From e3c67b64e63762e50899390afa7f5d4cca3d6c17 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Sat, 8 Jun 2024 20:24:21 +0930
Subject: [PATCH] Make excluded file types more robust

---
 src/crawl.py | 10 +++++++---
 todo         |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/crawl.py b/src/crawl.py
index 467b434..6966a25 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -23,6 +23,9 @@ Session = sessionmaker(bind=engine)
 excluded_domains = ['amazon.', 'news.ycombinator.',
                     'facebook.com', 'amzn', 'fb.com']
 
+excluded_filetypes = [".jpg", ".xml", ".mp4",
+                      ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
+
 
 def get_html(url: str) -> str:
     response = requests.get(url)
@@ -85,6 +88,9 @@ def parse_youtube(video_url: str) -> bool:
 
 
 def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
+    for domain in excluded_domains:
+        if domain in url:
+            return
     if "youtube.com" in url:
         parse_youtube(url)
         return
@@ -124,15 +130,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
         s.add(existing_website)
     s.commit()
     s.close()
-    x = open(f'data/links.txt', 'a')
-    x.close()
     links = soup.find_all("a", href=True)
     for link in links:
         found = False
         link = link["href"]
         if (len(link) > 0 and link[0] == "#") or "localhost" in link:
             continue
-        if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
+        if any(ext in link for ext in excluded_filetypes):
             continue
         if "http" not in link:
             link = urljoin(url, link)
diff --git a/todo b/todo
index 328320b..2f5f3e5 100644
--- a/todo
+++ b/todo
@@ -6,4 +6,6 @@
 [x] Add clustered index to document_ngrams table model
 [x] Add clustered index to document_tokens table model
 [ ] Add ddl command to create partition tables
+[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
+[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that