From e3c67b64e63762e50899390afa7f5d4cca3d6c17 Mon Sep 17 00:00:00 2001 From: rmgr Date: Sat, 8 Jun 2024 20:24:21 +0930 Subject: [PATCH] Make excluded file types more robust --- src/crawl.py | 10 +++++++--- todo | 2 ++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/crawl.py b/src/crawl.py index 467b434..6966a25 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -23,6 +23,9 @@ Session = sessionmaker(bind=engine) excluded_domains = ['amazon.', 'news.ycombinator.', 'facebook.com', 'amzn', 'fb.com'] +excluded_filetypes = [".jpg", ".xml", ".mp4", + ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"] + def get_html(url: str) -> str: response = requests.get(url) @@ -85,6 +88,9 @@ def parse_youtube(video_url: str) -> bool: def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool: + for domain in excluded_domains: + if domain in url: + return if "youtube.com" in url: parse_youtube(url) return @@ -124,15 +130,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo s.add(existing_website) s.commit() s.close() - x = open(f'data/links.txt', 'a') - x.close() links = soup.find_all("a", href=True) for link in links: found = False link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue - if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link: + if any(ext in link for ext in excluded_filetypes): continue if "http" not in link: link = urljoin(url, link) diff --git a/todo b/todo index 328320b..2f5f3e5 100644 --- a/todo +++ b/todo @@ -6,4 +6,6 @@ [x] Add clustered index to document_ngrams table model [x] Add clustered index to document_tokens table model [ ] Add ddl command to create partition tables +[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be +[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that