Make excluded file types more robust

2024-06-08 20:24:21 +09:30 · 2024-06-08 20:24:21 +09:30 · e3c67b64e6
commit e3c67b64e6
parent 98efe9d1a2
2 changed files with 9 additions and 3 deletions
--- a/src/crawl.py
+++ b/src/crawl.py
@ -23,6 +23,9 @@ Session = sessionmaker(bind=engine)
 excluded_domains = ['amazon.', 'news.ycombinator.',
                    'facebook.com', 'amzn', 'fb.com']
 excluded_filetypes = [".jpg", ".xml", ".mp4",
                      ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
 def get_html(url: str) -> str:
    response = requests.get(url)
@ -85,6 +88,9 @@ def parse_youtube(video_url: str) -> bool:
 def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
    for domain in excluded_domains:
        if domain in url:
            return
    if "youtube.com" in url:
        parse_youtube(url)
        return
@ -124,15 +130,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
        s.add(existing_website)
    s.commit()
    s.close()
    x = open(f'data/links.txt', 'a')
    x.close()
    links = soup.find_all("a", href=True)
    for link in links:
        found = False
        link = link["href"]
        if (len(link) > 0 and link[0] == "#") or "localhost" in link:
            continue
-        if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
+        if any(ext in link for ext in excluded_filetypes):
            continue
        if "http" not in link:
            link = urljoin(url, link)
--- a/2
+++ b/2
@ -6,4 +6,6 @@
 [x] Add clustered index to document_ngrams table model
 [x] Add clustered index to document_tokens table model
 [ ] Add ddl command to create partition tables
 [ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
 [ ] Instead of starting from a random page on the site, go to root and find site map and crawl that