Make excluded file types more robust

This commit is contained in:
rmgr 2024-06-08 20:24:21 +09:30
parent 98efe9d1a2
commit e3c67b64e6
2 changed files with 9 additions and 3 deletions

View file

@ -23,6 +23,9 @@ Session = sessionmaker(bind=engine)
excluded_domains = ['amazon.', 'news.ycombinator.', excluded_domains = ['amazon.', 'news.ycombinator.',
'facebook.com', 'amzn', 'fb.com'] 'facebook.com', 'amzn', 'fb.com']
excluded_filetypes = [".jpg", ".xml", ".mp4",
".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
def get_html(url: str) -> str: def get_html(url: str) -> str:
response = requests.get(url) response = requests.get(url)
@ -85,6 +88,9 @@ def parse_youtube(video_url: str) -> bool:
def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool: def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
for domain in excluded_domains:
if domain in url:
return
if "youtube.com" in url: if "youtube.com" in url:
parse_youtube(url) parse_youtube(url)
return return
@ -124,15 +130,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
s.add(existing_website) s.add(existing_website)
s.commit() s.commit()
s.close() s.close()
x = open(f'data/links.txt', 'a')
x.close()
links = soup.find_all("a", href=True) links = soup.find_all("a", href=True)
for link in links: for link in links:
found = False found = False
link = link["href"] link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link: if (len(link) > 0 and link[0] == "#") or "localhost" in link:
continue continue
if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link: if any(ext in link for ext in excluded_filetypes):
continue continue
if "http" not in link: if "http" not in link:
link = urljoin(url, link) link = urljoin(url, link)

2
todo
View file

@ -6,4 +6,6 @@
[x] Add clustered index to document_ngrams table model [x] Add clustered index to document_ngrams table model
[x] Add clustered index to document_tokens table model [x] Add clustered index to document_tokens table model
[ ] Add ddl command to create partition tables [ ] Add ddl command to create partition tables
[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that