Make excluded file types more robust
This commit is contained in:
parent
98efe9d1a2
commit
e3c67b64e6
2 changed files with 9 additions and 3 deletions
10
src/crawl.py
10
src/crawl.py
|
|
@ -23,6 +23,9 @@ Session = sessionmaker(bind=engine)
|
|||
excluded_domains = ['amazon.', 'news.ycombinator.',
|
||||
'facebook.com', 'amzn', 'fb.com']
|
||||
|
||||
excluded_filetypes = [".jpg", ".xml", ".mp4",
|
||||
".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
|
||||
|
||||
|
||||
def get_html(url: str) -> str:
|
||||
response = requests.get(url)
|
||||
|
|
@ -85,6 +88,9 @@ def parse_youtube(video_url: str) -> bool:
|
|||
|
||||
|
||||
def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
|
||||
for domain in excluded_domains:
|
||||
if domain in url:
|
||||
return
|
||||
if "youtube.com" in url:
|
||||
parse_youtube(url)
|
||||
return
|
||||
|
|
@ -124,15 +130,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
|
|||
s.add(existing_website)
|
||||
s.commit()
|
||||
s.close()
|
||||
x = open(f'data/links.txt', 'a')
|
||||
x.close()
|
||||
links = soup.find_all("a", href=True)
|
||||
for link in links:
|
||||
found = False
|
||||
link = link["href"]
|
||||
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
||||
continue
|
||||
if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
|
||||
if any(ext in link for ext in excluded_filetypes):
|
||||
continue
|
||||
if "http" not in link:
|
||||
link = urljoin(url, link)
|
||||
|
|
|
|||
2
todo
2
todo
|
|
@ -6,4 +6,6 @@
|
|||
[x] Add clustered index to document_ngrams table model
|
||||
[x] Add clustered index to document_tokens table model
|
||||
[ ] Add ddl command to create partition tables
|
||||
[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
|
||||
[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue