Make excluded file types more robust
This commit is contained in:
parent
98efe9d1a2
commit
e3c67b64e6
2 changed files with 9 additions and 3 deletions
10
src/crawl.py
10
src/crawl.py
|
|
@ -23,6 +23,9 @@ Session = sessionmaker(bind=engine)
|
||||||
excluded_domains = ['amazon.', 'news.ycombinator.',
|
excluded_domains = ['amazon.', 'news.ycombinator.',
|
||||||
'facebook.com', 'amzn', 'fb.com']
|
'facebook.com', 'amzn', 'fb.com']
|
||||||
|
|
||||||
|
excluded_filetypes = [".jpg", ".xml", ".mp4",
|
||||||
|
".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
|
||||||
|
|
||||||
|
|
||||||
def get_html(url: str) -> str:
|
def get_html(url: str) -> str:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
|
|
@ -85,6 +88,9 @@ def parse_youtube(video_url: str) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
|
def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
|
||||||
|
for domain in excluded_domains:
|
||||||
|
if domain in url:
|
||||||
|
return
|
||||||
if "youtube.com" in url:
|
if "youtube.com" in url:
|
||||||
parse_youtube(url)
|
parse_youtube(url)
|
||||||
return
|
return
|
||||||
|
|
@ -124,15 +130,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
|
||||||
s.add(existing_website)
|
s.add(existing_website)
|
||||||
s.commit()
|
s.commit()
|
||||||
s.close()
|
s.close()
|
||||||
x = open(f'data/links.txt', 'a')
|
|
||||||
x.close()
|
|
||||||
links = soup.find_all("a", href=True)
|
links = soup.find_all("a", href=True)
|
||||||
for link in links:
|
for link in links:
|
||||||
found = False
|
found = False
|
||||||
link = link["href"]
|
link = link["href"]
|
||||||
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
||||||
continue
|
continue
|
||||||
if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
|
if any(ext in link for ext in excluded_filetypes):
|
||||||
continue
|
continue
|
||||||
if "http" not in link:
|
if "http" not in link:
|
||||||
link = urljoin(url, link)
|
link = urljoin(url, link)
|
||||||
|
|
|
||||||
2
todo
2
todo
|
|
@ -6,4 +6,6 @@
|
||||||
[x] Add clustered index to document_ngrams table model
|
[x] Add clustered index to document_ngrams table model
|
||||||
[x] Add clustered index to document_tokens table model
|
[x] Add clustered index to document_tokens table model
|
||||||
[ ] Add ddl command to create partition tables
|
[ ] Add ddl command to create partition tables
|
||||||
|
[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
|
||||||
|
[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue