Respect robots.txt
This commit is contained in:
parent
b43343e0ee
commit
f4ea8ad1d7
4 changed files with 25 additions and 2 deletions
18
src/crawl.py
18
src/crawl.py
|
|
@ -3,21 +3,32 @@ import argparse
|
|||
import requests
|
||||
import hashlib
|
||||
from urllib.parse import urlparse, urljoin
|
||||
import urllib.robotparser
|
||||
import os
|
||||
from time import sleep
|
||||
from bs4 import BeautifulSoup
|
||||
# TODO- Handle gemini/gopher links
|
||||
# TODO- Keep a list of traversed links and check before traversing again
|
||||
|
||||
def get_html(url: str) -> str:
|
||||
response = requests.get(url)
|
||||
return response.content
|
||||
|
||||
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
|
||||
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
|
||||
rp = urllib.robotparser.RobotFileParser()
|
||||
print(url)
|
||||
print(recursion)
|
||||
urlparts = urlparse(url)
|
||||
baseurl = urlparts.scheme + "://" + urlparts.netloc
|
||||
if baseurl not in robots:
|
||||
rp.set_url(baseurl + "/robots.txt")
|
||||
rp.read()
|
||||
robots[baseurl] = rp
|
||||
else:
|
||||
rp = robots[baseurl]
|
||||
if not rp.can_fetch("*", url):
|
||||
print("Robots prevents crawling url: " + url)
|
||||
return
|
||||
|
||||
soup = BeautifulSoup(html,'html.parser')
|
||||
hash = hashlib.sha256()
|
||||
hash.update(url.encode('ascii'))
|
||||
|
|
@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
|
|||
links = soup.find_all("a")
|
||||
for link in links:
|
||||
found = False
|
||||
if "href" not in link:
|
||||
continue
|
||||
link = link["href"]
|
||||
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
||||
continue
|
||||
|
|
@ -80,4 +93,5 @@ if __name__ == "__main__":
|
|||
# parse_html(line, get_html(line))
|
||||
# except:
|
||||
# pass
|
||||
|
||||
os.remove('data/links.txt')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue