Respect robots.txt

This commit is contained in:
rmgr 2024-01-01 19:53:22 +10:30
parent b43343e0ee
commit f4ea8ad1d7
4 changed files with 25 additions and 2 deletions

View file

@ -3,21 +3,32 @@ import argparse
import requests
import hashlib
from urllib.parse import urlparse, urljoin
import urllib.robotparser
import os
from time import sleep
from bs4 import BeautifulSoup
# TODO- Handle gemini/gopher links
# TODO- Keep a list of traversed links and check before traversing again
def get_html(url: str) -> str:
response = requests.get(url)
return response.content
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
rp = urllib.robotparser.RobotFileParser()
print(url)
print(recursion)
urlparts = urlparse(url)
baseurl = urlparts.scheme + "://" + urlparts.netloc
if baseurl not in robots:
rp.set_url(baseurl + "/robots.txt")
rp.read()
robots[baseurl] = rp
else:
rp = robots[baseurl]
if not rp.can_fetch("*", url):
print("Robots prevents crawling url: " + url)
return
soup = BeautifulSoup(html,'html.parser')
hash = hashlib.sha256()
hash.update(url.encode('ascii'))
@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
links = soup.find_all("a")
for link in links:
found = False
if "href" not in link:
continue
link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
continue
@ -80,4 +93,5 @@ if __name__ == "__main__":
# parse_html(line, get_html(line))
# except:
# pass
os.remove('data/links.txt')