Respect robots.txt

This commit is contained in:
rmgr 2024-01-01 19:53:22 +10:30
parent b43343e0ee
commit f4ea8ad1d7
4 changed files with 25 additions and 2 deletions

1
beehave.txt Normal file
View file

@ -0,0 +1 @@
https://github.com/bitbrain/beehave

View file

@ -3,21 +3,32 @@ import argparse
import requests
import hashlib
from urllib.parse import urlparse, urljoin
import urllib.robotparser
import os
from time import sleep
from bs4 import BeautifulSoup
# TODO- Handle gemini/gopher links
# TODO- Keep a list of traversed links and check before traversing again
def get_html(url: str) -> str:
response = requests.get(url)
return response.content
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
rp = urllib.robotparser.RobotFileParser()
print(url)
print(recursion)
urlparts = urlparse(url)
baseurl = urlparts.scheme + "://" + urlparts.netloc
if baseurl not in robots:
rp.set_url(baseurl + "/robots.txt")
rp.read()
robots[baseurl] = rp
else:
rp = robots[baseurl]
if not rp.can_fetch("*", url):
print("Robots prevents crawling url: " + url)
return
soup = BeautifulSoup(html,'html.parser')
hash = hashlib.sha256()
hash.update(url.encode('ascii'))
@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
links = soup.find_all("a")
for link in links:
found = False
if "href" not in link:
continue
link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
continue
@ -80,4 +93,5 @@ if __name__ == "__main__":
# parse_html(line, get_html(line))
# except:
# pass
os.remove('data/links.txt')

View file

@ -7,6 +7,13 @@ import json
# investigate ngrams for "multi word" matching
ignored_words = ['a', 'the','is']
def remove_punctuation(input_string):
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
for p in punc:
input_string = input_string.replace(p, '')
return input_string
def build_index():
with open(f"data/index.json", "w") as index:
# get a list of all content files
@ -20,6 +27,7 @@ def build_index():
content_words = content.split()
for word in content_words:
word = word.lower()
word = remove_punctuation(word)
if not word in ignored_words:
if not word in dictionary:
dictionary[word] = []