Respect robots.txt

This commit is contained in:
rmgr 2024-01-01 19:53:22 +10:30
parent b43343e0ee
commit f4ea8ad1d7
4 changed files with 25 additions and 2 deletions

1
beehave.txt Normal file
View file

@ -0,0 +1 @@
https://github.com/bitbrain/beehave

View file

@ -3,21 +3,32 @@ import argparse
import requests import requests
import hashlib import hashlib
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
import urllib.robotparser
import os import os
from time import sleep from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# TODO- Handle gemini/gopher links # TODO- Handle gemini/gopher links
# TODO- Keep a list of traversed links and check before traversing again
def get_html(url: str) -> str: def get_html(url: str) -> str:
response = requests.get(url) response = requests.get(url)
return response.content return response.content
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool: def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
rp = urllib.robotparser.RobotFileParser()
print(url) print(url)
print(recursion) print(recursion)
urlparts = urlparse(url) urlparts = urlparse(url)
baseurl = urlparts.scheme + "://" + urlparts.netloc baseurl = urlparts.scheme + "://" + urlparts.netloc
if baseurl not in robots:
rp.set_url(baseurl + "/robots.txt")
rp.read()
robots[baseurl] = rp
else:
rp = robots[baseurl]
if not rp.can_fetch("*", url):
print("Robots prevents crawling url: " + url)
return
soup = BeautifulSoup(html,'html.parser') soup = BeautifulSoup(html,'html.parser')
hash = hashlib.sha256() hash = hashlib.sha256()
hash.update(url.encode('ascii')) hash.update(url.encode('ascii'))
@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
links = soup.find_all("a") links = soup.find_all("a")
for link in links: for link in links:
found = False found = False
if "href" not in link:
continue
link = link["href"] link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link: if (len(link) > 0 and link[0] == "#") or "localhost" in link:
continue continue
@ -80,4 +93,5 @@ if __name__ == "__main__":
# parse_html(line, get_html(line)) # parse_html(line, get_html(line))
# except: # except:
# pass # pass
os.remove('data/links.txt') os.remove('data/links.txt')

View file

@ -7,6 +7,13 @@ import json
# investigate ngrams for "multi word" matching # investigate ngrams for "multi word" matching
ignored_words = ['a', 'the','is'] ignored_words = ['a', 'the','is']
def remove_punctuation(input_string):
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
for p in punc:
input_string = input_string.replace(p, '')
return input_string
def build_index(): def build_index():
with open(f"data/index.json", "w") as index: with open(f"data/index.json", "w") as index:
# get a list of all content files # get a list of all content files
@ -20,6 +27,7 @@ def build_index():
content_words = content.split() content_words = content.split()
for word in content_words: for word in content_words:
word = word.lower() word = word.lower()
word = remove_punctuation(word)
if not word in ignored_words: if not word in ignored_words:
if not word in dictionary: if not word in dictionary:
dictionary[word] = [] dictionary[word] = []