From f4ea8ad1d776ea76241357186ed9cc1fb26e11a8 Mon Sep 17 00:00:00 2001 From: rmgr Date: Mon, 1 Jan 2024 19:53:22 +1030 Subject: [PATCH] Respect robots.txt --- beehave.txt | 1 + src/__pycache__/search.cpython-310.pyc | Bin 685 -> 1033 bytes src/crawl.py | 18 ++++++++++++++++-- src/index.py | 8 ++++++++ 4 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 beehave.txt diff --git a/beehave.txt b/beehave.txt new file mode 100644 index 0000000..e3415b6 --- /dev/null +++ b/beehave.txt @@ -0,0 +1 @@ +https://github.com/bitbrain/beehave diff --git a/src/__pycache__/search.cpython-310.pyc b/src/__pycache__/search.cpython-310.pyc index c7402826f1c52ce28e47c499f6b8c84b197fb854..f3e8621f1765bb2ca051342506fd63ec8716cacf 100644 GIT binary patch delta 673 zcmZuu&1w`u5U%R^+1bhLx>2I8F6zM*28pN$UWP@85HJveIrsw-#_8GYvc0oOcdtKT zLWBvJoMhRvkQ{UH6?_RVGLPXCSUo{Ru#4}jrn{=Ys_JjfvDch58a~i!vJYu&&6*v2 zKD+y@;qm$*y!N@Z386pRTe)7N7>{5P)OR8c962l4myOxa?!lvdCwESqQ!qYf??A5Q zEO*-Q6y9`?LA$#48T!aAZXbHNU9bZ8UG3#=f!rA}*~{&d`VpKW!frQrIc0L!fi@hG zZ^L8Kjp%0gN^ZXZ1{`d%!a~@D2cTYFd{2qSUJ*DQhPC+yF0CaEjG}EC!-^M!~TMU&h-TqMMRF;-Vv(k-scG8Sf#b5E^LD?F| zIvEVJ?Jb$AsnANA(aXA_1@oSJE@ysC-69H@j~i=Nf9g;XMM)qII|j&5re6$!m_PJ z0yY-;LtFVLwDxW~=WxE6a}QjYcR3SYT&?ly&DrpIn%z(dE5`9oX-cm7Y9mn$Oc$jbb~%M zDi?iy9VNb-ec~bi0$tC3*};ix!a$#26`wR~gbJ3KB=I>V^B4{#BHR4Y%T)X2TwL@5 rbE}b(b_$fT4`;Vow7UKuOB8c?bQi0KUU str: response = requests.get(url) return response.content -def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool: +def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool: + rp = urllib.robotparser.RobotFileParser() print(url) print(recursion) urlparts = urlparse(url) baseurl = urlparts.scheme + "://" + urlparts.netloc + if baseurl not in robots: + rp.set_url(baseurl + "/robots.txt") + rp.read() + robots[baseurl] = rp + else: + rp = robots[baseurl] + if not rp.can_fetch("*", url): + print("Robots prevents crawling url: " + url) + return + soup = BeautifulSoup(html,'html.parser') hash = hashlib.sha256() hash.update(url.encode('ascii')) @@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> links = soup.find_all("a") for link in links: found = False + if "href" not in link: + continue link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue @@ -80,4 +93,5 @@ if __name__ == "__main__": # parse_html(line, get_html(line)) # except: # pass + os.remove('data/links.txt') diff --git a/src/index.py b/src/index.py index f55a356..7532247 100755 --- a/src/index.py +++ b/src/index.py @@ -7,6 +7,13 @@ import json # investigate ngrams for "multi word" matching ignored_words = ['a', 'the','is'] +def remove_punctuation(input_string): + punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' + for p in punc: + input_string = input_string.replace(p, '') + return input_string + + def build_index(): with open(f"data/index.json", "w") as index: # get a list of all content files @@ -20,6 +27,7 @@ def build_index(): content_words = content.split() for word in content_words: word = word.lower() + word = remove_punctuation(word) if not word in ignored_words: if not word in dictionary: dictionary[word] = []