diff --git a/beehave.txt b/beehave.txt new file mode 100644 index 0000000..e3415b6 --- /dev/null +++ b/beehave.txt @@ -0,0 +1 @@ +https://github.com/bitbrain/beehave diff --git a/src/__pycache__/search.cpython-310.pyc b/src/__pycache__/search.cpython-310.pyc index c740282..f3e8621 100644 Binary files a/src/__pycache__/search.cpython-310.pyc and b/src/__pycache__/search.cpython-310.pyc differ diff --git a/src/crawl.py b/src/crawl.py index dcac7ed..da6bffa 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -3,21 +3,32 @@ import argparse import requests import hashlib from urllib.parse import urlparse, urljoin +import urllib.robotparser import os from time import sleep from bs4 import BeautifulSoup # TODO- Handle gemini/gopher links -# TODO- Keep a list of traversed links and check before traversing again def get_html(url: str) -> str: response = requests.get(url) return response.content -def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool: +def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool: + rp = urllib.robotparser.RobotFileParser() print(url) print(recursion) urlparts = urlparse(url) baseurl = urlparts.scheme + "://" + urlparts.netloc + if baseurl not in robots: + rp.set_url(baseurl + "/robots.txt") + rp.read() + robots[baseurl] = rp + else: + rp = robots[baseurl] + if not rp.can_fetch("*", url): + print("Robots prevents crawling url: " + url) + return + soup = BeautifulSoup(html,'html.parser') hash = hashlib.sha256() hash.update(url.encode('ascii')) @@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> links = soup.find_all("a") for link in links: found = False + if "href" not in link: + continue link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue @@ -80,4 +93,5 @@ if __name__ == "__main__": # parse_html(line, get_html(line)) # except: # pass + os.remove('data/links.txt') diff --git a/src/index.py b/src/index.py index f55a356..7532247 100755 --- a/src/index.py +++ b/src/index.py @@ -7,6 +7,13 @@ import json # investigate ngrams for "multi word" matching ignored_words = ['a', 'the','is'] +def remove_punctuation(input_string): + punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' + for p in punc: + input_string = input_string.replace(p, '') + return input_string + + def build_index(): with open(f"data/index.json", "w") as index: # get a list of all content files @@ -20,6 +27,7 @@ def build_index(): content_words = content.split() for word in content_words: word = word.lower() + word = remove_punctuation(word) if not word in ignored_words: if not word in dictionary: dictionary[word] = []