From f36ab2fbfb556934838e00dfc152372cf6421456 Mon Sep 17 00:00:00 2001 From: rmgr Date: Tue, 28 Nov 2023 20:51:54 +1030 Subject: [PATCH] Initial commit --- requirements.txt | 21 +++++++++ src/__pycache__/search.cpython-39.pyc | Bin 0 -> 679 bytes src/crawl.py | 63 ++++++++++++++++++++++++++ src/index.py | 36 +++++++++++++++ src/search.py | 24 ++++++++++ 5 files changed, 144 insertions(+) create mode 100644 requirements.txt create mode 100644 src/__pycache__/search.cpython-39.pyc create mode 100755 src/crawl.py create mode 100755 src/index.py create mode 100755 src/search.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3321311 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,21 @@ +beautifulsoup4==4.12.2 +blinker==1.7.0 +bs4==0.0.1 +certifi==2023.11.17 +chardet==5.2.0 +charset-normalizer==3.3.2 +click==8.1.7 +cssselect==1.2.0 +flask==3.0.0 +idna==3.6 +importlib-metadata==6.8.0 +itsdangerous==2.1.2 +Jinja2==3.1.2 +lxml==4.9.3 +MarkupSafe==2.1.3 +readability-lxml==0.8.1 +requests==2.31.0 +soupsieve==2.5 +urllib3==2.1.0 +werkzeug==3.0.1 +zipp==3.17.0 diff --git a/src/__pycache__/search.cpython-39.pyc b/src/__pycache__/search.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22a345c6aa6159e978e60df4597a596caf62a433 GIT binary patch literal 679 zcmYjP!EV$r5S?-CBrK`gf`r7CTaehmy{amKdf?ClJs={XC|#R&TW^xJlR_&;LR#+p z!tRlu;6r@n#4m7SlHEhcnm5lg@p#6Wa&K<}B<8nL2MplPuz4r}ZB9t-djbg(H7qbk z95^G+ayIfn2YEm~uK7ab0zsh)SC~gCTE|jsP=+%40pxikgL6pZzmyMB^x{;P_J&CG zK`lFF+p~dp)v~Kwb-$lkRhrpVc0@6AcRa)4*iJ&SG3i@U`<wkmFJ^_>$;SNQ@vi; zM$cO>bp2HsAC?V$CB10OylT_Xi?Qi^GG;5r!(!z5<1tcfkB3ItPPZPHzrY+)-hVH% zt9qfb>+(irQr+$z@s+AB=atH=ne9B&=FW%P7aqpIJ|h5@-~?k@e8d z8Ge_cQ2{f|hKUG#(iyGi7t^LRR&860i=rwQswjLwZFynpu2o}}vBU@PWXKR7Eo#|m Sbvz`s2| str: + response = requests.get(url) + return response.content + +def parse_html(url: str, html: str) -> bool: + soup = BeautifulSoup(html,'html.parser') + hash = hashlib.sha256() + hash.update(url.encode('ascii')) + filename_text = hash.hexdigest() + '.txt' + filename_html = hash.hexdigest() + '.html' + with open(f'data/content/{filename_text}', 'w') as outfile: + outfile.write(url) + outfile.write('\n') + outfile.write(soup.get_text()) + with open(f'data/content/{filename_html}', 'w') as outfile: + outfile.write(url) + outfile.write('\n') + outfile.write(soup.prettify()) + x = open(f'data/links.txt', 'a') + x.close() + links = soup.find_all("a") + for link in links: + found = False + if 'href' in link: + link = link["href"] + else: + continue + if not "http" in link: + link = url + "/" + link + with open(f'data/links.txt', 'r+') as linksfile: + while line := linksfile.readline(): + if line.strip() == link.strip(): + found = True + if not found: + linksfile.write(f'{link}\n') + +if __name__ == "__main__": + os.makedirs("data/content", exist_ok=True) + # check inputs + parser = argparse.ArgumentParser() + parser.add_argument("url", help="URL of the webpage to be crawled") + parser.add_argument('-f', "--followlinks", action="store_true") + + args = parser.parse_args() + html = get_html(args.url) + parse_html(args.url, html) + + if (args.followlinks): + with open(f'data/links.txt', 'r+') as linksfile: + while line := linksfile.readline(): + if "http" in line: + try: + parse_html(line, get_html(line)) + + except: + pass diff --git a/src/index.py b/src/index.py new file mode 100755 index 0000000..781b9b6 --- /dev/null +++ b/src/index.py @@ -0,0 +1,36 @@ +#!/usr/bin/python3 + +from pathlib import Path +import argparse +import os +import json + +ignored_words = ['a', 'the','is'] + +def build_index(): + with open(f"data/index.json", "w") as index: + # get a list of all content files + # split on whitespace and add to index + dictionary = {} + pathlist = Path('data/content').rglob('*.txt') + for path in pathlist: + with open(str(path)) as content_file: + url = content_file.readline() + content = content_file.read() + content_words = content.split() + for word in content_words: + word = word.lower() + if not word in ignored_words: + if not word in dictionary: + dictionary[word] = [] + if not url.strip() in dictionary[word]: + dictionary[word].append(url.strip()) + + index.write(json.dumps(dictionary)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") + args = parser.parse_args() + if args.rebuild: + build_index() diff --git a/src/search.py b/src/search.py new file mode 100755 index 0000000..1a71241 --- /dev/null +++ b/src/search.py @@ -0,0 +1,24 @@ +#!/bin/bash + +from flask import Flask +from flask import Request +import json +from urllib.parse import unquote + +app = Flask(__name__) + +@app.route("/search/") +def search(query): + with open('data/index.json', 'r') as index_json: + index = json.load(index_json) + query = unquote(query) + query_split = query.split() + result = [] + for q in query_split: + q = q.lower() + if q in index: + result.append(index[q]) + print(result) + return result + +