commit f36ab2fbfb556934838e00dfc152372cf6421456 Author: rmgr Date: Tue Nov 28 20:51:54 2023 +1030 Initial commit diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3321311 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,21 @@ +beautifulsoup4==4.12.2 +blinker==1.7.0 +bs4==0.0.1 +certifi==2023.11.17 +chardet==5.2.0 +charset-normalizer==3.3.2 +click==8.1.7 +cssselect==1.2.0 +flask==3.0.0 +idna==3.6 +importlib-metadata==6.8.0 +itsdangerous==2.1.2 +Jinja2==3.1.2 +lxml==4.9.3 +MarkupSafe==2.1.3 +readability-lxml==0.8.1 +requests==2.31.0 +soupsieve==2.5 +urllib3==2.1.0 +werkzeug==3.0.1 +zipp==3.17.0 diff --git a/src/__pycache__/search.cpython-39.pyc b/src/__pycache__/search.cpython-39.pyc new file mode 100644 index 0000000..22a345c Binary files /dev/null and b/src/__pycache__/search.cpython-39.pyc differ diff --git a/src/crawl.py b/src/crawl.py new file mode 100755 index 0000000..c071595 --- /dev/null +++ b/src/crawl.py @@ -0,0 +1,63 @@ +#!/usr/bin/python3 +import argparse +import requests +import hashlib +import os +from bs4 import BeautifulSoup + +def get_html(url: str) -> str: + response = requests.get(url) + return response.content + +def parse_html(url: str, html: str) -> bool: + soup = BeautifulSoup(html,'html.parser') + hash = hashlib.sha256() + hash.update(url.encode('ascii')) + filename_text = hash.hexdigest() + '.txt' + filename_html = hash.hexdigest() + '.html' + with open(f'data/content/{filename_text}', 'w') as outfile: + outfile.write(url) + outfile.write('\n') + outfile.write(soup.get_text()) + with open(f'data/content/{filename_html}', 'w') as outfile: + outfile.write(url) + outfile.write('\n') + outfile.write(soup.prettify()) + x = open(f'data/links.txt', 'a') + x.close() + links = soup.find_all("a") + for link in links: + found = False + if 'href' in link: + link = link["href"] + else: + continue + if not "http" in link: + link = url + "/" + link + with open(f'data/links.txt', 'r+') as linksfile: + while line := linksfile.readline(): + if line.strip() == link.strip(): + found = True + if not found: + linksfile.write(f'{link}\n') + +if __name__ == "__main__": + os.makedirs("data/content", exist_ok=True) + # check inputs + parser = argparse.ArgumentParser() + parser.add_argument("url", help="URL of the webpage to be crawled") + parser.add_argument('-f', "--followlinks", action="store_true") + + args = parser.parse_args() + html = get_html(args.url) + parse_html(args.url, html) + + if (args.followlinks): + with open(f'data/links.txt', 'r+') as linksfile: + while line := linksfile.readline(): + if "http" in line: + try: + parse_html(line, get_html(line)) + + except: + pass diff --git a/src/index.py b/src/index.py new file mode 100755 index 0000000..781b9b6 --- /dev/null +++ b/src/index.py @@ -0,0 +1,36 @@ +#!/usr/bin/python3 + +from pathlib import Path +import argparse +import os +import json + +ignored_words = ['a', 'the','is'] + +def build_index(): + with open(f"data/index.json", "w") as index: + # get a list of all content files + # split on whitespace and add to index + dictionary = {} + pathlist = Path('data/content').rglob('*.txt') + for path in pathlist: + with open(str(path)) as content_file: + url = content_file.readline() + content = content_file.read() + content_words = content.split() + for word in content_words: + word = word.lower() + if not word in ignored_words: + if not word in dictionary: + dictionary[word] = [] + if not url.strip() in dictionary[word]: + dictionary[word].append(url.strip()) + + index.write(json.dumps(dictionary)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") + args = parser.parse_args() + if args.rebuild: + build_index() diff --git a/src/search.py b/src/search.py new file mode 100755 index 0000000..1a71241 --- /dev/null +++ b/src/search.py @@ -0,0 +1,24 @@ +#!/bin/bash + +from flask import Flask +from flask import Request +import json +from urllib.parse import unquote + +app = Flask(__name__) + +@app.route("/search/") +def search(query): + with open('data/index.json', 'r') as index_json: + index = json.load(index_json) + query = unquote(query) + query_split = query.split() + result = [] + for q in query_split: + q = q.lower() + if q in index: + result.append(index[q]) + print(result) + return result + +