From f36ab2fbfb556934838e00dfc152372cf6421456 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Tue, 28 Nov 2023 20:51:54 +1030
Subject: [PATCH] Initial commit

---
 requirements.txt                      |  21 +++++++++
 src/__pycache__/search.cpython-39.pyc | Bin 0 -> 679 bytes
 src/crawl.py                          |  63 ++++++++++++++++++++++++++
 src/index.py                          |  36 +++++++++++++++
 src/search.py                         |  24 ++++++++++
 5 files changed, 144 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 src/__pycache__/search.cpython-39.pyc
 create mode 100755 src/crawl.py
 create mode 100755 src/index.py
 create mode 100755 src/search.py

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3321311
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,21 @@
+beautifulsoup4==4.12.2
+blinker==1.7.0
+bs4==0.0.1
+certifi==2023.11.17
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+cssselect==1.2.0
+flask==3.0.0
+idna==3.6
+importlib-metadata==6.8.0
+itsdangerous==2.1.2
+Jinja2==3.1.2
+lxml==4.9.3
+MarkupSafe==2.1.3
+readability-lxml==0.8.1
+requests==2.31.0
+soupsieve==2.5
+urllib3==2.1.0
+werkzeug==3.0.1
+zipp==3.17.0
diff --git a/src/__pycache__/search.cpython-39.pyc b/src/__pycache__/search.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22a345c6aa6159e978e60df4597a596caf62a433
GIT binary patch
literal 679
zcmYjP!EV$r5S?-CBrK`gf`r7CTaehmy{amKdf?ClJs={XC|#R&TW^xJlR_&;LR#+p
z!tRlu;6r@n#4m7SlHEhcnm5lg@p#6Wa&K<}B<8nL2MplPuz4r}ZB9t-djbg(H7qbk
z95^G+ayIfn2YEm~uK7ab0zsh)SC~gCTE|jsP=+%40pxikgL6pZzmyMB^x{;P_J&CG
zK`lFF+p~dp)v~Kwb-$lkRhrpVc0@6AcRa)4*iJ&SG3i@U`<<Yr`XpPipBFv0xMmJF
z*t0fpbZ-w1)_enfa0xwk!8`c;q8CoA1^LK2b_-_e_&RjViRZAP9+%TLa(qS1@sGf$
zXHIJsAe_S+u;w-3xIPg|@}<Gq{k|;QGMiUYeVbm}y7Fk!*z>wkmFJ^_>$;SNQ@vi;
zM$cO>bp2HsAC?V$CB10OylT_Xi?Qi^GG;5r!(!z5<1tcfkB3ItPPZPHzrY+)-hVH%
zt9qfb>+(irQr+$z@s+AB=atH=ne9B&=FW%P7aqpIJ|h5@-~?k@e8d<J$eO1VNhe->
z8Ge_cQ2{f|hKUG#(iyGi7t^LRR&860i=rwQswjLwZFynpu2o}}vBU@PWXKR7Eo#|m
Sbvz`s2|<Dh-J7tO#rPlDw51mS

literal 0
HcmV?d00001

diff --git a/src/crawl.py b/src/crawl.py
new file mode 100755
index 0000000..c071595
--- /dev/null
+++ b/src/crawl.py
@@ -0,0 +1,63 @@
+#!/usr/bin/python3
+import argparse
+import requests
+import hashlib
+import os
+from bs4 import BeautifulSoup
+
+def get_html(url: str) -> str:
+    response = requests.get(url)
+    return response.content
+
+def parse_html(url: str, html: str) -> bool:
+    soup = BeautifulSoup(html,'html.parser')
+    hash = hashlib.sha256()
+    hash.update(url.encode('ascii'))
+    filename_text = hash.hexdigest() + '.txt' 
+    filename_html = hash.hexdigest() + '.html' 
+    with open(f'data/content/{filename_text}', 'w') as outfile:
+        outfile.write(url)
+        outfile.write('\n')
+        outfile.write(soup.get_text())
+    with open(f'data/content/{filename_html}', 'w') as outfile:
+        outfile.write(url)
+        outfile.write('\n')
+        outfile.write(soup.prettify())
+    x = open(f'data/links.txt', 'a')
+    x.close()
+    links = soup.find_all("a")
+    for link in links:
+        found = False
+        if 'href' in link:
+            link = link["href"]
+        else:
+            continue
+        if not "http" in link:
+            link = url + "/" + link
+        with open(f'data/links.txt', 'r+') as linksfile:
+            while line := linksfile.readline():
+                if line.strip() == link.strip():
+                    found = True
+            if not found:
+                linksfile.write(f'{link}\n')
+
+if __name__ == "__main__":
+    os.makedirs("data/content", exist_ok=True)
+    # check inputs
+    parser = argparse.ArgumentParser()
+    parser.add_argument("url", help="URL of the webpage to be crawled")
+    parser.add_argument('-f', "--followlinks", action="store_true")
+    
+    args = parser.parse_args()
+    html = get_html(args.url)
+    parse_html(args.url, html)
+    
+    if (args.followlinks):
+        with open(f'data/links.txt', 'r+') as linksfile:
+            while line := linksfile.readline():
+                if "http" in line:
+                    try:
+                        parse_html(line, get_html(line))
+
+                    except:
+                        pass
diff --git a/src/index.py b/src/index.py
new file mode 100755
index 0000000..781b9b6
--- /dev/null
+++ b/src/index.py
@@ -0,0 +1,36 @@
+#!/usr/bin/python3
+
+from pathlib import Path
+import argparse
+import os
+import json
+
+ignored_words = ['a', 'the','is']
+
+def build_index():
+    with open(f"data/index.json", "w") as index:
+        # get a list of all content files
+        # split on whitespace and add to index
+        dictionary = {}
+        pathlist = Path('data/content').rglob('*.txt')
+        for path in pathlist:
+            with open(str(path)) as content_file:
+                url = content_file.readline()
+                content = content_file.read()
+                content_words = content.split()
+                for word in content_words:
+                    word = word.lower()
+                    if not word in ignored_words:
+                        if not word in dictionary:
+                            dictionary[word] = []
+                        if not url.strip() in dictionary[word]:
+                            dictionary[word].append(url.strip())
+
+        index.write(json.dumps(dictionary))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
+    args = parser.parse_args()
+    if args.rebuild:
+        build_index()
diff --git a/src/search.py b/src/search.py
new file mode 100755
index 0000000..1a71241
--- /dev/null
+++ b/src/search.py
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+from flask import Flask
+from flask import Request
+import json
+from urllib.parse import unquote
+
+app = Flask(__name__)
+
+@app.route("/search/<query>")
+def search(query):
+    with open('data/index.json', 'r') as index_json:
+        index = json.load(index_json)
+        query = unquote(query)
+        query_split = query.split()
+        result = []
+        for q in query_split:
+            q = q.lower()
+            if q in index:
+                result.append(index[q])
+        print(result)
+        return result
+
+