From 9fc2e1af537613f93ab62d509720733aea323fbe Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Wed, 6 Dec 2023 08:29:39 +1030
Subject: [PATCH] Implement recursive page crawling

---
 .gitignore                            |   4 +-
 src/__pycache__/search.cpython-39.pyc | Bin 679 -> 0 bytes
 src/crawl.py                          |  63 +++++++++++++++++---------
 src/index.py                          |   2 +-
 src/search.py                         |   4 +-
 5 files changed, 47 insertions(+), 26 deletions(-)
 delete mode 100644 src/__pycache__/search.cpython-39.pyc

diff --git a/.gitignore b/.gitignore
index 550d67d..9e56094 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
-src/__pycache__
+src/__pycache__/
+data
+env
diff --git a/src/__pycache__/search.cpython-39.pyc b/src/__pycache__/search.cpython-39.pyc
deleted file mode 100644
index 22a345c6aa6159e978e60df4597a596caf62a433..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 679
zcmYjP!EV$r5S?-CBrK`gf`r7CTaehmy{amKdf?ClJs={XC|#R&TW^xJlR_&;LR#+p
z!tRlu;6r@n#4m7SlHEhcnm5lg@p#6Wa&K<}B<8nL2MplPuz4r}ZB9t-djbg(H7qbk
z95^G+ayIfn2YEm~uK7ab0zsh)SC~gCTE|jsP=+%40pxikgL6pZzmyMB^x{;P_J&CG
zK`lFF+p~dp)v~Kwb-$lkRhrpVc0@6AcRa)4*iJ&SG3i@U`<<Yr`XpPipBFv0xMmJF
z*t0fpbZ-w1)_enfa0xwk!8`c;q8CoA1^LK2b_-_e_&RjViRZAP9+%TLa(qS1@sGf$
zXHIJsAe_S+u;w-3xIPg|@}<Gq{k|;QGMiUYeVbm}y7Fk!*z>wkmFJ^_>$;SNQ@vi;
zM$cO>bp2HsAC?V$CB10OylT_Xi?Qi^GG;5r!(!z5<1tcfkB3ItPPZPHzrY+)-hVH%
zt9qfb>+(irQr+$z@s+AB=atH=ne9B&=FW%P7aqpIJ|h5@-~?k@e8d<J$eO1VNhe->
z8Ge_cQ2{f|hKUG#(iyGi7t^LRR&860i=rwQswjLwZFynpu2o}}vBU@PWXKR7Eo#|m
Sbvz`s2|<Dh-J7tO#rPlDw51mS

diff --git a/src/crawl.py b/src/crawl.py
index c071595..39ccdc2 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -2,14 +2,22 @@
 import argparse
 import requests
 import hashlib
+from urllib.parse import urlparse, urljoin
 import os
+from time import sleep
 from bs4 import BeautifulSoup
+# TODO- Handle gemini/gopher links
+# TODO- Keep a list of traversed links and check before traversing again
 
 def get_html(url: str) -> str:
     response = requests.get(url)
     return response.content
 
-def parse_html(url: str, html: str) -> bool:
+def parse_html(url: str, html: str, recursion: int = 0) -> bool:
+    print(url)
+    print(recursion)
+    urlparts = urlparse(url)
+    baseurl = urlparts.scheme + "://" + urlparts.netloc
     soup = BeautifulSoup(html,'html.parser')
     hash = hashlib.sha256()
     hash.update(url.encode('ascii'))
@@ -28,18 +36,26 @@ def parse_html(url: str, html: str) -> bool:
     links = soup.find_all("a")
     for link in links:
         found = False
-        if 'href' in link:
-            link = link["href"]
-        else:
+        link = link["href"]
+        if (len(link) > 0 and link[0] == "#") or "localhost" in link:
             continue
         if not "http" in link:
-            link = url + "/" + link
-        with open(f'data/links.txt', 'r+') as linksfile:
-            while line := linksfile.readline():
-                if line.strip() == link.strip():
-                    found = True
-            if not found:
-                linksfile.write(f'{link}\n')
+            link = urljoin(url, link)
+        if (recursion > 0):
+            try:
+                link_html = get_html(link)
+                r = recursion -1 
+                sleep(0.5)
+                parse_html(link, link_html)
+            except:
+                pass
+#        else:
+#            with open(f'data/links.txt', 'r+') as linksfile:
+#                while line := linksfile.readline():
+#                    if line.strip() == link.strip():
+#                        found = True
+#                if not found:
+#                    linksfile.write(f'{link}\n')
 
 if __name__ == "__main__":
     os.makedirs("data/content", exist_ok=True)
@@ -47,17 +63,20 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("url", help="URL of the webpage to be crawled")
     parser.add_argument('-f', "--followlinks", action="store_true")
-    
+    max_recursion = 4
     args = parser.parse_args()
     html = get_html(args.url)
-    parse_html(args.url, html)
+    parse_html(args.url, html, max_recursion)
     
-    if (args.followlinks):
-        with open(f'data/links.txt', 'r+') as linksfile:
-            while line := linksfile.readline():
-                if "http" in line:
-                    try:
-                        parse_html(line, get_html(line))
-
-                    except:
-                        pass
+#    recursion = 0
+#    if (args.followlinks):
+#        with open(f'data/links.txt', 'r+') as linksfile:
+#            while line := linksfile.readline():
+#                if recursion < max_recursion:
+#                    if "http" in line:
+#                        recursion += 1
+#                        try:
+#                            parse_html(line, get_html(line))
+#                        except:
+#                            pass
+    os.remove('data/links.txt')
diff --git a/src/index.py b/src/index.py
index 83d4342..1947f56 100755
--- a/src/index.py
+++ b/src/index.py
@@ -27,7 +27,7 @@ def build_index():
                         if len(matching_urls) == 0:
 #                        if not url.strip() in dictionary[word]:
                             entries = dictionary[word]
-                            entry = {"url": url.strip(), "count": 1}
+                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
                             dictionary[word].append(entry)
                         else:
                             entries = dictionary[word]
diff --git a/src/search.py b/src/search.py
index 0dcb85e..17668f9 100755
--- a/src/search.py
+++ b/src/search.py
@@ -23,8 +23,8 @@ def search(query):
                         result.append(item)
                     else:
                         matching_results[0]["count"] += item["count"]
-                #result.append(index[q])
-#        result.sort(reverse= True,key=lambda entry: int(entry.count))
         return result
 
+def handle_and():
+    pass