Implement recursive page crawling

2023-12-06 08:29:39 +10:30 · 2023-12-06 08:29:39 +10:30 · 9fc2e1af53
commit 9fc2e1af53
parent 3d7b72e5ef
5 changed files with 47 additions and 26 deletions
--- a/src/crawl.py
+++ b/src/crawl.py
@ -2,14 +2,22 @@
 import argparse
 import requests
 import hashlib
+from urllib.parse import urlparse, urljoin
 import os
+from time import sleep
 from bs4 import BeautifulSoup
+# TODO- Handle gemini/gopher links
+# TODO- Keep a list of traversed links and check before traversing again

 def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content

-def parse_html(url: str, html: str) -> bool:
+def parse_html(url: str, html: str, recursion: int = 0) -> bool:
+    print(url)
+    print(recursion)
+    urlparts = urlparse(url)
+    baseurl = urlparts.scheme + "://" + urlparts.netloc
    soup = BeautifulSoup(html,'html.parser')
    hash = hashlib.sha256()
    hash.update(url.encode('ascii'))
@ -28,18 +36,26 @@ def parse_html(url: str, html: str) -> bool:
    links = soup.find_all("a")
    for link in links:
        found = False
-        if 'href' in link:
-            link = link["href"]
-        else:
+        link = link["href"]
+        if (len(link) > 0 and link[0] == "#") or "localhost" in link:
            continue
        if not "http" in link:
-            link = url + "/" + link
-        with open(f'data/links.txt', 'r+') as linksfile:
-            while line := linksfile.readline():
-                if line.strip() == link.strip():
-                    found = True
-            if not found:
-                linksfile.write(f'{link}\n')
+            link = urljoin(url, link)
+        if (recursion > 0):
+            try:
+                link_html = get_html(link)
+                r = recursion -1 
+                sleep(0.5)
+                parse_html(link, link_html)
+            except:
+                pass
+#        else:
+#            with open(f'data/links.txt', 'r+') as linksfile:
+#                while line := linksfile.readline():
+#                    if line.strip() == link.strip():
+#                        found = True
+#                if not found:
+#                    linksfile.write(f'{link}\n')

 if __name__ == "__main__":
    os.makedirs("data/content", exist_ok=True)
@ -47,17 +63,20 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("url", help="URL of the webpage to be crawled")
    parser.add_argument('-f', "--followlinks", action="store_true")
-    
+    max_recursion = 4
    args = parser.parse_args()
    html = get_html(args.url)
-    parse_html(args.url, html)
+    parse_html(args.url, html, max_recursion)
    
-    if (args.followlinks):
-        with open(f'data/links.txt', 'r+') as linksfile:
-            while line := linksfile.readline():
-                if "http" in line:
-                    try:
-                        parse_html(line, get_html(line))
-
-                    except:
-                        pass
+#    recursion = 0
+#    if (args.followlinks):
+#        with open(f'data/links.txt', 'r+') as linksfile:
+#            while line := linksfile.readline():
+#                if recursion < max_recursion:
+#                    if "http" in line:
+#                        recursion += 1
+#                        try:
+#                            parse_html(line, get_html(line))
+#                        except:
+#                            pass
+    os.remove('data/links.txt')