Initial commit

2023-11-28 20:51:54 +10:30 · 2023-11-28 20:51:54 +10:30 · f36ab2fbfb
commit f36ab2fbfb
5 changed files with 144 additions and 0 deletions
--- a/src/crawl.py
+++ b/src/crawl.py
@ -0,0 +1,63 @@
+#!/usr/bin/python3
+import argparse
+import requests
+import hashlib
+import os
+from bs4 import BeautifulSoup
+
+def get_html(url: str) -> str:
+    response = requests.get(url)
+    return response.content
+
+def parse_html(url: str, html: str) -> bool:
+    soup = BeautifulSoup(html,'html.parser')
+    hash = hashlib.sha256()
+    hash.update(url.encode('ascii'))
+    filename_text = hash.hexdigest() + '.txt' 
+    filename_html = hash.hexdigest() + '.html' 
+    with open(f'data/content/{filename_text}', 'w') as outfile:
+        outfile.write(url)
+        outfile.write('\n')
+        outfile.write(soup.get_text())
+    with open(f'data/content/{filename_html}', 'w') as outfile:
+        outfile.write(url)
+        outfile.write('\n')
+        outfile.write(soup.prettify())
+    x = open(f'data/links.txt', 'a')
+    x.close()
+    links = soup.find_all("a")
+    for link in links:
+        found = False
+        if 'href' in link:
+            link = link["href"]
+        else:
+            continue
+        if not "http" in link:
+            link = url + "/" + link
+        with open(f'data/links.txt', 'r+') as linksfile:
+            while line := linksfile.readline():
+                if line.strip() == link.strip():
+                    found = True
+            if not found:
+                linksfile.write(f'{link}\n')
+
+if __name__ == "__main__":
+    os.makedirs("data/content", exist_ok=True)
+    # check inputs
+    parser = argparse.ArgumentParser()
+    parser.add_argument("url", help="URL of the webpage to be crawled")
+    parser.add_argument('-f', "--followlinks", action="store_true")
+    
+    args = parser.parse_args()
+    html = get_html(args.url)
+    parse_html(args.url, html)
+    
+    if (args.followlinks):
+        with open(f'data/links.txt', 'r+') as linksfile:
+            while line := linksfile.readline():
+                if "http" in line:
+                    try:
+                        parse_html(line, get_html(line))
+
+                    except:
+                        pass