Implement recursive page crawling

2023-12-06 08:29:39 +10:30 · 2023-12-06 08:29:39 +10:30 · 9fc2e1af53
commit 9fc2e1af53
parent 3d7b72e5ef
5 changed files with 47 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,3 @@
-src/__pycache__
+src/__pycache__/
 data
 env
--- a/src/pycache/search.cpython-39.pyc
+++ b/src/pycache/search.cpython-39.pyc
--- a/src/crawl.py
+++ b/src/crawl.py
@ -2,14 +2,22 @@
 import argparse
 import requests
 import hashlib
 from urllib.parse import urlparse, urljoin
 import os
 from time import sleep
 from bs4 import BeautifulSoup
 # TODO- Handle gemini/gopher links
 # TODO- Keep a list of traversed links and check before traversing again
 def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content
-def parse_html(url: str, html: str) -> bool:
+def parse_html(url: str, html: str, recursion: int = 0) -> bool:
    print(url)
    print(recursion)
    urlparts = urlparse(url)
    baseurl = urlparts.scheme + "://" + urlparts.netloc
    soup = BeautifulSoup(html,'html.parser')
    hash = hashlib.sha256()
    hash.update(url.encode('ascii'))
@ -28,18 +36,26 @@ def parse_html(url: str, html: str) -> bool:
    links = soup.find_all("a")
    for link in links:
        found = False
        if 'href' in link:
        link = link["href"]
-        else:
+        if (len(link) > 0 and link[0] == "#") or "localhost" in link:
            continue
        if not "http" in link:
-            link = url + "/" + link
+            link = urljoin(url, link)
-        with open(f'data/links.txt', 'r+') as linksfile:
+        if (recursion > 0):
-            while line := linksfile.readline():
+            try:
-                if line.strip() == link.strip():
+                link_html = get_html(link)
-                    found = True
+                r = recursion -1 
-            if not found:
+                sleep(0.5)
-                linksfile.write(f'{link}\n')
+                parse_html(link, link_html)
            except:
                pass
 #        else:
 #            with open(f'data/links.txt', 'r+') as linksfile:
 #                while line := linksfile.readline():
 #                    if line.strip() == link.strip():
 #                        found = True
 #                if not found:
 #                    linksfile.write(f'{link}\n')
 if __name__ == "__main__":
    os.makedirs("data/content", exist_ok=True)
@ -47,17 +63,20 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("url", help="URL of the webpage to be crawled")
    parser.add_argument('-f', "--followlinks", action="store_true")
-    
+    max_recursion = 4
    args = parser.parse_args()
    html = get_html(args.url)
-    parse_html(args.url, html)
+    parse_html(args.url, html, max_recursion)
-    if (args.followlinks):
+#    recursion = 0
-        with open(f'data/links.txt', 'r+') as linksfile:
+#    if (args.followlinks):
-            while line := linksfile.readline():
+#        with open(f'data/links.txt', 'r+') as linksfile:
-                if "http" in line:
+#            while line := linksfile.readline():
-                    try:
+#                if recursion < max_recursion:
-                        parse_html(line, get_html(line))
+#                    if "http" in line:
-
+#                        recursion += 1
-                    except:
+#                        try:
-                        pass
+#                            parse_html(line, get_html(line))
 #                        except:
 #                            pass
    os.remove('data/links.txt')
--- a/src/index.py
+++ b/src/index.py
@ -27,7 +27,7 @@ def build_index():
                        if len(matching_urls) == 0:
 #                        if not url.strip() in dictionary[word]:
                            entries = dictionary[word]
-                            entry = {"url": url.strip(), "count": 1}
+                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
                            dictionary[word].append(entry)
                        else:
                            entries = dictionary[word]
--- a/src/search.py
+++ b/src/search.py
@ -23,8 +23,8 @@ def search(query):
                        result.append(item)
                    else:
                        matching_results[0]["count"] += item["count"]
                #result.append(index[q])
 #        result.sort(reverse= True,key=lambda entry: int(entry.count))
        return result
 def handle_and():
    pass