Fix crawling. Add initial linksfile crawling. Still need to remove records as they are processed.

2024-01-01 20:52:12 +10:30 · 2024-01-01 20:52:12 +10:30 · efe6dea1f5
commit efe6dea1f5
parent f4ea8ad1d7
1 changed files with 22 additions and 22 deletions
--- a/src/crawl.py
+++ b/src/crawl.py
@ -47,7 +47,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
    links = soup.find_all("a")
    for link in links:
        found = False
-        if "href" not in link:
+        if not hasattr(link, "href"):
            continue
        link = link["href"]
        if (len(link) > 0 and link[0] == "#") or "localhost" in link:
@ -63,13 +63,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
                parse_html(link, link_html, r, traversed_links)
            except:
                pass
-#        else:
+        elif link not in traversed_links:
-#            with open(f'data/links.txt', 'r+') as linksfile:
+            with open(f'data/links.txt', 'r+') as linksfile:
-#                while line := linksfile.readline():
+                while line := linksfile.readline():
-#                    if line.strip() == link.strip():
+                    if line.strip() == link.strip():
-#                        found = True
+                        found = True
-#                if not found:
+                if not found:
-#                    linksfile.write(f'{link}\n')
+                    linksfile.write(f'{link}\n')
 if __name__ == "__main__":
    os.makedirs("data/content", exist_ok=True)
@ -77,21 +77,21 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("url", help="URL of the webpage to be crawled")
    parser.add_argument('-f', "--followlinks", action="store_true")
-    max_recursion = 4
+    max_recursion = 2 
    args = parser.parse_args()
    if args.url == "links":    
        with open(f'data/links.txt', 'r+') as linksfile:
            while line := linksfile.readline():
                if "http" in line:
                    try:
                        parse_html(line, get_html(line))
                    except:
                        pass
    else:
        html = get_html(args.url)
        parse_html(args.url, html, max_recursion)
 #    recursion = 0
 #    if (args.followlinks):
-#        with open(f'data/links.txt', 'r+') as linksfile:
+#    os.remove('data/links.txt')
 #            while line := linksfile.readline():
 #                if recursion < max_recursion:
 #                    if "http" in line:
 #                        recursion += 1
 #                        try:
 #                            parse_html(line, get_html(line))
 #                        except:
 #                            pass
    os.remove('data/links.txt')