Fix recursive crawling.

2023-12-12 17:35:15 +10:30 · 2023-12-12 17:35:15 +10:30 · b43343e0ee
commit b43343e0ee
parent 21961fced6
1 changed files with 5 additions and 4 deletions
--- a/src/crawl.py
+++ b/src/crawl.py
@ -13,7 +13,7 @@ def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content

-def parse_html(url: str, html: str, recursion: int = 0) -> bool:
+def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
    print(url)
    print(recursion)
    urlparts = urlparse(url)
@ -41,12 +41,13 @@ def parse_html(url: str, html: str, recursion: int = 0) -> bool:
            continue
        if not "http" in link:
            link = urljoin(url, link)
-        if (recursion > 0):
+        if (recursion > 0 and link not in traversed_links):
            try:
+                traversed_links.append(link)
                link_html = get_html(link)
                r = recursion -1 
-                sleep(0.5)
-                parse_html(link, link_html)
+                sleep(1)
+                parse_html(link, link_html, r, traversed_links)
            except:
                pass
 #        else: