From b43343e0eee5239aa5ca199c6afe5cc06fc23324 Mon Sep 17 00:00:00 2001 From: rmgr Date: Tue, 12 Dec 2023 17:35:15 +1030 Subject: [PATCH] Fix recursive crawling. --- src/crawl.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/crawl.py b/src/crawl.py index 39ccdc2..dcac7ed 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -13,7 +13,7 @@ def get_html(url: str) -> str: response = requests.get(url) return response.content -def parse_html(url: str, html: str, recursion: int = 0) -> bool: +def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool: print(url) print(recursion) urlparts = urlparse(url) @@ -41,12 +41,13 @@ def parse_html(url: str, html: str, recursion: int = 0) -> bool: continue if not "http" in link: link = urljoin(url, link) - if (recursion > 0): + if (recursion > 0 and link not in traversed_links): try: + traversed_links.append(link) link_html = get_html(link) r = recursion -1 - sleep(0.5) - parse_html(link, link_html) + sleep(1) + parse_html(link, link_html, r, traversed_links) except: pass # else: