diff --git a/src/crawl.py b/src/crawl.py index 39ccdc2..dcac7ed 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -13,7 +13,7 @@ def get_html(url: str) -> str: response = requests.get(url) return response.content -def parse_html(url: str, html: str, recursion: int = 0) -> bool: +def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool: print(url) print(recursion) urlparts = urlparse(url) @@ -41,12 +41,13 @@ def parse_html(url: str, html: str, recursion: int = 0) -> bool: continue if not "http" in link: link = urljoin(url, link) - if (recursion > 0): + if (recursion > 0 and link not in traversed_links): try: + traversed_links.append(link) link_html = get_html(link) r = recursion -1 - sleep(0.5) - parse_html(link, link_html) + sleep(1) + parse_html(link, link_html, r, traversed_links) except: pass # else: