Fix recursive crawling.

This commit is contained in:
rmgr 2023-12-12 17:35:15 +10:30
parent 21961fced6
commit b43343e0ee

View file

@ -13,7 +13,7 @@ def get_html(url: str) -> str:
response = requests.get(url)
return response.content
def parse_html(url: str, html: str, recursion: int = 0) -> bool:
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
print(url)
print(recursion)
urlparts = urlparse(url)
@ -41,12 +41,13 @@ def parse_html(url: str, html: str, recursion: int = 0) -> bool:
continue
if not "http" in link:
link = urljoin(url, link)
if (recursion > 0):
if (recursion > 0 and link not in traversed_links):
try:
traversed_links.append(link)
link_html = get_html(link)
r = recursion -1
sleep(0.5)
parse_html(link, link_html)
sleep(1)
parse_html(link, link_html, r, traversed_links)
except:
pass
# else: