Fix recursive crawling.
This commit is contained in:
parent
21961fced6
commit
b43343e0ee
1 changed files with 5 additions and 4 deletions
|
|
@ -13,7 +13,7 @@ def get_html(url: str) -> str:
|
|||
response = requests.get(url)
|
||||
return response.content
|
||||
|
||||
def parse_html(url: str, html: str, recursion: int = 0) -> bool:
|
||||
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
|
||||
print(url)
|
||||
print(recursion)
|
||||
urlparts = urlparse(url)
|
||||
|
|
@ -41,12 +41,13 @@ def parse_html(url: str, html: str, recursion: int = 0) -> bool:
|
|||
continue
|
||||
if not "http" in link:
|
||||
link = urljoin(url, link)
|
||||
if (recursion > 0):
|
||||
if (recursion > 0 and link not in traversed_links):
|
||||
try:
|
||||
traversed_links.append(link)
|
||||
link_html = get_html(link)
|
||||
r = recursion -1
|
||||
sleep(0.5)
|
||||
parse_html(link, link_html)
|
||||
sleep(1)
|
||||
parse_html(link, link_html, r, traversed_links)
|
||||
except:
|
||||
pass
|
||||
# else:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue