Fix recursive crawling.
This commit is contained in:
parent
21961fced6
commit
b43343e0ee
1 changed files with 5 additions and 4 deletions
|
|
@ -13,7 +13,7 @@ def get_html(url: str) -> str:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
return response.content
|
return response.content
|
||||||
|
|
||||||
def parse_html(url: str, html: str, recursion: int = 0) -> bool:
|
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
|
||||||
print(url)
|
print(url)
|
||||||
print(recursion)
|
print(recursion)
|
||||||
urlparts = urlparse(url)
|
urlparts = urlparse(url)
|
||||||
|
|
@ -41,12 +41,13 @@ def parse_html(url: str, html: str, recursion: int = 0) -> bool:
|
||||||
continue
|
continue
|
||||||
if not "http" in link:
|
if not "http" in link:
|
||||||
link = urljoin(url, link)
|
link = urljoin(url, link)
|
||||||
if (recursion > 0):
|
if (recursion > 0 and link not in traversed_links):
|
||||||
try:
|
try:
|
||||||
|
traversed_links.append(link)
|
||||||
link_html = get_html(link)
|
link_html = get_html(link)
|
||||||
r = recursion -1
|
r = recursion -1
|
||||||
sleep(0.5)
|
sleep(1)
|
||||||
parse_html(link, link_html)
|
parse_html(link, link_html, r, traversed_links)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
# else:
|
# else:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue