Fix crawling. Add initial linksfile crawling. Still need to remove records as they are processed.

This commit is contained in:
rmgr 2024-01-01 20:52:12 +10:30
parent f4ea8ad1d7
commit efe6dea1f5

View file

@ -47,7 +47,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
links = soup.find_all("a")
for link in links:
found = False
if "href" not in link:
if not hasattr(link, "href"):
continue
link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
@ -63,13 +63,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
parse_html(link, link_html, r, traversed_links)
except:
pass
# else:
# with open(f'data/links.txt', 'r+') as linksfile:
# while line := linksfile.readline():
# if line.strip() == link.strip():
# found = True
# if not found:
# linksfile.write(f'{link}\n')
elif link not in traversed_links:
with open(f'data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if line.strip() == link.strip():
found = True
if not found:
linksfile.write(f'{link}\n')
if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True)
@ -77,21 +77,21 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true")
max_recursion = 4
max_recursion = 2
args = parser.parse_args()
html = get_html(args.url)
parse_html(args.url, html, max_recursion)
if args.url == "links":
with open(f'data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if "http" in line:
try:
parse_html(line, get_html(line))
except:
pass
else:
html = get_html(args.url)
parse_html(args.url, html, max_recursion)
# recursion = 0
# if (args.followlinks):
# with open(f'data/links.txt', 'r+') as linksfile:
# while line := linksfile.readline():
# if recursion < max_recursion:
# if "http" in line:
# recursion += 1
# try:
# parse_html(line, get_html(line))
# except:
# pass
os.remove('data/links.txt')
# os.remove('data/links.txt')