From efe6dea1f575480d14db9f5df75848cd5bb44482 Mon Sep 17 00:00:00 2001 From: rmgr Date: Mon, 1 Jan 2024 20:52:12 +1030 Subject: [PATCH] Fix crawling. Add initial linksfile crawling. Still need to remove records as they are processed. --- src/crawl.py | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/crawl.py b/src/crawl.py index da6bffa..a0d2e64 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -47,7 +47,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro links = soup.find_all("a") for link in links: found = False - if "href" not in link: + if not hasattr(link, "href"): continue link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: @@ -63,13 +63,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro parse_html(link, link_html, r, traversed_links) except: pass -# else: -# with open(f'data/links.txt', 'r+') as linksfile: -# while line := linksfile.readline(): -# if line.strip() == link.strip(): -# found = True -# if not found: -# linksfile.write(f'{link}\n') + elif link not in traversed_links: + with open(f'data/links.txt', 'r+') as linksfile: + while line := linksfile.readline(): + if line.strip() == link.strip(): + found = True + if not found: + linksfile.write(f'{link}\n') if __name__ == "__main__": os.makedirs("data/content", exist_ok=True) @@ -77,21 +77,21 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") - max_recursion = 4 + max_recursion = 2 args = parser.parse_args() - html = get_html(args.url) - parse_html(args.url, html, max_recursion) + if args.url == "links": + with open(f'data/links.txt', 'r+') as linksfile: + while line := linksfile.readline(): + if "http" in line: + try: + parse_html(line, get_html(line)) + except: + pass + + else: + html = get_html(args.url) + parse_html(args.url, html, max_recursion) # recursion = 0 # if (args.followlinks): -# with open(f'data/links.txt', 'r+') as linksfile: -# while line := linksfile.readline(): -# if recursion < max_recursion: -# if "http" in line: -# recursion += 1 -# try: -# parse_html(line, get_html(line)) -# except: -# pass - - os.remove('data/links.txt') +# os.remove('data/links.txt')