From 9fc2e1af537613f93ab62d509720733aea323fbe Mon Sep 17 00:00:00 2001 From: rmgr Date: Wed, 6 Dec 2023 08:29:39 +1030 Subject: [PATCH] Implement recursive page crawling --- .gitignore | 4 +- src/__pycache__/search.cpython-39.pyc | Bin 679 -> 0 bytes src/crawl.py | 63 +++++++++++++++++--------- src/index.py | 2 +- src/search.py | 4 +- 5 files changed, 47 insertions(+), 26 deletions(-) delete mode 100644 src/__pycache__/search.cpython-39.pyc diff --git a/.gitignore b/.gitignore index 550d67d..9e56094 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -src/__pycache__ +src/__pycache__/ +data +env diff --git a/src/__pycache__/search.cpython-39.pyc b/src/__pycache__/search.cpython-39.pyc deleted file mode 100644 index 22a345c6aa6159e978e60df4597a596caf62a433..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 679 zcmYjP!EV$r5S?-CBrK`gf`r7CTaehmy{amKdf?ClJs={XC|#R&TW^xJlR_&;LR#+p z!tRlu;6r@n#4m7SlHEhcnm5lg@p#6Wa&K<}B<8nL2MplPuz4r}ZB9t-djbg(H7qbk z95^G+ayIfn2YEm~uK7ab0zsh)SC~gCTE|jsP=+%40pxikgL6pZzmyMB^x{;P_J&CG zK`lFF+p~dp)v~Kwb-$lkRhrpVc0@6AcRa)4*iJ&SG3i@U`<wkmFJ^_>$;SNQ@vi; zM$cO>bp2HsAC?V$CB10OylT_Xi?Qi^GG;5r!(!z5<1tcfkB3ItPPZPHzrY+)-hVH% zt9qfb>+(irQr+$z@s+AB=atH=ne9B&=FW%P7aqpIJ|h5@-~?k@e8d z8Ge_cQ2{f|hKUG#(iyGi7t^LRR&860i=rwQswjLwZFynpu2o}}vBU@PWXKR7Eo#|m Sbvz`s2| str: response = requests.get(url) return response.content -def parse_html(url: str, html: str) -> bool: +def parse_html(url: str, html: str, recursion: int = 0) -> bool: + print(url) + print(recursion) + urlparts = urlparse(url) + baseurl = urlparts.scheme + "://" + urlparts.netloc soup = BeautifulSoup(html,'html.parser') hash = hashlib.sha256() hash.update(url.encode('ascii')) @@ -28,18 +36,26 @@ def parse_html(url: str, html: str) -> bool: links = soup.find_all("a") for link in links: found = False - if 'href' in link: - link = link["href"] - else: + link = link["href"] + if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue if not "http" in link: - link = url + "/" + link - with open(f'data/links.txt', 'r+') as linksfile: - while line := linksfile.readline(): - if line.strip() == link.strip(): - found = True - if not found: - linksfile.write(f'{link}\n') + link = urljoin(url, link) + if (recursion > 0): + try: + link_html = get_html(link) + r = recursion -1 + sleep(0.5) + parse_html(link, link_html) + except: + pass +# else: +# with open(f'data/links.txt', 'r+') as linksfile: +# while line := linksfile.readline(): +# if line.strip() == link.strip(): +# found = True +# if not found: +# linksfile.write(f'{link}\n') if __name__ == "__main__": os.makedirs("data/content", exist_ok=True) @@ -47,17 +63,20 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") - + max_recursion = 4 args = parser.parse_args() html = get_html(args.url) - parse_html(args.url, html) + parse_html(args.url, html, max_recursion) - if (args.followlinks): - with open(f'data/links.txt', 'r+') as linksfile: - while line := linksfile.readline(): - if "http" in line: - try: - parse_html(line, get_html(line)) - - except: - pass +# recursion = 0 +# if (args.followlinks): +# with open(f'data/links.txt', 'r+') as linksfile: +# while line := linksfile.readline(): +# if recursion < max_recursion: +# if "http" in line: +# recursion += 1 +# try: +# parse_html(line, get_html(line)) +# except: +# pass + os.remove('data/links.txt') diff --git a/src/index.py b/src/index.py index 83d4342..1947f56 100755 --- a/src/index.py +++ b/src/index.py @@ -27,7 +27,7 @@ def build_index(): if len(matching_urls) == 0: # if not url.strip() in dictionary[word]: entries = dictionary[word] - entry = {"url": url.strip(), "count": 1} + entry = {"url": url.strip(), "count": 1, "filename": str(path)} dictionary[word].append(entry) else: entries = dictionary[word] diff --git a/src/search.py b/src/search.py index 0dcb85e..17668f9 100755 --- a/src/search.py +++ b/src/search.py @@ -23,8 +23,8 @@ def search(query): result.append(item) else: matching_results[0]["count"] += item["count"] - #result.append(index[q]) -# result.sort(reverse= True,key=lambda entry: int(entry.count)) return result +def handle_and(): + pass