diff --git a/.gitignore b/.gitignore index 550d67d..9e56094 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -src/__pycache__ +src/__pycache__/ +data +env diff --git a/src/__pycache__/search.cpython-39.pyc b/src/__pycache__/search.cpython-39.pyc deleted file mode 100644 index 22a345c..0000000 Binary files a/src/__pycache__/search.cpython-39.pyc and /dev/null differ diff --git a/src/crawl.py b/src/crawl.py index c071595..39ccdc2 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -2,14 +2,22 @@ import argparse import requests import hashlib +from urllib.parse import urlparse, urljoin import os +from time import sleep from bs4 import BeautifulSoup +# TODO- Handle gemini/gopher links +# TODO- Keep a list of traversed links and check before traversing again def get_html(url: str) -> str: response = requests.get(url) return response.content -def parse_html(url: str, html: str) -> bool: +def parse_html(url: str, html: str, recursion: int = 0) -> bool: + print(url) + print(recursion) + urlparts = urlparse(url) + baseurl = urlparts.scheme + "://" + urlparts.netloc soup = BeautifulSoup(html,'html.parser') hash = hashlib.sha256() hash.update(url.encode('ascii')) @@ -28,18 +36,26 @@ def parse_html(url: str, html: str) -> bool: links = soup.find_all("a") for link in links: found = False - if 'href' in link: - link = link["href"] - else: + link = link["href"] + if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue if not "http" in link: - link = url + "/" + link - with open(f'data/links.txt', 'r+') as linksfile: - while line := linksfile.readline(): - if line.strip() == link.strip(): - found = True - if not found: - linksfile.write(f'{link}\n') + link = urljoin(url, link) + if (recursion > 0): + try: + link_html = get_html(link) + r = recursion -1 + sleep(0.5) + parse_html(link, link_html) + except: + pass +# else: +# with open(f'data/links.txt', 'r+') as linksfile: +# while line := linksfile.readline(): +# if line.strip() == link.strip(): +# found = True +# if not found: +# linksfile.write(f'{link}\n') if __name__ == "__main__": os.makedirs("data/content", exist_ok=True) @@ -47,17 +63,20 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") - + max_recursion = 4 args = parser.parse_args() html = get_html(args.url) - parse_html(args.url, html) + parse_html(args.url, html, max_recursion) - if (args.followlinks): - with open(f'data/links.txt', 'r+') as linksfile: - while line := linksfile.readline(): - if "http" in line: - try: - parse_html(line, get_html(line)) - - except: - pass +# recursion = 0 +# if (args.followlinks): +# with open(f'data/links.txt', 'r+') as linksfile: +# while line := linksfile.readline(): +# if recursion < max_recursion: +# if "http" in line: +# recursion += 1 +# try: +# parse_html(line, get_html(line)) +# except: +# pass + os.remove('data/links.txt') diff --git a/src/index.py b/src/index.py index 83d4342..1947f56 100755 --- a/src/index.py +++ b/src/index.py @@ -27,7 +27,7 @@ def build_index(): if len(matching_urls) == 0: # if not url.strip() in dictionary[word]: entries = dictionary[word] - entry = {"url": url.strip(), "count": 1} + entry = {"url": url.strip(), "count": 1, "filename": str(path)} dictionary[word].append(entry) else: entries = dictionary[word] diff --git a/src/search.py b/src/search.py index 0dcb85e..17668f9 100755 --- a/src/search.py +++ b/src/search.py @@ -23,8 +23,8 @@ def search(query): result.append(item) else: matching_results[0]["count"] += item["count"] - #result.append(index[q]) -# result.sort(reverse= True,key=lambda entry: int(entry.count)) return result +def handle_and(): + pass