Tidy up crawling and implement boolean search

2024-04-04 20:46:34 +10:30 · 2024-04-04 20:46:34 +10:30 · 7ee9d978b2
commit 7ee9d978b2
parent d4bb3fb8dc
4 changed files with 91 additions and 30 deletions
--- a/src/crawl.py
+++ b/src/crawl.py
@ -40,21 +40,21 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
    if not rp.can_fetch("*", url):
        print("Robots prevents crawling url: " + url)
        return
-    
-    soup = BeautifulSoup(html,'html.parser')
+
+    soup = BeautifulSoup(html, 'html.parser')
    hash = hashlib.sha256()
    hash.update(url.encode('ascii'))

    s = Session()
    existing_website = s.query(Documents).filter_by(url=url).first()
-    print (existing_website)
-    if existing_website == None:
+    if existing_website is None:
        website = Documents(
                url=url,
                text_content=soup.get_text(),
                html_content=soup.prettify(),
                first_crawl_date=datetime.datetime.now(),
-                last_crawl_date = datetime.datetime.now()
+                last_crawl_date=datetime.datetime.now(),
+                last_index_date=None
                )
        s.add(website)
    else:
@ -64,44 +64,44 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
    s.close()
    x = open(f'data/links.txt', 'a')
    x.close()
-    links = soup.find_all("a")
+    links = soup.find_all("a", href=True)
    for link in links:
        found = False
-        if not hasattr(link, "href"):
-            continue
        link = link["href"]
        if (len(link) > 0 and link[0] == "#") or "localhost" in link:
            continue
-        if not "http" in link:
+        if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
+            continue
+        if "http" not in link:
            link = urljoin(url, link)
        if (recursion > 0 and link not in traversed_links):
            try:
                traversed_links.append(link)
                link_html = get_html(link)
-                r = recursion -1 
-                sleep(1)
+                r = recursion -1
+                sleep(0.5)
                parse_html(link, link_html, r, traversed_links)
            except:
                pass
        elif link not in traversed_links:
-            with open(f'data/links.txt', 'r+') as linksfile:
+            with open('data/links.txt', 'r+') as linksfile:
                while line := linksfile.readline():
                    if line.strip() == link.strip():
                        found = True
                if not found:
                    linksfile.write(f'{link}\n')

-if __name__ == "__main__":

+if __name__ == "__main__":
    os.makedirs("data/content", exist_ok=True)
    # check inputs
    parser = argparse.ArgumentParser()
    parser.add_argument("url", help="URL of the webpage to be crawled")
    parser.add_argument('-f', "--followlinks", action="store_true")
-    max_recursion = 2 
+    max_recursion = 4 
    args = parser.parse_args()
    if args.url == "links":    
-        with open(f'data/links.txt', 'r+') as linksfile:
+        with open('data/links.txt', 'r+') as linksfile:
            while line := linksfile.readline():
                if "http" in line:
                    try: