From 7ee9d978b26faa7ef5a73c83f060fef7549ab349 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Thu, 4 Apr 2024 20:46:34 +1030
Subject: [PATCH] Tidy up crawling and implement boolean search

---
 src/crawl.py  | 30 +++++++++----------
 src/index.py  |  9 ++++--
 src/models.py |  1 +
 src/search.py | 81 ++++++++++++++++++++++++++++++++++++++++++---------
 4 files changed, 91 insertions(+), 30 deletions(-)

diff --git a/src/crawl.py b/src/crawl.py
index 3856300..e7e35be 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -40,21 +40,21 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
     if not rp.can_fetch("*", url):
         print("Robots prevents crawling url: " + url)
         return
-    
-    soup = BeautifulSoup(html,'html.parser')
+
+    soup = BeautifulSoup(html, 'html.parser')
     hash = hashlib.sha256()
     hash.update(url.encode('ascii'))
 
     s = Session()
     existing_website = s.query(Documents).filter_by(url=url).first()
-    print (existing_website)
-    if existing_website == None:
+    if existing_website is None:
         website = Documents(
                 url=url,
                 text_content=soup.get_text(),
                 html_content=soup.prettify(),
                 first_crawl_date=datetime.datetime.now(),
-                last_crawl_date = datetime.datetime.now()
+                last_crawl_date=datetime.datetime.now(),
+                last_index_date=None
                 )
         s.add(website)
     else:
@@ -64,44 +64,44 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
     s.close()
     x = open(f'data/links.txt', 'a')
     x.close()
-    links = soup.find_all("a")
+    links = soup.find_all("a", href=True)
     for link in links:
         found = False
-        if not hasattr(link, "href"):
-            continue
         link = link["href"]
         if (len(link) > 0 and link[0] == "#") or "localhost" in link:
             continue
-        if not "http" in link:
+        if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
+            continue
+        if "http" not in link:
             link = urljoin(url, link)
         if (recursion > 0 and link not in traversed_links):
             try:
                 traversed_links.append(link)
                 link_html = get_html(link)
-                r = recursion -1 
-                sleep(1)
+                r = recursion -1
+                sleep(0.5)
                 parse_html(link, link_html, r, traversed_links)
             except:
                 pass
         elif link not in traversed_links:
-            with open(f'data/links.txt', 'r+') as linksfile:
+            with open('data/links.txt', 'r+') as linksfile:
                 while line := linksfile.readline():
                     if line.strip() == link.strip():
                         found = True
                 if not found:
                     linksfile.write(f'{link}\n')
 
-if __name__ == "__main__":
 
+if __name__ == "__main__":
     os.makedirs("data/content", exist_ok=True)
     # check inputs
     parser = argparse.ArgumentParser()
     parser.add_argument("url", help="URL of the webpage to be crawled")
     parser.add_argument('-f', "--followlinks", action="store_true")
-    max_recursion = 2 
+    max_recursion = 4 
     args = parser.parse_args()
     if args.url == "links":    
-        with open(f'data/links.txt', 'r+') as linksfile:
+        with open('data/links.txt', 'r+') as linksfile:
             while line := linksfile.readline():
                 if "http" in line:
                     try:
diff --git a/src/index.py b/src/index.py
index 227815e..d7259ce 100644
--- a/src/index.py
+++ b/src/index.py
@@ -1,10 +1,11 @@
 #!/usr/bin/python3
 import argparse
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, or_
 from config import DATABASE_URI
 from models import Base, Documents, Document_Tokens, Tokens
 from sqlalchemy.orm import sessionmaker
 import uuid
+import datetime
 
 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
@@ -14,18 +15,22 @@ Session = sessionmaker(bind=engine)
 def build_index():
     session = Session()
     # Read list of 1000 documents from db
-    documents = session.query(Documents).limit(1000)
+    documents = session.query(Documents).filter(or_(Documents.last_index_date.is_(None), Documents.last_index_date<Documents.last_crawl_date)).limit(1000)
     for document in documents:
         print(document.url)
         content_words = document.text_content.split()
         for word in content_words:
             word = word.lower()
+            if len(word) > 50:
+                continue
             token = session.query(Tokens).filter_by(token=word).first()
             if token is None:
                 token = Tokens(token=word, id=uuid.uuid4())
                 session.add(token)
             document_token = Document_Tokens(document_id=document.id, token_id=token.id)
             session.add(document_token)
+        document.last_index_date = datetime.datetime.now()
+        session.add(document)
         session.commit()
 
 
diff --git a/src/models.py b/src/models.py
index c2c1d07..de7e7a9 100644
--- a/src/models.py
+++ b/src/models.py
@@ -15,6 +15,7 @@ class Documents(Base):
     html_content = Column(String)
     first_crawl_date = Column(DateTime)
     last_crawl_date = Column(DateTime)
+    last_index_date = Column(DateTime)
     document_tokens = relationship("Document_Tokens", back_populates="document")
 
 
diff --git a/src/search.py b/src/search.py
index b95a83f..c5c233e 100755
--- a/src/search.py
+++ b/src/search.py
@@ -1,9 +1,10 @@
 #!/usr/bin/python3
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, func
 from config import DATABASE_URI
-from models import Base, Tokens
+from models import Base, Tokens, Documents, Document_Tokens
 from sqlalchemy.orm import sessionmaker
-
+from sqlalchemy.sql.expression import distinct
+import time
 from flask import Flask
 from urllib.parse import unquote
 
@@ -14,17 +15,71 @@ Session = sessionmaker(bind=engine)
 # Todo - Boolean search (AND/OR/NOT/"")
 
 
+def split_query(query):
+    result = {'ands': [], 'ors': [], 'words': []}
+    query_words = query.split()
+    i = 0
+    while i < len(query_words):
+        if i + 1 < len(query_words):
+            if query_words[i + 1].lower() == "and":
+                if i + 2 < len(query_words):
+                    result['ands'].append(
+                        query_words[i] + ',' + query_words[i+2])
+                    i = i + 3
+                    continue
+        result['words'].append(query_words[i])
+        i += 1
+    return result
+
+
 @app.route("/search/<query>")
 def search(query):
+    start_time = time.time_ns()
     session = Session()
-    result = []
-    query_words = unquote(query).split()
-    for word in query_words:
-        word = word.lower()
-        matching_token = session.query(Tokens).filter_by(token=word).first()
-        if session is None:
-            continue
-        for document_token in matching_token.document_tokens:
+    results = {}
+    query_words = split_query(unquote(query))
+    for a in query_words['ands']:
+        query = session.query(Documents.url, func.count(1)).\
+            join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
+            join(Tokens, Document_Tokens.token_id == Tokens.id).\
+            filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
+            group_by(Documents.url).\
+            having(func.count(distinct(Document_Tokens.token_id)) == 2).\
+            order_by(func.count(1).desc())
+        for result in query.all():
+            if result[0] in results.keys():
+                results[result[0]] += result[1]
+            else:
+                results[result[0]] = result[1]
+    x = session.query(Tokens).filter(
+        Tokens.token.in_(query_words['words'])).limit(1000)
+    for y in x:
+        for document_token in y.document_tokens:
+            if document_token.document.url in results.keys():
+                results[document_token.document.url] += 1
+            else:
+                results[document_token.document.url] = 1
 
-            result.append(document_token.document.url)
-    return result
+    print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
+    return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
+
+# @app.route("/search/<query>")
+# def search(query):
+#    start_time = time.time_ns()
+#    session = Session()
+#    result = {}
+#    query_words = unquote(query).split()
+# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000)
+#    for word in query_words:
+#        word = word.lower()
+#        matching_token = session.query(Tokens).filter_by(token=word).first()
+#
+#        if matching_token is None:
+#            continue
+#        for document_token in matching_token.document_tokens:
+#            if document_token.document.url in result.keys():
+#                result[document_token.document.url] += 1
+#            else:
+#                result[document_token.document.url] = 1
+#    print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
+#    return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]