From f4ea8ad1d776ea76241357186ed9cc1fb26e11a8 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Mon, 1 Jan 2024 19:53:22 +1030
Subject: [PATCH 01/15] Respect robots.txt

---
 beehave.txt                            |   1 +
 src/__pycache__/search.cpython-310.pyc | Bin 685 -> 1033 bytes
 src/crawl.py                           |  18 ++++++++++++++++--
 src/index.py                           |   8 ++++++++
 4 files changed, 25 insertions(+), 2 deletions(-)
 create mode 100644 beehave.txt

diff --git a/beehave.txt b/beehave.txt
new file mode 100644
index 0000000..e3415b6
--- /dev/null
+++ b/beehave.txt
@@ -0,0 +1 @@
+https://github.com/bitbrain/beehave
diff --git a/src/__pycache__/search.cpython-310.pyc b/src/__pycache__/search.cpython-310.pyc
index c7402826f1c52ce28e47c499f6b8c84b197fb854..f3e8621f1765bb2ca051342506fd63ec8716cacf 100644
GIT binary patch
delta 673
zcmZuu&1w`u5U%R^+1bhLx>2I8F6zM*28pN$UWP@85HJveIrsw-#_8GYvc0oOcdtKT
zLWBvJoMhRvkQ{UH6?_RVGLPXCSUo{Ru#4}jrn{=Ys_JjfvDch58a~i!vJYu&&6*v2
zKD+y@;qm$*y!N@Z386pRTe)7N7>{5P)OR8c962l4myOxa?!lvdCwESqQ!qYf??A5Q
zEO*-Q6y9`?LA$#48T!aAZXbHNU9bZ8UG3#=f!rA}*~{&d`VpKW!frQrIc0L!fi@hG
zZ^L8Kjp%0gN^ZXZ1{`d%!a~@D2cTYFd{2qSUJ*DQhPC+yF0Ca<mHt^~Z_2+!P%WaI
zTBsC=M~sr#dTC9iID-&NC(X3{c!s4t)ah8(=0EU$<V7MLzvXdn$-KdcTCjIjrLjx~
z_aZS#VxfApIS-OXv2>EjG}EC!-^M!~TMU&h-TqMMRF;-Vv(k-scG8Sf#b5E^LD?F|
zIvEVJ?Jb$AsnANA(aXA_1@oSJE@ysC-69H@j~<y_v4;U(CIonmu}=(d|B^Q-C?Wi_
zOld0wWfN5s!*x<xC6!9=FIA%cAkMf*w}?9?%d~Ezyh$!xD4o8D)kvB{X(lW$%vag+
g6C+%O{P#&O8c+CCq&-tg^&bHXs0RY`sHb4@H}hA5TL1t6

delta 360
zcmY+9J4*vW6ov2Ho!JZ_Sq)LdN2%Z<K4KRTn^>i=Nf9g;XMM)qII|j&5re6$!m_PJ
z0yY-;LtFVLwDxW~=WxE6a}QjYcR3SYT&<RY&Ukhib}{ufSUcTm3#S9VOy64TE_IUJ
zBAX#00mT%KaNJ10w_a;6!itHQLUCJs1ifwf#I<O}Doh!J`<0|*`G{VYcL6~GCyg8i
z9cX}Q4yY+~MpiWQozv$96(qs_b)>?ly&DrpIn%z(dE5`9oX-cm7Y9mn$Oc$jbb~%M
zDi?iy9VNb-ec~bi0$tC3*};ix!a$#26`wR~gbJ3KB=I>V^B4{#BHR4Y%T)X2TwL@5
rbE}b(b_$fT4`;Vow7UKuOB8c?bQi0KUU<wX8IMrG3T-E&M34OeNQp}d

diff --git a/src/crawl.py b/src/crawl.py
index dcac7ed..da6bffa 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -3,21 +3,32 @@ import argparse
 import requests
 import hashlib
 from urllib.parse import urlparse, urljoin
+import urllib.robotparser
 import os
 from time import sleep
 from bs4 import BeautifulSoup
 # TODO- Handle gemini/gopher links
-# TODO- Keep a list of traversed links and check before traversing again
 
 def get_html(url: str) -> str:
     response = requests.get(url)
     return response.content
 
-def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
+def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
+    rp = urllib.robotparser.RobotFileParser()
     print(url)
     print(recursion)
     urlparts = urlparse(url)
     baseurl = urlparts.scheme + "://" + urlparts.netloc
+    if baseurl not in robots:
+        rp.set_url(baseurl + "/robots.txt")
+        rp.read()
+        robots[baseurl] = rp
+    else:
+        rp = robots[baseurl]
+    if not rp.can_fetch("*", url):
+        print("Robots prevents crawling url: " + url)
+        return
+    
     soup = BeautifulSoup(html,'html.parser')
     hash = hashlib.sha256()
     hash.update(url.encode('ascii'))
@@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
     links = soup.find_all("a")
     for link in links:
         found = False
+        if "href" not in link:
+            continue
         link = link["href"]
         if (len(link) > 0 and link[0] == "#") or "localhost" in link:
             continue
@@ -80,4 +93,5 @@ if __name__ == "__main__":
 #                            parse_html(line, get_html(line))
 #                        except:
 #                            pass
+
     os.remove('data/links.txt')
diff --git a/src/index.py b/src/index.py
index f55a356..7532247 100755
--- a/src/index.py
+++ b/src/index.py
@@ -7,6 +7,13 @@ import json
 # investigate ngrams for "multi word" matching
 ignored_words = ['a', 'the','is']
 
+def remove_punctuation(input_string):
+    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
+    for p in punc:
+        input_string = input_string.replace(p, '')
+    return input_string
+
+
 def build_index():
     with open(f"data/index.json", "w") as index:
         # get a list of all content files
@@ -20,6 +27,7 @@ def build_index():
                 content_words = content.split()
                 for word in content_words:
                     word = word.lower()
+                    word = remove_punctuation(word)
                     if not word in ignored_words:
                         if not word in dictionary:
                             dictionary[word] = []

From efe6dea1f575480d14db9f5df75848cd5bb44482 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Mon, 1 Jan 2024 20:52:12 +1030
Subject: [PATCH 02/15] Fix crawling. Add initial linksfile crawling. Still
 need to remove records as they are processed.

---
 src/crawl.py | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/crawl.py b/src/crawl.py
index da6bffa..a0d2e64 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -47,7 +47,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
     links = soup.find_all("a")
     for link in links:
         found = False
-        if "href" not in link:
+        if not hasattr(link, "href"):
             continue
         link = link["href"]
         if (len(link) > 0 and link[0] == "#") or "localhost" in link:
@@ -63,13 +63,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
                 parse_html(link, link_html, r, traversed_links)
             except:
                 pass
-#        else:
-#            with open(f'data/links.txt', 'r+') as linksfile:
-#                while line := linksfile.readline():
-#                    if line.strip() == link.strip():
-#                        found = True
-#                if not found:
-#                    linksfile.write(f'{link}\n')
+        elif link not in traversed_links:
+            with open(f'data/links.txt', 'r+') as linksfile:
+                while line := linksfile.readline():
+                    if line.strip() == link.strip():
+                        found = True
+                if not found:
+                    linksfile.write(f'{link}\n')
 
 if __name__ == "__main__":
     os.makedirs("data/content", exist_ok=True)
@@ -77,21 +77,21 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("url", help="URL of the webpage to be crawled")
     parser.add_argument('-f', "--followlinks", action="store_true")
-    max_recursion = 4
+    max_recursion = 2 
     args = parser.parse_args()
-    html = get_html(args.url)
-    parse_html(args.url, html, max_recursion)
+    if args.url == "links":    
+        with open(f'data/links.txt', 'r+') as linksfile:
+            while line := linksfile.readline():
+                if "http" in line:
+                    try:
+                        parse_html(line, get_html(line))
+                    except:
+                        pass
+
+    else:
+        html = get_html(args.url)
+        parse_html(args.url, html, max_recursion)
     
 #    recursion = 0
 #    if (args.followlinks):
-#        with open(f'data/links.txt', 'r+') as linksfile:
-#            while line := linksfile.readline():
-#                if recursion < max_recursion:
-#                    if "http" in line:
-#                        recursion += 1
-#                        try:
-#                            parse_html(line, get_html(line))
-#                        except:
-#                            pass
-
-    os.remove('data/links.txt')
+#    os.remove('data/links.txt')

From aed568d11ed36132e418e4eb8b1def9855dc3ca8 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Sat, 2 Mar 2024 19:54:53 +1030
Subject: [PATCH 03/15] Remove beehave.txt note

---
 beehave.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 beehave.txt

diff --git a/beehave.txt b/beehave.txt
deleted file mode 100644
index e3415b6..0000000
--- a/beehave.txt
+++ /dev/null
@@ -1 +0,0 @@
-https://github.com/bitbrain/beehave

From 8605ee6b2c8ed8b6da9f7a4bd3d1427f48673878 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Sat, 2 Mar 2024 19:58:10 +1030
Subject: [PATCH 04/15] Add todo file

---
 todo | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 todo

diff --git a/todo b/todo
new file mode 100644
index 0000000..2c7e8cc
--- /dev/null
+++ b/todo
@@ -0,0 +1,6 @@
+[ ] Refactor website table to generic document table (maybe using URN instead of URL?)
+[ ] Define tokens table FKed to document table
+[ ] Refactor index.py to tokenize input into tokens table
+[ ] Define N-Grams table 
+[ ] Add N-Gram generation to index.py
+

From 20d198e5595f33244d7a364ab2538e983dd8ab71 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Thu, 7 Mar 2024 20:44:34 +1030
Subject: [PATCH 05/15] Refactor to use postgresql end to end

---
 src/crawl.py     |  9 +++---
 src/index.py     | 83 ++++++++++++++++++++++++------------------------
 src/index.py.old | 54 +++++++++++++++++++++++++++++++
 src/models.py    | 26 ++++++++++++---
 src/search.py    | 44 ++++++++++++-------------
 5 files changed, 144 insertions(+), 72 deletions(-)
 create mode 100644 src/index.py.old

diff --git a/src/crawl.py b/src/crawl.py
index 9521b5d..3856300 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -9,7 +9,7 @@ from time import sleep
 from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
-from models import Base, Website
+from models import Base, Documents, Document_Tokens, Tokens
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy import create_engine
 import datetime
@@ -19,11 +19,12 @@ engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
 
-def get_html(url: str) -> str:
 
+def get_html(url: str) -> str:
     response = requests.get(url)
     return response.content
 
+
 def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
     rp = urllib.robotparser.RobotFileParser()
     print(url)
@@ -45,10 +46,10 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
     hash.update(url.encode('ascii'))
 
     s = Session()
-    existing_website = s.query(Website).filter_by(url=url).first()
+    existing_website = s.query(Documents).filter_by(url=url).first()
     print (existing_website)
     if existing_website == None:
-        website = Website(
+        website = Documents(
                 url=url,
                 text_content=soup.get_text(),
                 html_content=soup.prettify(),
diff --git a/src/index.py b/src/index.py
index e04c787..c80b5e7 100644
--- a/src/index.py
+++ b/src/index.py
@@ -1,54 +1,53 @@
+#!/usr/bin/python3
+import argparse
+import requests
+import hashlib
+from urllib.parse import urlparse, urljoin
+import urllib.robotparser
+import os
+from time import sleep
+from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
-from models import Base, Website
-from pathlib import Path
-import argparse
-import os
-import json
-# investigate ngrams for "multi word" matching
-ignored_words = ['a', 'the','is']
+from models import Base, Documents, Document_Tokens, Tokens
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy import create_engine
+import datetime
 
-def remove_punctuation(input_string):
-    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
-    for p in punc:
-        input_string = input_string.replace(p, '')
-    return input_string
+engine = create_engine(DATABASE_URI)
+Base.metadata.create_all(engine)
+Session = sessionmaker(bind=engine)
 
 
 def build_index():
-    with open(f"data/index.json", "w") as index:
-        # get a list of all content files
-        # split on whitespace and add to index
-        dictionary = {}
-        pathlist = Path('data/content').rglob('*.txt')
-        for path in pathlist:
-            with open(str(path)) as content_file:
-                url = content_file.readline()
-                content = content_file.read()
-                content_words = content.split()
-                for word in content_words:
-                    word = word.lower()
-                    word = remove_punctuation(word)
-                    if not word in ignored_words:
-                        if not word in dictionary:
-                            dictionary[word] = []
-                        matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
-                        if len(matching_urls) == 0:
-#                        if not url.strip() in dictionary[word]:
-                            entries = dictionary[word]
-                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
-                            dictionary[word].append(entry)
-                        else:
-                            entries = dictionary[word]
-                            entry = matching_urls[0]
-                            entry["count"] += 1
-                            entries.sort(reverse=True, key=lambda entry: entry["count"])
-        index.write(json.dumps(dictionary))
+    session = Session()
+    # Read list of 1000 documents from db
+    documents = session.query(Documents).limit(1000)
+    for document in documents:
+        print(document.url)
+        content_words = document.text_content.split()
+        for word in content_words:
+            word = word.lower()
+            token = session.query(Tokens).filter_by(token=word).first()
+            if token is None:
+                token = Tokens(token=word)
+                session.add(token)
+            document_token = Document_Tokens(document_id=document.id, token_id=token.id)
+            session.add(document_token)
+        session.commit()
+ 
+    # Foreach document, break into words
+    # Check if word exists in database
+    # Create if not exist
+    # Link to document
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
+    parser.add_argument('-r',
+                        "--rebuild",
+                        action="store_true",
+                        help="Blow away the index and rebuild")
     args = parser.parse_args()
     if args.rebuild:
         build_index()
-
diff --git a/src/index.py.old b/src/index.py.old
new file mode 100644
index 0000000..6ec8e21
--- /dev/null
+++ b/src/index.py.old
@@ -0,0 +1,54 @@
+from sqlalchemy import create_engine
+from config import DATABASE_URI
+from models import Base, Website
+from pathlib import Path
+import argparse
+import os
+import json
+# investigate ngrams for "multi word" matching
+ignored_words = ['a', 'the','is']
+
+def remove_punctuation(input_string):
+    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
+    for p in punc:
+        input_string = input_string.replace(p, '')
+    return input_string
+
+
+def build_index():
+    with open("data/index.json", "w") as index:
+        # get a list of all content files
+        # split on whitespace and add to index
+        dictionary = {}
+        pathlist = Path('data/content').rglob('*.txt')
+        for path in pathlist:
+            with open(str(path)) as content_file:
+                url = content_file.readline()
+                content = content_file.read()
+                content_words = content.split()
+                for word in content_words:
+                    word = word.lower()
+                    word = remove_punctuation(word)
+                    if word not in ignored_words:
+                        if word not in dictionary:
+                            dictionary[word] = []
+                        matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
+                        if len(matching_urls) == 0:
+#                        if not url.strip() in dictionary[word]:
+                            entries = dictionary[word]
+                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
+                            dictionary[word].append(entry)
+                        else:
+                            entries = dictionary[word]
+                            entry = matching_urls[0]
+                            entry["count"] += 1
+                            entries.sort(reverse=True, key=lambda entry: entry["count"])
+        index.write(json.dumps(dictionary))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
+    args = parser.parse_args()
+    if args.rebuild:
+        build_index()
+
diff --git a/src/models.py b/src/models.py
index ee768d4..c2c1d07 100644
--- a/src/models.py
+++ b/src/models.py
@@ -1,18 +1,36 @@
 from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, Integer, String, DateTime
+from sqlalchemy import Column, String, DateTime, ForeignKey, Index
 from sqlalchemy.dialects.postgresql import UUID
+from sqlalchemy.orm import relationship, mapped_column
 import uuid
 
 Base = declarative_base()
 
-class Website(Base):
 
-    __tablename__ = 'websites'
-    id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
+class Documents(Base):
+    __tablename__ = 'documents'
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
     url = Column(String)
     text_content = Column(String)
     html_content = Column(String)
     first_crawl_date = Column(DateTime)
     last_crawl_date = Column(DateTime)
+    document_tokens = relationship("Document_Tokens", back_populates="document")
 
 
+class Document_Tokens(Base):
+    __tablename__ = 'document_tokens'
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    document_id = mapped_column(ForeignKey("documents.id"))
+    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    token_id = mapped_column(ForeignKey("tokens.id"))
+    #Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    document = relationship("Documents", back_populates="document_tokens", uselist=False)
+    token = relationship("Tokens", back_populates="document_tokens")
+
+
+class Tokens(Base):
+    __tablename__ = 'tokens'
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    token = Column(String, index=True)
+    document_tokens = relationship("Document_Tokens", back_populates="token")
diff --git a/src/search.py b/src/search.py
index 17668f9..b95a83f 100755
--- a/src/search.py
+++ b/src/search.py
@@ -1,30 +1,30 @@
-#!/bin/bash
+#!/usr/bin/python3
+from sqlalchemy import create_engine
+from config import DATABASE_URI
+from models import Base, Tokens
+from sqlalchemy.orm import sessionmaker
 
 from flask import Flask
-from flask import Request
-import json
 from urllib.parse import unquote
 
 app = Flask(__name__)
-## Todo - Boolean search (AND/OR/NOT/"")
+engine = create_engine(DATABASE_URI)
+Base.metadata.create_all(engine)
+Session = sessionmaker(bind=engine)
+# Todo - Boolean search (AND/OR/NOT/"")
+
+
 @app.route("/search/<query>")
 def search(query):
-    with open('data/index.json', 'r') as index_json:
-        index = json.load(index_json)
-        query = unquote(query)
-        query_split = query.split()
-        result = []
-        for q in query_split:
-            q = q.lower()
-            if q in index:
-                for item in index[q]:
-                    matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
-                    if len(matching_results) == 0:
-                        result.append(item)
-                    else:
-                        matching_results[0]["count"] += item["count"]
-        return result
-
-def handle_and():
-    pass
+    session = Session()
+    result = []
+    query_words = unquote(query).split()
+    for word in query_words:
+        word = word.lower()
+        matching_token = session.query(Tokens).filter_by(token=word).first()
+        if session is None:
+            continue
+        for document_token in matching_token.document_tokens:
 
+            result.append(document_token.document.url)
+    return result

From d4bb3fb8dc00d2f8c8bdee00def5731fa5a053c3 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Thu, 7 Mar 2024 21:12:19 +1030
Subject: [PATCH 06/15] Tidy up index.py

---
 src/index.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/src/index.py b/src/index.py
index c80b5e7..227815e 100644
--- a/src/index.py
+++ b/src/index.py
@@ -1,18 +1,10 @@
 #!/usr/bin/python3
 import argparse
-import requests
-import hashlib
-from urllib.parse import urlparse, urljoin
-import urllib.robotparser
-import os
-from time import sleep
-from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
 from models import Base, Documents, Document_Tokens, Tokens
 from sqlalchemy.orm import sessionmaker
-from sqlalchemy import create_engine
-import datetime
+import uuid
 
 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
@@ -30,16 +22,11 @@ def build_index():
             word = word.lower()
             token = session.query(Tokens).filter_by(token=word).first()
             if token is None:
-                token = Tokens(token=word)
+                token = Tokens(token=word, id=uuid.uuid4())
                 session.add(token)
             document_token = Document_Tokens(document_id=document.id, token_id=token.id)
             session.add(document_token)
         session.commit()
- 
-    # Foreach document, break into words
-    # Check if word exists in database
-    # Create if not exist
-    # Link to document
 
 
 if __name__ == "__main__":

From 7ee9d978b26faa7ef5a73c83f060fef7549ab349 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Thu, 4 Apr 2024 20:46:34 +1030
Subject: [PATCH 07/15] Tidy up crawling and implement boolean search

---
 src/crawl.py  | 30 +++++++++----------
 src/index.py  |  9 ++++--
 src/models.py |  1 +
 src/search.py | 81 ++++++++++++++++++++++++++++++++++++++++++---------
 4 files changed, 91 insertions(+), 30 deletions(-)

diff --git a/src/crawl.py b/src/crawl.py
index 3856300..e7e35be 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -40,21 +40,21 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
     if not rp.can_fetch("*", url):
         print("Robots prevents crawling url: " + url)
         return
-    
-    soup = BeautifulSoup(html,'html.parser')
+
+    soup = BeautifulSoup(html, 'html.parser')
     hash = hashlib.sha256()
     hash.update(url.encode('ascii'))
 
     s = Session()
     existing_website = s.query(Documents).filter_by(url=url).first()
-    print (existing_website)
-    if existing_website == None:
+    if existing_website is None:
         website = Documents(
                 url=url,
                 text_content=soup.get_text(),
                 html_content=soup.prettify(),
                 first_crawl_date=datetime.datetime.now(),
-                last_crawl_date = datetime.datetime.now()
+                last_crawl_date=datetime.datetime.now(),
+                last_index_date=None
                 )
         s.add(website)
     else:
@@ -64,44 +64,44 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
     s.close()
     x = open(f'data/links.txt', 'a')
     x.close()
-    links = soup.find_all("a")
+    links = soup.find_all("a", href=True)
     for link in links:
         found = False
-        if not hasattr(link, "href"):
-            continue
         link = link["href"]
         if (len(link) > 0 and link[0] == "#") or "localhost" in link:
             continue
-        if not "http" in link:
+        if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
+            continue
+        if "http" not in link:
             link = urljoin(url, link)
         if (recursion > 0 and link not in traversed_links):
             try:
                 traversed_links.append(link)
                 link_html = get_html(link)
-                r = recursion -1 
-                sleep(1)
+                r = recursion -1
+                sleep(0.5)
                 parse_html(link, link_html, r, traversed_links)
             except:
                 pass
         elif link not in traversed_links:
-            with open(f'data/links.txt', 'r+') as linksfile:
+            with open('data/links.txt', 'r+') as linksfile:
                 while line := linksfile.readline():
                     if line.strip() == link.strip():
                         found = True
                 if not found:
                     linksfile.write(f'{link}\n')
 
-if __name__ == "__main__":
 
+if __name__ == "__main__":
     os.makedirs("data/content", exist_ok=True)
     # check inputs
     parser = argparse.ArgumentParser()
     parser.add_argument("url", help="URL of the webpage to be crawled")
     parser.add_argument('-f', "--followlinks", action="store_true")
-    max_recursion = 2 
+    max_recursion = 4 
     args = parser.parse_args()
     if args.url == "links":    
-        with open(f'data/links.txt', 'r+') as linksfile:
+        with open('data/links.txt', 'r+') as linksfile:
             while line := linksfile.readline():
                 if "http" in line:
                     try:
diff --git a/src/index.py b/src/index.py
index 227815e..d7259ce 100644
--- a/src/index.py
+++ b/src/index.py
@@ -1,10 +1,11 @@
 #!/usr/bin/python3
 import argparse
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, or_
 from config import DATABASE_URI
 from models import Base, Documents, Document_Tokens, Tokens
 from sqlalchemy.orm import sessionmaker
 import uuid
+import datetime
 
 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
@@ -14,18 +15,22 @@ Session = sessionmaker(bind=engine)
 def build_index():
     session = Session()
     # Read list of 1000 documents from db
-    documents = session.query(Documents).limit(1000)
+    documents = session.query(Documents).filter(or_(Documents.last_index_date.is_(None), Documents.last_index_date<Documents.last_crawl_date)).limit(1000)
     for document in documents:
         print(document.url)
         content_words = document.text_content.split()
         for word in content_words:
             word = word.lower()
+            if len(word) > 50:
+                continue
             token = session.query(Tokens).filter_by(token=word).first()
             if token is None:
                 token = Tokens(token=word, id=uuid.uuid4())
                 session.add(token)
             document_token = Document_Tokens(document_id=document.id, token_id=token.id)
             session.add(document_token)
+        document.last_index_date = datetime.datetime.now()
+        session.add(document)
         session.commit()
 
 
diff --git a/src/models.py b/src/models.py
index c2c1d07..de7e7a9 100644
--- a/src/models.py
+++ b/src/models.py
@@ -15,6 +15,7 @@ class Documents(Base):
     html_content = Column(String)
     first_crawl_date = Column(DateTime)
     last_crawl_date = Column(DateTime)
+    last_index_date = Column(DateTime)
     document_tokens = relationship("Document_Tokens", back_populates="document")
 
 
diff --git a/src/search.py b/src/search.py
index b95a83f..c5c233e 100755
--- a/src/search.py
+++ b/src/search.py
@@ -1,9 +1,10 @@
 #!/usr/bin/python3
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, func
 from config import DATABASE_URI
-from models import Base, Tokens
+from models import Base, Tokens, Documents, Document_Tokens
 from sqlalchemy.orm import sessionmaker
-
+from sqlalchemy.sql.expression import distinct
+import time
 from flask import Flask
 from urllib.parse import unquote
 
@@ -14,17 +15,71 @@ Session = sessionmaker(bind=engine)
 # Todo - Boolean search (AND/OR/NOT/"")
 
 
+def split_query(query):
+    result = {'ands': [], 'ors': [], 'words': []}
+    query_words = query.split()
+    i = 0
+    while i < len(query_words):
+        if i + 1 < len(query_words):
+            if query_words[i + 1].lower() == "and":
+                if i + 2 < len(query_words):
+                    result['ands'].append(
+                        query_words[i] + ',' + query_words[i+2])
+                    i = i + 3
+                    continue
+        result['words'].append(query_words[i])
+        i += 1
+    return result
+
+
 @app.route("/search/<query>")
 def search(query):
+    start_time = time.time_ns()
     session = Session()
-    result = []
-    query_words = unquote(query).split()
-    for word in query_words:
-        word = word.lower()
-        matching_token = session.query(Tokens).filter_by(token=word).first()
-        if session is None:
-            continue
-        for document_token in matching_token.document_tokens:
+    results = {}
+    query_words = split_query(unquote(query))
+    for a in query_words['ands']:
+        query = session.query(Documents.url, func.count(1)).\
+            join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
+            join(Tokens, Document_Tokens.token_id == Tokens.id).\
+            filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
+            group_by(Documents.url).\
+            having(func.count(distinct(Document_Tokens.token_id)) == 2).\
+            order_by(func.count(1).desc())
+        for result in query.all():
+            if result[0] in results.keys():
+                results[result[0]] += result[1]
+            else:
+                results[result[0]] = result[1]
+    x = session.query(Tokens).filter(
+        Tokens.token.in_(query_words['words'])).limit(1000)
+    for y in x:
+        for document_token in y.document_tokens:
+            if document_token.document.url in results.keys():
+                results[document_token.document.url] += 1
+            else:
+                results[document_token.document.url] = 1
 
-            result.append(document_token.document.url)
-    return result
+    print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
+    return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
+
+# @app.route("/search/<query>")
+# def search(query):
+#    start_time = time.time_ns()
+#    session = Session()
+#    result = {}
+#    query_words = unquote(query).split()
+# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000)
+#    for word in query_words:
+#        word = word.lower()
+#        matching_token = session.query(Tokens).filter_by(token=word).first()
+#
+#        if matching_token is None:
+#            continue
+#        for document_token in matching_token.document_tokens:
+#            if document_token.document.url in result.keys():
+#                result[document_token.document.url] += 1
+#            else:
+#                result[document_token.document.url] = 1
+#    print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
+#    return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]

From 343410e62f5a8754f845d0cba064900285e2edaa Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Fri, 5 Apr 2024 06:22:56 +1030
Subject: [PATCH 08/15] Add first pass youtube subtitle indexer

---
 src/crawl.py | 82 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 70 insertions(+), 12 deletions(-)

diff --git a/src/crawl.py b/src/crawl.py
index e7e35be..bf814e2 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -13,6 +13,7 @@ from models import Base, Documents, Document_Tokens, Tokens
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy import create_engine
 import datetime
+import yt_dlp as youtube_dl
 # TODO- Handle gemini/gopher links
 
 engine = create_engine(DATABASE_URI)
@@ -25,7 +26,64 @@ def get_html(url: str) -> str:
     return response.content
 
 
-def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
+def parse_youtube(video_url: str) -> bool:
+    # Language preference for subtitles (set to None for auto-generated)
+    # Change this to 'en' for English subtitles, or None for auto-generated
+    subtitle_language = 'en'
+    # Options for youtube_dl
+    ydl_opts = {
+        'writesubtitles': True,
+        'allsubtitles': True,
+        'skip_download': True,  # We only want to fetch metadata
+        'subtitleslangs': [subtitle_language] if subtitle_language else None,
+    }
+
+    # Initialize youtube_dl object
+    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+        # Download metadata
+        info_dict = ydl.extract_info(video_url, download=False)
+
+    # Extract subtitles
+    subtitles = info_dict.get('subtitles')
+    subtitles_text = ""
+    # Print available subtitles
+    if subtitles:
+        for subs in subtitles.values():
+            for sub in subs:
+                subtitle_url = sub['url']
+                with youtube_dl.YoutubeDL({}) as ydl:
+                    subtitle_info = ydl.extract_info(
+                        subtitle_url, download=False)
+                    for subtitle in subtitle_info['subtitles'][subtitle_language]:
+                        if subtitle["ext"] == "srv1":
+                            soup = BeautifulSoup(
+                                get_html(subtitle["url"]), 'html.parser')
+                            subtitles_text = soup.get_text()
+
+                            s = Session()
+                            existing_website = s.query(
+                                Documents).filter_by(url=video_url).first()
+                            if existing_website is None:
+                                website = Documents(
+                                    url=video_url,
+                                    text_content=subtitles_text,
+                                    html_content=None,  # soup.prettify(),
+                                    first_crawl_date=datetime.datetime.now(),
+                                    last_crawl_date=datetime.datetime.now(),
+                                    last_index_date=None
+                                )
+                                s.add(website)
+                            else:
+                                existing_website.last_crawl_date = datetime.datetime.now()
+                                s.add(existing_website)
+                            s.commit()
+                            s.close()
+
+
+def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
+    if "youtube.com" in url:
+        parse_youtube(url)
+        return
     rp = urllib.robotparser.RobotFileParser()
     print(url)
     print(recursion)
@@ -49,13 +107,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
     existing_website = s.query(Documents).filter_by(url=url).first()
     if existing_website is None:
         website = Documents(
-                url=url,
-                text_content=soup.get_text(),
-                html_content=soup.prettify(),
-                first_crawl_date=datetime.datetime.now(),
-                last_crawl_date=datetime.datetime.now(),
-                last_index_date=None
-                )
+            url=url,
+            text_content=soup.get_text(),
+            html_content=soup.prettify(),
+            first_crawl_date=datetime.datetime.now(),
+            last_crawl_date=datetime.datetime.now(),
+            last_index_date=None
+        )
         s.add(website)
     else:
         existing_website.last_crawl_date = datetime.datetime.now()
@@ -78,7 +136,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
             try:
                 traversed_links.append(link)
                 link_html = get_html(link)
-                r = recursion -1
+                r = recursion - 1
                 sleep(0.5)
                 parse_html(link, link_html, r, traversed_links)
             except:
@@ -98,9 +156,9 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("url", help="URL of the webpage to be crawled")
     parser.add_argument('-f', "--followlinks", action="store_true")
-    max_recursion = 4 
+    max_recursion = 4
     args = parser.parse_args()
-    if args.url == "links":    
+    if args.url == "links":
         with open('data/links.txt', 'r+') as linksfile:
             while line := linksfile.readline():
                 if "http" in line:
@@ -112,7 +170,7 @@ if __name__ == "__main__":
     else:
         html = get_html(args.url)
         parse_html(args.url, html, max_recursion)
-    
+
 #    recursion = 0
 #    if (args.followlinks):
 #    os.remove('data/links.txt')

From 9d57f66cd763032c7d6ac4d9a6d0b49f8d74cffe Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Fri, 5 Apr 2024 21:36:15 +1030
Subject: [PATCH 09/15] Add beginnings of ngram search capability

---
 src/crawl.py  |  4 ++--
 src/index.py  | 60 +++++++++++++++++++++++++++++++++++++++++++++------
 src/models.py | 32 +++++++++++++++++++++++----
 src/search.py | 31 ++++++++++++++++++++++----
 4 files changed, 110 insertions(+), 17 deletions(-)

diff --git a/src/crawl.py b/src/crawl.py
index bf814e2..c62f4a9 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -1,4 +1,5 @@
 #!/usr/bin/python3
+
 import argparse
 import requests
 import hashlib
@@ -9,9 +10,8 @@ from time import sleep
 from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
-from models import Base, Documents, Document_Tokens, Tokens
+from models import Base, Documents, Document_Tokens
 from sqlalchemy.orm import sessionmaker
-from sqlalchemy import create_engine
 import datetime
 import yt_dlp as youtube_dl
 # TODO- Handle gemini/gopher links
diff --git a/src/index.py b/src/index.py
index d7259ce..e73c93d 100644
--- a/src/index.py
+++ b/src/index.py
@@ -1,24 +1,29 @@
 #!/usr/bin/python3
+
 import argparse
 from sqlalchemy import create_engine, or_
 from config import DATABASE_URI
-from models import Base, Documents, Document_Tokens, Tokens
+from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams
 from sqlalchemy.orm import sessionmaker
 import uuid
 import datetime
+import re
+from multiprocessing import Pool
 
 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
 
 
-def build_index():
+def build_index_chunk(document_chunk):
     session = Session()
-    # Read list of 1000 documents from db
-    documents = session.query(Documents).filter(or_(Documents.last_index_date.is_(None), Documents.last_index_date<Documents.last_crawl_date)).limit(1000)
-    for document in documents:
+    for document in document_chunk:
         print(document.url)
-        content_words = document.text_content.split()
+        content = re.sub(r'[^\w\s]', '', str(document.text_content))
+        content_words = content.split()
+        build_ngrams(3, content_words, session, document.id)
+        build_ngrams(4, content_words, session, document.id)
+        build_ngrams(5, content_words, session, document.id)
         for word in content_words:
             word = word.lower()
             if len(word) > 50:
@@ -27,11 +32,52 @@ def build_index():
             if token is None:
                 token = Tokens(token=word, id=uuid.uuid4())
                 session.add(token)
-            document_token = Document_Tokens(document_id=document.id, token_id=token.id)
+            document_token = Document_Tokens(
+                document_id=document.id, token_id=token.id)
             session.add(document_token)
         document.last_index_date = datetime.datetime.now()
         session.add(document)
         session.commit()
+    session.close()
+
+
+def build_index():
+    session = Session()
+    documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
+        None), Documents.last_index_date < Documents.last_crawl_date)).limit(1000)
+    session.close()
+    
+    documents = list(documents_query)  # Execute the query to get the result set
+
+    chunk_size = 100
+    document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)]
+
+    with Pool() as pool:
+        pool.map(build_index_chunk, document_chunks)
+
+
+def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str):
+    i = 0
+    while i < len(corpus):
+        if i + size >= len(corpus):
+            i = len(corpus)
+        gram = ''
+        for n in range(0, size):
+            if i + n >= len(corpus):
+                break
+            gram += corpus[i+n] + ' '
+        gram = gram.rstrip().lower()
+        print(gram)
+
+        ngram = session.query(NGrams).filter_by(gram=gram).first()
+        if ngram is None:
+            ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
+            session.add(ngram)
+        document_ngram = Document_NGrams(
+            document_id=document_id, ngram_id=ngram.id)
+        session.add(document_ngram)
+        session.commit()
+        i += 1
 
 
 if __name__ == "__main__":
diff --git a/src/models.py b/src/models.py
index de7e7a9..c73ea7d 100644
--- a/src/models.py
+++ b/src/models.py
@@ -1,5 +1,5 @@
 from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, String, DateTime, ForeignKey, Index
+from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer
 from sqlalchemy.dialects.postgresql import UUID
 from sqlalchemy.orm import relationship, mapped_column
 import uuid
@@ -16,7 +16,10 @@ class Documents(Base):
     first_crawl_date = Column(DateTime)
     last_crawl_date = Column(DateTime)
     last_index_date = Column(DateTime)
-    document_tokens = relationship("Document_Tokens", back_populates="document")
+    document_tokens = relationship(
+        "Document_Tokens", back_populates="document")
+    document_ngrams = relationship(
+        "Document_NGrams", back_populates="document")
 
 
 class Document_Tokens(Base):
@@ -25,8 +28,9 @@ class Document_Tokens(Base):
     document_id = mapped_column(ForeignKey("documents.id"))
     # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
     token_id = mapped_column(ForeignKey("tokens.id"))
-    #Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
-    document = relationship("Documents", back_populates="document_tokens", uselist=False)
+    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    document = relationship(
+        "Documents", back_populates="document_tokens", uselist=False)
     token = relationship("Tokens", back_populates="document_tokens")
 
 
@@ -35,3 +39,23 @@ class Tokens(Base):
     id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
     token = Column(String, index=True)
     document_tokens = relationship("Document_Tokens", back_populates="token")
+
+
+class NGrams(Base):
+    __tablename__ = 'ngrams'
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    size = Column(Integer, index=True)
+    gram = Column(String, index=True)
+    document_ngrams = relationship("Document_NGrams", back_populates="ngram")
+
+
+class Document_NGrams(Base):
+    __tablename__ = 'document_ngrams'
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    document_id = mapped_column(ForeignKey("documents.id"))
+    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    ngram_id = mapped_column(ForeignKey("ngrams.id"))
+    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    document = relationship(
+        "Documents", back_populates="document_ngrams", uselist=False)
+    ngram = relationship("NGrams", back_populates="document_ngrams")
diff --git a/src/search.py b/src/search.py
index c5c233e..f77927b 100755
--- a/src/search.py
+++ b/src/search.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python3
 from sqlalchemy import create_engine, func
 from config import DATABASE_URI
-from models import Base, Tokens, Documents, Document_Tokens
+from models import Base, Tokens, Documents, Document_Tokens, NGrams
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.sql.expression import distinct
 import time
@@ -16,7 +16,7 @@ Session = sessionmaker(bind=engine)
 
 
 def split_query(query):
-    result = {'ands': [], 'ors': [], 'words': []}
+    result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []}
     query_words = query.split()
     i = 0
     while i < len(query_words):
@@ -27,19 +27,31 @@ def split_query(query):
                         query_words[i] + ',' + query_words[i+2])
                     i = i + 3
                     continue
+        if query_words[i][0] == '"':
+            n = 0
+            quoted_query = ""
+            while i+n < len(query_words):
+                quoted_query += query_words[i+n] + ' '
+                if query_words[i+n][len(query_words[i+n])-1] == '"':
+                    break
+                n += 1
+            result['ngrams'].append(
+                quoted_query[1:len(quoted_query)-2].rstrip())
+            i += n
+            continue
         result['words'].append(query_words[i])
         i += 1
     return result
 
 
-@app.route("/search/<query>")
+@ app.route("/search/<query>")
 def search(query):
     start_time = time.time_ns()
     session = Session()
     results = {}
     query_words = split_query(unquote(query))
     for a in query_words['ands']:
-        query = session.query(Documents.url, func.count(1)).\
+        query = session.query(Documents.url, func.count(1)). \
             join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
             join(Tokens, Document_Tokens.token_id == Tokens.id).\
             filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
@@ -51,6 +63,17 @@ def search(query):
                 results[result[0]] += result[1]
             else:
                 results[result[0]] = result[1]
+    x = session.query(NGrams).filter(
+        NGrams.gram.in_(query_words['ngrams'])).all()
+
+    for y in x:
+        print(y.gram)
+        for document_ngram in y.document_ngrams:
+            if document_ngram.document.url in results.keys():
+                results[document_ngram.document.url] += 1
+            else:
+                results[document_ngram.document.url] = 1
+
     x = session.query(Tokens).filter(
         Tokens.token.in_(query_words['words'])).limit(1000)
     for y in x:

From 9f0e7e6b299c8439d43494f902410be83576f130 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Sat, 6 Apr 2024 19:34:59 +1030
Subject: [PATCH 10/15] Indexer and query optimisations

---
 src/index.py  | 33 ++++++++++++++---------
 src/search.py | 73 ++++++++++++++++++++++++++++++---------------------
 2 files changed, 63 insertions(+), 43 deletions(-)

diff --git a/src/index.py b/src/index.py
index e73c93d..4629c75 100644
--- a/src/index.py
+++ b/src/index.py
@@ -17,10 +17,12 @@ Session = sessionmaker(bind=engine)
 
 def build_index_chunk(document_chunk):
     session = Session()
+    print(len(document_chunk))
     for document in document_chunk:
         print(document.url)
         content = re.sub(r'[^\w\s]', '', str(document.text_content))
         content_words = content.split()
+        build_ngrams(2, content_words, session, document.id)
         build_ngrams(3, content_words, session, document.id)
         build_ngrams(4, content_words, session, document.id)
         build_ngrams(5, content_words, session, document.id)
@@ -43,17 +45,21 @@ def build_index_chunk(document_chunk):
 
 def build_index():
     session = Session()
-    documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
-        None), Documents.last_index_date < Documents.last_crawl_date)).limit(1000)
-    session.close()
-    
-    documents = list(documents_query)  # Execute the query to get the result set
+    while True:
+        documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
+            None), Documents.last_index_date < Documents.last_crawl_date)).limit(100)
+        session.close()
 
-    chunk_size = 100
-    document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)]
+        # Execute the query to get the result set
+        documents = list(documents_query)
+        if len(documents) == 0:
+            return
+        chunk_size = 10
+        document_chunks = [documents[i:i+chunk_size]
+                           for i in range(0, len(documents), chunk_size)]
 
-    with Pool() as pool:
-        pool.map(build_index_chunk, document_chunks)
+        with Pool() as pool:
+            pool.map(build_index_chunk, document_chunks)
 
 
 def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str):
@@ -66,9 +72,10 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str
             if i + n >= len(corpus):
                 break
             gram += corpus[i+n] + ' '
-        gram = gram.rstrip().lower()
-        print(gram)
-
+        gram = gram.strip().lower()
+        if len(gram) > 4000:
+            i += 1
+            continue
         ngram = session.query(NGrams).filter_by(gram=gram).first()
         if ngram is None:
             ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
@@ -76,7 +83,7 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str
         document_ngram = Document_NGrams(
             document_id=document_id, ngram_id=ngram.id)
         session.add(document_ngram)
-        session.commit()
+    # session.commit()
         i += 1
 
 
diff --git a/src/search.py b/src/search.py
index f77927b..0dedf77 100755
--- a/src/search.py
+++ b/src/search.py
@@ -50,38 +50,51 @@ def search(query):
     session = Session()
     results = {}
     query_words = split_query(unquote(query))
-    for a in query_words['ands']:
-        query = session.query(Documents.url, func.count(1)). \
-            join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
-            join(Tokens, Document_Tokens.token_id == Tokens.id).\
-            filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
-            group_by(Documents.url).\
-            having(func.count(distinct(Document_Tokens.token_id)) == 2).\
-            order_by(func.count(1).desc())
-        for result in query.all():
-            if result[0] in results.keys():
-                results[result[0]] += result[1]
-            else:
-                results[result[0]] = result[1]
-    x = session.query(NGrams).filter(
-        NGrams.gram.in_(query_words['ngrams'])).all()
+    if len(query_words['ands']) > 0:
+        for a in query_words['ands']:
+            query = session.query(Documents.url, func.count(1)). \
+                join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
+                join(Tokens, Document_Tokens.token_id == Tokens.id).\
+                filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
+                group_by(Documents.url).\
+                having(func.count(distinct(Document_Tokens.token_id)) == 2).\
+                order_by(func.count(1).desc())
+            for result in query.all():
+                if result[0] in results.keys():
+                    results[result[0]] += result[1]
+                else:
+                    results[result[0]] = result[1]
+    if len(query_words['ngrams']) > 0:
+        print('entering ngrams: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
 
-    for y in x:
-        print(y.gram)
-        for document_ngram in y.document_ngrams:
-            if document_ngram.document.url in results.keys():
-                results[document_ngram.document.url] += 1
-            else:
-                results[document_ngram.document.url] = 1
+        q = session.query(NGrams)
+        for ngram in query_words['ngrams']:
+            q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
+        print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
 
-    x = session.query(Tokens).filter(
-        Tokens.token.in_(query_words['words'])).limit(1000)
-    for y in x:
-        for document_token in y.document_tokens:
-            if document_token.document.url in results.keys():
-                results[document_token.document.url] += 1
-            else:
-                results[document_token.document.url] = 1
+        x = q.all()
+        for y in x:
+            for document_ngram in y.document_ngrams:
+                if document_ngram.document.url in results.keys():
+                    results[document_ngram.document.url] += 1
+                else:
+                    results[document_ngram.document.url] = 1
+        print('exiting ngrams: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
+    if len(query_words['words']) > 0:
+        print('entering words: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
+        x = session.query(Tokens).filter(
+            Tokens.token.in_(query_words['words'])).limit(1000)
+        for y in x:
+            for document_token in y.document_tokens:
+                if document_token.document.url in results.keys():
+                    results[document_token.document.url] += 1
+                else:
+                    results[document_token.document.url] = 1
+        print('exiting words: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
 
     print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
     return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]

From bdb4064acce0d3a204d73acd4d96e8ce39bfae32 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Sat, 4 May 2024 21:10:46 +0930
Subject: [PATCH 11/15] Rework ngram generation. Greatly improve performance of
 indexer. Commit horrendous sql sins

---
 src/crawl.py  |  10 ++++-
 src/index.py  | 115 +++++++++++++++++++++++++++++++++++++-------------
 src/models.py |  15 ++++++-
 src/search.py |  59 +++++++++++++++++---------
 todo          |  13 +++---
 5 files changed, 155 insertions(+), 57 deletions(-)

diff --git a/src/crawl.py b/src/crawl.py
index c62f4a9..467b434 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -20,6 +20,9 @@ engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
 
+excluded_domains = ['amazon.', 'news.ycombinator.',
+                    'facebook.com', 'amzn', 'fb.com']
+
 
 def get_html(url: str) -> str:
     response = requests.get(url)
@@ -36,6 +39,7 @@ def parse_youtube(video_url: str) -> bool:
         'allsubtitles': True,
         'skip_download': True,  # We only want to fetch metadata
         'subtitleslangs': [subtitle_language] if subtitle_language else None,
+        'extractor-args': {'youtube': {'player_client': 'ios,web'}},
     }
 
     # Initialize youtube_dl object
@@ -132,6 +136,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
             continue
         if "http" not in link:
             link = urljoin(url, link)
+        link = link.split('?')[0]
+        link = link.split('#')[0]
         if (recursion > 0 and link not in traversed_links):
             try:
                 traversed_links.append(link)
@@ -156,8 +162,10 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("url", help="URL of the webpage to be crawled")
     parser.add_argument('-f', "--followlinks", action="store_true")
-    max_recursion = 4
+    parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
+
     args = parser.parse_args()
+    max_recursion = int(args.max_recursion)
     if args.url == "links":
         with open('data/links.txt', 'r+') as linksfile:
             while line := linksfile.readline():
diff --git a/src/index.py b/src/index.py
index 4629c75..542424c 100644
--- a/src/index.py
+++ b/src/index.py
@@ -1,51 +1,55 @@
 #!/usr/bin/python3
 
 import argparse
-from sqlalchemy import create_engine, or_
+from sqlalchemy import create_engine, or_, text
+from sqlalchemy import Table, Column, String, Integer
 from config import DATABASE_URI
+from sqlalchemy.dialects.postgresql import UUID
 from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams
 from sqlalchemy.orm import sessionmaker
+from sqlalchemy.exc import SQLAlchemyError
 import uuid
 import datetime
+import time
 import re
+import random
 from multiprocessing import Pool
 
 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
+# https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html
+
+
+def contains_latin(text):
+    latin_pattern = r'[a-zA-ZÀ-ÖØ-öø-ÿ]'
+    return bool(re.search(latin_pattern, text))
 
 
 def build_index_chunk(document_chunk):
     session = Session()
     print(len(document_chunk))
+    start_time = time.time_ns()
     for document in document_chunk:
         print(document.url)
-        content = re.sub(r'[^\w\s]', '', str(document.text_content))
+        content = re.sub(r'[.,?!]', ' ', str(document.text_content))
+        content = re.sub(r'[^\w\s]', '', str(content))
         content_words = content.split()
-        build_ngrams(2, content_words, session, document.id)
-        build_ngrams(3, content_words, session, document.id)
-        build_ngrams(4, content_words, session, document.id)
-        build_ngrams(5, content_words, session, document.id)
-        for word in content_words:
-            word = word.lower()
-            if len(word) > 50:
-                continue
-            token = session.query(Tokens).filter_by(token=word).first()
-            if token is None:
-                token = Tokens(token=word, id=uuid.uuid4())
-                session.add(token)
-            document_token = Document_Tokens(
-                document_id=document.id, token_id=token.id)
-            session.add(document_token)
+        build_ngrams(1, content_words, document.id)
+        build_ngrams(2, content_words, document.id)
+        build_ngrams(3, content_words, document.id)
+        build_ngrams(4, content_words, document.id)
+        build_ngrams(5, content_words, document.id)
+
         document.last_index_date = datetime.datetime.now()
-        session.add(document)
+        session.merge(document)
         session.commit()
     session.close()
 
 
 def build_index():
-    session = Session()
     while True:
+        session = Session()
         documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
             None), Documents.last_index_date < Documents.last_crawl_date)).limit(100)
         session.close()
@@ -54,16 +58,62 @@ def build_index():
         documents = list(documents_query)
         if len(documents) == 0:
             return
+        build_index_chunk(documents)
+        continue
         chunk_size = 10
         document_chunks = [documents[i:i+chunk_size]
                            for i in range(0, len(documents), chunk_size)]
-
         with Pool() as pool:
             pool.map(build_index_chunk, document_chunks)
 
 
-def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str):
+def zip_ngrams(size: int, corpus, document_id):
+    size = int(size)
+    connection = engine.connect()
+    temptbl_name = 'temp_del_{}'.format(random.randint(100000, 9999999))
+    temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column(
+        'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True)
+
+    try:
+        # Start transaction
+        with connection.begin():
+            temptbl.create(engine)
+            insert_grams = []
+            grams = zip(*[corpus[i:] for i in range(size)])
+            for gram in grams:
+                gram = ' '.join(gram).lower()
+                insert_grams.append(
+                    {"id": uuid.uuid4(), "gram": gram, "size": size})
+            connection.execute(temptbl.insert().values(insert_grams))
+            connection.execute(text("UPDATE " + temptbl_name +
+                                    " SET id = ngrams.id FROM ngrams WHERE ngrams.gram = "
+                                    + temptbl_name + ".gram;"))
+            connection.execute(text("INSERT INTO ngrams (id, gram, size) SELECT " +
+                                    " distinct t.id, t.gram as gram, t.size FROM " +
+                                    temptbl_name + " t LEFT JOIN ngrams on ngrams.gram = " +
+                                    "t.gram WHERE ngrams.id is null and t.size is not null " + " ON CONFLICT DO NOTHING;"))
+            connection.execute(text("INSERT INTO document_ngrams(id, document_id, ngram_id) SELECT DISTINCT " +
+                                    "uuid_generate_v4() , '" + str(document_id) + "'::UUID, t.id FROM " + temptbl_name + " t;"))
+    except SQLAlchemyError as e:
+        # Handle exceptions
+        print("An error occurred:", e)
+        # Rollback transaction
+        connection.rollback()
+    else:
+        # Commit transaction if no exceptions occurred
+        connection.commit()
+    finally:
+        connection.close()
+        # Drop table outside the transaction block
+        temptbl.drop(engine)
+
+
+def build_ngrams(size: int, corpus: str, document_id: str):
+    session = Session()
+    zip_ngrams(size, corpus, document_id)
+    return
     i = 0
+    grams = []
     while i < len(corpus):
         if i + size >= len(corpus):
             i = len(corpus)
@@ -73,18 +123,23 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str
                 break
             gram += corpus[i+n] + ' '
         gram = gram.strip().lower()
-        if len(gram) > 4000:
+        if len(gram) > 1000 or gram in grams or not contains_latin(gram):
             i += 1
             continue
-        ngram = session.query(NGrams).filter_by(gram=gram).first()
-        if ngram is None:
-            ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
-            session.add(ngram)
-        document_ngram = Document_NGrams(
-            document_id=document_id, ngram_id=ngram.id)
-        session.add(document_ngram)
-    # session.commit()
+        grams.append(gram)
+        if (len(gram) > 1):
+            ngram = session.query(NGrams).filter_by(
+                gram=gram).filter_by(size=size).first()
+            if ngram is None:
+                ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
+                session.add(ngram)
+            document_ngram = Document_NGrams(
+                document_id=document_id, ngram_id=ngram.id)
+            session.add(document_ngram)
+            session.commit()
         i += 1
+#    print(str((time.time_ns() - start_time)//1_000_000))
+    session.close()
 
 
 if __name__ == "__main__":
diff --git a/src/models.py b/src/models.py
index c73ea7d..50010b6 100644
--- a/src/models.py
+++ b/src/models.py
@@ -32,6 +32,12 @@ class Document_Tokens(Base):
     document = relationship(
         "Documents", back_populates="document_tokens", uselist=False)
     token = relationship("Tokens", back_populates="document_tokens")
+    __table_args__ = (
+        Index('idx_document_tokens_document_id_token_id', 'document_id',
+              'token_id', unique=True, postgresql_using='hash'),
+        Index('idx_document_tokens_clustered', 'document_id',
+              'token_id', postgresql_using='hash'),
+    )
 
 
 class Tokens(Base):
@@ -53,9 +59,14 @@ class Document_NGrams(Base):
     __tablename__ = 'document_ngrams'
     id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
     document_id = mapped_column(ForeignKey("documents.id"))
-    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
     ngram_id = mapped_column(ForeignKey("ngrams.id"))
-    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
     document = relationship(
         "Documents", back_populates="document_ngrams", uselist=False)
     ngram = relationship("NGrams", back_populates="document_ngrams")
+
+    __table_args__ = (
+        Index('idx_document_ngrams_document_id_ngram_id', 'document_id',
+              'ngram_id', unique=True, postgresql_using='hash'),
+        Index('idx_document_ngrams_clustered', 'document_id',
+              'ngram_id', postgresql_using='hash'),
+    )
diff --git a/src/search.py b/src/search.py
index 0dedf77..6033e60 100755
--- a/src/search.py
+++ b/src/search.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python3
 from sqlalchemy import create_engine, func
 from config import DATABASE_URI
-from models import Base, Tokens, Documents, Document_Tokens, NGrams
+from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.sql.expression import distinct
 import time
@@ -37,9 +37,9 @@ def split_query(query):
                 n += 1
             result['ngrams'].append(
                 quoted_query[1:len(quoted_query)-2].rstrip())
-            i += n
+            i += n + 1
             continue
-        result['words'].append(query_words[i])
+        result['ngrams'].append(query_words[i])
         i += 1
     return result
 
@@ -50,6 +50,7 @@ def search(query):
     session = Session()
     results = {}
     query_words = split_query(unquote(query))
+    print(query_words)
     if len(query_words['ands']) > 0:
         for a in query_words['ands']:
             query = session.query(Documents.url, func.count(1)). \
@@ -68,35 +69,55 @@ def search(query):
         print('entering ngrams: ' +
               str((time.time_ns() - start_time) // 1_000_000) + "ms")
 
-        q = session.query(NGrams)
+        q = session.query(Documents.url, func.count(1)) \
+            .join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
+            .join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
+            .group_by(Documents.url)
         for ngram in query_words['ngrams']:
             q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
         print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
-
+        print(q)
         x = q.all()
-        for y in x:
-            for document_ngram in y.document_ngrams:
-                if document_ngram.document.url in results.keys():
-                    results[document_ngram.document.url] += 1
-                else:
-                    results[document_ngram.document.url] = 1
+        print('query executed: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
+        print(x)
+        for result in x:
+            if result[0] in results.keys():
+                results[result[0]] += result[1]
+            else:
+                results[result[0]] = result[1]
+#        for y in x:
+#            print(y)
+#            for document_ngram in y.document_ngrams:
+#                if document_ngram.document.url in results.keys():
+#                    results[document_ngram.document.url] += 1
+#                else:
+#                    results[document_ngram.document.url] = 1
         print('exiting ngrams: ' +
               str((time.time_ns() - start_time) // 1_000_000) + "ms")
     if len(query_words['words']) > 0:
         print('entering words: ' +
               str((time.time_ns() - start_time) // 1_000_000) + "ms")
-        x = session.query(Tokens).filter(
-            Tokens.token.in_(query_words['words'])).limit(1000)
-        for y in x:
-            for document_token in y.document_tokens:
-                if document_token.document.url in results.keys():
-                    results[document_token.document.url] += 1
-                else:
-                    results[document_token.document.url] = 1
+        q = session.query(Documents.url, func.count(1)) \
+            .join(Document_Tokens, Documents.id == Document_Tokens.document_id) \
+            .join(Tokens, Document_Tokens.token_id == Tokens.id) \
+            .group_by(Documents.url).filter(Tokens.token.in_(query_words['words']))
+
+        print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
+        print(q)
+        x = q.all()
+        print('query executed: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
+        for result in x:
+            if result[0] in results.keys():
+                results[result[0]] += result[1]
+            else:
+                results[result[0]] = result[1]
         print('exiting words: ' +
               str((time.time_ns() - start_time) // 1_000_000) + "ms")
 
     print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
+    session.close()
     return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
 
 # @app.route("/search/<query>")
diff --git a/todo b/todo
index 2c7e8cc..328320b 100644
--- a/todo
+++ b/todo
@@ -1,6 +1,9 @@
-[ ] Refactor website table to generic document table (maybe using URN instead of URL?)
-[ ] Define tokens table FKed to document table
-[ ] Refactor index.py to tokenize input into tokens table
-[ ] Define N-Grams table 
-[ ] Add N-Gram generation to index.py
+[x] Refactor website table to generic document table (maybe using URN instead of URL?)
+[x] Define tokens table FKed to document table
+[x] Refactor index.py to tokenize input into tokens table
+[x] Define N-Grams table 
+[x] Add N-Gram generation to index.py
+[x] Add clustered index to document_ngrams table model
+[x] Add clustered index to document_tokens table model
+[ ] Add ddl command to create partition tables
 

From 98efe9d1a2cb5fe913f4ca7bf8719c872793cdbe Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Sun, 5 May 2024 19:06:56 +0930
Subject: [PATCH 12/15] Fix temp table being randomly dropped due to name
 collision. Fix multi-word non-phrase search

---
 src/index.py  |  3 ++-
 src/search.py | 10 ++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/index.py b/src/index.py
index 542424c..679d312 100644
--- a/src/index.py
+++ b/src/index.py
@@ -70,7 +70,8 @@ def build_index():
 def zip_ngrams(size: int, corpus, document_id):
     size = int(size)
     connection = engine.connect()
-    temptbl_name = 'temp_del_{}'.format(random.randint(100000, 9999999))
+    temptbl_name = 'temp_del_{}'.format(
+        time.time_ns() + random.randint(100000, 9999999))
     temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column(
         'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True)
 
diff --git a/src/search.py b/src/search.py
index 6033e60..d643eb2 100755
--- a/src/search.py
+++ b/src/search.py
@@ -1,5 +1,5 @@
 #!/usr/bin/python3
-from sqlalchemy import create_engine, func
+from sqlalchemy import create_engine, func, and_, or_
 from config import DATABASE_URI
 from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams
 from sqlalchemy.orm import sessionmaker
@@ -73,8 +73,14 @@ def search(query):
             .join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
             .join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
             .group_by(Documents.url)
+        conditions = []
         for ngram in query_words['ngrams']:
-            q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
+            conditions.append(
+                (NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram))
+#            q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
+        and_conditions = [and_(*condition_pair)
+                          for condition_pair in conditions]
+        q = q.filter(or_(*and_conditions))
         print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
         print(q)
         x = q.all()

From e3c67b64e63762e50899390afa7f5d4cca3d6c17 Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Sat, 8 Jun 2024 20:24:21 +0930
Subject: [PATCH 13/15] Make excluded file types more robust

---
 src/crawl.py | 10 +++++++---
 todo         |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/crawl.py b/src/crawl.py
index 467b434..6966a25 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -23,6 +23,9 @@ Session = sessionmaker(bind=engine)
 excluded_domains = ['amazon.', 'news.ycombinator.',
                     'facebook.com', 'amzn', 'fb.com']
 
+excluded_filetypes = [".jpg", ".xml", ".mp4",
+                      ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
+
 
 def get_html(url: str) -> str:
     response = requests.get(url)
@@ -85,6 +88,9 @@ def parse_youtube(video_url: str) -> bool:
 
 
 def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
+    for domain in excluded_domains:
+        if domain in url:
+            return
     if "youtube.com" in url:
         parse_youtube(url)
         return
@@ -124,15 +130,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
         s.add(existing_website)
     s.commit()
     s.close()
-    x = open(f'data/links.txt', 'a')
-    x.close()
     links = soup.find_all("a", href=True)
     for link in links:
         found = False
         link = link["href"]
         if (len(link) > 0 and link[0] == "#") or "localhost" in link:
             continue
-        if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
+        if any(ext in link for ext in excluded_filetypes):
             continue
         if "http" not in link:
             link = urljoin(url, link)
diff --git a/todo b/todo
index 328320b..2f5f3e5 100644
--- a/todo
+++ b/todo
@@ -6,4 +6,6 @@
 [x] Add clustered index to document_ngrams table model
 [x] Add clustered index to document_tokens table model
 [ ] Add ddl command to create partition tables
+[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
+[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that
 

From 2a99a61dbe098774067d47f92e76c16175d8dedf Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Sat, 8 Jun 2024 20:43:05 +0930
Subject: [PATCH 14/15] Add site map crawl option

---
 src/crawl.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/crawl.py b/src/crawl.py
index 6966a25..0816e1b 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -166,6 +166,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("url", help="URL of the webpage to be crawled")
     parser.add_argument('-f', "--followlinks", action="store_true")
+    parser.add_argument('-s', "--crawl-sitemap", action="store_true")
     parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
 
     args = parser.parse_args()
@@ -178,7 +179,21 @@ if __name__ == "__main__":
                         parse_html(line, get_html(line))
                     except:
                         pass
-
+    elif args.crawl_sitemap:
+        rp = urllib.robotparser.RobotFileParser()
+        urlparts = urlparse(args.url)
+        baseurl = urlparts.scheme + "://" + urlparts.netloc
+        rp.set_url(baseurl + "/robots.txt")
+        rp.read()
+        if not rp.can_fetch("*", args.url):
+            print("Robots prevents crawling url: " + args.url)
+            exit(0)
+        if len(rp.site_maps()) > 0:
+            map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml')
+            for loc in map.find_all('loc'):
+                url = loc.contents[0]
+                html = get_html(url)
+                parse_html(url, html, max_recursion)
     else:
         html = get_html(args.url)
         parse_html(args.url, html, max_recursion)

From bbba459480971d678880584efd7871d7acc5328d Mon Sep 17 00:00:00 2001
From: rmgr <jake@rmgr.dev>
Date: Sun, 9 Jun 2024 21:53:57 +0930
Subject: [PATCH 15/15] Clean up site map scanning. Return all results instead
 of 10

---
 client/src/css/styles.css | 33 +++++++++++++++++
 client/src/index.html     | 16 +++++++++
 client/src/js/index.js    | 28 +++++++++++++++
 src/crawl.py              | 42 +++++++++++++---------
 src/search.py             | 74 +++++++++++++++++++--------------------
 todo                      |  4 +--
 6 files changed, 140 insertions(+), 57 deletions(-)
 create mode 100644 client/src/css/styles.css
 create mode 100644 client/src/index.html
 create mode 100644 client/src/js/index.js

diff --git a/client/src/css/styles.css b/client/src/css/styles.css
new file mode 100644
index 0000000..37323ab
--- /dev/null
+++ b/client/src/css/styles.css
@@ -0,0 +1,33 @@
+html, body {
+    height: 100%;
+}
+body {
+    margin: 0;
+}
+input {
+    padding: 7px;
+    font-size: 1.1rem;
+}
+.search-container {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    text-align: center;
+    min-height: 25vh;
+}
+
+.flex-container {
+    padding: 0;
+    margin: 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    flex-direction: column;
+}
+.flex-item {
+}
+.result {
+    display:block;
+    max-width: 60vw;
+    overflow-x: hidden;
+}
diff --git a/client/src/index.html b/client/src/index.html
new file mode 100644
index 0000000..a748d6c
--- /dev/null
+++ b/client/src/index.html
@@ -0,0 +1,16 @@
+<html>
+
+	<head>
+		<link rel="stylesheet" href="css/styles.css">
+	</head>
+	<body>
+		<div class="search-container">
+			<input type="text" class="searchbox" id="searchbox">
+		</div>
+		<div class="flex-container">
+			<div class="flex-item" id="results">
+			</div>
+		</div>
+		<script src="js/index.js"></script>
+	</body>
+</html>
diff --git a/client/src/js/index.js b/client/src/js/index.js
new file mode 100644
index 0000000..09b0bb2
--- /dev/null
+++ b/client/src/js/index.js
@@ -0,0 +1,28 @@
+function debounce(func, timeout = 300){
+  let timer;
+  return (...args) => {
+    clearTimeout(timer);
+    timer = setTimeout(() => { func.apply(this, args); }, timeout);
+  };
+}
+async function search(searchBox){
+  const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
+  const results = await response.json();
+  
+  const resultView = document.getElementById("results");
+  resultView.replaceChildren();
+  for (let i = 0; i < results.length; i++){
+    let result = results[i];
+    let resultElement = document.createElement("a");
+    resultElement.innerText = result[0];
+    resultElement.href = result[0];
+    resultElement.className = "flex-item result";
+    resultView.appendChild(resultElement);
+  }
+}
+
+const searchBoxKeyUp = debounce(() => search())
+
+const searchBox = document.getElementById("searchbox");
+
+searchBox.addEventListener("keyup", debounce(() => search(searchBox)))
diff --git a/src/crawl.py b/src/crawl.py
index 0816e1b..1480b4e 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -2,7 +2,6 @@
 
 import argparse
 import requests
-import hashlib
 from urllib.parse import urlparse, urljoin
 import urllib.robotparser
 import os
@@ -10,7 +9,7 @@ from time import sleep
 from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
-from models import Base, Documents, Document_Tokens
+from models import Base, Documents
 from sqlalchemy.orm import sessionmaker
 import datetime
 import yt_dlp as youtube_dl
@@ -23,7 +22,7 @@ Session = sessionmaker(bind=engine)
 excluded_domains = ['amazon.', 'news.ycombinator.',
                     'facebook.com', 'amzn', 'fb.com']
 
-excluded_filetypes = [".jpg", ".xml", ".mp4",
+excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
                       ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
 
 
@@ -33,6 +32,7 @@ def get_html(url: str) -> str:
 
 
 def parse_youtube(video_url: str) -> bool:
+    return
     # Language preference for subtitles (set to None for auto-generated)
     # Change this to 'en' for English subtitles, or None for auto-generated
     subtitle_language = 'en'
@@ -91,6 +91,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
     for domain in excluded_domains:
         if domain in url:
             return
+    if any(ext in url for ext in excluded_filetypes):
+        return
     if "youtube.com" in url:
         parse_youtube(url)
         return
@@ -110,8 +112,6 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
         return
 
     soup = BeautifulSoup(html, 'html.parser')
-    hash = hashlib.sha256()
-    hash.update(url.encode('ascii'))
 
     s = Session()
     existing_website = s.query(Documents).filter_by(url=url).first()
@@ -151,13 +151,25 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
                 parse_html(link, link_html, r, traversed_links)
             except:
                 pass
-        elif link not in traversed_links:
-            with open('data/links.txt', 'r+') as linksfile:
-                while line := linksfile.readline():
-                    if line.strip() == link.strip():
-                        found = True
-                if not found:
-                    linksfile.write(f'{link}\n')
+#        elif link not in traversed_links:
+#            with open('data/links.txt', 'r+') as linksfile:
+#                while line := linksfile.readline():
+#                    if line.strip() == link.strip():
+#                        found = True
+#                if not found:
+#                    linksfile.write(f'{link}\n')
+
+
+def parse_site_map(base_url):
+    map = BeautifulSoup(requests.get(base_url).content, 'xml')
+    print(map.find_all('loc'))
+    for loc in map.find_all('loc'):
+        if "xml" in loc.contents[0]:
+            parse_site_map(loc.contents[0])
+        else:
+            url = loc.contents[0]
+            html = get_html(url)
+            parse_html(url, html, max_recursion)
 
 
 if __name__ == "__main__":
@@ -189,11 +201,7 @@ if __name__ == "__main__":
             print("Robots prevents crawling url: " + args.url)
             exit(0)
         if len(rp.site_maps()) > 0:
-            map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml')
-            for loc in map.find_all('loc'):
-                url = loc.contents[0]
-                html = get_html(url)
-                parse_html(url, html, max_recursion)
+            parse_site_map(rp.site_maps()[0])
     else:
         html = get_html(args.url)
         parse_html(args.url, html, max_recursion)
diff --git a/src/search.py b/src/search.py
index d643eb2..fd013bc 100755
--- a/src/search.py
+++ b/src/search.py
@@ -1,14 +1,17 @@
 #!/usr/bin/python3
-from sqlalchemy import create_engine, func, and_, or_
+from sqlalchemy import create_engine, func, and_, or_, not_
 from config import DATABASE_URI
-from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams
+from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.sql.expression import distinct
 import time
 from flask import Flask
+from flask_cors import CORS
+from flask import send_from_directory
 from urllib.parse import unquote
 
-app = Flask(__name__)
+app = Flask(__name__, static_url_path='/static/')
+CORS(app)
 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
@@ -16,7 +19,9 @@ Session = sessionmaker(bind=engine)
 
 
 def split_query(query):
-    result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []}
+    query = query.lower()
+    result = {'ands': [], 'ors': [], 'words': [],
+              'ngrams': [], 'exclusions': []}
     query_words = query.split()
     i = 0
     while i < len(query_words):
@@ -39,6 +44,11 @@ def split_query(query):
                 quoted_query[1:len(quoted_query)-2].rstrip())
             i += n + 1
             continue
+        elif query_words[i][0] == "-":
+            excluded_query = query_words[i][1: len(query_words[i])]
+            result['exclusions'].append(excluded_query)
+            i += 1
+            continue
         result['ngrams'].append(query_words[i])
         i += 1
     return result
@@ -52,19 +62,26 @@ def search(query):
     query_words = split_query(unquote(query))
     print(query_words)
     if len(query_words['ands']) > 0:
+        print('entering ands: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
         for a in query_words['ands']:
             query = session.query(Documents.url, func.count(1)). \
-                join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
-                join(Tokens, Document_Tokens.token_id == Tokens.id).\
-                filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
-                group_by(Documents.url).\
-                having(func.count(distinct(Document_Tokens.token_id)) == 2).\
+                join(Document_NGrams, Documents.id == Document_NGrams.document_id). \
+                join(NGrams, Document_NGrams.ngram_id == NGrams.id). \
+                filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\
+                group_by(Documents.url). \
+                having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \
                 order_by(func.count(1).desc())
+
+#                limit(100)
+            print(query)
             for result in query.all():
                 if result[0] in results.keys():
                     results[result[0]] += result[1]
                 else:
                     results[result[0]] = result[1]
+        print('exiting ands: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
     if len(query_words['ngrams']) > 0:
         print('entering ngrams: ' +
               str((time.time_ns() - start_time) // 1_000_000) + "ms")
@@ -83,7 +100,7 @@ def search(query):
         q = q.filter(or_(*and_conditions))
         print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
         print(q)
-        x = q.all()
+        x = q.limit(100).all()
         print('query executed: ' +
               str((time.time_ns() - start_time) // 1_000_000) + "ms")
         print(x)
@@ -101,30 +118,11 @@ def search(query):
 #                    results[document_ngram.document.url] = 1
         print('exiting ngrams: ' +
               str((time.time_ns() - start_time) // 1_000_000) + "ms")
-    if len(query_words['words']) > 0:
-        print('entering words: ' +
-              str((time.time_ns() - start_time) // 1_000_000) + "ms")
-        q = session.query(Documents.url, func.count(1)) \
-            .join(Document_Tokens, Documents.id == Document_Tokens.document_id) \
-            .join(Tokens, Document_Tokens.token_id == Tokens.id) \
-            .group_by(Documents.url).filter(Tokens.token.in_(query_words['words']))
-
-        print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
-        print(q)
-        x = q.all()
-        print('query executed: ' +
-              str((time.time_ns() - start_time) // 1_000_000) + "ms")
-        for result in x:
-            if result[0] in results.keys():
-                results[result[0]] += result[1]
-            else:
-                results[result[0]] = result[1]
-        print('exiting words: ' +
-              str((time.time_ns() - start_time) // 1_000_000) + "ms")
 
     print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
     session.close()
-    return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
+    return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
+
 
 # @app.route("/search/<query>")
 # def search(query):
@@ -132,17 +130,17 @@ def search(query):
 #    session = Session()
 #    result = {}
 #    query_words = unquote(query).split()
-# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000)
+# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
 #    for word in query_words:
 #        word = word.lower()
-#        matching_token = session.query(Tokens).filter_by(token=word).first()
+#        matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
 #
-#        if matching_token is None:
+#        if matching_ngram is None:
 #            continue
-#        for document_token in matching_token.document_tokens:
-#            if document_token.document.url in result.keys():
-#                result[document_token.document.url] += 1
+#        for document_ngram in matching_ngram.document_ngrams:
+#            if document_ngram.document.url in result.keys():
+#                result[document_ngram.document.url] += 1
 #            else:
-#                result[document_token.document.url] = 1
+#                result[document_ngram.document.url] = 1
 #    print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
 #    return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
diff --git a/todo b/todo
index 2f5f3e5..ddda3bd 100644
--- a/todo
+++ b/todo
@@ -6,6 +6,6 @@
 [x] Add clustered index to document_ngrams table model
 [x] Add clustered index to document_tokens table model
 [ ] Add ddl command to create partition tables
-[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
-[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that
+[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
+[x] Instead of starting from a random page on the site, go to root and find site map and crawl that