Clean up site map scanning. Return all results instead of 10

Add site map crawl option
Make excluded file types more robust
2024-06-09 21:53:57 +09:30 · 2024-06-08 20:43:05 +09:30 · 2024-06-08 20:24:21 +09:30 · 2024-05-05 19:06:56 +09:30 · 2024-05-04 21:10:46 +09:30 · 2024-04-06 19:34:59 +10:30
10 changed files with 632 additions and 113 deletions
--- a/client/src/css/styles.css
+++ b/client/src/css/styles.css
@ -0,0 +1,33 @@
+html, body {
+    height: 100%;
+}
+body {
+    margin: 0;
+}
+input {
+    padding: 7px;
+    font-size: 1.1rem;
+}
+.search-container {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    text-align: center;
+    min-height: 25vh;
+}
+
+.flex-container {
+    padding: 0;
+    margin: 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    flex-direction: column;
+}
+.flex-item {
+}
+.result {
+    display:block;
+    max-width: 60vw;
+    overflow-x: hidden;
+}
--- a/client/src/index.html
+++ b/client/src/index.html
@ -0,0 +1,16 @@
+<html>
+
+	<head>
+		<link rel="stylesheet" href="css/styles.css">
+	</head>
+	<body>
+		<div class="search-container">
+			<input type="text" class="searchbox" id="searchbox">
+		</div>
+		<div class="flex-container">
+			<div class="flex-item" id="results">
+			</div>
+		</div>
+		<script src="js/index.js"></script>
+	</body>
+</html>
--- a/client/src/js/index.js
+++ b/client/src/js/index.js
@ -0,0 +1,28 @@
+function debounce(func, timeout = 300){
+  let timer;
+  return (...args) => {
+    clearTimeout(timer);
+    timer = setTimeout(() => { func.apply(this, args); }, timeout);
+  };
+}
+async function search(searchBox){
+  const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
+  const results = await response.json();
+  
+  const resultView = document.getElementById("results");
+  resultView.replaceChildren();
+  for (let i = 0; i < results.length; i++){
+    let result = results[i];
+    let resultElement = document.createElement("a");
+    resultElement.innerText = result[0];
+    resultElement.href = result[0];
+    resultElement.className = "flex-item result";
+    resultView.appendChild(resultElement);
+  }
+}
+
+const searchBoxKeyUp = debounce(() => search())
+
+const searchBox = document.getElementById("searchbox");
+
+searchBox.addEventListener("keyup", debounce(() => search(searchBox)))
--- a/src/pycache/search.cpython-310.pyc
+++ b/src/pycache/search.cpython-310.pyc
--- a/src/crawl.py
+++ b/src/crawl.py
@ -1,49 +1,83 @@
 #!/usr/bin/python3
+
 import argparse
 import requests
-import hashlib
 from urllib.parse import urlparse, urljoin
+import urllib.robotparser
 import os
 from time import sleep
 from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
-from models import Base, Website
+from models import Base, Documents
 from sqlalchemy.orm import sessionmaker
-from sqlalchemy import create_engine
 import datetime
+import yt_dlp as youtube_dl
 # TODO- Handle gemini/gopher links
-# TODO- Keep a list of traversed links and check before traversing again

 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)

-def get_html(url: str) -> str:
+excluded_domains = ['amazon.', 'news.ycombinator.',
+                    'facebook.com', 'amzn', 'fb.com']

+excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
+                      ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
+
+
+def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content

-def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:

-    print(url)
-    print(recursion)
-    urlparts = urlparse(url)
-    baseurl = urlparts.scheme + "://" + urlparts.netloc
-    soup = BeautifulSoup(html,'html.parser')
-    hash = hashlib.sha256()
-    hash.update(url.encode('ascii'))
+def parse_youtube(video_url: str) -> bool:
+    return
+    # Language preference for subtitles (set to None for auto-generated)
+    # Change this to 'en' for English subtitles, or None for auto-generated
+    subtitle_language = 'en'
+    # Options for youtube_dl
+    ydl_opts = {
+        'writesubtitles': True,
+        'allsubtitles': True,
+        'skip_download': True,  # We only want to fetch metadata
+        'subtitleslangs': [subtitle_language] if subtitle_language else None,
+        'extractor-args': {'youtube': {'player_client': 'ios,web'}},
+    }
+
+    # Initialize youtube_dl object
+    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+        # Download metadata
+        info_dict = ydl.extract_info(video_url, download=False)
+
+    # Extract subtitles
+    subtitles = info_dict.get('subtitles')
+    subtitles_text = ""
+    # Print available subtitles
+    if subtitles:
+        for subs in subtitles.values():
+            for sub in subs:
+                subtitle_url = sub['url']
+                with youtube_dl.YoutubeDL({}) as ydl:
+                    subtitle_info = ydl.extract_info(
+                        subtitle_url, download=False)
+                    for subtitle in subtitle_info['subtitles'][subtitle_language]:
+                        if subtitle["ext"] == "srv1":
+                            soup = BeautifulSoup(
+                                get_html(subtitle["url"]), 'html.parser')
+                            subtitles_text = soup.get_text()

                            s = Session()
-    existing_website = s.query(Website).filter_by(url=url).first()
-    print (existing_website)
-    if existing_website == None:
-        website = Website(
-                url=url,
-                text_content=soup.get_text(),
-                html_content=soup.prettify(),
+                            existing_website = s.query(
+                                Documents).filter_by(url=video_url).first()
+                            if existing_website is None:
+                                website = Documents(
+                                    url=video_url,
+                                    text_content=subtitles_text,
+                                    html_content=None,  # soup.prettify(),
                                    first_crawl_date=datetime.datetime.now(),
-                last_crawl_date = datetime.datetime.now()
+                                    last_crawl_date=datetime.datetime.now(),
+                                    last_index_date=None
                                )
                                s.add(website)
                            else:
@ -51,54 +85,127 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
                                s.add(existing_website)
                            s.commit()
                            s.close()
-    x = open(f'data/links.txt', 'a')
-    x.close()
-    links = soup.find_all("a")
+
+
+def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
+    for domain in excluded_domains:
+        if domain in url:
+            return
+    if any(ext in url for ext in excluded_filetypes):
+        return
+    if "youtube.com" in url:
+        parse_youtube(url)
+        return
+    rp = urllib.robotparser.RobotFileParser()
+    print(url)
+    print(recursion)
+    urlparts = urlparse(url)
+    baseurl = urlparts.scheme + "://" + urlparts.netloc
+    if baseurl not in robots:
+        rp.set_url(baseurl + "/robots.txt")
+        rp.read()
+        robots[baseurl] = rp
+    else:
+        rp = robots[baseurl]
+    if not rp.can_fetch("*", url):
+        print("Robots prevents crawling url: " + url)
+        return
+
+    soup = BeautifulSoup(html, 'html.parser')
+
+    s = Session()
+    existing_website = s.query(Documents).filter_by(url=url).first()
+    if existing_website is None:
+        website = Documents(
+            url=url,
+            text_content=soup.get_text(),
+            html_content=soup.prettify(),
+            first_crawl_date=datetime.datetime.now(),
+            last_crawl_date=datetime.datetime.now(),
+            last_index_date=None
+        )
+        s.add(website)
+    else:
+        existing_website.last_crawl_date = datetime.datetime.now()
+        s.add(existing_website)
+    s.commit()
+    s.close()
+    links = soup.find_all("a", href=True)
    for link in links:
        found = False
        link = link["href"]
        if (len(link) > 0 and link[0] == "#") or "localhost" in link:
            continue
-        if not "http" in link:
+        if any(ext in link for ext in excluded_filetypes):
+            continue
+        if "http" not in link:
            link = urljoin(url, link)
+        link = link.split('?')[0]
+        link = link.split('#')[0]
        if (recursion > 0 and link not in traversed_links):
            try:
                traversed_links.append(link)
                link_html = get_html(link)
                r = recursion - 1
-                sleep(1)
+                sleep(0.5)
                parse_html(link, link_html, r, traversed_links)
            except:
                pass
-#        else:
-#            with open(f'data/links.txt', 'r+') as linksfile:
+#        elif link not in traversed_links:
+#            with open('data/links.txt', 'r+') as linksfile:
 #                while line := linksfile.readline():
 #                    if line.strip() == link.strip():
 #                        found = True
 #                if not found:
 #                    linksfile.write(f'{link}\n')

-if __name__ == "__main__":

+def parse_site_map(base_url):
+    map = BeautifulSoup(requests.get(base_url).content, 'xml')
+    print(map.find_all('loc'))
+    for loc in map.find_all('loc'):
+        if "xml" in loc.contents[0]:
+            parse_site_map(loc.contents[0])
+        else:
+            url = loc.contents[0]
+            html = get_html(url)
+            parse_html(url, html, max_recursion)
+
+
+if __name__ == "__main__":
    os.makedirs("data/content", exist_ok=True)
    # check inputs
    parser = argparse.ArgumentParser()
    parser.add_argument("url", help="URL of the webpage to be crawled")
    parser.add_argument('-f', "--followlinks", action="store_true")
-    max_recursion = 4
+    parser.add_argument('-s', "--crawl-sitemap", action="store_true")
+    parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
+
    args = parser.parse_args()
+    max_recursion = int(args.max_recursion)
+    if args.url == "links":
+        with open('data/links.txt', 'r+') as linksfile:
+            while line := linksfile.readline():
+                if "http" in line:
+                    try:
+                        parse_html(line, get_html(line))
+                    except:
+                        pass
+    elif args.crawl_sitemap:
+        rp = urllib.robotparser.RobotFileParser()
+        urlparts = urlparse(args.url)
+        baseurl = urlparts.scheme + "://" + urlparts.netloc
+        rp.set_url(baseurl + "/robots.txt")
+        rp.read()
+        if not rp.can_fetch("*", args.url):
+            print("Robots prevents crawling url: " + args.url)
+            exit(0)
+        if len(rp.site_maps()) > 0:
+            parse_site_map(rp.site_maps()[0])
+    else:
        html = get_html(args.url)
        parse_html(args.url, html, max_recursion)

 #    recursion = 0
 #    if (args.followlinks):
-#        with open(f'data/links.txt', 'r+') as linksfile:
-#            while line := linksfile.readline():
-#                if recursion < max_recursion:
-#                    if "http" in line:
-#                        recursion += 1
-#                        try:
-#                            parse_html(line, get_html(line))
-#                        except:
-#                            pass
-    os.remove('data/links.txt')
+#    os.remove('data/links.txt')
--- a/src/index.py
+++ b/src/index.py
@ -1,54 +1,154 @@
-from sqlalchemy import create_engine
-from config import DATABASE_URI
-from models import Base, Website
-from pathlib import Path
-import argparse
-import os
-import json
-# investigate ngrams for "multi word" matching
-ignored_words = ['a', 'the','is']
+#!/usr/bin/python3

-def remove_punctuation(input_string):
-    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
-    for p in punc:
-        input_string = input_string.replace(p, '')
-    return input_string
+import argparse
+from sqlalchemy import create_engine, or_, text
+from sqlalchemy import Table, Column, String, Integer
+from config import DATABASE_URI
+from sqlalchemy.dialects.postgresql import UUID
+from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy.exc import SQLAlchemyError
+import uuid
+import datetime
+import time
+import re
+import random
+from multiprocessing import Pool
+
+engine = create_engine(DATABASE_URI)
+Base.metadata.create_all(engine)
+Session = sessionmaker(bind=engine)
+# https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html
+
+
+def contains_latin(text):
+    latin_pattern = r'[a-zA-ZÀ-ÖØ-öø-ÿ]'
+    return bool(re.search(latin_pattern, text))
+
+
+def build_index_chunk(document_chunk):
+    session = Session()
+    print(len(document_chunk))
+    start_time = time.time_ns()
+    for document in document_chunk:
+        print(document.url)
+        content = re.sub(r'[.,?!]', ' ', str(document.text_content))
+        content = re.sub(r'[^\w\s]', '', str(content))
+        content_words = content.split()
+        build_ngrams(1, content_words, document.id)
+        build_ngrams(2, content_words, document.id)
+        build_ngrams(3, content_words, document.id)
+        build_ngrams(4, content_words, document.id)
+        build_ngrams(5, content_words, document.id)
+
+        document.last_index_date = datetime.datetime.now()
+        session.merge(document)
+        session.commit()
+    session.close()


 def build_index():
-    with open(f"data/index.json", "w") as index:
-        # get a list of all content files
-        # split on whitespace and add to index
-        dictionary = {}
-        pathlist = Path('data/content').rglob('*.txt')
-        for path in pathlist:
-            with open(str(path)) as content_file:
-                url = content_file.readline()
-                content = content_file.read()
-                content_words = content.split()
-                for word in content_words:
-                    word = word.lower()
-                    word = remove_punctuation(word)
-                    if not word in ignored_words:
-                        if not word in dictionary:
-                            dictionary[word] = []
-                        matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
-                        if len(matching_urls) == 0:
-#                        if not url.strip() in dictionary[word]:
-                            entries = dictionary[word]
-                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
-                            dictionary[word].append(entry)
+    while True:
+        session = Session()
+        documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
+            None), Documents.last_index_date < Documents.last_crawl_date)).limit(100)
+        session.close()
+
+        # Execute the query to get the result set
+        documents = list(documents_query)
+        if len(documents) == 0:
+            return
+        build_index_chunk(documents)
+        continue
+        chunk_size = 10
+        document_chunks = [documents[i:i+chunk_size]
+                           for i in range(0, len(documents), chunk_size)]
+        with Pool() as pool:
+            pool.map(build_index_chunk, document_chunks)
+
+
+def zip_ngrams(size: int, corpus, document_id):
+    size = int(size)
+    connection = engine.connect()
+    temptbl_name = 'temp_del_{}'.format(
+        time.time_ns() + random.randint(100000, 9999999))
+    temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column(
+        'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True)
+
+    try:
+        # Start transaction
+        with connection.begin():
+            temptbl.create(engine)
+            insert_grams = []
+            grams = zip(*[corpus[i:] for i in range(size)])
+            for gram in grams:
+                gram = ' '.join(gram).lower()
+                insert_grams.append(
+                    {"id": uuid.uuid4(), "gram": gram, "size": size})
+            connection.execute(temptbl.insert().values(insert_grams))
+            connection.execute(text("UPDATE " + temptbl_name +
+                                    " SET id = ngrams.id FROM ngrams WHERE ngrams.gram = "
+                                    + temptbl_name + ".gram;"))
+            connection.execute(text("INSERT INTO ngrams (id, gram, size) SELECT " +
+                                    " distinct t.id, t.gram as gram, t.size FROM " +
+                                    temptbl_name + " t LEFT JOIN ngrams on ngrams.gram = " +
+                                    "t.gram WHERE ngrams.id is null and t.size is not null " + " ON CONFLICT DO NOTHING;"))
+            connection.execute(text("INSERT INTO document_ngrams(id, document_id, ngram_id) SELECT DISTINCT " +
+                                    "uuid_generate_v4() , '" + str(document_id) + "'::UUID, t.id FROM " + temptbl_name + " t;"))
+    except SQLAlchemyError as e:
+        # Handle exceptions
+        print("An error occurred:", e)
+        # Rollback transaction
+        connection.rollback()
    else:
-                            entries = dictionary[word]
-                            entry = matching_urls[0]
-                            entry["count"] += 1
-                            entries.sort(reverse=True, key=lambda entry: entry["count"])
-        index.write(json.dumps(dictionary))
+        # Commit transaction if no exceptions occurred
+        connection.commit()
+    finally:
+        connection.close()
+        # Drop table outside the transaction block
+        temptbl.drop(engine)
+
+
+def build_ngrams(size: int, corpus: str, document_id: str):
+    session = Session()
+    zip_ngrams(size, corpus, document_id)
+    return
+    i = 0
+    grams = []
+    while i < len(corpus):
+        if i + size >= len(corpus):
+            i = len(corpus)
+        gram = ''
+        for n in range(0, size):
+            if i + n >= len(corpus):
+                break
+            gram += corpus[i+n] + ' '
+        gram = gram.strip().lower()
+        if len(gram) > 1000 or gram in grams or not contains_latin(gram):
+            i += 1
+            continue
+        grams.append(gram)
+        if (len(gram) > 1):
+            ngram = session.query(NGrams).filter_by(
+                gram=gram).filter_by(size=size).first()
+            if ngram is None:
+                ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
+                session.add(ngram)
+            document_ngram = Document_NGrams(
+                document_id=document_id, ngram_id=ngram.id)
+            session.add(document_ngram)
+            session.commit()
+        i += 1
+#    print(str((time.time_ns() - start_time)//1_000_000))
+    session.close()
+

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
+    parser.add_argument('-r',
+                        "--rebuild",
+                        action="store_true",
+                        help="Blow away the index and rebuild")
    args = parser.parse_args()
    if args.rebuild:
        build_index()
-
--- a/src/index.py.old
+++ b/src/index.py.old
@ -0,0 +1,54 @@
+from sqlalchemy import create_engine
+from config import DATABASE_URI
+from models import Base, Website
+from pathlib import Path
+import argparse
+import os
+import json
+# investigate ngrams for "multi word" matching
+ignored_words = ['a', 'the','is']
+
+def remove_punctuation(input_string):
+    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
+    for p in punc:
+        input_string = input_string.replace(p, '')
+    return input_string
+
+
+def build_index():
+    with open("data/index.json", "w") as index:
+        # get a list of all content files
+        # split on whitespace and add to index
+        dictionary = {}
+        pathlist = Path('data/content').rglob('*.txt')
+        for path in pathlist:
+            with open(str(path)) as content_file:
+                url = content_file.readline()
+                content = content_file.read()
+                content_words = content.split()
+                for word in content_words:
+                    word = word.lower()
+                    word = remove_punctuation(word)
+                    if word not in ignored_words:
+                        if word not in dictionary:
+                            dictionary[word] = []
+                        matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
+                        if len(matching_urls) == 0:
+#                        if not url.strip() in dictionary[word]:
+                            entries = dictionary[word]
+                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
+                            dictionary[word].append(entry)
+                        else:
+                            entries = dictionary[word]
+                            entry = matching_urls[0]
+                            entry["count"] += 1
+                            entries.sort(reverse=True, key=lambda entry: entry["count"])
+        index.write(json.dumps(dictionary))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
+    args = parser.parse_args()
+    if args.rebuild:
+        build_index()
+
--- a/src/models.py
+++ b/src/models.py
@ -1,18 +1,72 @@
 from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, Integer, String, DateTime
+from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer
 from sqlalchemy.dialects.postgresql import UUID
+from sqlalchemy.orm import relationship, mapped_column
 import uuid

 Base = declarative_base()

-class Website(Base):

-    __tablename__ = 'websites'
+class Documents(Base):
+    __tablename__ = 'documents'
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    url = Column(String)
    text_content = Column(String)
    html_content = Column(String)
    first_crawl_date = Column(DateTime)
    last_crawl_date = Column(DateTime)
+    last_index_date = Column(DateTime)
+    document_tokens = relationship(
+        "Document_Tokens", back_populates="document")
+    document_ngrams = relationship(
+        "Document_NGrams", back_populates="document")


+class Document_Tokens(Base):
+    __tablename__ = 'document_tokens'
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    document_id = mapped_column(ForeignKey("documents.id"))
+    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    token_id = mapped_column(ForeignKey("tokens.id"))
+    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    document = relationship(
+        "Documents", back_populates="document_tokens", uselist=False)
+    token = relationship("Tokens", back_populates="document_tokens")
+    __table_args__ = (
+        Index('idx_document_tokens_document_id_token_id', 'document_id',
+              'token_id', unique=True, postgresql_using='hash'),
+        Index('idx_document_tokens_clustered', 'document_id',
+              'token_id', postgresql_using='hash'),
+    )
+
+
+class Tokens(Base):
+    __tablename__ = 'tokens'
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    token = Column(String, index=True)
+    document_tokens = relationship("Document_Tokens", back_populates="token")
+
+
+class NGrams(Base):
+    __tablename__ = 'ngrams'
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    size = Column(Integer, index=True)
+    gram = Column(String, index=True)
+    document_ngrams = relationship("Document_NGrams", back_populates="ngram")
+
+
+class Document_NGrams(Base):
+    __tablename__ = 'document_ngrams'
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    document_id = mapped_column(ForeignKey("documents.id"))
+    ngram_id = mapped_column(ForeignKey("ngrams.id"))
+    document = relationship(
+        "Documents", back_populates="document_ngrams", uselist=False)
+    ngram = relationship("NGrams", back_populates="document_ngrams")
+
+    __table_args__ = (
+        Index('idx_document_ngrams_document_id_ngram_id', 'document_id',
+              'ngram_id', unique=True, postgresql_using='hash'),
+        Index('idx_document_ngrams_clustered', 'document_id',
+              'ngram_id', postgresql_using='hash'),
+    )
--- a/src/search.py
+++ b/src/search.py
@ -1,30 +1,146 @@
-#!/bin/bash
-
+#!/usr/bin/python3
+from sqlalchemy import create_engine, func, and_, or_, not_
+from config import DATABASE_URI
+from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy.sql.expression import distinct
+import time
 from flask import Flask
-from flask import Request
-import json
+from flask_cors import CORS
+from flask import send_from_directory
 from urllib.parse import unquote

-app = Flask(__name__)
-## Todo - Boolean search (AND/OR/NOT/"")
-@app.route("/search/<query>")
-def search(query):
-    with open('data/index.json', 'r') as index_json:
-        index = json.load(index_json)
-        query = unquote(query)
-        query_split = query.split()
-        result = []
-        for q in query_split:
-            q = q.lower()
-            if q in index:
-                for item in index[q]:
-                    matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
-                    if len(matching_results) == 0:
-                        result.append(item)
-                    else:
-                        matching_results[0]["count"] += item["count"]
+app = Flask(__name__, static_url_path='/static/')
+CORS(app)
+engine = create_engine(DATABASE_URI)
+Base.metadata.create_all(engine)
+Session = sessionmaker(bind=engine)
+# Todo - Boolean search (AND/OR/NOT/"")
+
+
+def split_query(query):
+    query = query.lower()
+    result = {'ands': [], 'ors': [], 'words': [],
+              'ngrams': [], 'exclusions': []}
+    query_words = query.split()
+    i = 0
+    while i < len(query_words):
+        if i + 1 < len(query_words):
+            if query_words[i + 1].lower() == "and":
+                if i + 2 < len(query_words):
+                    result['ands'].append(
+                        query_words[i] + ',' + query_words[i+2])
+                    i = i + 3
+                    continue
+        if query_words[i][0] == '"':
+            n = 0
+            quoted_query = ""
+            while i+n < len(query_words):
+                quoted_query += query_words[i+n] + ' '
+                if query_words[i+n][len(query_words[i+n])-1] == '"':
+                    break
+                n += 1
+            result['ngrams'].append(
+                quoted_query[1:len(quoted_query)-2].rstrip())
+            i += n + 1
+            continue
+        elif query_words[i][0] == "-":
+            excluded_query = query_words[i][1: len(query_words[i])]
+            result['exclusions'].append(excluded_query)
+            i += 1
+            continue
+        result['ngrams'].append(query_words[i])
+        i += 1
    return result

-def handle_and():
-    pass

+@ app.route("/search/<query>")
+def search(query):
+    start_time = time.time_ns()
+    session = Session()
+    results = {}
+    query_words = split_query(unquote(query))
+    print(query_words)
+    if len(query_words['ands']) > 0:
+        print('entering ands: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
+        for a in query_words['ands']:
+            query = session.query(Documents.url, func.count(1)). \
+                join(Document_NGrams, Documents.id == Document_NGrams.document_id). \
+                join(NGrams, Document_NGrams.ngram_id == NGrams.id). \
+                filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\
+                group_by(Documents.url). \
+                having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \
+                order_by(func.count(1).desc())
+
+#                limit(100)
+            print(query)
+            for result in query.all():
+                if result[0] in results.keys():
+                    results[result[0]] += result[1]
+                else:
+                    results[result[0]] = result[1]
+        print('exiting ands: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
+    if len(query_words['ngrams']) > 0:
+        print('entering ngrams: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
+
+        q = session.query(Documents.url, func.count(1)) \
+            .join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
+            .join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
+            .group_by(Documents.url)
+        conditions = []
+        for ngram in query_words['ngrams']:
+            conditions.append(
+                (NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram))
+#            q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
+        and_conditions = [and_(*condition_pair)
+                          for condition_pair in conditions]
+        q = q.filter(or_(*and_conditions))
+        print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
+        print(q)
+        x = q.limit(100).all()
+        print('query executed: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
+        print(x)
+        for result in x:
+            if result[0] in results.keys():
+                results[result[0]] += result[1]
+            else:
+                results[result[0]] = result[1]
+#        for y in x:
+#            print(y)
+#            for document_ngram in y.document_ngrams:
+#                if document_ngram.document.url in results.keys():
+#                    results[document_ngram.document.url] += 1
+#                else:
+#                    results[document_ngram.document.url] = 1
+        print('exiting ngrams: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
+
+    print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
+    session.close()
+    return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
+
+
+# @app.route("/search/<query>")
+# def search(query):
+#    start_time = time.time_ns()
+#    session = Session()
+#    result = {}
+#    query_words = unquote(query).split()
+# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
+#    for word in query_words:
+#        word = word.lower()
+#        matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
+#
+#        if matching_ngram is None:
+#            continue
+#        for document_ngram in matching_ngram.document_ngrams:
+#            if document_ngram.document.url in result.keys():
+#                result[document_ngram.document.url] += 1
+#            else:
+#                result[document_ngram.document.url] = 1
+#    print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
+#    return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
--- a/11
+++ b/11
@ -0,0 +1,11 @@
+[x] Refactor website table to generic document table (maybe using URN instead of URL?)
+[x] Define tokens table FKed to document table
+[x] Refactor index.py to tokenize input into tokens table
+[x] Define N-Grams table 
+[x] Add N-Gram generation to index.py
+[x] Add clustered index to document_ngrams table model
+[x] Add clustered index to document_tokens table model
+[ ] Add ddl command to create partition tables
+[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
+[x] Instead of starting from a random page on the site, go to root and find site map and crawl that
+
Author	SHA1	Message	Date
rmgr	bbba459480	Clean up site map scanning. Return all results instead of 10	2024-06-09 21:53:57 +09:30
rmgr	2a99a61dbe	Add site map crawl option	2024-06-08 20:43:05 +09:30
rmgr	e3c67b64e6	Make excluded file types more robust	2024-06-08 20:24:21 +09:30
rmgr	98efe9d1a2	Fix temp table being randomly dropped due to name collision. Fix multi-word non-phrase search	2024-05-05 19:06:56 +09:30
rmgr	bdb4064acc	Rework ngram generation. Greatly improve performance of indexer. Commit horrendous sql sins	2024-05-04 21:10:46 +09:30
rmgr	9f0e7e6b29	Indexer and query optimisations	2024-04-06 19:34:59 +10:30
rmgr	9d57f66cd7	Add beginnings of ngram search capability	2024-04-05 21:36:15 +10:30
rmgr	343410e62f	Add first pass youtube subtitle indexer	2024-04-05 06:22:56 +10:30
rmgr	7ee9d978b2	Tidy up crawling and implement boolean search	2024-04-04 20:46:34 +10:30
rmgr	d4bb3fb8dc	Tidy up index.py	2024-03-07 21:12:19 +10:30
rmgr	20d198e559	Refactor to use postgresql end to end	2024-03-07 21:00:11 +10:30
rmgr	8605ee6b2c	Add todo file	2024-03-02 19:58:10 +10:30
rmgr	aed568d11e	Remove beehave.txt note	2024-03-02 19:54:53 +10:30
rmgr	8903f7a3e5	Merge postgres chagnes	2024-03-02 19:53:58 +10:30
rmgr	efe6dea1f5	Fix crawling. Add initial linksfile crawling. Still need to remove records as they are processed.	2024-01-01 20:52:12 +10:30
rmgr	f4ea8ad1d7	Respect robots.txt	2024-01-01 19:53:22 +10:30