Clean up site map scanning. Return all results instead of 10

Add site map crawl option
Make excluded file types more robust
2024-06-09 21:53:57 +09:30 · 2024-06-08 20:43:05 +09:30 · 2024-06-08 20:24:21 +09:30 · 2024-05-05 19:06:56 +09:30 · 2024-05-04 21:10:46 +09:30 · 2024-04-06 19:34:59 +10:30
10 changed files with 632 additions and 113 deletions
--- a/client/src/css/styles.css
+++ b/client/src/css/styles.css
@ -0,0 +1,33 @@
 html, body {
    height: 100%;
 }
 body {
    margin: 0;
 }
 input {
    padding: 7px;
    font-size: 1.1rem;
 }
 .search-container {
    display: flex;
    justify-content: center;
    align-items: center;
    text-align: center;
    min-height: 25vh;
 }
 .flex-container {
    padding: 0;
    margin: 0;
    display: flex;
    align-items: center;
    justify-content: center;
    flex-direction: column;
 }
 .flex-item {
 }
 .result {
    display:block;
    max-width: 60vw;
    overflow-x: hidden;
 }
--- a/client/src/index.html
+++ b/client/src/index.html
@ -0,0 +1,16 @@
 <html>
 	<head>
 		<link rel="stylesheet" href="css/styles.css">
 	</head>
 	<body>
 		<div class="search-container">
 			<input type="text" class="searchbox" id="searchbox">
 		</div>
 		<div class="flex-container">
 			<div class="flex-item" id="results">
 			</div>
 		</div>
 		<script src="js/index.js"></script>
 	</body>
 </html>
--- a/client/src/js/index.js
+++ b/client/src/js/index.js
@ -0,0 +1,28 @@
 function debounce(func, timeout = 300){
  let timer;
  return (...args) => {
    clearTimeout(timer);
    timer = setTimeout(() => { func.apply(this, args); }, timeout);
  };
 }
 async function search(searchBox){
  const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
  const results = await response.json();
  const resultView = document.getElementById("results");
  resultView.replaceChildren();
  for (let i = 0; i < results.length; i++){
    let result = results[i];
    let resultElement = document.createElement("a");
    resultElement.innerText = result[0];
    resultElement.href = result[0];
    resultElement.className = "flex-item result";
    resultView.appendChild(resultElement);
  }
 }
 const searchBoxKeyUp = debounce(() => search())
 const searchBox = document.getElementById("searchbox");
 searchBox.addEventListener("keyup", debounce(() => search(searchBox)))
--- a/src/pycache/search.cpython-310.pyc
+++ b/src/pycache/search.cpython-310.pyc
--- a/src/crawl.py
+++ b/src/crawl.py
@ -1,49 +1,83 @@
 #!/usr/bin/python3
 import argparse
 import requests
 import hashlib
 from urllib.parse import urlparse, urljoin
 import urllib.robotparser
 import os
 from time import sleep
 from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
-from models import Base, Website
+from models import Base, Documents
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy import create_engine
 import datetime
 import yt_dlp as youtube_dl
 # TODO- Handle gemini/gopher links
 # TODO- Keep a list of traversed links and check before traversing again
 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
-def get_html(url: str) -> str:
+excluded_domains = ['amazon.', 'news.ycombinator.',
                    'facebook.com', 'amzn', 'fb.com']
 excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
                      ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
 def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content
 def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
-    print(url)
+def parse_youtube(video_url: str) -> bool:
-    print(recursion)
+    return
-    urlparts = urlparse(url)
+    # Language preference for subtitles (set to None for auto-generated)
-    baseurl = urlparts.scheme + "://" + urlparts.netloc
+    # Change this to 'en' for English subtitles, or None for auto-generated
-    soup = BeautifulSoup(html,'html.parser')
+    subtitle_language = 'en'
-    hash = hashlib.sha256()
+    # Options for youtube_dl
-    hash.update(url.encode('ascii'))
+    ydl_opts = {
        'writesubtitles': True,
        'allsubtitles': True,
        'skip_download': True,  # We only want to fetch metadata
        'subtitleslangs': [subtitle_language] if subtitle_language else None,
        'extractor-args': {'youtube': {'player_client': 'ios,web'}},
    }
    # Initialize youtube_dl object
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        # Download metadata
        info_dict = ydl.extract_info(video_url, download=False)
    # Extract subtitles
    subtitles = info_dict.get('subtitles')
    subtitles_text = ""
    # Print available subtitles
    if subtitles:
        for subs in subtitles.values():
            for sub in subs:
                subtitle_url = sub['url']
                with youtube_dl.YoutubeDL({}) as ydl:
                    subtitle_info = ydl.extract_info(
                        subtitle_url, download=False)
                    for subtitle in subtitle_info['subtitles'][subtitle_language]:
                        if subtitle["ext"] == "srv1":
                            soup = BeautifulSoup(
                                get_html(subtitle["url"]), 'html.parser')
                            subtitles_text = soup.get_text()
                            s = Session()
-    existing_website = s.query(Website).filter_by(url=url).first()
+                            existing_website = s.query(
-    print (existing_website)
+                                Documents).filter_by(url=video_url).first()
-    if existing_website == None:
+                            if existing_website is None:
-        website = Website(
+                                website = Documents(
-                url=url,
+                                    url=video_url,
-                text_content=soup.get_text(),
+                                    text_content=subtitles_text,
-                html_content=soup.prettify(),
+                                    html_content=None,  # soup.prettify(),
                                    first_crawl_date=datetime.datetime.now(),
-                last_crawl_date = datetime.datetime.now()
+                                    last_crawl_date=datetime.datetime.now(),
                                    last_index_date=None
                                )
                                s.add(website)
                            else:
@ -51,54 +85,127 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
                                s.add(existing_website)
                            s.commit()
                            s.close()
-    x = open(f'data/links.txt', 'a')
+
-    x.close()
+
-    links = soup.find_all("a")
+def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
    for domain in excluded_domains:
        if domain in url:
            return
    if any(ext in url for ext in excluded_filetypes):
        return
    if "youtube.com" in url:
        parse_youtube(url)
        return
    rp = urllib.robotparser.RobotFileParser()
    print(url)
    print(recursion)
    urlparts = urlparse(url)
    baseurl = urlparts.scheme + "://" + urlparts.netloc
    if baseurl not in robots:
        rp.set_url(baseurl + "/robots.txt")
        rp.read()
        robots[baseurl] = rp
    else:
        rp = robots[baseurl]
    if not rp.can_fetch("*", url):
        print("Robots prevents crawling url: " + url)
        return
    soup = BeautifulSoup(html, 'html.parser')
    s = Session()
    existing_website = s.query(Documents).filter_by(url=url).first()
    if existing_website is None:
        website = Documents(
            url=url,
            text_content=soup.get_text(),
            html_content=soup.prettify(),
            first_crawl_date=datetime.datetime.now(),
            last_crawl_date=datetime.datetime.now(),
            last_index_date=None
        )
        s.add(website)
    else:
        existing_website.last_crawl_date = datetime.datetime.now()
        s.add(existing_website)
    s.commit()
    s.close()
    links = soup.find_all("a", href=True)
    for link in links:
        found = False
        link = link["href"]
        if (len(link) > 0 and link[0] == "#") or "localhost" in link:
            continue
-        if not "http" in link:
+        if any(ext in link for ext in excluded_filetypes):
            continue
        if "http" not in link:
            link = urljoin(url, link)
        link = link.split('?')[0]
        link = link.split('#')[0]
        if (recursion > 0 and link not in traversed_links):
            try:
                traversed_links.append(link)
                link_html = get_html(link)
                r = recursion - 1
-                sleep(1)
+                sleep(0.5)
                parse_html(link, link_html, r, traversed_links)
            except:
                pass
-#        else:
+#        elif link not in traversed_links:
-#            with open(f'data/links.txt', 'r+') as linksfile:
+#            with open('data/links.txt', 'r+') as linksfile:
 #                while line := linksfile.readline():
 #                    if line.strip() == link.strip():
 #                        found = True
 #                if not found:
 #                    linksfile.write(f'{link}\n')
 if __name__ == "__main__":
 def parse_site_map(base_url):
    map = BeautifulSoup(requests.get(base_url).content, 'xml')
    print(map.find_all('loc'))
    for loc in map.find_all('loc'):
        if "xml" in loc.contents[0]:
            parse_site_map(loc.contents[0])
        else:
            url = loc.contents[0]
            html = get_html(url)
            parse_html(url, html, max_recursion)
 if __name__ == "__main__":
    os.makedirs("data/content", exist_ok=True)
    # check inputs
    parser = argparse.ArgumentParser()
    parser.add_argument("url", help="URL of the webpage to be crawled")
    parser.add_argument('-f', "--followlinks", action="store_true")
-    max_recursion = 4
+    parser.add_argument('-s', "--crawl-sitemap", action="store_true")
    parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
    args = parser.parse_args()
    max_recursion = int(args.max_recursion)
    if args.url == "links":
        with open('data/links.txt', 'r+') as linksfile:
            while line := linksfile.readline():
                if "http" in line:
                    try:
                        parse_html(line, get_html(line))
                    except:
                        pass
    elif args.crawl_sitemap:
        rp = urllib.robotparser.RobotFileParser()
        urlparts = urlparse(args.url)
        baseurl = urlparts.scheme + "://" + urlparts.netloc
        rp.set_url(baseurl + "/robots.txt")
        rp.read()
        if not rp.can_fetch("*", args.url):
            print("Robots prevents crawling url: " + args.url)
            exit(0)
        if len(rp.site_maps()) > 0:
            parse_site_map(rp.site_maps()[0])
    else:
        html = get_html(args.url)
        parse_html(args.url, html, max_recursion)
 #    recursion = 0
 #    if (args.followlinks):
-#        with open(f'data/links.txt', 'r+') as linksfile:
+#    os.remove('data/links.txt')
 #            while line := linksfile.readline():
 #                if recursion < max_recursion:
 #                    if "http" in line:
 #                        recursion += 1
 #                        try:
 #                            parse_html(line, get_html(line))
 #                        except:
 #                            pass
    os.remove('data/links.txt')
--- a/src/index.py
+++ b/src/index.py
@ -1,54 +1,154 @@
-from sqlalchemy import create_engine
+#!/usr/bin/python3
 from config import DATABASE_URI
 from models import Base, Website
 from pathlib import Path
 import argparse
 import os
 import json
 # investigate ngrams for "multi word" matching
 ignored_words = ['a', 'the','is']
-def remove_punctuation(input_string):
+import argparse
-    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
+from sqlalchemy import create_engine, or_, text
-    for p in punc:
+from sqlalchemy import Table, Column, String, Integer
-        input_string = input_string.replace(p, '')
+from config import DATABASE_URI
-    return input_string
+from sqlalchemy.dialects.postgresql import UUID
 from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.exc import SQLAlchemyError
 import uuid
 import datetime
 import time
 import re
 import random
 from multiprocessing import Pool
 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
 # https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html
 def contains_latin(text):
    latin_pattern = r'[a-zA-ZÀ-ÖØ-öø-ÿ]'
    return bool(re.search(latin_pattern, text))
 def build_index_chunk(document_chunk):
    session = Session()
    print(len(document_chunk))
    start_time = time.time_ns()
    for document in document_chunk:
        print(document.url)
        content = re.sub(r'[.,?!]', ' ', str(document.text_content))
        content = re.sub(r'[^\w\s]', '', str(content))
        content_words = content.split()
        build_ngrams(1, content_words, document.id)
        build_ngrams(2, content_words, document.id)
        build_ngrams(3, content_words, document.id)
        build_ngrams(4, content_words, document.id)
        build_ngrams(5, content_words, document.id)
        document.last_index_date = datetime.datetime.now()
        session.merge(document)
        session.commit()
    session.close()
 def build_index():
-    with open(f"data/index.json", "w") as index:
+    while True:
-        # get a list of all content files
+        session = Session()
-        # split on whitespace and add to index
+        documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
-        dictionary = {}
+            None), Documents.last_index_date < Documents.last_crawl_date)).limit(100)
-        pathlist = Path('data/content').rglob('*.txt')
+        session.close()
-        for path in pathlist:
+
-            with open(str(path)) as content_file:
+        # Execute the query to get the result set
-                url = content_file.readline()
+        documents = list(documents_query)
-                content = content_file.read()
+        if len(documents) == 0:
-                content_words = content.split()
+            return
-                for word in content_words:
+        build_index_chunk(documents)
-                    word = word.lower()
+        continue
-                    word = remove_punctuation(word)
+        chunk_size = 10
-                    if not word in ignored_words:
+        document_chunks = [documents[i:i+chunk_size]
-                        if not word in dictionary:
+                           for i in range(0, len(documents), chunk_size)]
-                            dictionary[word] = []
+        with Pool() as pool:
-                        matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
+            pool.map(build_index_chunk, document_chunks)
-                        if len(matching_urls) == 0:
+
-#                        if not url.strip() in dictionary[word]:
+
-                            entries = dictionary[word]
+def zip_ngrams(size: int, corpus, document_id):
-                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
+    size = int(size)
-                            dictionary[word].append(entry)
+    connection = engine.connect()
    temptbl_name = 'temp_del_{}'.format(
        time.time_ns() + random.randint(100000, 9999999))
    temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column(
        'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True)
    try:
        # Start transaction
        with connection.begin():
            temptbl.create(engine)
            insert_grams = []
            grams = zip(*[corpus[i:] for i in range(size)])
            for gram in grams:
                gram = ' '.join(gram).lower()
                insert_grams.append(
                    {"id": uuid.uuid4(), "gram": gram, "size": size})
            connection.execute(temptbl.insert().values(insert_grams))
            connection.execute(text("UPDATE " + temptbl_name +
                                    " SET id = ngrams.id FROM ngrams WHERE ngrams.gram = "
                                    + temptbl_name + ".gram;"))
            connection.execute(text("INSERT INTO ngrams (id, gram, size) SELECT " +
                                    " distinct t.id, t.gram as gram, t.size FROM " +
                                    temptbl_name + " t LEFT JOIN ngrams on ngrams.gram = " +
                                    "t.gram WHERE ngrams.id is null and t.size is not null " + " ON CONFLICT DO NOTHING;"))
            connection.execute(text("INSERT INTO document_ngrams(id, document_id, ngram_id) SELECT DISTINCT " +
                                    "uuid_generate_v4() , '" + str(document_id) + "'::UUID, t.id FROM " + temptbl_name + " t;"))
    except SQLAlchemyError as e:
        # Handle exceptions
        print("An error occurred:", e)
        # Rollback transaction
        connection.rollback()
    else:
-                            entries = dictionary[word]
+        # Commit transaction if no exceptions occurred
-                            entry = matching_urls[0]
+        connection.commit()
-                            entry["count"] += 1
+    finally:
-                            entries.sort(reverse=True, key=lambda entry: entry["count"])
+        connection.close()
-        index.write(json.dumps(dictionary))
+        # Drop table outside the transaction block
        temptbl.drop(engine)
 def build_ngrams(size: int, corpus: str, document_id: str):
    session = Session()
    zip_ngrams(size, corpus, document_id)
    return
    i = 0
    grams = []
    while i < len(corpus):
        if i + size >= len(corpus):
            i = len(corpus)
        gram = ''
        for n in range(0, size):
            if i + n >= len(corpus):
                break
            gram += corpus[i+n] + ' '
        gram = gram.strip().lower()
        if len(gram) > 1000 or gram in grams or not contains_latin(gram):
            i += 1
            continue
        grams.append(gram)
        if (len(gram) > 1):
            ngram = session.query(NGrams).filter_by(
                gram=gram).filter_by(size=size).first()
            if ngram is None:
                ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
                session.add(ngram)
            document_ngram = Document_NGrams(
                document_id=document_id, ngram_id=ngram.id)
            session.add(document_ngram)
            session.commit()
        i += 1
 #    print(str((time.time_ns() - start_time)//1_000_000))
    session.close()
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
+    parser.add_argument('-r',
                        "--rebuild",
                        action="store_true",
                        help="Blow away the index and rebuild")
    args = parser.parse_args()
    if args.rebuild:
        build_index()
--- a/src/index.py.old
+++ b/src/index.py.old
@ -0,0 +1,54 @@
 from sqlalchemy import create_engine
 from config import DATABASE_URI
 from models import Base, Website
 from pathlib import Path
 import argparse
 import os
 import json
 # investigate ngrams for "multi word" matching
 ignored_words = ['a', 'the','is']
 def remove_punctuation(input_string):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
    for p in punc:
        input_string = input_string.replace(p, '')
    return input_string
 def build_index():
    with open("data/index.json", "w") as index:
        # get a list of all content files
        # split on whitespace and add to index
        dictionary = {}
        pathlist = Path('data/content').rglob('*.txt')
        for path in pathlist:
            with open(str(path)) as content_file:
                url = content_file.readline()
                content = content_file.read()
                content_words = content.split()
                for word in content_words:
                    word = word.lower()
                    word = remove_punctuation(word)
                    if word not in ignored_words:
                        if word not in dictionary:
                            dictionary[word] = []
                        matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
                        if len(matching_urls) == 0:
 #                        if not url.strip() in dictionary[word]:
                            entries = dictionary[word]
                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
                            dictionary[word].append(entry)
                        else:
                            entries = dictionary[word]
                            entry = matching_urls[0]
                            entry["count"] += 1
                            entries.sort(reverse=True, key=lambda entry: entry["count"])
        index.write(json.dumps(dictionary))
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
    args = parser.parse_args()
    if args.rebuild:
        build_index()
--- a/src/models.py
+++ b/src/models.py
@ -1,18 +1,72 @@
 from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, Integer, String, DateTime
+from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer
 from sqlalchemy.dialects.postgresql import UUID
 from sqlalchemy.orm import relationship, mapped_column
 import uuid
 Base = declarative_base()
 class Website(Base):
-    __tablename__ = 'websites'
+class Documents(Base):
    __tablename__ = 'documents'
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    url = Column(String)
    text_content = Column(String)
    html_content = Column(String)
    first_crawl_date = Column(DateTime)
    last_crawl_date = Column(DateTime)
    last_index_date = Column(DateTime)
    document_tokens = relationship(
        "Document_Tokens", back_populates="document")
    document_ngrams = relationship(
        "Document_NGrams", back_populates="document")
 class Document_Tokens(Base):
    __tablename__ = 'document_tokens'
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    document_id = mapped_column(ForeignKey("documents.id"))
    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    token_id = mapped_column(ForeignKey("tokens.id"))
    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    document = relationship(
        "Documents", back_populates="document_tokens", uselist=False)
    token = relationship("Tokens", back_populates="document_tokens")
    __table_args__ = (
        Index('idx_document_tokens_document_id_token_id', 'document_id',
              'token_id', unique=True, postgresql_using='hash'),
        Index('idx_document_tokens_clustered', 'document_id',
              'token_id', postgresql_using='hash'),
    )
 class Tokens(Base):
    __tablename__ = 'tokens'
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    token = Column(String, index=True)
    document_tokens = relationship("Document_Tokens", back_populates="token")
 class NGrams(Base):
    __tablename__ = 'ngrams'
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    size = Column(Integer, index=True)
    gram = Column(String, index=True)
    document_ngrams = relationship("Document_NGrams", back_populates="ngram")
 class Document_NGrams(Base):
    __tablename__ = 'document_ngrams'
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    document_id = mapped_column(ForeignKey("documents.id"))
    ngram_id = mapped_column(ForeignKey("ngrams.id"))
    document = relationship(
        "Documents", back_populates="document_ngrams", uselist=False)
    ngram = relationship("NGrams", back_populates="document_ngrams")
    __table_args__ = (
        Index('idx_document_ngrams_document_id_ngram_id', 'document_id',
              'ngram_id', unique=True, postgresql_using='hash'),
        Index('idx_document_ngrams_clustered', 'document_id',
              'ngram_id', postgresql_using='hash'),
    )
--- a/src/search.py
+++ b/src/search.py
@ -1,30 +1,146 @@
-#!/bin/bash
+#!/usr/bin/python3
-
+from sqlalchemy import create_engine, func, and_, or_, not_
 from config import DATABASE_URI
 from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.sql.expression import distinct
 import time
 from flask import Flask
-from flask import Request
+from flask_cors import CORS
-import json
+from flask import send_from_directory
 from urllib.parse import unquote
-app = Flask(__name__)
+app = Flask(__name__, static_url_path='/static/')
-## Todo - Boolean search (AND/OR/NOT/"")
+CORS(app)
-@app.route("/search/<query>")
+engine = create_engine(DATABASE_URI)
-def search(query):
+Base.metadata.create_all(engine)
-    with open('data/index.json', 'r') as index_json:
+Session = sessionmaker(bind=engine)
-        index = json.load(index_json)
+# Todo - Boolean search (AND/OR/NOT/"")
-        query = unquote(query)
+
-        query_split = query.split()
+
-        result = []
+def split_query(query):
-        for q in query_split:
+    query = query.lower()
-            q = q.lower()
+    result = {'ands': [], 'ors': [], 'words': [],
-            if q in index:
+              'ngrams': [], 'exclusions': []}
-                for item in index[q]:
+    query_words = query.split()
-                    matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
+    i = 0
-                    if len(matching_results) == 0:
+    while i < len(query_words):
-                        result.append(item)
+        if i + 1 < len(query_words):
-                    else:
+            if query_words[i + 1].lower() == "and":
-                        matching_results[0]["count"] += item["count"]
+                if i + 2 < len(query_words):
                    result['ands'].append(
                        query_words[i] + ',' + query_words[i+2])
                    i = i + 3
                    continue
        if query_words[i][0] == '"':
            n = 0
            quoted_query = ""
            while i+n < len(query_words):
                quoted_query += query_words[i+n] + ' '
                if query_words[i+n][len(query_words[i+n])-1] == '"':
                    break
                n += 1
            result['ngrams'].append(
                quoted_query[1:len(quoted_query)-2].rstrip())
            i += n + 1
            continue
        elif query_words[i][0] == "-":
            excluded_query = query_words[i][1: len(query_words[i])]
            result['exclusions'].append(excluded_query)
            i += 1
            continue
        result['ngrams'].append(query_words[i])
        i += 1
    return result
 def handle_and():
    pass
@ app.route("/search/<query>")
 def search(query):
    start_time = time.time_ns()
    session = Session()
    results = {}
    query_words = split_query(unquote(query))
    print(query_words)
    if len(query_words['ands']) > 0:
        print('entering ands: ' +
              str((time.time_ns() - start_time) // 1_000_000) + "ms")
        for a in query_words['ands']:
            query = session.query(Documents.url, func.count(1)). \
                join(Document_NGrams, Documents.id == Document_NGrams.document_id). \
                join(NGrams, Document_NGrams.ngram_id == NGrams.id). \
                filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\
                group_by(Documents.url). \
                having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \
                order_by(func.count(1).desc())
 #                limit(100)
            print(query)
            for result in query.all():
                if result[0] in results.keys():
                    results[result[0]] += result[1]
                else:
                    results[result[0]] = result[1]
        print('exiting ands: ' +
              str((time.time_ns() - start_time) // 1_000_000) + "ms")
    if len(query_words['ngrams']) > 0:
        print('entering ngrams: ' +
              str((time.time_ns() - start_time) // 1_000_000) + "ms")
        q = session.query(Documents.url, func.count(1)) \
            .join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
            .join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
            .group_by(Documents.url)
        conditions = []
        for ngram in query_words['ngrams']:
            conditions.append(
                (NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram))
 #            q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
        and_conditions = [and_(*condition_pair)
                          for condition_pair in conditions]
        q = q.filter(or_(*and_conditions))
        print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
        print(q)
        x = q.limit(100).all()
        print('query executed: ' +
              str((time.time_ns() - start_time) // 1_000_000) + "ms")
        print(x)
        for result in x:
            if result[0] in results.keys():
                results[result[0]] += result[1]
            else:
                results[result[0]] = result[1]
 #        for y in x:
 #            print(y)
 #            for document_ngram in y.document_ngrams:
 #                if document_ngram.document.url in results.keys():
 #                    results[document_ngram.document.url] += 1
 #                else:
 #                    results[document_ngram.document.url] = 1
        print('exiting ngrams: ' +
              str((time.time_ns() - start_time) // 1_000_000) + "ms")
    print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
    session.close()
    return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
 # @app.route("/search/<query>")
 # def search(query):
 #    start_time = time.time_ns()
 #    session = Session()
 #    result = {}
 #    query_words = unquote(query).split()
 # x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
 #    for word in query_words:
 #        word = word.lower()
 #        matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
 #
 #        if matching_ngram is None:
 #            continue
 #        for document_ngram in matching_ngram.document_ngrams:
 #            if document_ngram.document.url in result.keys():
 #                result[document_ngram.document.url] += 1
 #            else:
 #                result[document_ngram.document.url] = 1
 #    print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
 #    return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
--- a/11
+++ b/11
@ -0,0 +1,11 @@
 [x] Refactor website table to generic document table (maybe using URN instead of URL?)
 [x] Define tokens table FKed to document table
 [x] Refactor index.py to tokenize input into tokens table
 [x] Define N-Grams table 
 [x] Add N-Gram generation to index.py
 [x] Add clustered index to document_ngrams table model
 [x] Add clustered index to document_tokens table model
 [ ] Add ddl command to create partition tables
 [x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
 [x] Instead of starting from a random page on the site, go to root and find site map and crawl that
Author	SHA1	Message	Date
rmgr	bbba459480	Clean up site map scanning. Return all results instead of 10	2024-06-09 21:53:57 +09:30
rmgr	2a99a61dbe	Add site map crawl option	2024-06-08 20:43:05 +09:30
rmgr	e3c67b64e6	Make excluded file types more robust	2024-06-08 20:24:21 +09:30
rmgr	98efe9d1a2	Fix temp table being randomly dropped due to name collision. Fix multi-word non-phrase search	2024-05-05 19:06:56 +09:30
rmgr	bdb4064acc	Rework ngram generation. Greatly improve performance of indexer. Commit horrendous sql sins	2024-05-04 21:10:46 +09:30
rmgr	9f0e7e6b29	Indexer and query optimisations	2024-04-06 19:34:59 +10:30
rmgr	9d57f66cd7	Add beginnings of ngram search capability	2024-04-05 21:36:15 +10:30
rmgr	343410e62f	Add first pass youtube subtitle indexer	2024-04-05 06:22:56 +10:30
rmgr	7ee9d978b2	Tidy up crawling and implement boolean search	2024-04-04 20:46:34 +10:30
rmgr	d4bb3fb8dc	Tidy up index.py	2024-03-07 21:12:19 +10:30
rmgr	20d198e559	Refactor to use postgresql end to end	2024-03-07 21:00:11 +10:30
rmgr	8605ee6b2c	Add todo file	2024-03-02 19:58:10 +10:30
rmgr	aed568d11e	Remove beehave.txt note	2024-03-02 19:54:53 +10:30
rmgr	8903f7a3e5	Merge postgres chagnes	2024-03-02 19:53:58 +10:30
rmgr	efe6dea1f5	Fix crawling. Add initial linksfile crawling. Still need to remove records as they are processed.	2024-01-01 20:52:12 +10:30
rmgr	f4ea8ad1d7	Respect robots.txt	2024-01-01 19:53:22 +10:30