diff --git a/client/src/css/styles.css b/client/src/css/styles.css
new file mode 100644
index 0000000..37323ab
--- /dev/null
+++ b/client/src/css/styles.css
@@ -0,0 +1,33 @@
+html, body {
+ height: 100%;
+}
+body {
+ margin: 0;
+}
+input {
+ padding: 7px;
+ font-size: 1.1rem;
+}
+.search-container {
+ display: flex;
+ justify-content: center;
+ align-items: center;
+ text-align: center;
+ min-height: 25vh;
+}
+
+.flex-container {
+ padding: 0;
+ margin: 0;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ flex-direction: column;
+}
+.flex-item {
+}
+.result {
+ display:block;
+ max-width: 60vw;
+ overflow-x: hidden;
+}
diff --git a/client/src/index.html b/client/src/index.html
new file mode 100644
index 0000000..a748d6c
--- /dev/null
+++ b/client/src/index.html
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/client/src/js/index.js b/client/src/js/index.js
new file mode 100644
index 0000000..09b0bb2
--- /dev/null
+++ b/client/src/js/index.js
@@ -0,0 +1,28 @@
+function debounce(func, timeout = 300){
+ let timer;
+ return (...args) => {
+ clearTimeout(timer);
+ timer = setTimeout(() => { func.apply(this, args); }, timeout);
+ };
+}
+async function search(searchBox){
+ const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
+ const results = await response.json();
+
+ const resultView = document.getElementById("results");
+ resultView.replaceChildren();
+ for (let i = 0; i < results.length; i++){
+ let result = results[i];
+ let resultElement = document.createElement("a");
+ resultElement.innerText = result[0];
+ resultElement.href = result[0];
+ resultElement.className = "flex-item result";
+ resultView.appendChild(resultElement);
+ }
+}
+
+const searchBoxKeyUp = debounce(() => search())
+
+const searchBox = document.getElementById("searchbox");
+
+searchBox.addEventListener("keyup", debounce(() => search(searchBox)))
diff --git a/src/__pycache__/search.cpython-310.pyc b/src/__pycache__/search.cpython-310.pyc
index c740282..f3e8621 100644
Binary files a/src/__pycache__/search.cpython-310.pyc and b/src/__pycache__/search.cpython-310.pyc differ
diff --git a/src/crawl.py b/src/crawl.py
index bc6470d..1480b4e 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -1,104 +1,211 @@
#!/usr/bin/python3
+
import argparse
import requests
-import hashlib
from urllib.parse import urlparse, urljoin
+import urllib.robotparser
import os
from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
-from models import Base, Website
+from models import Base, Documents
from sqlalchemy.orm import sessionmaker
-from sqlalchemy import create_engine
import datetime
+import yt_dlp as youtube_dl
# TODO- Handle gemini/gopher links
-# TODO- Keep a list of traversed links and check before traversing again
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
-def get_html(url: str) -> str:
+excluded_domains = ['amazon.', 'news.ycombinator.',
+ 'facebook.com', 'amzn', 'fb.com']
+excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
+ ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
+
+
+def get_html(url: str) -> str:
response = requests.get(url)
return response.content
-def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
+def parse_youtube(video_url: str) -> bool:
+ return
+ # Language preference for subtitles (set to None for auto-generated)
+ # Change this to 'en' for English subtitles, or None for auto-generated
+ subtitle_language = 'en'
+ # Options for youtube_dl
+ ydl_opts = {
+ 'writesubtitles': True,
+ 'allsubtitles': True,
+ 'skip_download': True, # We only want to fetch metadata
+ 'subtitleslangs': [subtitle_language] if subtitle_language else None,
+ 'extractor-args': {'youtube': {'player_client': 'ios,web'}},
+ }
+
+ # Initialize youtube_dl object
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+ # Download metadata
+ info_dict = ydl.extract_info(video_url, download=False)
+
+ # Extract subtitles
+ subtitles = info_dict.get('subtitles')
+ subtitles_text = ""
+ # Print available subtitles
+ if subtitles:
+ for subs in subtitles.values():
+ for sub in subs:
+ subtitle_url = sub['url']
+ with youtube_dl.YoutubeDL({}) as ydl:
+ subtitle_info = ydl.extract_info(
+ subtitle_url, download=False)
+ for subtitle in subtitle_info['subtitles'][subtitle_language]:
+ if subtitle["ext"] == "srv1":
+ soup = BeautifulSoup(
+ get_html(subtitle["url"]), 'html.parser')
+ subtitles_text = soup.get_text()
+
+ s = Session()
+ existing_website = s.query(
+ Documents).filter_by(url=video_url).first()
+ if existing_website is None:
+ website = Documents(
+ url=video_url,
+ text_content=subtitles_text,
+ html_content=None, # soup.prettify(),
+ first_crawl_date=datetime.datetime.now(),
+ last_crawl_date=datetime.datetime.now(),
+ last_index_date=None
+ )
+ s.add(website)
+ else:
+ existing_website.last_crawl_date = datetime.datetime.now()
+ s.add(existing_website)
+ s.commit()
+ s.close()
+
+
+def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
+ for domain in excluded_domains:
+ if domain in url:
+ return
+ if any(ext in url for ext in excluded_filetypes):
+ return
+ if "youtube.com" in url:
+ parse_youtube(url)
+ return
+ rp = urllib.robotparser.RobotFileParser()
print(url)
print(recursion)
urlparts = urlparse(url)
baseurl = urlparts.scheme + "://" + urlparts.netloc
- soup = BeautifulSoup(html,'html.parser')
- hash = hashlib.sha256()
- hash.update(url.encode('ascii'))
+ if baseurl not in robots:
+ rp.set_url(baseurl + "/robots.txt")
+ rp.read()
+ robots[baseurl] = rp
+ else:
+ rp = robots[baseurl]
+ if not rp.can_fetch("*", url):
+ print("Robots prevents crawling url: " + url)
+ return
+
+ soup = BeautifulSoup(html, 'html.parser')
s = Session()
- existing_website = s.query(Website).filter_by(url=url).first()
- print (existing_website)
- if existing_website == None:
- website = Website(
- url=url,
- text_content=soup.get_text(),
- html_content=soup.prettify(),
- first_crawl_date=datetime.datetime.now(),
- last_crawl_date = datetime.datetime.now()
- )
+ existing_website = s.query(Documents).filter_by(url=url).first()
+ if existing_website is None:
+ website = Documents(
+ url=url,
+ text_content=soup.get_text(),
+ html_content=soup.prettify(),
+ first_crawl_date=datetime.datetime.now(),
+ last_crawl_date=datetime.datetime.now(),
+ last_index_date=None
+ )
s.add(website)
else:
existing_website.last_crawl_date = datetime.datetime.now()
s.add(existing_website)
s.commit()
s.close()
- x = open(f'data/links.txt', 'a')
- x.close()
- links = soup.find_all("a")
+ links = soup.find_all("a", href=True)
for link in links:
found = False
link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
continue
- if not "http" in link:
+ if any(ext in link for ext in excluded_filetypes):
+ continue
+ if "http" not in link:
link = urljoin(url, link)
+ link = link.split('?')[0]
+ link = link.split('#')[0]
if (recursion > 0 and link not in traversed_links):
try:
traversed_links.append(link)
link_html = get_html(link)
- r = recursion -1
- sleep(1)
+ r = recursion - 1
+ sleep(0.5)
parse_html(link, link_html, r, traversed_links)
except:
pass
-# else:
-# with open(f'data/links.txt', 'r+') as linksfile:
+# elif link not in traversed_links:
+# with open('data/links.txt', 'r+') as linksfile:
# while line := linksfile.readline():
# if line.strip() == link.strip():
# found = True
# if not found:
# linksfile.write(f'{link}\n')
-if __name__ == "__main__":
+def parse_site_map(base_url):
+ map = BeautifulSoup(requests.get(base_url).content, 'xml')
+ print(map.find_all('loc'))
+ for loc in map.find_all('loc'):
+ if "xml" in loc.contents[0]:
+ parse_site_map(loc.contents[0])
+ else:
+ url = loc.contents[0]
+ html = get_html(url)
+ parse_html(url, html, max_recursion)
+
+
+if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True)
# check inputs
parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true")
- max_recursion = 4
+ parser.add_argument('-s', "--crawl-sitemap", action="store_true")
+ parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
+
args = parser.parse_args()
- html = get_html(args.url)
- parse_html(args.url, html, max_recursion)
+ max_recursion = int(args.max_recursion)
+ if args.url == "links":
+ with open('data/links.txt', 'r+') as linksfile:
+ while line := linksfile.readline():
+ if "http" in line:
+ try:
+ parse_html(line, get_html(line))
+ except:
+ pass
+ elif args.crawl_sitemap:
+ rp = urllib.robotparser.RobotFileParser()
+ urlparts = urlparse(args.url)
+ baseurl = urlparts.scheme + "://" + urlparts.netloc
+ rp.set_url(baseurl + "/robots.txt")
+ rp.read()
+ if not rp.can_fetch("*", args.url):
+ print("Robots prevents crawling url: " + args.url)
+ exit(0)
+ if len(rp.site_maps()) > 0:
+ parse_site_map(rp.site_maps()[0])
+ else:
+ html = get_html(args.url)
+ parse_html(args.url, html, max_recursion)
# recursion = 0
# if (args.followlinks):
-# with open(f'data/links.txt', 'r+') as linksfile:
-# while line := linksfile.readline():
-# if recursion < max_recursion:
-# if "http" in line:
-# recursion += 1
-# try:
-# parse_html(line, get_html(line))
-# except:
-# pass
- os.remove('data/links.txt')
+# os.remove('data/links.txt')
diff --git a/src/index.py b/src/index.py
index e04c787..679d312 100644
--- a/src/index.py
+++ b/src/index.py
@@ -1,54 +1,154 @@
-from sqlalchemy import create_engine
-from config import DATABASE_URI
-from models import Base, Website
-from pathlib import Path
-import argparse
-import os
-import json
-# investigate ngrams for "multi word" matching
-ignored_words = ['a', 'the','is']
+#!/usr/bin/python3
-def remove_punctuation(input_string):
- punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
- for p in punc:
- input_string = input_string.replace(p, '')
- return input_string
+import argparse
+from sqlalchemy import create_engine, or_, text
+from sqlalchemy import Table, Column, String, Integer
+from config import DATABASE_URI
+from sqlalchemy.dialects.postgresql import UUID
+from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy.exc import SQLAlchemyError
+import uuid
+import datetime
+import time
+import re
+import random
+from multiprocessing import Pool
+
+engine = create_engine(DATABASE_URI)
+Base.metadata.create_all(engine)
+Session = sessionmaker(bind=engine)
+# https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html
+
+
+def contains_latin(text):
+ latin_pattern = r'[a-zA-ZÀ-ÖØ-öø-ÿ]'
+ return bool(re.search(latin_pattern, text))
+
+
+def build_index_chunk(document_chunk):
+ session = Session()
+ print(len(document_chunk))
+ start_time = time.time_ns()
+ for document in document_chunk:
+ print(document.url)
+ content = re.sub(r'[.,?!]', ' ', str(document.text_content))
+ content = re.sub(r'[^\w\s]', '', str(content))
+ content_words = content.split()
+ build_ngrams(1, content_words, document.id)
+ build_ngrams(2, content_words, document.id)
+ build_ngrams(3, content_words, document.id)
+ build_ngrams(4, content_words, document.id)
+ build_ngrams(5, content_words, document.id)
+
+ document.last_index_date = datetime.datetime.now()
+ session.merge(document)
+ session.commit()
+ session.close()
def build_index():
- with open(f"data/index.json", "w") as index:
- # get a list of all content files
- # split on whitespace and add to index
- dictionary = {}
- pathlist = Path('data/content').rglob('*.txt')
- for path in pathlist:
- with open(str(path)) as content_file:
- url = content_file.readline()
- content = content_file.read()
- content_words = content.split()
- for word in content_words:
- word = word.lower()
- word = remove_punctuation(word)
- if not word in ignored_words:
- if not word in dictionary:
- dictionary[word] = []
- matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
- if len(matching_urls) == 0:
-# if not url.strip() in dictionary[word]:
- entries = dictionary[word]
- entry = {"url": url.strip(), "count": 1, "filename": str(path)}
- dictionary[word].append(entry)
- else:
- entries = dictionary[word]
- entry = matching_urls[0]
- entry["count"] += 1
- entries.sort(reverse=True, key=lambda entry: entry["count"])
- index.write(json.dumps(dictionary))
+ while True:
+ session = Session()
+ documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
+ None), Documents.last_index_date < Documents.last_crawl_date)).limit(100)
+ session.close()
+
+ # Execute the query to get the result set
+ documents = list(documents_query)
+ if len(documents) == 0:
+ return
+ build_index_chunk(documents)
+ continue
+ chunk_size = 10
+ document_chunks = [documents[i:i+chunk_size]
+ for i in range(0, len(documents), chunk_size)]
+ with Pool() as pool:
+ pool.map(build_index_chunk, document_chunks)
+
+
+def zip_ngrams(size: int, corpus, document_id):
+ size = int(size)
+ connection = engine.connect()
+ temptbl_name = 'temp_del_{}'.format(
+ time.time_ns() + random.randint(100000, 9999999))
+ temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column(
+ 'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True)
+
+ try:
+ # Start transaction
+ with connection.begin():
+ temptbl.create(engine)
+ insert_grams = []
+ grams = zip(*[corpus[i:] for i in range(size)])
+ for gram in grams:
+ gram = ' '.join(gram).lower()
+ insert_grams.append(
+ {"id": uuid.uuid4(), "gram": gram, "size": size})
+ connection.execute(temptbl.insert().values(insert_grams))
+ connection.execute(text("UPDATE " + temptbl_name +
+ " SET id = ngrams.id FROM ngrams WHERE ngrams.gram = "
+ + temptbl_name + ".gram;"))
+ connection.execute(text("INSERT INTO ngrams (id, gram, size) SELECT " +
+ " distinct t.id, t.gram as gram, t.size FROM " +
+ temptbl_name + " t LEFT JOIN ngrams on ngrams.gram = " +
+ "t.gram WHERE ngrams.id is null and t.size is not null " + " ON CONFLICT DO NOTHING;"))
+ connection.execute(text("INSERT INTO document_ngrams(id, document_id, ngram_id) SELECT DISTINCT " +
+ "uuid_generate_v4() , '" + str(document_id) + "'::UUID, t.id FROM " + temptbl_name + " t;"))
+ except SQLAlchemyError as e:
+ # Handle exceptions
+ print("An error occurred:", e)
+ # Rollback transaction
+ connection.rollback()
+ else:
+ # Commit transaction if no exceptions occurred
+ connection.commit()
+ finally:
+ connection.close()
+ # Drop table outside the transaction block
+ temptbl.drop(engine)
+
+
+def build_ngrams(size: int, corpus: str, document_id: str):
+ session = Session()
+ zip_ngrams(size, corpus, document_id)
+ return
+ i = 0
+ grams = []
+ while i < len(corpus):
+ if i + size >= len(corpus):
+ i = len(corpus)
+ gram = ''
+ for n in range(0, size):
+ if i + n >= len(corpus):
+ break
+ gram += corpus[i+n] + ' '
+ gram = gram.strip().lower()
+ if len(gram) > 1000 or gram in grams or not contains_latin(gram):
+ i += 1
+ continue
+ grams.append(gram)
+ if (len(gram) > 1):
+ ngram = session.query(NGrams).filter_by(
+ gram=gram).filter_by(size=size).first()
+ if ngram is None:
+ ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
+ session.add(ngram)
+ document_ngram = Document_NGrams(
+ document_id=document_id, ngram_id=ngram.id)
+ session.add(document_ngram)
+ session.commit()
+ i += 1
+# print(str((time.time_ns() - start_time)//1_000_000))
+ session.close()
+
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
+ parser.add_argument('-r',
+ "--rebuild",
+ action="store_true",
+ help="Blow away the index and rebuild")
args = parser.parse_args()
if args.rebuild:
build_index()
-
diff --git a/src/index.py.old b/src/index.py.old
new file mode 100644
index 0000000..6ec8e21
--- /dev/null
+++ b/src/index.py.old
@@ -0,0 +1,54 @@
+from sqlalchemy import create_engine
+from config import DATABASE_URI
+from models import Base, Website
+from pathlib import Path
+import argparse
+import os
+import json
+# investigate ngrams for "multi word" matching
+ignored_words = ['a', 'the','is']
+
+def remove_punctuation(input_string):
+ punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
+ for p in punc:
+ input_string = input_string.replace(p, '')
+ return input_string
+
+
+def build_index():
+ with open("data/index.json", "w") as index:
+ # get a list of all content files
+ # split on whitespace and add to index
+ dictionary = {}
+ pathlist = Path('data/content').rglob('*.txt')
+ for path in pathlist:
+ with open(str(path)) as content_file:
+ url = content_file.readline()
+ content = content_file.read()
+ content_words = content.split()
+ for word in content_words:
+ word = word.lower()
+ word = remove_punctuation(word)
+ if word not in ignored_words:
+ if word not in dictionary:
+ dictionary[word] = []
+ matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
+ if len(matching_urls) == 0:
+# if not url.strip() in dictionary[word]:
+ entries = dictionary[word]
+ entry = {"url": url.strip(), "count": 1, "filename": str(path)}
+ dictionary[word].append(entry)
+ else:
+ entries = dictionary[word]
+ entry = matching_urls[0]
+ entry["count"] += 1
+ entries.sort(reverse=True, key=lambda entry: entry["count"])
+ index.write(json.dumps(dictionary))
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
+ args = parser.parse_args()
+ if args.rebuild:
+ build_index()
+
diff --git a/src/models.py b/src/models.py
index ee768d4..50010b6 100644
--- a/src/models.py
+++ b/src/models.py
@@ -1,18 +1,72 @@
from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, Integer, String, DateTime
+from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer
from sqlalchemy.dialects.postgresql import UUID
+from sqlalchemy.orm import relationship, mapped_column
import uuid
Base = declarative_base()
-class Website(Base):
- __tablename__ = 'websites'
- id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
+class Documents(Base):
+ __tablename__ = 'documents'
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
url = Column(String)
text_content = Column(String)
html_content = Column(String)
first_crawl_date = Column(DateTime)
last_crawl_date = Column(DateTime)
+ last_index_date = Column(DateTime)
+ document_tokens = relationship(
+ "Document_Tokens", back_populates="document")
+ document_ngrams = relationship(
+ "Document_NGrams", back_populates="document")
+class Document_Tokens(Base):
+ __tablename__ = 'document_tokens'
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+ document_id = mapped_column(ForeignKey("documents.id"))
+ # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+ token_id = mapped_column(ForeignKey("tokens.id"))
+ # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+ document = relationship(
+ "Documents", back_populates="document_tokens", uselist=False)
+ token = relationship("Tokens", back_populates="document_tokens")
+ __table_args__ = (
+ Index('idx_document_tokens_document_id_token_id', 'document_id',
+ 'token_id', unique=True, postgresql_using='hash'),
+ Index('idx_document_tokens_clustered', 'document_id',
+ 'token_id', postgresql_using='hash'),
+ )
+
+
+class Tokens(Base):
+ __tablename__ = 'tokens'
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+ token = Column(String, index=True)
+ document_tokens = relationship("Document_Tokens", back_populates="token")
+
+
+class NGrams(Base):
+ __tablename__ = 'ngrams'
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+ size = Column(Integer, index=True)
+ gram = Column(String, index=True)
+ document_ngrams = relationship("Document_NGrams", back_populates="ngram")
+
+
+class Document_NGrams(Base):
+ __tablename__ = 'document_ngrams'
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+ document_id = mapped_column(ForeignKey("documents.id"))
+ ngram_id = mapped_column(ForeignKey("ngrams.id"))
+ document = relationship(
+ "Documents", back_populates="document_ngrams", uselist=False)
+ ngram = relationship("NGrams", back_populates="document_ngrams")
+
+ __table_args__ = (
+ Index('idx_document_ngrams_document_id_ngram_id', 'document_id',
+ 'ngram_id', unique=True, postgresql_using='hash'),
+ Index('idx_document_ngrams_clustered', 'document_id',
+ 'ngram_id', postgresql_using='hash'),
+ )
diff --git a/src/search.py b/src/search.py
index 17668f9..fd013bc 100755
--- a/src/search.py
+++ b/src/search.py
@@ -1,30 +1,146 @@
-#!/bin/bash
-
+#!/usr/bin/python3
+from sqlalchemy import create_engine, func, and_, or_, not_
+from config import DATABASE_URI
+from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy.sql.expression import distinct
+import time
from flask import Flask
-from flask import Request
-import json
+from flask_cors import CORS
+from flask import send_from_directory
from urllib.parse import unquote
-app = Flask(__name__)
-## Todo - Boolean search (AND/OR/NOT/"")
-@app.route("/search/")
+app = Flask(__name__, static_url_path='/static/')
+CORS(app)
+engine = create_engine(DATABASE_URI)
+Base.metadata.create_all(engine)
+Session = sessionmaker(bind=engine)
+# Todo - Boolean search (AND/OR/NOT/"")
+
+
+def split_query(query):
+ query = query.lower()
+ result = {'ands': [], 'ors': [], 'words': [],
+ 'ngrams': [], 'exclusions': []}
+ query_words = query.split()
+ i = 0
+ while i < len(query_words):
+ if i + 1 < len(query_words):
+ if query_words[i + 1].lower() == "and":
+ if i + 2 < len(query_words):
+ result['ands'].append(
+ query_words[i] + ',' + query_words[i+2])
+ i = i + 3
+ continue
+ if query_words[i][0] == '"':
+ n = 0
+ quoted_query = ""
+ while i+n < len(query_words):
+ quoted_query += query_words[i+n] + ' '
+ if query_words[i+n][len(query_words[i+n])-1] == '"':
+ break
+ n += 1
+ result['ngrams'].append(
+ quoted_query[1:len(quoted_query)-2].rstrip())
+ i += n + 1
+ continue
+ elif query_words[i][0] == "-":
+ excluded_query = query_words[i][1: len(query_words[i])]
+ result['exclusions'].append(excluded_query)
+ i += 1
+ continue
+ result['ngrams'].append(query_words[i])
+ i += 1
+ return result
+
+
+@ app.route("/search/")
def search(query):
- with open('data/index.json', 'r') as index_json:
- index = json.load(index_json)
- query = unquote(query)
- query_split = query.split()
- result = []
- for q in query_split:
- q = q.lower()
- if q in index:
- for item in index[q]:
- matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
- if len(matching_results) == 0:
- result.append(item)
- else:
- matching_results[0]["count"] += item["count"]
- return result
+ start_time = time.time_ns()
+ session = Session()
+ results = {}
+ query_words = split_query(unquote(query))
+ print(query_words)
+ if len(query_words['ands']) > 0:
+ print('entering ands: ' +
+ str((time.time_ns() - start_time) // 1_000_000) + "ms")
+ for a in query_words['ands']:
+ query = session.query(Documents.url, func.count(1)). \
+ join(Document_NGrams, Documents.id == Document_NGrams.document_id). \
+ join(NGrams, Document_NGrams.ngram_id == NGrams.id). \
+ filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\
+ group_by(Documents.url). \
+ having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \
+ order_by(func.count(1).desc())
-def handle_and():
- pass
+# limit(100)
+ print(query)
+ for result in query.all():
+ if result[0] in results.keys():
+ results[result[0]] += result[1]
+ else:
+ results[result[0]] = result[1]
+ print('exiting ands: ' +
+ str((time.time_ns() - start_time) // 1_000_000) + "ms")
+ if len(query_words['ngrams']) > 0:
+ print('entering ngrams: ' +
+ str((time.time_ns() - start_time) // 1_000_000) + "ms")
+ q = session.query(Documents.url, func.count(1)) \
+ .join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
+ .join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
+ .group_by(Documents.url)
+ conditions = []
+ for ngram in query_words['ngrams']:
+ conditions.append(
+ (NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram))
+# q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
+ and_conditions = [and_(*condition_pair)
+ for condition_pair in conditions]
+ q = q.filter(or_(*and_conditions))
+ print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
+ print(q)
+ x = q.limit(100).all()
+ print('query executed: ' +
+ str((time.time_ns() - start_time) // 1_000_000) + "ms")
+ print(x)
+ for result in x:
+ if result[0] in results.keys():
+ results[result[0]] += result[1]
+ else:
+ results[result[0]] = result[1]
+# for y in x:
+# print(y)
+# for document_ngram in y.document_ngrams:
+# if document_ngram.document.url in results.keys():
+# results[document_ngram.document.url] += 1
+# else:
+# results[document_ngram.document.url] = 1
+ print('exiting ngrams: ' +
+ str((time.time_ns() - start_time) // 1_000_000) + "ms")
+
+ print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
+ session.close()
+ return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
+
+
+# @app.route("/search/")
+# def search(query):
+# start_time = time.time_ns()
+# session = Session()
+# result = {}
+# query_words = unquote(query).split()
+# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
+# for word in query_words:
+# word = word.lower()
+# matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
+#
+# if matching_ngram is None:
+# continue
+# for document_ngram in matching_ngram.document_ngrams:
+# if document_ngram.document.url in result.keys():
+# result[document_ngram.document.url] += 1
+# else:
+# result[document_ngram.document.url] = 1
+# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
+# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
diff --git a/todo b/todo
new file mode 100644
index 0000000..ddda3bd
--- /dev/null
+++ b/todo
@@ -0,0 +1,11 @@
+[x] Refactor website table to generic document table (maybe using URN instead of URL?)
+[x] Define tokens table FKed to document table
+[x] Refactor index.py to tokenize input into tokens table
+[x] Define N-Grams table
+[x] Add N-Gram generation to index.py
+[x] Add clustered index to document_ngrams table model
+[x] Add clustered index to document_tokens table model
+[ ] Add ddl command to create partition tables
+[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
+[x] Instead of starting from a random page on the site, go to root and find site map and crawl that
+