Compare commits

..

No commits in common. "main" and "postgres" have entirely different histories.

10 changed files with 112 additions and 631 deletions

View file

@ -1,33 +0,0 @@
html, body {
height: 100%;
}
body {
margin: 0;
}
input {
padding: 7px;
font-size: 1.1rem;
}
.search-container {
display: flex;
justify-content: center;
align-items: center;
text-align: center;
min-height: 25vh;
}
.flex-container {
padding: 0;
margin: 0;
display: flex;
align-items: center;
justify-content: center;
flex-direction: column;
}
.flex-item {
}
.result {
display:block;
max-width: 60vw;
overflow-x: hidden;
}

View file

@ -1,16 +0,0 @@
<html>
<head>
<link rel="stylesheet" href="css/styles.css">
</head>
<body>
<div class="search-container">
<input type="text" class="searchbox" id="searchbox">
</div>
<div class="flex-container">
<div class="flex-item" id="results">
</div>
</div>
<script src="js/index.js"></script>
</body>
</html>

View file

@ -1,28 +0,0 @@
function debounce(func, timeout = 300){
let timer;
return (...args) => {
clearTimeout(timer);
timer = setTimeout(() => { func.apply(this, args); }, timeout);
};
}
async function search(searchBox){
const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
const results = await response.json();
const resultView = document.getElementById("results");
resultView.replaceChildren();
for (let i = 0; i < results.length; i++){
let result = results[i];
let resultElement = document.createElement("a");
resultElement.innerText = result[0];
resultElement.href = result[0];
resultElement.className = "flex-item result";
resultView.appendChild(resultElement);
}
}
const searchBoxKeyUp = debounce(() => search())
const searchBox = document.getElementById("searchbox");
searchBox.addEventListener("keyup", debounce(() => search(searchBox)))

View file

@ -1,128 +1,49 @@
#!/usr/bin/python3
import argparse
import requests
import hashlib
from urllib.parse import urlparse, urljoin
import urllib.robotparser
import os
from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Documents
from models import Base, Website
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import datetime
import yt_dlp as youtube_dl
# TODO- Handle gemini/gopher links
# TODO- Keep a list of traversed links and check before traversing again
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
excluded_domains = ['amazon.', 'news.ycombinator.',
'facebook.com', 'amzn', 'fb.com']
excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
def get_html(url: str) -> str:
response = requests.get(url)
return response.content
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
def parse_youtube(video_url: str) -> bool:
return
# Language preference for subtitles (set to None for auto-generated)
# Change this to 'en' for English subtitles, or None for auto-generated
subtitle_language = 'en'
# Options for youtube_dl
ydl_opts = {
'writesubtitles': True,
'allsubtitles': True,
'skip_download': True, # We only want to fetch metadata
'subtitleslangs': [subtitle_language] if subtitle_language else None,
'extractor-args': {'youtube': {'player_client': 'ios,web'}},
}
# Initialize youtube_dl object
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
# Download metadata
info_dict = ydl.extract_info(video_url, download=False)
# Extract subtitles
subtitles = info_dict.get('subtitles')
subtitles_text = ""
# Print available subtitles
if subtitles:
for subs in subtitles.values():
for sub in subs:
subtitle_url = sub['url']
with youtube_dl.YoutubeDL({}) as ydl:
subtitle_info = ydl.extract_info(
subtitle_url, download=False)
for subtitle in subtitle_info['subtitles'][subtitle_language]:
if subtitle["ext"] == "srv1":
soup = BeautifulSoup(
get_html(subtitle["url"]), 'html.parser')
subtitles_text = soup.get_text()
s = Session()
existing_website = s.query(
Documents).filter_by(url=video_url).first()
if existing_website is None:
website = Documents(
url=video_url,
text_content=subtitles_text,
html_content=None, # soup.prettify(),
first_crawl_date=datetime.datetime.now(),
last_crawl_date=datetime.datetime.now(),
last_index_date=None
)
s.add(website)
else:
existing_website.last_crawl_date = datetime.datetime.now()
s.add(existing_website)
s.commit()
s.close()
def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
for domain in excluded_domains:
if domain in url:
return
if any(ext in url for ext in excluded_filetypes):
return
if "youtube.com" in url:
parse_youtube(url)
return
rp = urllib.robotparser.RobotFileParser()
print(url)
print(recursion)
urlparts = urlparse(url)
baseurl = urlparts.scheme + "://" + urlparts.netloc
if baseurl not in robots:
rp.set_url(baseurl + "/robots.txt")
rp.read()
robots[baseurl] = rp
else:
rp = robots[baseurl]
if not rp.can_fetch("*", url):
print("Robots prevents crawling url: " + url)
return
soup = BeautifulSoup(html, 'html.parser')
soup = BeautifulSoup(html,'html.parser')
hash = hashlib.sha256()
hash.update(url.encode('ascii'))
s = Session()
existing_website = s.query(Documents).filter_by(url=url).first()
if existing_website is None:
website = Documents(
existing_website = s.query(Website).filter_by(url=url).first()
print (existing_website)
if existing_website == None:
website = Website(
url=url,
text_content=soup.get_text(),
html_content=soup.prettify(),
first_crawl_date=datetime.datetime.now(),
last_crawl_date=datetime.datetime.now(),
last_index_date=None
last_crawl_date = datetime.datetime.now()
)
s.add(website)
else:
@ -130,82 +51,54 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
s.add(existing_website)
s.commit()
s.close()
links = soup.find_all("a", href=True)
x = open(f'data/links.txt', 'a')
x.close()
links = soup.find_all("a")
for link in links:
found = False
link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
continue
if any(ext in link for ext in excluded_filetypes):
continue
if "http" not in link:
if not "http" in link:
link = urljoin(url, link)
link = link.split('?')[0]
link = link.split('#')[0]
if (recursion > 0 and link not in traversed_links):
try:
traversed_links.append(link)
link_html = get_html(link)
r = recursion - 1
sleep(0.5)
r = recursion -1
sleep(1)
parse_html(link, link_html, r, traversed_links)
except:
pass
# elif link not in traversed_links:
# with open('data/links.txt', 'r+') as linksfile:
# else:
# with open(f'data/links.txt', 'r+') as linksfile:
# while line := linksfile.readline():
# if line.strip() == link.strip():
# found = True
# if not found:
# linksfile.write(f'{link}\n')
def parse_site_map(base_url):
map = BeautifulSoup(requests.get(base_url).content, 'xml')
print(map.find_all('loc'))
for loc in map.find_all('loc'):
if "xml" in loc.contents[0]:
parse_site_map(loc.contents[0])
else:
url = loc.contents[0]
html = get_html(url)
parse_html(url, html, max_recursion)
if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True)
# check inputs
parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true")
parser.add_argument('-s', "--crawl-sitemap", action="store_true")
parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
max_recursion = 4
args = parser.parse_args()
max_recursion = int(args.max_recursion)
if args.url == "links":
with open('data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if "http" in line:
try:
parse_html(line, get_html(line))
except:
pass
elif args.crawl_sitemap:
rp = urllib.robotparser.RobotFileParser()
urlparts = urlparse(args.url)
baseurl = urlparts.scheme + "://" + urlparts.netloc
rp.set_url(baseurl + "/robots.txt")
rp.read()
if not rp.can_fetch("*", args.url):
print("Robots prevents crawling url: " + args.url)
exit(0)
if len(rp.site_maps()) > 0:
parse_site_map(rp.site_maps()[0])
else:
html = get_html(args.url)
parse_html(args.url, html, max_recursion)
# recursion = 0
# if (args.followlinks):
# os.remove('data/links.txt')
# with open(f'data/links.txt', 'r+') as linksfile:
# while line := linksfile.readline():
# if recursion < max_recursion:
# if "http" in line:
# recursion += 1
# try:
# parse_html(line, get_html(line))
# except:
# pass
os.remove('data/links.txt')

View file

@ -1,154 +1,54 @@
#!/usr/bin/python3
import argparse
from sqlalchemy import create_engine, or_, text
from sqlalchemy import Table, Column, String, Integer
from sqlalchemy import create_engine
from config import DATABASE_URI
from sqlalchemy.dialects.postgresql import UUID
from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import SQLAlchemyError
import uuid
import datetime
import time
import re
import random
from multiprocessing import Pool
from models import Base, Website
from pathlib import Path
import argparse
import os
import json
# investigate ngrams for "multi word" matching
ignored_words = ['a', 'the','is']
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
# https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html
def contains_latin(text):
latin_pattern = r'[a-zA-ZÀ-ÖØ-öø-ÿ]'
return bool(re.search(latin_pattern, text))
def build_index_chunk(document_chunk):
session = Session()
print(len(document_chunk))
start_time = time.time_ns()
for document in document_chunk:
print(document.url)
content = re.sub(r'[.,?!]', ' ', str(document.text_content))
content = re.sub(r'[^\w\s]', '', str(content))
content_words = content.split()
build_ngrams(1, content_words, document.id)
build_ngrams(2, content_words, document.id)
build_ngrams(3, content_words, document.id)
build_ngrams(4, content_words, document.id)
build_ngrams(5, content_words, document.id)
document.last_index_date = datetime.datetime.now()
session.merge(document)
session.commit()
session.close()
def remove_punctuation(input_string):
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
for p in punc:
input_string = input_string.replace(p, '')
return input_string
def build_index():
while True:
session = Session()
documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
None), Documents.last_index_date < Documents.last_crawl_date)).limit(100)
session.close()
# Execute the query to get the result set
documents = list(documents_query)
if len(documents) == 0:
return
build_index_chunk(documents)
continue
chunk_size = 10
document_chunks = [documents[i:i+chunk_size]
for i in range(0, len(documents), chunk_size)]
with Pool() as pool:
pool.map(build_index_chunk, document_chunks)
def zip_ngrams(size: int, corpus, document_id):
size = int(size)
connection = engine.connect()
temptbl_name = 'temp_del_{}'.format(
time.time_ns() + random.randint(100000, 9999999))
temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column(
'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True)
try:
# Start transaction
with connection.begin():
temptbl.create(engine)
insert_grams = []
grams = zip(*[corpus[i:] for i in range(size)])
for gram in grams:
gram = ' '.join(gram).lower()
insert_grams.append(
{"id": uuid.uuid4(), "gram": gram, "size": size})
connection.execute(temptbl.insert().values(insert_grams))
connection.execute(text("UPDATE " + temptbl_name +
" SET id = ngrams.id FROM ngrams WHERE ngrams.gram = "
+ temptbl_name + ".gram;"))
connection.execute(text("INSERT INTO ngrams (id, gram, size) SELECT " +
" distinct t.id, t.gram as gram, t.size FROM " +
temptbl_name + " t LEFT JOIN ngrams on ngrams.gram = " +
"t.gram WHERE ngrams.id is null and t.size is not null " + " ON CONFLICT DO NOTHING;"))
connection.execute(text("INSERT INTO document_ngrams(id, document_id, ngram_id) SELECT DISTINCT " +
"uuid_generate_v4() , '" + str(document_id) + "'::UUID, t.id FROM " + temptbl_name + " t;"))
except SQLAlchemyError as e:
# Handle exceptions
print("An error occurred:", e)
# Rollback transaction
connection.rollback()
with open(f"data/index.json", "w") as index:
# get a list of all content files
# split on whitespace and add to index
dictionary = {}
pathlist = Path('data/content').rglob('*.txt')
for path in pathlist:
with open(str(path)) as content_file:
url = content_file.readline()
content = content_file.read()
content_words = content.split()
for word in content_words:
word = word.lower()
word = remove_punctuation(word)
if not word in ignored_words:
if not word in dictionary:
dictionary[word] = []
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
if len(matching_urls) == 0:
# if not url.strip() in dictionary[word]:
entries = dictionary[word]
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
dictionary[word].append(entry)
else:
# Commit transaction if no exceptions occurred
connection.commit()
finally:
connection.close()
# Drop table outside the transaction block
temptbl.drop(engine)
def build_ngrams(size: int, corpus: str, document_id: str):
session = Session()
zip_ngrams(size, corpus, document_id)
return
i = 0
grams = []
while i < len(corpus):
if i + size >= len(corpus):
i = len(corpus)
gram = ''
for n in range(0, size):
if i + n >= len(corpus):
break
gram += corpus[i+n] + ' '
gram = gram.strip().lower()
if len(gram) > 1000 or gram in grams or not contains_latin(gram):
i += 1
continue
grams.append(gram)
if (len(gram) > 1):
ngram = session.query(NGrams).filter_by(
gram=gram).filter_by(size=size).first()
if ngram is None:
ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
session.add(ngram)
document_ngram = Document_NGrams(
document_id=document_id, ngram_id=ngram.id)
session.add(document_ngram)
session.commit()
i += 1
# print(str((time.time_ns() - start_time)//1_000_000))
session.close()
entries = dictionary[word]
entry = matching_urls[0]
entry["count"] += 1
entries.sort(reverse=True, key=lambda entry: entry["count"])
index.write(json.dumps(dictionary))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-r',
"--rebuild",
action="store_true",
help="Blow away the index and rebuild")
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
args = parser.parse_args()
if args.rebuild:
build_index()

View file

@ -1,54 +0,0 @@
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from pathlib import Path
import argparse
import os
import json
# investigate ngrams for "multi word" matching
ignored_words = ['a', 'the','is']
def remove_punctuation(input_string):
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
for p in punc:
input_string = input_string.replace(p, '')
return input_string
def build_index():
with open("data/index.json", "w") as index:
# get a list of all content files
# split on whitespace and add to index
dictionary = {}
pathlist = Path('data/content').rglob('*.txt')
for path in pathlist:
with open(str(path)) as content_file:
url = content_file.readline()
content = content_file.read()
content_words = content.split()
for word in content_words:
word = word.lower()
word = remove_punctuation(word)
if word not in ignored_words:
if word not in dictionary:
dictionary[word] = []
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
if len(matching_urls) == 0:
# if not url.strip() in dictionary[word]:
entries = dictionary[word]
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
dictionary[word].append(entry)
else:
entries = dictionary[word]
entry = matching_urls[0]
entry["count"] += 1
entries.sort(reverse=True, key=lambda entry: entry["count"])
index.write(json.dumps(dictionary))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
args = parser.parse_args()
if args.rebuild:
build_index()

View file

@ -1,72 +1,18 @@
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer
from sqlalchemy import Column, Integer, String, DateTime
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import relationship, mapped_column
import uuid
Base = declarative_base()
class Website(Base):
class Documents(Base):
__tablename__ = 'documents'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
__tablename__ = 'websites'
id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
url = Column(String)
text_content = Column(String)
html_content = Column(String)
first_crawl_date = Column(DateTime)
last_crawl_date = Column(DateTime)
last_index_date = Column(DateTime)
document_tokens = relationship(
"Document_Tokens", back_populates="document")
document_ngrams = relationship(
"Document_NGrams", back_populates="document")
class Document_Tokens(Base):
__tablename__ = 'document_tokens'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id = mapped_column(ForeignKey("documents.id"))
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
token_id = mapped_column(ForeignKey("tokens.id"))
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document = relationship(
"Documents", back_populates="document_tokens", uselist=False)
token = relationship("Tokens", back_populates="document_tokens")
__table_args__ = (
Index('idx_document_tokens_document_id_token_id', 'document_id',
'token_id', unique=True, postgresql_using='hash'),
Index('idx_document_tokens_clustered', 'document_id',
'token_id', postgresql_using='hash'),
)
class Tokens(Base):
__tablename__ = 'tokens'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
token = Column(String, index=True)
document_tokens = relationship("Document_Tokens", back_populates="token")
class NGrams(Base):
__tablename__ = 'ngrams'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
size = Column(Integer, index=True)
gram = Column(String, index=True)
document_ngrams = relationship("Document_NGrams", back_populates="ngram")
class Document_NGrams(Base):
__tablename__ = 'document_ngrams'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id = mapped_column(ForeignKey("documents.id"))
ngram_id = mapped_column(ForeignKey("ngrams.id"))
document = relationship(
"Documents", back_populates="document_ngrams", uselist=False)
ngram = relationship("NGrams", back_populates="document_ngrams")
__table_args__ = (
Index('idx_document_ngrams_document_id_ngram_id', 'document_id',
'ngram_id', unique=True, postgresql_using='hash'),
Index('idx_document_ngrams_clustered', 'document_id',
'ngram_id', postgresql_using='hash'),
)

View file

@ -1,146 +1,30 @@
#!/usr/bin/python3
from sqlalchemy import create_engine, func, and_, or_, not_
from config import DATABASE_URI
from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.expression import distinct
import time
#!/bin/bash
from flask import Flask
from flask_cors import CORS
from flask import send_from_directory
from flask import Request
import json
from urllib.parse import unquote
app = Flask(__name__, static_url_path='/static/')
CORS(app)
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
# Todo - Boolean search (AND/OR/NOT/"")
def split_query(query):
query = query.lower()
result = {'ands': [], 'ors': [], 'words': [],
'ngrams': [], 'exclusions': []}
query_words = query.split()
i = 0
while i < len(query_words):
if i + 1 < len(query_words):
if query_words[i + 1].lower() == "and":
if i + 2 < len(query_words):
result['ands'].append(
query_words[i] + ',' + query_words[i+2])
i = i + 3
continue
if query_words[i][0] == '"':
n = 0
quoted_query = ""
while i+n < len(query_words):
quoted_query += query_words[i+n] + ' '
if query_words[i+n][len(query_words[i+n])-1] == '"':
break
n += 1
result['ngrams'].append(
quoted_query[1:len(quoted_query)-2].rstrip())
i += n + 1
continue
elif query_words[i][0] == "-":
excluded_query = query_words[i][1: len(query_words[i])]
result['exclusions'].append(excluded_query)
i += 1
continue
result['ngrams'].append(query_words[i])
i += 1
app = Flask(__name__)
## Todo - Boolean search (AND/OR/NOT/"")
@app.route("/search/<query>")
def search(query):
with open('data/index.json', 'r') as index_json:
index = json.load(index_json)
query = unquote(query)
query_split = query.split()
result = []
for q in query_split:
q = q.lower()
if q in index:
for item in index[q]:
matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
if len(matching_results) == 0:
result.append(item)
else:
matching_results[0]["count"] += item["count"]
return result
def handle_and():
pass
@ app.route("/search/<query>")
def search(query):
start_time = time.time_ns()
session = Session()
results = {}
query_words = split_query(unquote(query))
print(query_words)
if len(query_words['ands']) > 0:
print('entering ands: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
for a in query_words['ands']:
query = session.query(Documents.url, func.count(1)). \
join(Document_NGrams, Documents.id == Document_NGrams.document_id). \
join(NGrams, Document_NGrams.ngram_id == NGrams.id). \
filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\
group_by(Documents.url). \
having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \
order_by(func.count(1).desc())
# limit(100)
print(query)
for result in query.all():
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
print('exiting ands: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
if len(query_words['ngrams']) > 0:
print('entering ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
q = session.query(Documents.url, func.count(1)) \
.join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
.join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
.group_by(Documents.url)
conditions = []
for ngram in query_words['ngrams']:
conditions.append(
(NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram))
# q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
and_conditions = [and_(*condition_pair)
for condition_pair in conditions]
q = q.filter(or_(*and_conditions))
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(q)
x = q.limit(100).all()
print('query executed: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(x)
for result in x:
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
# for y in x:
# print(y)
# for document_ngram in y.document_ngrams:
# if document_ngram.document.url in results.keys():
# results[document_ngram.document.url] += 1
# else:
# results[document_ngram.document.url] = 1
print('exiting ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
session.close()
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
# @app.route("/search/<query>")
# def search(query):
# start_time = time.time_ns()
# session = Session()
# result = {}
# query_words = unquote(query).split()
# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
# for word in query_words:
# word = word.lower()
# matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
#
# if matching_ngram is None:
# continue
# for document_ngram in matching_ngram.document_ngrams:
# if document_ngram.document.url in result.keys():
# result[document_ngram.document.url] += 1
# else:
# result[document_ngram.document.url] = 1
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]

11
todo
View file

@ -1,11 +0,0 @@
[x] Refactor website table to generic document table (maybe using URN instead of URL?)
[x] Define tokens table FKed to document table
[x] Refactor index.py to tokenize input into tokens table
[x] Define N-Grams table
[x] Add N-Gram generation to index.py
[x] Add clustered index to document_ngrams table model
[x] Add clustered index to document_tokens table model
[ ] Add ddl command to create partition tables
[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
[x] Instead of starting from a random page on the site, go to root and find site map and crawl that