Compare commits

..

No commits in common. "main" and "postgres" have entirely different histories.

10 changed files with 112 additions and 631 deletions

View file

@ -1,33 +0,0 @@
html, body {
height: 100%;
}
body {
margin: 0;
}
input {
padding: 7px;
font-size: 1.1rem;
}
.search-container {
display: flex;
justify-content: center;
align-items: center;
text-align: center;
min-height: 25vh;
}
.flex-container {
padding: 0;
margin: 0;
display: flex;
align-items: center;
justify-content: center;
flex-direction: column;
}
.flex-item {
}
.result {
display:block;
max-width: 60vw;
overflow-x: hidden;
}

View file

@ -1,16 +0,0 @@
<html>
<head>
<link rel="stylesheet" href="css/styles.css">
</head>
<body>
<div class="search-container">
<input type="text" class="searchbox" id="searchbox">
</div>
<div class="flex-container">
<div class="flex-item" id="results">
</div>
</div>
<script src="js/index.js"></script>
</body>
</html>

View file

@ -1,28 +0,0 @@
function debounce(func, timeout = 300){
let timer;
return (...args) => {
clearTimeout(timer);
timer = setTimeout(() => { func.apply(this, args); }, timeout);
};
}
async function search(searchBox){
const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
const results = await response.json();
const resultView = document.getElementById("results");
resultView.replaceChildren();
for (let i = 0; i < results.length; i++){
let result = results[i];
let resultElement = document.createElement("a");
resultElement.innerText = result[0];
resultElement.href = result[0];
resultElement.className = "flex-item result";
resultView.appendChild(resultElement);
}
}
const searchBoxKeyUp = debounce(() => search())
const searchBox = document.getElementById("searchbox");
searchBox.addEventListener("keyup", debounce(() => search(searchBox)))

View file

@ -1,211 +1,104 @@
#!/usr/bin/python3 #!/usr/bin/python3
import argparse import argparse
import requests import requests
import hashlib
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
import urllib.robotparser
import os import os
from time import sleep from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from sqlalchemy import create_engine from sqlalchemy import create_engine
from config import DATABASE_URI from config import DATABASE_URI
from models import Base, Documents from models import Base, Website
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import datetime import datetime
import yt_dlp as youtube_dl
# TODO- Handle gemini/gopher links # TODO- Handle gemini/gopher links
# TODO- Keep a list of traversed links and check before traversing again
engine = create_engine(DATABASE_URI) engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine) Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine) Session = sessionmaker(bind=engine)
excluded_domains = ['amazon.', 'news.ycombinator.',
'facebook.com', 'amzn', 'fb.com']
excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
def get_html(url: str) -> str: def get_html(url: str) -> str:
response = requests.get(url) response = requests.get(url)
return response.content return response.content
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
def parse_youtube(video_url: str) -> bool:
return
# Language preference for subtitles (set to None for auto-generated)
# Change this to 'en' for English subtitles, or None for auto-generated
subtitle_language = 'en'
# Options for youtube_dl
ydl_opts = {
'writesubtitles': True,
'allsubtitles': True,
'skip_download': True, # We only want to fetch metadata
'subtitleslangs': [subtitle_language] if subtitle_language else None,
'extractor-args': {'youtube': {'player_client': 'ios,web'}},
}
# Initialize youtube_dl object
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
# Download metadata
info_dict = ydl.extract_info(video_url, download=False)
# Extract subtitles
subtitles = info_dict.get('subtitles')
subtitles_text = ""
# Print available subtitles
if subtitles:
for subs in subtitles.values():
for sub in subs:
subtitle_url = sub['url']
with youtube_dl.YoutubeDL({}) as ydl:
subtitle_info = ydl.extract_info(
subtitle_url, download=False)
for subtitle in subtitle_info['subtitles'][subtitle_language]:
if subtitle["ext"] == "srv1":
soup = BeautifulSoup(
get_html(subtitle["url"]), 'html.parser')
subtitles_text = soup.get_text()
s = Session()
existing_website = s.query(
Documents).filter_by(url=video_url).first()
if existing_website is None:
website = Documents(
url=video_url,
text_content=subtitles_text,
html_content=None, # soup.prettify(),
first_crawl_date=datetime.datetime.now(),
last_crawl_date=datetime.datetime.now(),
last_index_date=None
)
s.add(website)
else:
existing_website.last_crawl_date = datetime.datetime.now()
s.add(existing_website)
s.commit()
s.close()
def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
for domain in excluded_domains:
if domain in url:
return
if any(ext in url for ext in excluded_filetypes):
return
if "youtube.com" in url:
parse_youtube(url)
return
rp = urllib.robotparser.RobotFileParser()
print(url) print(url)
print(recursion) print(recursion)
urlparts = urlparse(url) urlparts = urlparse(url)
baseurl = urlparts.scheme + "://" + urlparts.netloc baseurl = urlparts.scheme + "://" + urlparts.netloc
if baseurl not in robots: soup = BeautifulSoup(html,'html.parser')
rp.set_url(baseurl + "/robots.txt") hash = hashlib.sha256()
rp.read() hash.update(url.encode('ascii'))
robots[baseurl] = rp
else:
rp = robots[baseurl]
if not rp.can_fetch("*", url):
print("Robots prevents crawling url: " + url)
return
soup = BeautifulSoup(html, 'html.parser')
s = Session() s = Session()
existing_website = s.query(Documents).filter_by(url=url).first() existing_website = s.query(Website).filter_by(url=url).first()
if existing_website is None: print (existing_website)
website = Documents( if existing_website == None:
url=url, website = Website(
text_content=soup.get_text(), url=url,
html_content=soup.prettify(), text_content=soup.get_text(),
first_crawl_date=datetime.datetime.now(), html_content=soup.prettify(),
last_crawl_date=datetime.datetime.now(), first_crawl_date=datetime.datetime.now(),
last_index_date=None last_crawl_date = datetime.datetime.now()
) )
s.add(website) s.add(website)
else: else:
existing_website.last_crawl_date = datetime.datetime.now() existing_website.last_crawl_date = datetime.datetime.now()
s.add(existing_website) s.add(existing_website)
s.commit() s.commit()
s.close() s.close()
links = soup.find_all("a", href=True) x = open(f'data/links.txt', 'a')
x.close()
links = soup.find_all("a")
for link in links: for link in links:
found = False found = False
link = link["href"] link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link: if (len(link) > 0 and link[0] == "#") or "localhost" in link:
continue continue
if any(ext in link for ext in excluded_filetypes): if not "http" in link:
continue
if "http" not in link:
link = urljoin(url, link) link = urljoin(url, link)
link = link.split('?')[0]
link = link.split('#')[0]
if (recursion > 0 and link not in traversed_links): if (recursion > 0 and link not in traversed_links):
try: try:
traversed_links.append(link) traversed_links.append(link)
link_html = get_html(link) link_html = get_html(link)
r = recursion - 1 r = recursion -1
sleep(0.5) sleep(1)
parse_html(link, link_html, r, traversed_links) parse_html(link, link_html, r, traversed_links)
except: except:
pass pass
# elif link not in traversed_links: # else:
# with open('data/links.txt', 'r+') as linksfile: # with open(f'data/links.txt', 'r+') as linksfile:
# while line := linksfile.readline(): # while line := linksfile.readline():
# if line.strip() == link.strip(): # if line.strip() == link.strip():
# found = True # found = True
# if not found: # if not found:
# linksfile.write(f'{link}\n') # linksfile.write(f'{link}\n')
def parse_site_map(base_url):
map = BeautifulSoup(requests.get(base_url).content, 'xml')
print(map.find_all('loc'))
for loc in map.find_all('loc'):
if "xml" in loc.contents[0]:
parse_site_map(loc.contents[0])
else:
url = loc.contents[0]
html = get_html(url)
parse_html(url, html, max_recursion)
if __name__ == "__main__": if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True) os.makedirs("data/content", exist_ok=True)
# check inputs # check inputs
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true") parser.add_argument('-f', "--followlinks", action="store_true")
parser.add_argument('-s', "--crawl-sitemap", action="store_true") max_recursion = 4
parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
args = parser.parse_args() args = parser.parse_args()
max_recursion = int(args.max_recursion) html = get_html(args.url)
if args.url == "links": parse_html(args.url, html, max_recursion)
with open('data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if "http" in line:
try:
parse_html(line, get_html(line))
except:
pass
elif args.crawl_sitemap:
rp = urllib.robotparser.RobotFileParser()
urlparts = urlparse(args.url)
baseurl = urlparts.scheme + "://" + urlparts.netloc
rp.set_url(baseurl + "/robots.txt")
rp.read()
if not rp.can_fetch("*", args.url):
print("Robots prevents crawling url: " + args.url)
exit(0)
if len(rp.site_maps()) > 0:
parse_site_map(rp.site_maps()[0])
else:
html = get_html(args.url)
parse_html(args.url, html, max_recursion)
# recursion = 0 # recursion = 0
# if (args.followlinks): # if (args.followlinks):
# os.remove('data/links.txt') # with open(f'data/links.txt', 'r+') as linksfile:
# while line := linksfile.readline():
# if recursion < max_recursion:
# if "http" in line:
# recursion += 1
# try:
# parse_html(line, get_html(line))
# except:
# pass
os.remove('data/links.txt')

View file

@ -1,154 +1,54 @@
#!/usr/bin/python3 from sqlalchemy import create_engine
import argparse
from sqlalchemy import create_engine, or_, text
from sqlalchemy import Table, Column, String, Integer
from config import DATABASE_URI from config import DATABASE_URI
from sqlalchemy.dialects.postgresql import UUID from models import Base, Website
from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams from pathlib import Path
from sqlalchemy.orm import sessionmaker import argparse
from sqlalchemy.exc import SQLAlchemyError import os
import uuid import json
import datetime # investigate ngrams for "multi word" matching
import time ignored_words = ['a', 'the','is']
import re
import random
from multiprocessing import Pool
engine = create_engine(DATABASE_URI) def remove_punctuation(input_string):
Base.metadata.create_all(engine) punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
Session = sessionmaker(bind=engine) for p in punc:
# https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html input_string = input_string.replace(p, '')
return input_string
def contains_latin(text):
latin_pattern = r'[a-zA-ZÀ-ÖØ-öø-ÿ]'
return bool(re.search(latin_pattern, text))
def build_index_chunk(document_chunk):
session = Session()
print(len(document_chunk))
start_time = time.time_ns()
for document in document_chunk:
print(document.url)
content = re.sub(r'[.,?!]', ' ', str(document.text_content))
content = re.sub(r'[^\w\s]', '', str(content))
content_words = content.split()
build_ngrams(1, content_words, document.id)
build_ngrams(2, content_words, document.id)
build_ngrams(3, content_words, document.id)
build_ngrams(4, content_words, document.id)
build_ngrams(5, content_words, document.id)
document.last_index_date = datetime.datetime.now()
session.merge(document)
session.commit()
session.close()
def build_index(): def build_index():
while True: with open(f"data/index.json", "w") as index:
session = Session() # get a list of all content files
documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_( # split on whitespace and add to index
None), Documents.last_index_date < Documents.last_crawl_date)).limit(100) dictionary = {}
session.close() pathlist = Path('data/content').rglob('*.txt')
for path in pathlist:
# Execute the query to get the result set with open(str(path)) as content_file:
documents = list(documents_query) url = content_file.readline()
if len(documents) == 0: content = content_file.read()
return content_words = content.split()
build_index_chunk(documents) for word in content_words:
continue word = word.lower()
chunk_size = 10 word = remove_punctuation(word)
document_chunks = [documents[i:i+chunk_size] if not word in ignored_words:
for i in range(0, len(documents), chunk_size)] if not word in dictionary:
with Pool() as pool: dictionary[word] = []
pool.map(build_index_chunk, document_chunks) matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
if len(matching_urls) == 0:
# if not url.strip() in dictionary[word]:
def zip_ngrams(size: int, corpus, document_id): entries = dictionary[word]
size = int(size) entry = {"url": url.strip(), "count": 1, "filename": str(path)}
connection = engine.connect() dictionary[word].append(entry)
temptbl_name = 'temp_del_{}'.format( else:
time.time_ns() + random.randint(100000, 9999999)) entries = dictionary[word]
temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column( entry = matching_urls[0]
'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True) entry["count"] += 1
entries.sort(reverse=True, key=lambda entry: entry["count"])
try: index.write(json.dumps(dictionary))
# Start transaction
with connection.begin():
temptbl.create(engine)
insert_grams = []
grams = zip(*[corpus[i:] for i in range(size)])
for gram in grams:
gram = ' '.join(gram).lower()
insert_grams.append(
{"id": uuid.uuid4(), "gram": gram, "size": size})
connection.execute(temptbl.insert().values(insert_grams))
connection.execute(text("UPDATE " + temptbl_name +
" SET id = ngrams.id FROM ngrams WHERE ngrams.gram = "
+ temptbl_name + ".gram;"))
connection.execute(text("INSERT INTO ngrams (id, gram, size) SELECT " +
" distinct t.id, t.gram as gram, t.size FROM " +
temptbl_name + " t LEFT JOIN ngrams on ngrams.gram = " +
"t.gram WHERE ngrams.id is null and t.size is not null " + " ON CONFLICT DO NOTHING;"))
connection.execute(text("INSERT INTO document_ngrams(id, document_id, ngram_id) SELECT DISTINCT " +
"uuid_generate_v4() , '" + str(document_id) + "'::UUID, t.id FROM " + temptbl_name + " t;"))
except SQLAlchemyError as e:
# Handle exceptions
print("An error occurred:", e)
# Rollback transaction
connection.rollback()
else:
# Commit transaction if no exceptions occurred
connection.commit()
finally:
connection.close()
# Drop table outside the transaction block
temptbl.drop(engine)
def build_ngrams(size: int, corpus: str, document_id: str):
session = Session()
zip_ngrams(size, corpus, document_id)
return
i = 0
grams = []
while i < len(corpus):
if i + size >= len(corpus):
i = len(corpus)
gram = ''
for n in range(0, size):
if i + n >= len(corpus):
break
gram += corpus[i+n] + ' '
gram = gram.strip().lower()
if len(gram) > 1000 or gram in grams or not contains_latin(gram):
i += 1
continue
grams.append(gram)
if (len(gram) > 1):
ngram = session.query(NGrams).filter_by(
gram=gram).filter_by(size=size).first()
if ngram is None:
ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
session.add(ngram)
document_ngram = Document_NGrams(
document_id=document_id, ngram_id=ngram.id)
session.add(document_ngram)
session.commit()
i += 1
# print(str((time.time_ns() - start_time)//1_000_000))
session.close()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-r', parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
"--rebuild",
action="store_true",
help="Blow away the index and rebuild")
args = parser.parse_args() args = parser.parse_args()
if args.rebuild: if args.rebuild:
build_index() build_index()

View file

@ -1,54 +0,0 @@
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from pathlib import Path
import argparse
import os
import json
# investigate ngrams for "multi word" matching
ignored_words = ['a', 'the','is']
def remove_punctuation(input_string):
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
for p in punc:
input_string = input_string.replace(p, '')
return input_string
def build_index():
with open("data/index.json", "w") as index:
# get a list of all content files
# split on whitespace and add to index
dictionary = {}
pathlist = Path('data/content').rglob('*.txt')
for path in pathlist:
with open(str(path)) as content_file:
url = content_file.readline()
content = content_file.read()
content_words = content.split()
for word in content_words:
word = word.lower()
word = remove_punctuation(word)
if word not in ignored_words:
if word not in dictionary:
dictionary[word] = []
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
if len(matching_urls) == 0:
# if not url.strip() in dictionary[word]:
entries = dictionary[word]
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
dictionary[word].append(entry)
else:
entries = dictionary[word]
entry = matching_urls[0]
entry["count"] += 1
entries.sort(reverse=True, key=lambda entry: entry["count"])
index.write(json.dumps(dictionary))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
args = parser.parse_args()
if args.rebuild:
build_index()

View file

@ -1,72 +1,18 @@
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer from sqlalchemy import Column, Integer, String, DateTime
from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import relationship, mapped_column
import uuid import uuid
Base = declarative_base() Base = declarative_base()
class Website(Base):
class Documents(Base): __tablename__ = 'websites'
__tablename__ = 'documents' id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
url = Column(String) url = Column(String)
text_content = Column(String) text_content = Column(String)
html_content = Column(String) html_content = Column(String)
first_crawl_date = Column(DateTime) first_crawl_date = Column(DateTime)
last_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime)
last_index_date = Column(DateTime)
document_tokens = relationship(
"Document_Tokens", back_populates="document")
document_ngrams = relationship(
"Document_NGrams", back_populates="document")
class Document_Tokens(Base):
__tablename__ = 'document_tokens'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id = mapped_column(ForeignKey("documents.id"))
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
token_id = mapped_column(ForeignKey("tokens.id"))
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document = relationship(
"Documents", back_populates="document_tokens", uselist=False)
token = relationship("Tokens", back_populates="document_tokens")
__table_args__ = (
Index('idx_document_tokens_document_id_token_id', 'document_id',
'token_id', unique=True, postgresql_using='hash'),
Index('idx_document_tokens_clustered', 'document_id',
'token_id', postgresql_using='hash'),
)
class Tokens(Base):
__tablename__ = 'tokens'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
token = Column(String, index=True)
document_tokens = relationship("Document_Tokens", back_populates="token")
class NGrams(Base):
__tablename__ = 'ngrams'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
size = Column(Integer, index=True)
gram = Column(String, index=True)
document_ngrams = relationship("Document_NGrams", back_populates="ngram")
class Document_NGrams(Base):
__tablename__ = 'document_ngrams'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id = mapped_column(ForeignKey("documents.id"))
ngram_id = mapped_column(ForeignKey("ngrams.id"))
document = relationship(
"Documents", back_populates="document_ngrams", uselist=False)
ngram = relationship("NGrams", back_populates="document_ngrams")
__table_args__ = (
Index('idx_document_ngrams_document_id_ngram_id', 'document_id',
'ngram_id', unique=True, postgresql_using='hash'),
Index('idx_document_ngrams_clustered', 'document_id',
'ngram_id', postgresql_using='hash'),
)

View file

@ -1,146 +1,30 @@
#!/usr/bin/python3 #!/bin/bash
from sqlalchemy import create_engine, func, and_, or_, not_
from config import DATABASE_URI
from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.expression import distinct
import time
from flask import Flask from flask import Flask
from flask_cors import CORS from flask import Request
from flask import send_from_directory import json
from urllib.parse import unquote from urllib.parse import unquote
app = Flask(__name__, static_url_path='/static/') app = Flask(__name__)
CORS(app) ## Todo - Boolean search (AND/OR/NOT/"")
engine = create_engine(DATABASE_URI) @app.route("/search/<query>")
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
# Todo - Boolean search (AND/OR/NOT/"")
def split_query(query):
query = query.lower()
result = {'ands': [], 'ors': [], 'words': [],
'ngrams': [], 'exclusions': []}
query_words = query.split()
i = 0
while i < len(query_words):
if i + 1 < len(query_words):
if query_words[i + 1].lower() == "and":
if i + 2 < len(query_words):
result['ands'].append(
query_words[i] + ',' + query_words[i+2])
i = i + 3
continue
if query_words[i][0] == '"':
n = 0
quoted_query = ""
while i+n < len(query_words):
quoted_query += query_words[i+n] + ' '
if query_words[i+n][len(query_words[i+n])-1] == '"':
break
n += 1
result['ngrams'].append(
quoted_query[1:len(quoted_query)-2].rstrip())
i += n + 1
continue
elif query_words[i][0] == "-":
excluded_query = query_words[i][1: len(query_words[i])]
result['exclusions'].append(excluded_query)
i += 1
continue
result['ngrams'].append(query_words[i])
i += 1
return result
@ app.route("/search/<query>")
def search(query): def search(query):
start_time = time.time_ns() with open('data/index.json', 'r') as index_json:
session = Session() index = json.load(index_json)
results = {} query = unquote(query)
query_words = split_query(unquote(query)) query_split = query.split()
print(query_words) result = []
if len(query_words['ands']) > 0: for q in query_split:
print('entering ands: ' + q = q.lower()
str((time.time_ns() - start_time) // 1_000_000) + "ms") if q in index:
for a in query_words['ands']: for item in index[q]:
query = session.query(Documents.url, func.count(1)). \ matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
join(Document_NGrams, Documents.id == Document_NGrams.document_id). \ if len(matching_results) == 0:
join(NGrams, Document_NGrams.ngram_id == NGrams.id). \ result.append(item)
filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\ else:
group_by(Documents.url). \ matching_results[0]["count"] += item["count"]
having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \ return result
order_by(func.count(1).desc())
# limit(100) def handle_and():
print(query) pass
for result in query.all():
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
print('exiting ands: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
if len(query_words['ngrams']) > 0:
print('entering ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
q = session.query(Documents.url, func.count(1)) \
.join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
.join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
.group_by(Documents.url)
conditions = []
for ngram in query_words['ngrams']:
conditions.append(
(NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram))
# q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
and_conditions = [and_(*condition_pair)
for condition_pair in conditions]
q = q.filter(or_(*and_conditions))
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(q)
x = q.limit(100).all()
print('query executed: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(x)
for result in x:
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
# for y in x:
# print(y)
# for document_ngram in y.document_ngrams:
# if document_ngram.document.url in results.keys():
# results[document_ngram.document.url] += 1
# else:
# results[document_ngram.document.url] = 1
print('exiting ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
session.close()
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
# @app.route("/search/<query>")
# def search(query):
# start_time = time.time_ns()
# session = Session()
# result = {}
# query_words = unquote(query).split()
# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
# for word in query_words:
# word = word.lower()
# matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
#
# if matching_ngram is None:
# continue
# for document_ngram in matching_ngram.document_ngrams:
# if document_ngram.document.url in result.keys():
# result[document_ngram.document.url] += 1
# else:
# result[document_ngram.document.url] = 1
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]

11
todo
View file

@ -1,11 +0,0 @@
[x] Refactor website table to generic document table (maybe using URN instead of URL?)
[x] Define tokens table FKed to document table
[x] Refactor index.py to tokenize input into tokens table
[x] Define N-Grams table
[x] Add N-Gram generation to index.py
[x] Add clustered index to document_ngrams table model
[x] Add clustered index to document_tokens table model
[ ] Add ddl command to create partition tables
[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
[x] Instead of starting from a random page on the site, go to root and find site map and crawl that