diff --git a/client/src/css/styles.css b/client/src/css/styles.css new file mode 100644 index 0000000..37323ab --- /dev/null +++ b/client/src/css/styles.css @@ -0,0 +1,33 @@ +html, body { + height: 100%; +} +body { + margin: 0; +} +input { + padding: 7px; + font-size: 1.1rem; +} +.search-container { + display: flex; + justify-content: center; + align-items: center; + text-align: center; + min-height: 25vh; +} + +.flex-container { + padding: 0; + margin: 0; + display: flex; + align-items: center; + justify-content: center; + flex-direction: column; +} +.flex-item { +} +.result { + display:block; + max-width: 60vw; + overflow-x: hidden; +} diff --git a/client/src/index.html b/client/src/index.html new file mode 100644 index 0000000..a748d6c --- /dev/null +++ b/client/src/index.html @@ -0,0 +1,16 @@ + + + + + + +
+ +
+
+
+
+
+ + + diff --git a/client/src/js/index.js b/client/src/js/index.js new file mode 100644 index 0000000..09b0bb2 --- /dev/null +++ b/client/src/js/index.js @@ -0,0 +1,28 @@ +function debounce(func, timeout = 300){ + let timer; + return (...args) => { + clearTimeout(timer); + timer = setTimeout(() => { func.apply(this, args); }, timeout); + }; +} +async function search(searchBox){ + const response = await fetch(`http://localhost:5000/search/${searchBox.value}`); + const results = await response.json(); + + const resultView = document.getElementById("results"); + resultView.replaceChildren(); + for (let i = 0; i < results.length; i++){ + let result = results[i]; + let resultElement = document.createElement("a"); + resultElement.innerText = result[0]; + resultElement.href = result[0]; + resultElement.className = "flex-item result"; + resultView.appendChild(resultElement); + } +} + +const searchBoxKeyUp = debounce(() => search()) + +const searchBox = document.getElementById("searchbox"); + +searchBox.addEventListener("keyup", debounce(() => search(searchBox))) diff --git a/src/__pycache__/search.cpython-310.pyc b/src/__pycache__/search.cpython-310.pyc index c740282..f3e8621 100644 Binary files a/src/__pycache__/search.cpython-310.pyc and b/src/__pycache__/search.cpython-310.pyc differ diff --git a/src/crawl.py b/src/crawl.py index bc6470d..1480b4e 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -1,104 +1,211 @@ #!/usr/bin/python3 + import argparse import requests -import hashlib from urllib.parse import urlparse, urljoin +import urllib.robotparser import os from time import sleep from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI -from models import Base, Website +from models import Base, Documents from sqlalchemy.orm import sessionmaker -from sqlalchemy import create_engine import datetime +import yt_dlp as youtube_dl # TODO- Handle gemini/gopher links -# TODO- Keep a list of traversed links and check before traversing again engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) -def get_html(url: str) -> str: +excluded_domains = ['amazon.', 'news.ycombinator.', + 'facebook.com', 'amzn', 'fb.com'] +excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db", + ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"] + + +def get_html(url: str) -> str: response = requests.get(url) return response.content -def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool: +def parse_youtube(video_url: str) -> bool: + return + # Language preference for subtitles (set to None for auto-generated) + # Change this to 'en' for English subtitles, or None for auto-generated + subtitle_language = 'en' + # Options for youtube_dl + ydl_opts = { + 'writesubtitles': True, + 'allsubtitles': True, + 'skip_download': True, # We only want to fetch metadata + 'subtitleslangs': [subtitle_language] if subtitle_language else None, + 'extractor-args': {'youtube': {'player_client': 'ios,web'}}, + } + + # Initialize youtube_dl object + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + # Download metadata + info_dict = ydl.extract_info(video_url, download=False) + + # Extract subtitles + subtitles = info_dict.get('subtitles') + subtitles_text = "" + # Print available subtitles + if subtitles: + for subs in subtitles.values(): + for sub in subs: + subtitle_url = sub['url'] + with youtube_dl.YoutubeDL({}) as ydl: + subtitle_info = ydl.extract_info( + subtitle_url, download=False) + for subtitle in subtitle_info['subtitles'][subtitle_language]: + if subtitle["ext"] == "srv1": + soup = BeautifulSoup( + get_html(subtitle["url"]), 'html.parser') + subtitles_text = soup.get_text() + + s = Session() + existing_website = s.query( + Documents).filter_by(url=video_url).first() + if existing_website is None: + website = Documents( + url=video_url, + text_content=subtitles_text, + html_content=None, # soup.prettify(), + first_crawl_date=datetime.datetime.now(), + last_crawl_date=datetime.datetime.now(), + last_index_date=None + ) + s.add(website) + else: + existing_website.last_crawl_date = datetime.datetime.now() + s.add(existing_website) + s.commit() + s.close() + + +def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool: + for domain in excluded_domains: + if domain in url: + return + if any(ext in url for ext in excluded_filetypes): + return + if "youtube.com" in url: + parse_youtube(url) + return + rp = urllib.robotparser.RobotFileParser() print(url) print(recursion) urlparts = urlparse(url) baseurl = urlparts.scheme + "://" + urlparts.netloc - soup = BeautifulSoup(html,'html.parser') - hash = hashlib.sha256() - hash.update(url.encode('ascii')) + if baseurl not in robots: + rp.set_url(baseurl + "/robots.txt") + rp.read() + robots[baseurl] = rp + else: + rp = robots[baseurl] + if not rp.can_fetch("*", url): + print("Robots prevents crawling url: " + url) + return + + soup = BeautifulSoup(html, 'html.parser') s = Session() - existing_website = s.query(Website).filter_by(url=url).first() - print (existing_website) - if existing_website == None: - website = Website( - url=url, - text_content=soup.get_text(), - html_content=soup.prettify(), - first_crawl_date=datetime.datetime.now(), - last_crawl_date = datetime.datetime.now() - ) + existing_website = s.query(Documents).filter_by(url=url).first() + if existing_website is None: + website = Documents( + url=url, + text_content=soup.get_text(), + html_content=soup.prettify(), + first_crawl_date=datetime.datetime.now(), + last_crawl_date=datetime.datetime.now(), + last_index_date=None + ) s.add(website) else: existing_website.last_crawl_date = datetime.datetime.now() s.add(existing_website) s.commit() s.close() - x = open(f'data/links.txt', 'a') - x.close() - links = soup.find_all("a") + links = soup.find_all("a", href=True) for link in links: found = False link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue - if not "http" in link: + if any(ext in link for ext in excluded_filetypes): + continue + if "http" not in link: link = urljoin(url, link) + link = link.split('?')[0] + link = link.split('#')[0] if (recursion > 0 and link not in traversed_links): try: traversed_links.append(link) link_html = get_html(link) - r = recursion -1 - sleep(1) + r = recursion - 1 + sleep(0.5) parse_html(link, link_html, r, traversed_links) except: pass -# else: -# with open(f'data/links.txt', 'r+') as linksfile: +# elif link not in traversed_links: +# with open('data/links.txt', 'r+') as linksfile: # while line := linksfile.readline(): # if line.strip() == link.strip(): # found = True # if not found: # linksfile.write(f'{link}\n') -if __name__ == "__main__": +def parse_site_map(base_url): + map = BeautifulSoup(requests.get(base_url).content, 'xml') + print(map.find_all('loc')) + for loc in map.find_all('loc'): + if "xml" in loc.contents[0]: + parse_site_map(loc.contents[0]) + else: + url = loc.contents[0] + html = get_html(url) + parse_html(url, html, max_recursion) + + +if __name__ == "__main__": os.makedirs("data/content", exist_ok=True) # check inputs parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") - max_recursion = 4 + parser.add_argument('-s', "--crawl-sitemap", action="store_true") + parser.add_argument('-r', "--max-recursion", help="", type=int, default=1) + args = parser.parse_args() - html = get_html(args.url) - parse_html(args.url, html, max_recursion) + max_recursion = int(args.max_recursion) + if args.url == "links": + with open('data/links.txt', 'r+') as linksfile: + while line := linksfile.readline(): + if "http" in line: + try: + parse_html(line, get_html(line)) + except: + pass + elif args.crawl_sitemap: + rp = urllib.robotparser.RobotFileParser() + urlparts = urlparse(args.url) + baseurl = urlparts.scheme + "://" + urlparts.netloc + rp.set_url(baseurl + "/robots.txt") + rp.read() + if not rp.can_fetch("*", args.url): + print("Robots prevents crawling url: " + args.url) + exit(0) + if len(rp.site_maps()) > 0: + parse_site_map(rp.site_maps()[0]) + else: + html = get_html(args.url) + parse_html(args.url, html, max_recursion) # recursion = 0 # if (args.followlinks): -# with open(f'data/links.txt', 'r+') as linksfile: -# while line := linksfile.readline(): -# if recursion < max_recursion: -# if "http" in line: -# recursion += 1 -# try: -# parse_html(line, get_html(line)) -# except: -# pass - os.remove('data/links.txt') +# os.remove('data/links.txt') diff --git a/src/index.py b/src/index.py index e04c787..679d312 100644 --- a/src/index.py +++ b/src/index.py @@ -1,54 +1,154 @@ -from sqlalchemy import create_engine -from config import DATABASE_URI -from models import Base, Website -from pathlib import Path -import argparse -import os -import json -# investigate ngrams for "multi word" matching -ignored_words = ['a', 'the','is'] +#!/usr/bin/python3 -def remove_punctuation(input_string): - punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!''' - for p in punc: - input_string = input_string.replace(p, '') - return input_string +import argparse +from sqlalchemy import create_engine, or_, text +from sqlalchemy import Table, Column, String, Integer +from config import DATABASE_URI +from sqlalchemy.dialects.postgresql import UUID +from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams +from sqlalchemy.orm import sessionmaker +from sqlalchemy.exc import SQLAlchemyError +import uuid +import datetime +import time +import re +import random +from multiprocessing import Pool + +engine = create_engine(DATABASE_URI) +Base.metadata.create_all(engine) +Session = sessionmaker(bind=engine) +# https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html + + +def contains_latin(text): + latin_pattern = r'[a-zA-ZÀ-ÖØ-öø-ÿ]' + return bool(re.search(latin_pattern, text)) + + +def build_index_chunk(document_chunk): + session = Session() + print(len(document_chunk)) + start_time = time.time_ns() + for document in document_chunk: + print(document.url) + content = re.sub(r'[.,?!]', ' ', str(document.text_content)) + content = re.sub(r'[^\w\s]', '', str(content)) + content_words = content.split() + build_ngrams(1, content_words, document.id) + build_ngrams(2, content_words, document.id) + build_ngrams(3, content_words, document.id) + build_ngrams(4, content_words, document.id) + build_ngrams(5, content_words, document.id) + + document.last_index_date = datetime.datetime.now() + session.merge(document) + session.commit() + session.close() def build_index(): - with open(f"data/index.json", "w") as index: - # get a list of all content files - # split on whitespace and add to index - dictionary = {} - pathlist = Path('data/content').rglob('*.txt') - for path in pathlist: - with open(str(path)) as content_file: - url = content_file.readline() - content = content_file.read() - content_words = content.split() - for word in content_words: - word = word.lower() - word = remove_punctuation(word) - if not word in ignored_words: - if not word in dictionary: - dictionary[word] = [] - matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word])) - if len(matching_urls) == 0: -# if not url.strip() in dictionary[word]: - entries = dictionary[word] - entry = {"url": url.strip(), "count": 1, "filename": str(path)} - dictionary[word].append(entry) - else: - entries = dictionary[word] - entry = matching_urls[0] - entry["count"] += 1 - entries.sort(reverse=True, key=lambda entry: entry["count"]) - index.write(json.dumps(dictionary)) + while True: + session = Session() + documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_( + None), Documents.last_index_date < Documents.last_crawl_date)).limit(100) + session.close() + + # Execute the query to get the result set + documents = list(documents_query) + if len(documents) == 0: + return + build_index_chunk(documents) + continue + chunk_size = 10 + document_chunks = [documents[i:i+chunk_size] + for i in range(0, len(documents), chunk_size)] + with Pool() as pool: + pool.map(build_index_chunk, document_chunks) + + +def zip_ngrams(size: int, corpus, document_id): + size = int(size) + connection = engine.connect() + temptbl_name = 'temp_del_{}'.format( + time.time_ns() + random.randint(100000, 9999999)) + temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column( + 'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True) + + try: + # Start transaction + with connection.begin(): + temptbl.create(engine) + insert_grams = [] + grams = zip(*[corpus[i:] for i in range(size)]) + for gram in grams: + gram = ' '.join(gram).lower() + insert_grams.append( + {"id": uuid.uuid4(), "gram": gram, "size": size}) + connection.execute(temptbl.insert().values(insert_grams)) + connection.execute(text("UPDATE " + temptbl_name + + " SET id = ngrams.id FROM ngrams WHERE ngrams.gram = " + + temptbl_name + ".gram;")) + connection.execute(text("INSERT INTO ngrams (id, gram, size) SELECT " + + " distinct t.id, t.gram as gram, t.size FROM " + + temptbl_name + " t LEFT JOIN ngrams on ngrams.gram = " + + "t.gram WHERE ngrams.id is null and t.size is not null " + " ON CONFLICT DO NOTHING;")) + connection.execute(text("INSERT INTO document_ngrams(id, document_id, ngram_id) SELECT DISTINCT " + + "uuid_generate_v4() , '" + str(document_id) + "'::UUID, t.id FROM " + temptbl_name + " t;")) + except SQLAlchemyError as e: + # Handle exceptions + print("An error occurred:", e) + # Rollback transaction + connection.rollback() + else: + # Commit transaction if no exceptions occurred + connection.commit() + finally: + connection.close() + # Drop table outside the transaction block + temptbl.drop(engine) + + +def build_ngrams(size: int, corpus: str, document_id: str): + session = Session() + zip_ngrams(size, corpus, document_id) + return + i = 0 + grams = [] + while i < len(corpus): + if i + size >= len(corpus): + i = len(corpus) + gram = '' + for n in range(0, size): + if i + n >= len(corpus): + break + gram += corpus[i+n] + ' ' + gram = gram.strip().lower() + if len(gram) > 1000 or gram in grams or not contains_latin(gram): + i += 1 + continue + grams.append(gram) + if (len(gram) > 1): + ngram = session.query(NGrams).filter_by( + gram=gram).filter_by(size=size).first() + if ngram is None: + ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram) + session.add(ngram) + document_ngram = Document_NGrams( + document_id=document_id, ngram_id=ngram.id) + session.add(document_ngram) + session.commit() + i += 1 +# print(str((time.time_ns() - start_time)//1_000_000)) + session.close() + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") + parser.add_argument('-r', + "--rebuild", + action="store_true", + help="Blow away the index and rebuild") args = parser.parse_args() if args.rebuild: build_index() - diff --git a/src/index.py.old b/src/index.py.old new file mode 100644 index 0000000..6ec8e21 --- /dev/null +++ b/src/index.py.old @@ -0,0 +1,54 @@ +from sqlalchemy import create_engine +from config import DATABASE_URI +from models import Base, Website +from pathlib import Path +import argparse +import os +import json +# investigate ngrams for "multi word" matching +ignored_words = ['a', 'the','is'] + +def remove_punctuation(input_string): + punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!''' + for p in punc: + input_string = input_string.replace(p, '') + return input_string + + +def build_index(): + with open("data/index.json", "w") as index: + # get a list of all content files + # split on whitespace and add to index + dictionary = {} + pathlist = Path('data/content').rglob('*.txt') + for path in pathlist: + with open(str(path)) as content_file: + url = content_file.readline() + content = content_file.read() + content_words = content.split() + for word in content_words: + word = word.lower() + word = remove_punctuation(word) + if word not in ignored_words: + if word not in dictionary: + dictionary[word] = [] + matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word])) + if len(matching_urls) == 0: +# if not url.strip() in dictionary[word]: + entries = dictionary[word] + entry = {"url": url.strip(), "count": 1, "filename": str(path)} + dictionary[word].append(entry) + else: + entries = dictionary[word] + entry = matching_urls[0] + entry["count"] += 1 + entries.sort(reverse=True, key=lambda entry: entry["count"]) + index.write(json.dumps(dictionary)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") + args = parser.parse_args() + if args.rebuild: + build_index() + diff --git a/src/models.py b/src/models.py index ee768d4..50010b6 100644 --- a/src/models.py +++ b/src/models.py @@ -1,18 +1,72 @@ from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, DateTime +from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.orm import relationship, mapped_column import uuid Base = declarative_base() -class Website(Base): - __tablename__ = 'websites' - id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4) +class Documents(Base): + __tablename__ = 'documents' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) url = Column(String) text_content = Column(String) html_content = Column(String) first_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime) + last_index_date = Column(DateTime) + document_tokens = relationship( + "Document_Tokens", back_populates="document") + document_ngrams = relationship( + "Document_NGrams", back_populates="document") +class Document_Tokens(Base): + __tablename__ = 'document_tokens' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document_id = mapped_column(ForeignKey("documents.id")) + # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + token_id = mapped_column(ForeignKey("tokens.id")) + # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document = relationship( + "Documents", back_populates="document_tokens", uselist=False) + token = relationship("Tokens", back_populates="document_tokens") + __table_args__ = ( + Index('idx_document_tokens_document_id_token_id', 'document_id', + 'token_id', unique=True, postgresql_using='hash'), + Index('idx_document_tokens_clustered', 'document_id', + 'token_id', postgresql_using='hash'), + ) + + +class Tokens(Base): + __tablename__ = 'tokens' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + token = Column(String, index=True) + document_tokens = relationship("Document_Tokens", back_populates="token") + + +class NGrams(Base): + __tablename__ = 'ngrams' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + size = Column(Integer, index=True) + gram = Column(String, index=True) + document_ngrams = relationship("Document_NGrams", back_populates="ngram") + + +class Document_NGrams(Base): + __tablename__ = 'document_ngrams' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document_id = mapped_column(ForeignKey("documents.id")) + ngram_id = mapped_column(ForeignKey("ngrams.id")) + document = relationship( + "Documents", back_populates="document_ngrams", uselist=False) + ngram = relationship("NGrams", back_populates="document_ngrams") + + __table_args__ = ( + Index('idx_document_ngrams_document_id_ngram_id', 'document_id', + 'ngram_id', unique=True, postgresql_using='hash'), + Index('idx_document_ngrams_clustered', 'document_id', + 'ngram_id', postgresql_using='hash'), + ) diff --git a/src/search.py b/src/search.py index 17668f9..fd013bc 100755 --- a/src/search.py +++ b/src/search.py @@ -1,30 +1,146 @@ -#!/bin/bash - +#!/usr/bin/python3 +from sqlalchemy import create_engine, func, and_, or_, not_ +from config import DATABASE_URI +from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams +from sqlalchemy.orm import sessionmaker +from sqlalchemy.sql.expression import distinct +import time from flask import Flask -from flask import Request -import json +from flask_cors import CORS +from flask import send_from_directory from urllib.parse import unquote -app = Flask(__name__) -## Todo - Boolean search (AND/OR/NOT/"") -@app.route("/search/") +app = Flask(__name__, static_url_path='/static/') +CORS(app) +engine = create_engine(DATABASE_URI) +Base.metadata.create_all(engine) +Session = sessionmaker(bind=engine) +# Todo - Boolean search (AND/OR/NOT/"") + + +def split_query(query): + query = query.lower() + result = {'ands': [], 'ors': [], 'words': [], + 'ngrams': [], 'exclusions': []} + query_words = query.split() + i = 0 + while i < len(query_words): + if i + 1 < len(query_words): + if query_words[i + 1].lower() == "and": + if i + 2 < len(query_words): + result['ands'].append( + query_words[i] + ',' + query_words[i+2]) + i = i + 3 + continue + if query_words[i][0] == '"': + n = 0 + quoted_query = "" + while i+n < len(query_words): + quoted_query += query_words[i+n] + ' ' + if query_words[i+n][len(query_words[i+n])-1] == '"': + break + n += 1 + result['ngrams'].append( + quoted_query[1:len(quoted_query)-2].rstrip()) + i += n + 1 + continue + elif query_words[i][0] == "-": + excluded_query = query_words[i][1: len(query_words[i])] + result['exclusions'].append(excluded_query) + i += 1 + continue + result['ngrams'].append(query_words[i]) + i += 1 + return result + + +@ app.route("/search/") def search(query): - with open('data/index.json', 'r') as index_json: - index = json.load(index_json) - query = unquote(query) - query_split = query.split() - result = [] - for q in query_split: - q = q.lower() - if q in index: - for item in index[q]: - matching_results = list(filter(lambda entry: entry['url'] == item["url"], result)) - if len(matching_results) == 0: - result.append(item) - else: - matching_results[0]["count"] += item["count"] - return result + start_time = time.time_ns() + session = Session() + results = {} + query_words = split_query(unquote(query)) + print(query_words) + if len(query_words['ands']) > 0: + print('entering ands: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") + for a in query_words['ands']: + query = session.query(Documents.url, func.count(1)). \ + join(Document_NGrams, Documents.id == Document_NGrams.document_id). \ + join(NGrams, Document_NGrams.ngram_id == NGrams.id). \ + filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\ + group_by(Documents.url). \ + having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \ + order_by(func.count(1).desc()) -def handle_and(): - pass +# limit(100) + print(query) + for result in query.all(): + if result[0] in results.keys(): + results[result[0]] += result[1] + else: + results[result[0]] = result[1] + print('exiting ands: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") + if len(query_words['ngrams']) > 0: + print('entering ngrams: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") + q = session.query(Documents.url, func.count(1)) \ + .join(Document_NGrams, Documents.id == Document_NGrams.document_id) \ + .join(NGrams, Document_NGrams.ngram_id == NGrams.id) \ + .group_by(Documents.url) + conditions = [] + for ngram in query_words['ngrams']: + conditions.append( + (NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram)) +# q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram) + and_conditions = [and_(*condition_pair) + for condition_pair in conditions] + q = q.filter(or_(*and_conditions)) + print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") + print(q) + x = q.limit(100).all() + print('query executed: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") + print(x) + for result in x: + if result[0] in results.keys(): + results[result[0]] += result[1] + else: + results[result[0]] = result[1] +# for y in x: +# print(y) +# for document_ngram in y.document_ngrams: +# if document_ngram.document.url in results.keys(): +# results[document_ngram.document.url] += 1 +# else: +# results[document_ngram.document.url] = 1 + print('exiting ngrams: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") + + print(str((time.time_ns() - start_time) // 1_000_000) + "ms") + session.close() + return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())] + + +# @app.route("/search/") +# def search(query): +# start_time = time.time_ns() +# session = Session() +# result = {} +# query_words = unquote(query).split() +# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000) +# for word in query_words: +# word = word.lower() +# matching_ngram = session.query(NGrams).filter_by(ngram=word).first() +# +# if matching_ngram is None: +# continue +# for document_ngram in matching_ngram.document_ngrams: +# if document_ngram.document.url in result.keys(): +# result[document_ngram.document.url] += 1 +# else: +# result[document_ngram.document.url] = 1 +# print(str((time.time_ns() - start_time) // 1_000_000) + "ms") +# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10] diff --git a/todo b/todo new file mode 100644 index 0000000..ddda3bd --- /dev/null +++ b/todo @@ -0,0 +1,11 @@ +[x] Refactor website table to generic document table (maybe using URN instead of URL?) +[x] Define tokens table FKed to document table +[x] Refactor index.py to tokenize input into tokens table +[x] Define N-Grams table +[x] Add N-Gram generation to index.py +[x] Add clustered index to document_ngrams table model +[x] Add clustered index to document_tokens table model +[ ] Add ddl command to create partition tables +[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be +[x] Instead of starting from a random page on the site, go to root and find site map and crawl that +