diff --git a/client/src/css/styles.css b/client/src/css/styles.css deleted file mode 100644 index 37323ab..0000000 --- a/client/src/css/styles.css +++ /dev/null @@ -1,33 +0,0 @@ -html, body { - height: 100%; -} -body { - margin: 0; -} -input { - padding: 7px; - font-size: 1.1rem; -} -.search-container { - display: flex; - justify-content: center; - align-items: center; - text-align: center; - min-height: 25vh; -} - -.flex-container { - padding: 0; - margin: 0; - display: flex; - align-items: center; - justify-content: center; - flex-direction: column; -} -.flex-item { -} -.result { - display:block; - max-width: 60vw; - overflow-x: hidden; -} diff --git a/client/src/index.html b/client/src/index.html deleted file mode 100644 index a748d6c..0000000 --- a/client/src/index.html +++ /dev/null @@ -1,16 +0,0 @@ - - - - - - -
- -
-
-
-
-
- - - diff --git a/client/src/js/index.js b/client/src/js/index.js deleted file mode 100644 index 09b0bb2..0000000 --- a/client/src/js/index.js +++ /dev/null @@ -1,28 +0,0 @@ -function debounce(func, timeout = 300){ - let timer; - return (...args) => { - clearTimeout(timer); - timer = setTimeout(() => { func.apply(this, args); }, timeout); - }; -} -async function search(searchBox){ - const response = await fetch(`http://localhost:5000/search/${searchBox.value}`); - const results = await response.json(); - - const resultView = document.getElementById("results"); - resultView.replaceChildren(); - for (let i = 0; i < results.length; i++){ - let result = results[i]; - let resultElement = document.createElement("a"); - resultElement.innerText = result[0]; - resultElement.href = result[0]; - resultElement.className = "flex-item result"; - resultView.appendChild(resultElement); - } -} - -const searchBoxKeyUp = debounce(() => search()) - -const searchBox = document.getElementById("searchbox"); - -searchBox.addEventListener("keyup", debounce(() => search(searchBox))) diff --git a/src/__pycache__/search.cpython-310.pyc b/src/__pycache__/search.cpython-310.pyc index f3e8621..c740282 100644 Binary files a/src/__pycache__/search.cpython-310.pyc and b/src/__pycache__/search.cpython-310.pyc differ diff --git a/src/crawl.py b/src/crawl.py index 1480b4e..bc6470d 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -1,211 +1,104 @@ #!/usr/bin/python3 - import argparse import requests +import hashlib from urllib.parse import urlparse, urljoin -import urllib.robotparser import os from time import sleep from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI -from models import Base, Documents +from models import Base, Website from sqlalchemy.orm import sessionmaker +from sqlalchemy import create_engine import datetime -import yt_dlp as youtube_dl # TODO- Handle gemini/gopher links +# TODO- Keep a list of traversed links and check before traversing again engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) -excluded_domains = ['amazon.', 'news.ycombinator.', - 'facebook.com', 'amzn', 'fb.com'] - -excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db", - ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"] - - def get_html(url: str) -> str: + response = requests.get(url) return response.content +def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool: -def parse_youtube(video_url: str) -> bool: - return - # Language preference for subtitles (set to None for auto-generated) - # Change this to 'en' for English subtitles, or None for auto-generated - subtitle_language = 'en' - # Options for youtube_dl - ydl_opts = { - 'writesubtitles': True, - 'allsubtitles': True, - 'skip_download': True, # We only want to fetch metadata - 'subtitleslangs': [subtitle_language] if subtitle_language else None, - 'extractor-args': {'youtube': {'player_client': 'ios,web'}}, - } - - # Initialize youtube_dl object - with youtube_dl.YoutubeDL(ydl_opts) as ydl: - # Download metadata - info_dict = ydl.extract_info(video_url, download=False) - - # Extract subtitles - subtitles = info_dict.get('subtitles') - subtitles_text = "" - # Print available subtitles - if subtitles: - for subs in subtitles.values(): - for sub in subs: - subtitle_url = sub['url'] - with youtube_dl.YoutubeDL({}) as ydl: - subtitle_info = ydl.extract_info( - subtitle_url, download=False) - for subtitle in subtitle_info['subtitles'][subtitle_language]: - if subtitle["ext"] == "srv1": - soup = BeautifulSoup( - get_html(subtitle["url"]), 'html.parser') - subtitles_text = soup.get_text() - - s = Session() - existing_website = s.query( - Documents).filter_by(url=video_url).first() - if existing_website is None: - website = Documents( - url=video_url, - text_content=subtitles_text, - html_content=None, # soup.prettify(), - first_crawl_date=datetime.datetime.now(), - last_crawl_date=datetime.datetime.now(), - last_index_date=None - ) - s.add(website) - else: - existing_website.last_crawl_date = datetime.datetime.now() - s.add(existing_website) - s.commit() - s.close() - - -def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool: - for domain in excluded_domains: - if domain in url: - return - if any(ext in url for ext in excluded_filetypes): - return - if "youtube.com" in url: - parse_youtube(url) - return - rp = urllib.robotparser.RobotFileParser() print(url) print(recursion) urlparts = urlparse(url) baseurl = urlparts.scheme + "://" + urlparts.netloc - if baseurl not in robots: - rp.set_url(baseurl + "/robots.txt") - rp.read() - robots[baseurl] = rp - else: - rp = robots[baseurl] - if not rp.can_fetch("*", url): - print("Robots prevents crawling url: " + url) - return - - soup = BeautifulSoup(html, 'html.parser') + soup = BeautifulSoup(html,'html.parser') + hash = hashlib.sha256() + hash.update(url.encode('ascii')) s = Session() - existing_website = s.query(Documents).filter_by(url=url).first() - if existing_website is None: - website = Documents( - url=url, - text_content=soup.get_text(), - html_content=soup.prettify(), - first_crawl_date=datetime.datetime.now(), - last_crawl_date=datetime.datetime.now(), - last_index_date=None - ) + existing_website = s.query(Website).filter_by(url=url).first() + print (existing_website) + if existing_website == None: + website = Website( + url=url, + text_content=soup.get_text(), + html_content=soup.prettify(), + first_crawl_date=datetime.datetime.now(), + last_crawl_date = datetime.datetime.now() + ) s.add(website) else: existing_website.last_crawl_date = datetime.datetime.now() s.add(existing_website) s.commit() s.close() - links = soup.find_all("a", href=True) + x = open(f'data/links.txt', 'a') + x.close() + links = soup.find_all("a") for link in links: found = False link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue - if any(ext in link for ext in excluded_filetypes): - continue - if "http" not in link: + if not "http" in link: link = urljoin(url, link) - link = link.split('?')[0] - link = link.split('#')[0] if (recursion > 0 and link not in traversed_links): try: traversed_links.append(link) link_html = get_html(link) - r = recursion - 1 - sleep(0.5) + r = recursion -1 + sleep(1) parse_html(link, link_html, r, traversed_links) except: pass -# elif link not in traversed_links: -# with open('data/links.txt', 'r+') as linksfile: +# else: +# with open(f'data/links.txt', 'r+') as linksfile: # while line := linksfile.readline(): # if line.strip() == link.strip(): # found = True # if not found: # linksfile.write(f'{link}\n') - -def parse_site_map(base_url): - map = BeautifulSoup(requests.get(base_url).content, 'xml') - print(map.find_all('loc')) - for loc in map.find_all('loc'): - if "xml" in loc.contents[0]: - parse_site_map(loc.contents[0]) - else: - url = loc.contents[0] - html = get_html(url) - parse_html(url, html, max_recursion) - - if __name__ == "__main__": + os.makedirs("data/content", exist_ok=True) # check inputs parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") - parser.add_argument('-s', "--crawl-sitemap", action="store_true") - parser.add_argument('-r', "--max-recursion", help="", type=int, default=1) - + max_recursion = 4 args = parser.parse_args() - max_recursion = int(args.max_recursion) - if args.url == "links": - with open('data/links.txt', 'r+') as linksfile: - while line := linksfile.readline(): - if "http" in line: - try: - parse_html(line, get_html(line)) - except: - pass - elif args.crawl_sitemap: - rp = urllib.robotparser.RobotFileParser() - urlparts = urlparse(args.url) - baseurl = urlparts.scheme + "://" + urlparts.netloc - rp.set_url(baseurl + "/robots.txt") - rp.read() - if not rp.can_fetch("*", args.url): - print("Robots prevents crawling url: " + args.url) - exit(0) - if len(rp.site_maps()) > 0: - parse_site_map(rp.site_maps()[0]) - else: - html = get_html(args.url) - parse_html(args.url, html, max_recursion) + html = get_html(args.url) + parse_html(args.url, html, max_recursion) # recursion = 0 # if (args.followlinks): -# os.remove('data/links.txt') +# with open(f'data/links.txt', 'r+') as linksfile: +# while line := linksfile.readline(): +# if recursion < max_recursion: +# if "http" in line: +# recursion += 1 +# try: +# parse_html(line, get_html(line)) +# except: +# pass + os.remove('data/links.txt') diff --git a/src/index.py b/src/index.py index 679d312..e04c787 100644 --- a/src/index.py +++ b/src/index.py @@ -1,154 +1,54 @@ -#!/usr/bin/python3 - -import argparse -from sqlalchemy import create_engine, or_, text -from sqlalchemy import Table, Column, String, Integer +from sqlalchemy import create_engine from config import DATABASE_URI -from sqlalchemy.dialects.postgresql import UUID -from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams -from sqlalchemy.orm import sessionmaker -from sqlalchemy.exc import SQLAlchemyError -import uuid -import datetime -import time -import re -import random -from multiprocessing import Pool +from models import Base, Website +from pathlib import Path +import argparse +import os +import json +# investigate ngrams for "multi word" matching +ignored_words = ['a', 'the','is'] -engine = create_engine(DATABASE_URI) -Base.metadata.create_all(engine) -Session = sessionmaker(bind=engine) -# https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html - - -def contains_latin(text): - latin_pattern = r'[a-zA-ZÀ-ÖØ-öø-ÿ]' - return bool(re.search(latin_pattern, text)) - - -def build_index_chunk(document_chunk): - session = Session() - print(len(document_chunk)) - start_time = time.time_ns() - for document in document_chunk: - print(document.url) - content = re.sub(r'[.,?!]', ' ', str(document.text_content)) - content = re.sub(r'[^\w\s]', '', str(content)) - content_words = content.split() - build_ngrams(1, content_words, document.id) - build_ngrams(2, content_words, document.id) - build_ngrams(3, content_words, document.id) - build_ngrams(4, content_words, document.id) - build_ngrams(5, content_words, document.id) - - document.last_index_date = datetime.datetime.now() - session.merge(document) - session.commit() - session.close() +def remove_punctuation(input_string): + punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!''' + for p in punc: + input_string = input_string.replace(p, '') + return input_string def build_index(): - while True: - session = Session() - documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_( - None), Documents.last_index_date < Documents.last_crawl_date)).limit(100) - session.close() - - # Execute the query to get the result set - documents = list(documents_query) - if len(documents) == 0: - return - build_index_chunk(documents) - continue - chunk_size = 10 - document_chunks = [documents[i:i+chunk_size] - for i in range(0, len(documents), chunk_size)] - with Pool() as pool: - pool.map(build_index_chunk, document_chunks) - - -def zip_ngrams(size: int, corpus, document_id): - size = int(size) - connection = engine.connect() - temptbl_name = 'temp_del_{}'.format( - time.time_ns() + random.randint(100000, 9999999)) - temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column( - 'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True) - - try: - # Start transaction - with connection.begin(): - temptbl.create(engine) - insert_grams = [] - grams = zip(*[corpus[i:] for i in range(size)]) - for gram in grams: - gram = ' '.join(gram).lower() - insert_grams.append( - {"id": uuid.uuid4(), "gram": gram, "size": size}) - connection.execute(temptbl.insert().values(insert_grams)) - connection.execute(text("UPDATE " + temptbl_name + - " SET id = ngrams.id FROM ngrams WHERE ngrams.gram = " - + temptbl_name + ".gram;")) - connection.execute(text("INSERT INTO ngrams (id, gram, size) SELECT " + - " distinct t.id, t.gram as gram, t.size FROM " + - temptbl_name + " t LEFT JOIN ngrams on ngrams.gram = " + - "t.gram WHERE ngrams.id is null and t.size is not null " + " ON CONFLICT DO NOTHING;")) - connection.execute(text("INSERT INTO document_ngrams(id, document_id, ngram_id) SELECT DISTINCT " + - "uuid_generate_v4() , '" + str(document_id) + "'::UUID, t.id FROM " + temptbl_name + " t;")) - except SQLAlchemyError as e: - # Handle exceptions - print("An error occurred:", e) - # Rollback transaction - connection.rollback() - else: - # Commit transaction if no exceptions occurred - connection.commit() - finally: - connection.close() - # Drop table outside the transaction block - temptbl.drop(engine) - - -def build_ngrams(size: int, corpus: str, document_id: str): - session = Session() - zip_ngrams(size, corpus, document_id) - return - i = 0 - grams = [] - while i < len(corpus): - if i + size >= len(corpus): - i = len(corpus) - gram = '' - for n in range(0, size): - if i + n >= len(corpus): - break - gram += corpus[i+n] + ' ' - gram = gram.strip().lower() - if len(gram) > 1000 or gram in grams or not contains_latin(gram): - i += 1 - continue - grams.append(gram) - if (len(gram) > 1): - ngram = session.query(NGrams).filter_by( - gram=gram).filter_by(size=size).first() - if ngram is None: - ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram) - session.add(ngram) - document_ngram = Document_NGrams( - document_id=document_id, ngram_id=ngram.id) - session.add(document_ngram) - session.commit() - i += 1 -# print(str((time.time_ns() - start_time)//1_000_000)) - session.close() - + with open(f"data/index.json", "w") as index: + # get a list of all content files + # split on whitespace and add to index + dictionary = {} + pathlist = Path('data/content').rglob('*.txt') + for path in pathlist: + with open(str(path)) as content_file: + url = content_file.readline() + content = content_file.read() + content_words = content.split() + for word in content_words: + word = word.lower() + word = remove_punctuation(word) + if not word in ignored_words: + if not word in dictionary: + dictionary[word] = [] + matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word])) + if len(matching_urls) == 0: +# if not url.strip() in dictionary[word]: + entries = dictionary[word] + entry = {"url": url.strip(), "count": 1, "filename": str(path)} + dictionary[word].append(entry) + else: + entries = dictionary[word] + entry = matching_urls[0] + entry["count"] += 1 + entries.sort(reverse=True, key=lambda entry: entry["count"]) + index.write(json.dumps(dictionary)) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-r', - "--rebuild", - action="store_true", - help="Blow away the index and rebuild") + parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") args = parser.parse_args() if args.rebuild: build_index() + diff --git a/src/index.py.old b/src/index.py.old deleted file mode 100644 index 6ec8e21..0000000 --- a/src/index.py.old +++ /dev/null @@ -1,54 +0,0 @@ -from sqlalchemy import create_engine -from config import DATABASE_URI -from models import Base, Website -from pathlib import Path -import argparse -import os -import json -# investigate ngrams for "multi word" matching -ignored_words = ['a', 'the','is'] - -def remove_punctuation(input_string): - punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!''' - for p in punc: - input_string = input_string.replace(p, '') - return input_string - - -def build_index(): - with open("data/index.json", "w") as index: - # get a list of all content files - # split on whitespace and add to index - dictionary = {} - pathlist = Path('data/content').rglob('*.txt') - for path in pathlist: - with open(str(path)) as content_file: - url = content_file.readline() - content = content_file.read() - content_words = content.split() - for word in content_words: - word = word.lower() - word = remove_punctuation(word) - if word not in ignored_words: - if word not in dictionary: - dictionary[word] = [] - matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word])) - if len(matching_urls) == 0: -# if not url.strip() in dictionary[word]: - entries = dictionary[word] - entry = {"url": url.strip(), "count": 1, "filename": str(path)} - dictionary[word].append(entry) - else: - entries = dictionary[word] - entry = matching_urls[0] - entry["count"] += 1 - entries.sort(reverse=True, key=lambda entry: entry["count"]) - index.write(json.dumps(dictionary)) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") - args = parser.parse_args() - if args.rebuild: - build_index() - diff --git a/src/models.py b/src/models.py index 50010b6..ee768d4 100644 --- a/src/models.py +++ b/src/models.py @@ -1,72 +1,18 @@ from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer +from sqlalchemy import Column, Integer, String, DateTime from sqlalchemy.dialects.postgresql import UUID -from sqlalchemy.orm import relationship, mapped_column import uuid Base = declarative_base() +class Website(Base): -class Documents(Base): - __tablename__ = 'documents' - id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + __tablename__ = 'websites' + id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4) url = Column(String) text_content = Column(String) html_content = Column(String) first_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime) - last_index_date = Column(DateTime) - document_tokens = relationship( - "Document_Tokens", back_populates="document") - document_ngrams = relationship( - "Document_NGrams", back_populates="document") -class Document_Tokens(Base): - __tablename__ = 'document_tokens' - id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - document_id = mapped_column(ForeignKey("documents.id")) - # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - token_id = mapped_column(ForeignKey("tokens.id")) - # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - document = relationship( - "Documents", back_populates="document_tokens", uselist=False) - token = relationship("Tokens", back_populates="document_tokens") - __table_args__ = ( - Index('idx_document_tokens_document_id_token_id', 'document_id', - 'token_id', unique=True, postgresql_using='hash'), - Index('idx_document_tokens_clustered', 'document_id', - 'token_id', postgresql_using='hash'), - ) - - -class Tokens(Base): - __tablename__ = 'tokens' - id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - token = Column(String, index=True) - document_tokens = relationship("Document_Tokens", back_populates="token") - - -class NGrams(Base): - __tablename__ = 'ngrams' - id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - size = Column(Integer, index=True) - gram = Column(String, index=True) - document_ngrams = relationship("Document_NGrams", back_populates="ngram") - - -class Document_NGrams(Base): - __tablename__ = 'document_ngrams' - id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - document_id = mapped_column(ForeignKey("documents.id")) - ngram_id = mapped_column(ForeignKey("ngrams.id")) - document = relationship( - "Documents", back_populates="document_ngrams", uselist=False) - ngram = relationship("NGrams", back_populates="document_ngrams") - - __table_args__ = ( - Index('idx_document_ngrams_document_id_ngram_id', 'document_id', - 'ngram_id', unique=True, postgresql_using='hash'), - Index('idx_document_ngrams_clustered', 'document_id', - 'ngram_id', postgresql_using='hash'), - ) diff --git a/src/search.py b/src/search.py index fd013bc..17668f9 100755 --- a/src/search.py +++ b/src/search.py @@ -1,146 +1,30 @@ -#!/usr/bin/python3 -from sqlalchemy import create_engine, func, and_, or_, not_ -from config import DATABASE_URI -from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams -from sqlalchemy.orm import sessionmaker -from sqlalchemy.sql.expression import distinct -import time +#!/bin/bash + from flask import Flask -from flask_cors import CORS -from flask import send_from_directory +from flask import Request +import json from urllib.parse import unquote -app = Flask(__name__, static_url_path='/static/') -CORS(app) -engine = create_engine(DATABASE_URI) -Base.metadata.create_all(engine) -Session = sessionmaker(bind=engine) -# Todo - Boolean search (AND/OR/NOT/"") - - -def split_query(query): - query = query.lower() - result = {'ands': [], 'ors': [], 'words': [], - 'ngrams': [], 'exclusions': []} - query_words = query.split() - i = 0 - while i < len(query_words): - if i + 1 < len(query_words): - if query_words[i + 1].lower() == "and": - if i + 2 < len(query_words): - result['ands'].append( - query_words[i] + ',' + query_words[i+2]) - i = i + 3 - continue - if query_words[i][0] == '"': - n = 0 - quoted_query = "" - while i+n < len(query_words): - quoted_query += query_words[i+n] + ' ' - if query_words[i+n][len(query_words[i+n])-1] == '"': - break - n += 1 - result['ngrams'].append( - quoted_query[1:len(quoted_query)-2].rstrip()) - i += n + 1 - continue - elif query_words[i][0] == "-": - excluded_query = query_words[i][1: len(query_words[i])] - result['exclusions'].append(excluded_query) - i += 1 - continue - result['ngrams'].append(query_words[i]) - i += 1 - return result - - -@ app.route("/search/") +app = Flask(__name__) +## Todo - Boolean search (AND/OR/NOT/"") +@app.route("/search/") def search(query): - start_time = time.time_ns() - session = Session() - results = {} - query_words = split_query(unquote(query)) - print(query_words) - if len(query_words['ands']) > 0: - print('entering ands: ' + - str((time.time_ns() - start_time) // 1_000_000) + "ms") - for a in query_words['ands']: - query = session.query(Documents.url, func.count(1)). \ - join(Document_NGrams, Documents.id == Document_NGrams.document_id). \ - join(NGrams, Document_NGrams.ngram_id == NGrams.id). \ - filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\ - group_by(Documents.url). \ - having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \ - order_by(func.count(1).desc()) + with open('data/index.json', 'r') as index_json: + index = json.load(index_json) + query = unquote(query) + query_split = query.split() + result = [] + for q in query_split: + q = q.lower() + if q in index: + for item in index[q]: + matching_results = list(filter(lambda entry: entry['url'] == item["url"], result)) + if len(matching_results) == 0: + result.append(item) + else: + matching_results[0]["count"] += item["count"] + return result -# limit(100) - print(query) - for result in query.all(): - if result[0] in results.keys(): - results[result[0]] += result[1] - else: - results[result[0]] = result[1] - print('exiting ands: ' + - str((time.time_ns() - start_time) // 1_000_000) + "ms") - if len(query_words['ngrams']) > 0: - print('entering ngrams: ' + - str((time.time_ns() - start_time) // 1_000_000) + "ms") +def handle_and(): + pass - q = session.query(Documents.url, func.count(1)) \ - .join(Document_NGrams, Documents.id == Document_NGrams.document_id) \ - .join(NGrams, Document_NGrams.ngram_id == NGrams.id) \ - .group_by(Documents.url) - conditions = [] - for ngram in query_words['ngrams']: - conditions.append( - (NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram)) -# q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram) - and_conditions = [and_(*condition_pair) - for condition_pair in conditions] - q = q.filter(or_(*and_conditions)) - print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") - print(q) - x = q.limit(100).all() - print('query executed: ' + - str((time.time_ns() - start_time) // 1_000_000) + "ms") - print(x) - for result in x: - if result[0] in results.keys(): - results[result[0]] += result[1] - else: - results[result[0]] = result[1] -# for y in x: -# print(y) -# for document_ngram in y.document_ngrams: -# if document_ngram.document.url in results.keys(): -# results[document_ngram.document.url] += 1 -# else: -# results[document_ngram.document.url] = 1 - print('exiting ngrams: ' + - str((time.time_ns() - start_time) // 1_000_000) + "ms") - - print(str((time.time_ns() - start_time) // 1_000_000) + "ms") - session.close() - return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())] - - -# @app.route("/search/") -# def search(query): -# start_time = time.time_ns() -# session = Session() -# result = {} -# query_words = unquote(query).split() -# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000) -# for word in query_words: -# word = word.lower() -# matching_ngram = session.query(NGrams).filter_by(ngram=word).first() -# -# if matching_ngram is None: -# continue -# for document_ngram in matching_ngram.document_ngrams: -# if document_ngram.document.url in result.keys(): -# result[document_ngram.document.url] += 1 -# else: -# result[document_ngram.document.url] = 1 -# print(str((time.time_ns() - start_time) // 1_000_000) + "ms") -# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10] diff --git a/todo b/todo deleted file mode 100644 index ddda3bd..0000000 --- a/todo +++ /dev/null @@ -1,11 +0,0 @@ -[x] Refactor website table to generic document table (maybe using URN instead of URL?) -[x] Define tokens table FKed to document table -[x] Refactor index.py to tokenize input into tokens table -[x] Define N-Grams table -[x] Add N-Gram generation to index.py -[x] Add clustered index to document_ngrams table model -[x] Add clustered index to document_tokens table model -[ ] Add ddl command to create partition tables -[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be -[x] Instead of starting from a random page on the site, go to root and find site map and crawl that -