diff --git a/client/src/css/styles.css b/client/src/css/styles.css
deleted file mode 100644
index 37323ab..0000000
--- a/client/src/css/styles.css
+++ /dev/null
@@ -1,33 +0,0 @@
-html, body {
- height: 100%;
-}
-body {
- margin: 0;
-}
-input {
- padding: 7px;
- font-size: 1.1rem;
-}
-.search-container {
- display: flex;
- justify-content: center;
- align-items: center;
- text-align: center;
- min-height: 25vh;
-}
-
-.flex-container {
- padding: 0;
- margin: 0;
- display: flex;
- align-items: center;
- justify-content: center;
- flex-direction: column;
-}
-.flex-item {
-}
-.result {
- display:block;
- max-width: 60vw;
- overflow-x: hidden;
-}
diff --git a/client/src/index.html b/client/src/index.html
deleted file mode 100644
index a748d6c..0000000
--- a/client/src/index.html
+++ /dev/null
@@ -1,16 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/client/src/js/index.js b/client/src/js/index.js
deleted file mode 100644
index 09b0bb2..0000000
--- a/client/src/js/index.js
+++ /dev/null
@@ -1,28 +0,0 @@
-function debounce(func, timeout = 300){
- let timer;
- return (...args) => {
- clearTimeout(timer);
- timer = setTimeout(() => { func.apply(this, args); }, timeout);
- };
-}
-async function search(searchBox){
- const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
- const results = await response.json();
-
- const resultView = document.getElementById("results");
- resultView.replaceChildren();
- for (let i = 0; i < results.length; i++){
- let result = results[i];
- let resultElement = document.createElement("a");
- resultElement.innerText = result[0];
- resultElement.href = result[0];
- resultElement.className = "flex-item result";
- resultView.appendChild(resultElement);
- }
-}
-
-const searchBoxKeyUp = debounce(() => search())
-
-const searchBox = document.getElementById("searchbox");
-
-searchBox.addEventListener("keyup", debounce(() => search(searchBox)))
diff --git a/src/__pycache__/search.cpython-310.pyc b/src/__pycache__/search.cpython-310.pyc
index f3e8621..c740282 100644
Binary files a/src/__pycache__/search.cpython-310.pyc and b/src/__pycache__/search.cpython-310.pyc differ
diff --git a/src/crawl.py b/src/crawl.py
index 1480b4e..bc6470d 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -1,211 +1,104 @@
#!/usr/bin/python3
-
import argparse
import requests
+import hashlib
from urllib.parse import urlparse, urljoin
-import urllib.robotparser
import os
from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
-from models import Base, Documents
+from models import Base, Website
from sqlalchemy.orm import sessionmaker
+from sqlalchemy import create_engine
import datetime
-import yt_dlp as youtube_dl
# TODO- Handle gemini/gopher links
+# TODO- Keep a list of traversed links and check before traversing again
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
-excluded_domains = ['amazon.', 'news.ycombinator.',
- 'facebook.com', 'amzn', 'fb.com']
-
-excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
- ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
-
-
def get_html(url: str) -> str:
+
response = requests.get(url)
return response.content
+def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
-def parse_youtube(video_url: str) -> bool:
- return
- # Language preference for subtitles (set to None for auto-generated)
- # Change this to 'en' for English subtitles, or None for auto-generated
- subtitle_language = 'en'
- # Options for youtube_dl
- ydl_opts = {
- 'writesubtitles': True,
- 'allsubtitles': True,
- 'skip_download': True, # We only want to fetch metadata
- 'subtitleslangs': [subtitle_language] if subtitle_language else None,
- 'extractor-args': {'youtube': {'player_client': 'ios,web'}},
- }
-
- # Initialize youtube_dl object
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
- # Download metadata
- info_dict = ydl.extract_info(video_url, download=False)
-
- # Extract subtitles
- subtitles = info_dict.get('subtitles')
- subtitles_text = ""
- # Print available subtitles
- if subtitles:
- for subs in subtitles.values():
- for sub in subs:
- subtitle_url = sub['url']
- with youtube_dl.YoutubeDL({}) as ydl:
- subtitle_info = ydl.extract_info(
- subtitle_url, download=False)
- for subtitle in subtitle_info['subtitles'][subtitle_language]:
- if subtitle["ext"] == "srv1":
- soup = BeautifulSoup(
- get_html(subtitle["url"]), 'html.parser')
- subtitles_text = soup.get_text()
-
- s = Session()
- existing_website = s.query(
- Documents).filter_by(url=video_url).first()
- if existing_website is None:
- website = Documents(
- url=video_url,
- text_content=subtitles_text,
- html_content=None, # soup.prettify(),
- first_crawl_date=datetime.datetime.now(),
- last_crawl_date=datetime.datetime.now(),
- last_index_date=None
- )
- s.add(website)
- else:
- existing_website.last_crawl_date = datetime.datetime.now()
- s.add(existing_website)
- s.commit()
- s.close()
-
-
-def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
- for domain in excluded_domains:
- if domain in url:
- return
- if any(ext in url for ext in excluded_filetypes):
- return
- if "youtube.com" in url:
- parse_youtube(url)
- return
- rp = urllib.robotparser.RobotFileParser()
print(url)
print(recursion)
urlparts = urlparse(url)
baseurl = urlparts.scheme + "://" + urlparts.netloc
- if baseurl not in robots:
- rp.set_url(baseurl + "/robots.txt")
- rp.read()
- robots[baseurl] = rp
- else:
- rp = robots[baseurl]
- if not rp.can_fetch("*", url):
- print("Robots prevents crawling url: " + url)
- return
-
- soup = BeautifulSoup(html, 'html.parser')
+ soup = BeautifulSoup(html,'html.parser')
+ hash = hashlib.sha256()
+ hash.update(url.encode('ascii'))
s = Session()
- existing_website = s.query(Documents).filter_by(url=url).first()
- if existing_website is None:
- website = Documents(
- url=url,
- text_content=soup.get_text(),
- html_content=soup.prettify(),
- first_crawl_date=datetime.datetime.now(),
- last_crawl_date=datetime.datetime.now(),
- last_index_date=None
- )
+ existing_website = s.query(Website).filter_by(url=url).first()
+ print (existing_website)
+ if existing_website == None:
+ website = Website(
+ url=url,
+ text_content=soup.get_text(),
+ html_content=soup.prettify(),
+ first_crawl_date=datetime.datetime.now(),
+ last_crawl_date = datetime.datetime.now()
+ )
s.add(website)
else:
existing_website.last_crawl_date = datetime.datetime.now()
s.add(existing_website)
s.commit()
s.close()
- links = soup.find_all("a", href=True)
+ x = open(f'data/links.txt', 'a')
+ x.close()
+ links = soup.find_all("a")
for link in links:
found = False
link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
continue
- if any(ext in link for ext in excluded_filetypes):
- continue
- if "http" not in link:
+ if not "http" in link:
link = urljoin(url, link)
- link = link.split('?')[0]
- link = link.split('#')[0]
if (recursion > 0 and link not in traversed_links):
try:
traversed_links.append(link)
link_html = get_html(link)
- r = recursion - 1
- sleep(0.5)
+ r = recursion -1
+ sleep(1)
parse_html(link, link_html, r, traversed_links)
except:
pass
-# elif link not in traversed_links:
-# with open('data/links.txt', 'r+') as linksfile:
+# else:
+# with open(f'data/links.txt', 'r+') as linksfile:
# while line := linksfile.readline():
# if line.strip() == link.strip():
# found = True
# if not found:
# linksfile.write(f'{link}\n')
-
-def parse_site_map(base_url):
- map = BeautifulSoup(requests.get(base_url).content, 'xml')
- print(map.find_all('loc'))
- for loc in map.find_all('loc'):
- if "xml" in loc.contents[0]:
- parse_site_map(loc.contents[0])
- else:
- url = loc.contents[0]
- html = get_html(url)
- parse_html(url, html, max_recursion)
-
-
if __name__ == "__main__":
+
os.makedirs("data/content", exist_ok=True)
# check inputs
parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true")
- parser.add_argument('-s', "--crawl-sitemap", action="store_true")
- parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
-
+ max_recursion = 4
args = parser.parse_args()
- max_recursion = int(args.max_recursion)
- if args.url == "links":
- with open('data/links.txt', 'r+') as linksfile:
- while line := linksfile.readline():
- if "http" in line:
- try:
- parse_html(line, get_html(line))
- except:
- pass
- elif args.crawl_sitemap:
- rp = urllib.robotparser.RobotFileParser()
- urlparts = urlparse(args.url)
- baseurl = urlparts.scheme + "://" + urlparts.netloc
- rp.set_url(baseurl + "/robots.txt")
- rp.read()
- if not rp.can_fetch("*", args.url):
- print("Robots prevents crawling url: " + args.url)
- exit(0)
- if len(rp.site_maps()) > 0:
- parse_site_map(rp.site_maps()[0])
- else:
- html = get_html(args.url)
- parse_html(args.url, html, max_recursion)
+ html = get_html(args.url)
+ parse_html(args.url, html, max_recursion)
# recursion = 0
# if (args.followlinks):
-# os.remove('data/links.txt')
+# with open(f'data/links.txt', 'r+') as linksfile:
+# while line := linksfile.readline():
+# if recursion < max_recursion:
+# if "http" in line:
+# recursion += 1
+# try:
+# parse_html(line, get_html(line))
+# except:
+# pass
+ os.remove('data/links.txt')
diff --git a/src/index.py b/src/index.py
index 679d312..e04c787 100644
--- a/src/index.py
+++ b/src/index.py
@@ -1,154 +1,54 @@
-#!/usr/bin/python3
-
-import argparse
-from sqlalchemy import create_engine, or_, text
-from sqlalchemy import Table, Column, String, Integer
+from sqlalchemy import create_engine
from config import DATABASE_URI
-from sqlalchemy.dialects.postgresql import UUID
-from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams
-from sqlalchemy.orm import sessionmaker
-from sqlalchemy.exc import SQLAlchemyError
-import uuid
-import datetime
-import time
-import re
-import random
-from multiprocessing import Pool
+from models import Base, Website
+from pathlib import Path
+import argparse
+import os
+import json
+# investigate ngrams for "multi word" matching
+ignored_words = ['a', 'the','is']
-engine = create_engine(DATABASE_URI)
-Base.metadata.create_all(engine)
-Session = sessionmaker(bind=engine)
-# https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html
-
-
-def contains_latin(text):
- latin_pattern = r'[a-zA-ZÀ-ÖØ-öø-ÿ]'
- return bool(re.search(latin_pattern, text))
-
-
-def build_index_chunk(document_chunk):
- session = Session()
- print(len(document_chunk))
- start_time = time.time_ns()
- for document in document_chunk:
- print(document.url)
- content = re.sub(r'[.,?!]', ' ', str(document.text_content))
- content = re.sub(r'[^\w\s]', '', str(content))
- content_words = content.split()
- build_ngrams(1, content_words, document.id)
- build_ngrams(2, content_words, document.id)
- build_ngrams(3, content_words, document.id)
- build_ngrams(4, content_words, document.id)
- build_ngrams(5, content_words, document.id)
-
- document.last_index_date = datetime.datetime.now()
- session.merge(document)
- session.commit()
- session.close()
+def remove_punctuation(input_string):
+ punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
+ for p in punc:
+ input_string = input_string.replace(p, '')
+ return input_string
def build_index():
- while True:
- session = Session()
- documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
- None), Documents.last_index_date < Documents.last_crawl_date)).limit(100)
- session.close()
-
- # Execute the query to get the result set
- documents = list(documents_query)
- if len(documents) == 0:
- return
- build_index_chunk(documents)
- continue
- chunk_size = 10
- document_chunks = [documents[i:i+chunk_size]
- for i in range(0, len(documents), chunk_size)]
- with Pool() as pool:
- pool.map(build_index_chunk, document_chunks)
-
-
-def zip_ngrams(size: int, corpus, document_id):
- size = int(size)
- connection = engine.connect()
- temptbl_name = 'temp_del_{}'.format(
- time.time_ns() + random.randint(100000, 9999999))
- temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column(
- 'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True)
-
- try:
- # Start transaction
- with connection.begin():
- temptbl.create(engine)
- insert_grams = []
- grams = zip(*[corpus[i:] for i in range(size)])
- for gram in grams:
- gram = ' '.join(gram).lower()
- insert_grams.append(
- {"id": uuid.uuid4(), "gram": gram, "size": size})
- connection.execute(temptbl.insert().values(insert_grams))
- connection.execute(text("UPDATE " + temptbl_name +
- " SET id = ngrams.id FROM ngrams WHERE ngrams.gram = "
- + temptbl_name + ".gram;"))
- connection.execute(text("INSERT INTO ngrams (id, gram, size) SELECT " +
- " distinct t.id, t.gram as gram, t.size FROM " +
- temptbl_name + " t LEFT JOIN ngrams on ngrams.gram = " +
- "t.gram WHERE ngrams.id is null and t.size is not null " + " ON CONFLICT DO NOTHING;"))
- connection.execute(text("INSERT INTO document_ngrams(id, document_id, ngram_id) SELECT DISTINCT " +
- "uuid_generate_v4() , '" + str(document_id) + "'::UUID, t.id FROM " + temptbl_name + " t;"))
- except SQLAlchemyError as e:
- # Handle exceptions
- print("An error occurred:", e)
- # Rollback transaction
- connection.rollback()
- else:
- # Commit transaction if no exceptions occurred
- connection.commit()
- finally:
- connection.close()
- # Drop table outside the transaction block
- temptbl.drop(engine)
-
-
-def build_ngrams(size: int, corpus: str, document_id: str):
- session = Session()
- zip_ngrams(size, corpus, document_id)
- return
- i = 0
- grams = []
- while i < len(corpus):
- if i + size >= len(corpus):
- i = len(corpus)
- gram = ''
- for n in range(0, size):
- if i + n >= len(corpus):
- break
- gram += corpus[i+n] + ' '
- gram = gram.strip().lower()
- if len(gram) > 1000 or gram in grams or not contains_latin(gram):
- i += 1
- continue
- grams.append(gram)
- if (len(gram) > 1):
- ngram = session.query(NGrams).filter_by(
- gram=gram).filter_by(size=size).first()
- if ngram is None:
- ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
- session.add(ngram)
- document_ngram = Document_NGrams(
- document_id=document_id, ngram_id=ngram.id)
- session.add(document_ngram)
- session.commit()
- i += 1
-# print(str((time.time_ns() - start_time)//1_000_000))
- session.close()
-
+ with open(f"data/index.json", "w") as index:
+ # get a list of all content files
+ # split on whitespace and add to index
+ dictionary = {}
+ pathlist = Path('data/content').rglob('*.txt')
+ for path in pathlist:
+ with open(str(path)) as content_file:
+ url = content_file.readline()
+ content = content_file.read()
+ content_words = content.split()
+ for word in content_words:
+ word = word.lower()
+ word = remove_punctuation(word)
+ if not word in ignored_words:
+ if not word in dictionary:
+ dictionary[word] = []
+ matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
+ if len(matching_urls) == 0:
+# if not url.strip() in dictionary[word]:
+ entries = dictionary[word]
+ entry = {"url": url.strip(), "count": 1, "filename": str(path)}
+ dictionary[word].append(entry)
+ else:
+ entries = dictionary[word]
+ entry = matching_urls[0]
+ entry["count"] += 1
+ entries.sort(reverse=True, key=lambda entry: entry["count"])
+ index.write(json.dumps(dictionary))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('-r',
- "--rebuild",
- action="store_true",
- help="Blow away the index and rebuild")
+ parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
args = parser.parse_args()
if args.rebuild:
build_index()
+
diff --git a/src/index.py.old b/src/index.py.old
deleted file mode 100644
index 6ec8e21..0000000
--- a/src/index.py.old
+++ /dev/null
@@ -1,54 +0,0 @@
-from sqlalchemy import create_engine
-from config import DATABASE_URI
-from models import Base, Website
-from pathlib import Path
-import argparse
-import os
-import json
-# investigate ngrams for "multi word" matching
-ignored_words = ['a', 'the','is']
-
-def remove_punctuation(input_string):
- punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
- for p in punc:
- input_string = input_string.replace(p, '')
- return input_string
-
-
-def build_index():
- with open("data/index.json", "w") as index:
- # get a list of all content files
- # split on whitespace and add to index
- dictionary = {}
- pathlist = Path('data/content').rglob('*.txt')
- for path in pathlist:
- with open(str(path)) as content_file:
- url = content_file.readline()
- content = content_file.read()
- content_words = content.split()
- for word in content_words:
- word = word.lower()
- word = remove_punctuation(word)
- if word not in ignored_words:
- if word not in dictionary:
- dictionary[word] = []
- matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
- if len(matching_urls) == 0:
-# if not url.strip() in dictionary[word]:
- entries = dictionary[word]
- entry = {"url": url.strip(), "count": 1, "filename": str(path)}
- dictionary[word].append(entry)
- else:
- entries = dictionary[word]
- entry = matching_urls[0]
- entry["count"] += 1
- entries.sort(reverse=True, key=lambda entry: entry["count"])
- index.write(json.dumps(dictionary))
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
- args = parser.parse_args()
- if args.rebuild:
- build_index()
-
diff --git a/src/models.py b/src/models.py
index 50010b6..ee768d4 100644
--- a/src/models.py
+++ b/src/models.py
@@ -1,72 +1,18 @@
from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer
+from sqlalchemy import Column, Integer, String, DateTime
from sqlalchemy.dialects.postgresql import UUID
-from sqlalchemy.orm import relationship, mapped_column
import uuid
Base = declarative_base()
+class Website(Base):
-class Documents(Base):
- __tablename__ = 'documents'
- id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+ __tablename__ = 'websites'
+ id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
url = Column(String)
text_content = Column(String)
html_content = Column(String)
first_crawl_date = Column(DateTime)
last_crawl_date = Column(DateTime)
- last_index_date = Column(DateTime)
- document_tokens = relationship(
- "Document_Tokens", back_populates="document")
- document_ngrams = relationship(
- "Document_NGrams", back_populates="document")
-class Document_Tokens(Base):
- __tablename__ = 'document_tokens'
- id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
- document_id = mapped_column(ForeignKey("documents.id"))
- # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
- token_id = mapped_column(ForeignKey("tokens.id"))
- # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
- document = relationship(
- "Documents", back_populates="document_tokens", uselist=False)
- token = relationship("Tokens", back_populates="document_tokens")
- __table_args__ = (
- Index('idx_document_tokens_document_id_token_id', 'document_id',
- 'token_id', unique=True, postgresql_using='hash'),
- Index('idx_document_tokens_clustered', 'document_id',
- 'token_id', postgresql_using='hash'),
- )
-
-
-class Tokens(Base):
- __tablename__ = 'tokens'
- id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
- token = Column(String, index=True)
- document_tokens = relationship("Document_Tokens", back_populates="token")
-
-
-class NGrams(Base):
- __tablename__ = 'ngrams'
- id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
- size = Column(Integer, index=True)
- gram = Column(String, index=True)
- document_ngrams = relationship("Document_NGrams", back_populates="ngram")
-
-
-class Document_NGrams(Base):
- __tablename__ = 'document_ngrams'
- id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
- document_id = mapped_column(ForeignKey("documents.id"))
- ngram_id = mapped_column(ForeignKey("ngrams.id"))
- document = relationship(
- "Documents", back_populates="document_ngrams", uselist=False)
- ngram = relationship("NGrams", back_populates="document_ngrams")
-
- __table_args__ = (
- Index('idx_document_ngrams_document_id_ngram_id', 'document_id',
- 'ngram_id', unique=True, postgresql_using='hash'),
- Index('idx_document_ngrams_clustered', 'document_id',
- 'ngram_id', postgresql_using='hash'),
- )
diff --git a/src/search.py b/src/search.py
index fd013bc..17668f9 100755
--- a/src/search.py
+++ b/src/search.py
@@ -1,146 +1,30 @@
-#!/usr/bin/python3
-from sqlalchemy import create_engine, func, and_, or_, not_
-from config import DATABASE_URI
-from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
-from sqlalchemy.orm import sessionmaker
-from sqlalchemy.sql.expression import distinct
-import time
+#!/bin/bash
+
from flask import Flask
-from flask_cors import CORS
-from flask import send_from_directory
+from flask import Request
+import json
from urllib.parse import unquote
-app = Flask(__name__, static_url_path='/static/')
-CORS(app)
-engine = create_engine(DATABASE_URI)
-Base.metadata.create_all(engine)
-Session = sessionmaker(bind=engine)
-# Todo - Boolean search (AND/OR/NOT/"")
-
-
-def split_query(query):
- query = query.lower()
- result = {'ands': [], 'ors': [], 'words': [],
- 'ngrams': [], 'exclusions': []}
- query_words = query.split()
- i = 0
- while i < len(query_words):
- if i + 1 < len(query_words):
- if query_words[i + 1].lower() == "and":
- if i + 2 < len(query_words):
- result['ands'].append(
- query_words[i] + ',' + query_words[i+2])
- i = i + 3
- continue
- if query_words[i][0] == '"':
- n = 0
- quoted_query = ""
- while i+n < len(query_words):
- quoted_query += query_words[i+n] + ' '
- if query_words[i+n][len(query_words[i+n])-1] == '"':
- break
- n += 1
- result['ngrams'].append(
- quoted_query[1:len(quoted_query)-2].rstrip())
- i += n + 1
- continue
- elif query_words[i][0] == "-":
- excluded_query = query_words[i][1: len(query_words[i])]
- result['exclusions'].append(excluded_query)
- i += 1
- continue
- result['ngrams'].append(query_words[i])
- i += 1
- return result
-
-
-@ app.route("/search/")
+app = Flask(__name__)
+## Todo - Boolean search (AND/OR/NOT/"")
+@app.route("/search/")
def search(query):
- start_time = time.time_ns()
- session = Session()
- results = {}
- query_words = split_query(unquote(query))
- print(query_words)
- if len(query_words['ands']) > 0:
- print('entering ands: ' +
- str((time.time_ns() - start_time) // 1_000_000) + "ms")
- for a in query_words['ands']:
- query = session.query(Documents.url, func.count(1)). \
- join(Document_NGrams, Documents.id == Document_NGrams.document_id). \
- join(NGrams, Document_NGrams.ngram_id == NGrams.id). \
- filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\
- group_by(Documents.url). \
- having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \
- order_by(func.count(1).desc())
+ with open('data/index.json', 'r') as index_json:
+ index = json.load(index_json)
+ query = unquote(query)
+ query_split = query.split()
+ result = []
+ for q in query_split:
+ q = q.lower()
+ if q in index:
+ for item in index[q]:
+ matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
+ if len(matching_results) == 0:
+ result.append(item)
+ else:
+ matching_results[0]["count"] += item["count"]
+ return result
-# limit(100)
- print(query)
- for result in query.all():
- if result[0] in results.keys():
- results[result[0]] += result[1]
- else:
- results[result[0]] = result[1]
- print('exiting ands: ' +
- str((time.time_ns() - start_time) // 1_000_000) + "ms")
- if len(query_words['ngrams']) > 0:
- print('entering ngrams: ' +
- str((time.time_ns() - start_time) // 1_000_000) + "ms")
+def handle_and():
+ pass
- q = session.query(Documents.url, func.count(1)) \
- .join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
- .join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
- .group_by(Documents.url)
- conditions = []
- for ngram in query_words['ngrams']:
- conditions.append(
- (NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram))
-# q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
- and_conditions = [and_(*condition_pair)
- for condition_pair in conditions]
- q = q.filter(or_(*and_conditions))
- print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
- print(q)
- x = q.limit(100).all()
- print('query executed: ' +
- str((time.time_ns() - start_time) // 1_000_000) + "ms")
- print(x)
- for result in x:
- if result[0] in results.keys():
- results[result[0]] += result[1]
- else:
- results[result[0]] = result[1]
-# for y in x:
-# print(y)
-# for document_ngram in y.document_ngrams:
-# if document_ngram.document.url in results.keys():
-# results[document_ngram.document.url] += 1
-# else:
-# results[document_ngram.document.url] = 1
- print('exiting ngrams: ' +
- str((time.time_ns() - start_time) // 1_000_000) + "ms")
-
- print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
- session.close()
- return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
-
-
-# @app.route("/search/")
-# def search(query):
-# start_time = time.time_ns()
-# session = Session()
-# result = {}
-# query_words = unquote(query).split()
-# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
-# for word in query_words:
-# word = word.lower()
-# matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
-#
-# if matching_ngram is None:
-# continue
-# for document_ngram in matching_ngram.document_ngrams:
-# if document_ngram.document.url in result.keys():
-# result[document_ngram.document.url] += 1
-# else:
-# result[document_ngram.document.url] = 1
-# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
-# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
diff --git a/todo b/todo
deleted file mode 100644
index ddda3bd..0000000
--- a/todo
+++ /dev/null
@@ -1,11 +0,0 @@
-[x] Refactor website table to generic document table (maybe using URN instead of URL?)
-[x] Define tokens table FKed to document table
-[x] Refactor index.py to tokenize input into tokens table
-[x] Define N-Grams table
-[x] Add N-Gram generation to index.py
-[x] Add clustered index to document_ngrams table model
-[x] Add clustered index to document_tokens table model
-[ ] Add ddl command to create partition tables
-[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
-[x] Instead of starting from a random page on the site, go to root and find site map and crawl that
-