search-engine/src/search.py

148 lines
5.9 KiB
Python
Executable file

#!/usr/bin/python3
from sqlalchemy import create_engine, func, and_, or_
from config import DATABASE_URI
from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.expression import distinct
import time
from flask import Flask
from urllib.parse import unquote
app = Flask(__name__)
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
# Todo - Boolean search (AND/OR/NOT/"")
def split_query(query):
result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []}
query_words = query.split()
i = 0
while i < len(query_words):
if i + 1 < len(query_words):
if query_words[i + 1].lower() == "and":
if i + 2 < len(query_words):
result['ands'].append(
query_words[i] + ',' + query_words[i+2])
i = i + 3
continue
if query_words[i][0] == '"':
n = 0
quoted_query = ""
while i+n < len(query_words):
quoted_query += query_words[i+n] + ' '
if query_words[i+n][len(query_words[i+n])-1] == '"':
break
n += 1
result['ngrams'].append(
quoted_query[1:len(quoted_query)-2].rstrip())
i += n + 1
continue
result['ngrams'].append(query_words[i])
i += 1
return result
@ app.route("/search/<query>")
def search(query):
start_time = time.time_ns()
session = Session()
results = {}
query_words = split_query(unquote(query))
print(query_words)
if len(query_words['ands']) > 0:
for a in query_words['ands']:
query = session.query(Documents.url, func.count(1)). \
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
join(Tokens, Document_Tokens.token_id == Tokens.id).\
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
group_by(Documents.url).\
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
order_by(func.count(1).desc())
for result in query.all():
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
if len(query_words['ngrams']) > 0:
print('entering ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
q = session.query(Documents.url, func.count(1)) \
.join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
.join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
.group_by(Documents.url)
conditions = []
for ngram in query_words['ngrams']:
conditions.append(
(NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram))
# q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
and_conditions = [and_(*condition_pair)
for condition_pair in conditions]
q = q.filter(or_(*and_conditions))
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(q)
x = q.all()
print('query executed: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(x)
for result in x:
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
# for y in x:
# print(y)
# for document_ngram in y.document_ngrams:
# if document_ngram.document.url in results.keys():
# results[document_ngram.document.url] += 1
# else:
# results[document_ngram.document.url] = 1
print('exiting ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
if len(query_words['words']) > 0:
print('entering words: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
q = session.query(Documents.url, func.count(1)) \
.join(Document_Tokens, Documents.id == Document_Tokens.document_id) \
.join(Tokens, Document_Tokens.token_id == Tokens.id) \
.group_by(Documents.url).filter(Tokens.token.in_(query_words['words']))
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(q)
x = q.all()
print('query executed: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
for result in x:
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
print('exiting words: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
session.close()
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
# @app.route("/search/<query>")
# def search(query):
# start_time = time.time_ns()
# session = Session()
# result = {}
# query_words = unquote(query).split()
# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000)
# for word in query_words:
# word = word.lower()
# matching_token = session.query(Tokens).filter_by(token=word).first()
#
# if matching_token is None:
# continue
# for document_token in matching_token.document_tokens:
# if document_token.document.url in result.keys():
# result[document_token.document.url] += 1
# else:
# result[document_token.document.url] = 1
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]