148 lines
5.9 KiB
Python
Executable file
148 lines
5.9 KiB
Python
Executable file
#!/usr/bin/python3
|
|
from sqlalchemy import create_engine, func, and_, or_
|
|
from config import DATABASE_URI
|
|
from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams
|
|
from sqlalchemy.orm import sessionmaker
|
|
from sqlalchemy.sql.expression import distinct
|
|
import time
|
|
from flask import Flask
|
|
from urllib.parse import unquote
|
|
|
|
app = Flask(__name__)
|
|
engine = create_engine(DATABASE_URI)
|
|
Base.metadata.create_all(engine)
|
|
Session = sessionmaker(bind=engine)
|
|
# Todo - Boolean search (AND/OR/NOT/"")
|
|
|
|
|
|
def split_query(query):
|
|
result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []}
|
|
query_words = query.split()
|
|
i = 0
|
|
while i < len(query_words):
|
|
if i + 1 < len(query_words):
|
|
if query_words[i + 1].lower() == "and":
|
|
if i + 2 < len(query_words):
|
|
result['ands'].append(
|
|
query_words[i] + ',' + query_words[i+2])
|
|
i = i + 3
|
|
continue
|
|
if query_words[i][0] == '"':
|
|
n = 0
|
|
quoted_query = ""
|
|
while i+n < len(query_words):
|
|
quoted_query += query_words[i+n] + ' '
|
|
if query_words[i+n][len(query_words[i+n])-1] == '"':
|
|
break
|
|
n += 1
|
|
result['ngrams'].append(
|
|
quoted_query[1:len(quoted_query)-2].rstrip())
|
|
i += n + 1
|
|
continue
|
|
result['ngrams'].append(query_words[i])
|
|
i += 1
|
|
return result
|
|
|
|
|
|
@ app.route("/search/<query>")
|
|
def search(query):
|
|
start_time = time.time_ns()
|
|
session = Session()
|
|
results = {}
|
|
query_words = split_query(unquote(query))
|
|
print(query_words)
|
|
if len(query_words['ands']) > 0:
|
|
for a in query_words['ands']:
|
|
query = session.query(Documents.url, func.count(1)). \
|
|
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
|
|
join(Tokens, Document_Tokens.token_id == Tokens.id).\
|
|
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
|
|
group_by(Documents.url).\
|
|
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
|
|
order_by(func.count(1).desc())
|
|
for result in query.all():
|
|
if result[0] in results.keys():
|
|
results[result[0]] += result[1]
|
|
else:
|
|
results[result[0]] = result[1]
|
|
if len(query_words['ngrams']) > 0:
|
|
print('entering ngrams: ' +
|
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
|
|
q = session.query(Documents.url, func.count(1)) \
|
|
.join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
|
|
.join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
|
|
.group_by(Documents.url)
|
|
conditions = []
|
|
for ngram in query_words['ngrams']:
|
|
conditions.append(
|
|
(NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram))
|
|
# q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
|
|
and_conditions = [and_(*condition_pair)
|
|
for condition_pair in conditions]
|
|
q = q.filter(or_(*and_conditions))
|
|
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
print(q)
|
|
x = q.all()
|
|
print('query executed: ' +
|
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
print(x)
|
|
for result in x:
|
|
if result[0] in results.keys():
|
|
results[result[0]] += result[1]
|
|
else:
|
|
results[result[0]] = result[1]
|
|
# for y in x:
|
|
# print(y)
|
|
# for document_ngram in y.document_ngrams:
|
|
# if document_ngram.document.url in results.keys():
|
|
# results[document_ngram.document.url] += 1
|
|
# else:
|
|
# results[document_ngram.document.url] = 1
|
|
print('exiting ngrams: ' +
|
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
if len(query_words['words']) > 0:
|
|
print('entering words: ' +
|
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
q = session.query(Documents.url, func.count(1)) \
|
|
.join(Document_Tokens, Documents.id == Document_Tokens.document_id) \
|
|
.join(Tokens, Document_Tokens.token_id == Tokens.id) \
|
|
.group_by(Documents.url).filter(Tokens.token.in_(query_words['words']))
|
|
|
|
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
print(q)
|
|
x = q.all()
|
|
print('query executed: ' +
|
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
for result in x:
|
|
if result[0] in results.keys():
|
|
results[result[0]] += result[1]
|
|
else:
|
|
results[result[0]] = result[1]
|
|
print('exiting words: ' +
|
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
|
|
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
session.close()
|
|
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
|
|
|
|
# @app.route("/search/<query>")
|
|
# def search(query):
|
|
# start_time = time.time_ns()
|
|
# session = Session()
|
|
# result = {}
|
|
# query_words = unquote(query).split()
|
|
# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000)
|
|
# for word in query_words:
|
|
# word = word.lower()
|
|
# matching_token = session.query(Tokens).filter_by(token=word).first()
|
|
#
|
|
# if matching_token is None:
|
|
# continue
|
|
# for document_token in matching_token.document_tokens:
|
|
# if document_token.document.url in result.keys():
|
|
# result[document_token.document.url] += 1
|
|
# else:
|
|
# result[document_token.document.url] = 1
|
|
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
|