Indexer and query optimisations

This commit is contained in:
rmgr 2024-04-06 19:34:59 +10:30
parent 9d57f66cd7
commit 9f0e7e6b29
2 changed files with 63 additions and 43 deletions

View file

@ -50,38 +50,51 @@ def search(query):
session = Session()
results = {}
query_words = split_query(unquote(query))
for a in query_words['ands']:
query = session.query(Documents.url, func.count(1)). \
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
join(Tokens, Document_Tokens.token_id == Tokens.id).\
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
group_by(Documents.url).\
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
order_by(func.count(1).desc())
for result in query.all():
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
x = session.query(NGrams).filter(
NGrams.gram.in_(query_words['ngrams'])).all()
if len(query_words['ands']) > 0:
for a in query_words['ands']:
query = session.query(Documents.url, func.count(1)). \
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
join(Tokens, Document_Tokens.token_id == Tokens.id).\
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
group_by(Documents.url).\
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
order_by(func.count(1).desc())
for result in query.all():
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
if len(query_words['ngrams']) > 0:
print('entering ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
for y in x:
print(y.gram)
for document_ngram in y.document_ngrams:
if document_ngram.document.url in results.keys():
results[document_ngram.document.url] += 1
else:
results[document_ngram.document.url] = 1
q = session.query(NGrams)
for ngram in query_words['ngrams']:
q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
x = session.query(Tokens).filter(
Tokens.token.in_(query_words['words'])).limit(1000)
for y in x:
for document_token in y.document_tokens:
if document_token.document.url in results.keys():
results[document_token.document.url] += 1
else:
results[document_token.document.url] = 1
x = q.all()
for y in x:
for document_ngram in y.document_ngrams:
if document_ngram.document.url in results.keys():
results[document_ngram.document.url] += 1
else:
results[document_ngram.document.url] = 1
print('exiting ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
if len(query_words['words']) > 0:
print('entering words: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
x = session.query(Tokens).filter(
Tokens.token.in_(query_words['words'])).limit(1000)
for y in x:
for document_token in y.document_tokens:
if document_token.document.url in results.keys():
results[document_token.document.url] += 1
else:
results[document_token.document.url] = 1
print('exiting words: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]