Rework ngram generation. Greatly improve performance of indexer. Commit horrendous sql sins
This commit is contained in:
parent
9f0e7e6b29
commit
bdb4064acc
5 changed files with 155 additions and 57 deletions
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/python3
|
||||
from sqlalchemy import create_engine, func
|
||||
from config import DATABASE_URI
|
||||
from models import Base, Tokens, Documents, Document_Tokens, NGrams
|
||||
from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy.sql.expression import distinct
|
||||
import time
|
||||
|
|
@ -37,9 +37,9 @@ def split_query(query):
|
|||
n += 1
|
||||
result['ngrams'].append(
|
||||
quoted_query[1:len(quoted_query)-2].rstrip())
|
||||
i += n
|
||||
i += n + 1
|
||||
continue
|
||||
result['words'].append(query_words[i])
|
||||
result['ngrams'].append(query_words[i])
|
||||
i += 1
|
||||
return result
|
||||
|
||||
|
|
@ -50,6 +50,7 @@ def search(query):
|
|||
session = Session()
|
||||
results = {}
|
||||
query_words = split_query(unquote(query))
|
||||
print(query_words)
|
||||
if len(query_words['ands']) > 0:
|
||||
for a in query_words['ands']:
|
||||
query = session.query(Documents.url, func.count(1)). \
|
||||
|
|
@ -68,35 +69,55 @@ def search(query):
|
|||
print('entering ngrams: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
|
||||
q = session.query(NGrams)
|
||||
q = session.query(Documents.url, func.count(1)) \
|
||||
.join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
|
||||
.join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
|
||||
.group_by(Documents.url)
|
||||
for ngram in query_words['ngrams']:
|
||||
q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
|
||||
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
|
||||
print(q)
|
||||
x = q.all()
|
||||
for y in x:
|
||||
for document_ngram in y.document_ngrams:
|
||||
if document_ngram.document.url in results.keys():
|
||||
results[document_ngram.document.url] += 1
|
||||
else:
|
||||
results[document_ngram.document.url] = 1
|
||||
print('query executed: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
print(x)
|
||||
for result in x:
|
||||
if result[0] in results.keys():
|
||||
results[result[0]] += result[1]
|
||||
else:
|
||||
results[result[0]] = result[1]
|
||||
# for y in x:
|
||||
# print(y)
|
||||
# for document_ngram in y.document_ngrams:
|
||||
# if document_ngram.document.url in results.keys():
|
||||
# results[document_ngram.document.url] += 1
|
||||
# else:
|
||||
# results[document_ngram.document.url] = 1
|
||||
print('exiting ngrams: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
if len(query_words['words']) > 0:
|
||||
print('entering words: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
x = session.query(Tokens).filter(
|
||||
Tokens.token.in_(query_words['words'])).limit(1000)
|
||||
for y in x:
|
||||
for document_token in y.document_tokens:
|
||||
if document_token.document.url in results.keys():
|
||||
results[document_token.document.url] += 1
|
||||
else:
|
||||
results[document_token.document.url] = 1
|
||||
q = session.query(Documents.url, func.count(1)) \
|
||||
.join(Document_Tokens, Documents.id == Document_Tokens.document_id) \
|
||||
.join(Tokens, Document_Tokens.token_id == Tokens.id) \
|
||||
.group_by(Documents.url).filter(Tokens.token.in_(query_words['words']))
|
||||
|
||||
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
print(q)
|
||||
x = q.all()
|
||||
print('query executed: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
for result in x:
|
||||
if result[0] in results.keys():
|
||||
results[result[0]] += result[1]
|
||||
else:
|
||||
results[result[0]] = result[1]
|
||||
print('exiting words: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
|
||||
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
session.close()
|
||||
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
|
||||
# @app.route("/search/<query>")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue