Indexer and query optimisations
This commit is contained in:
parent
9d57f66cd7
commit
9f0e7e6b29
2 changed files with 63 additions and 43 deletions
|
|
@ -50,38 +50,51 @@ def search(query):
|
|||
session = Session()
|
||||
results = {}
|
||||
query_words = split_query(unquote(query))
|
||||
for a in query_words['ands']:
|
||||
query = session.query(Documents.url, func.count(1)). \
|
||||
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
|
||||
join(Tokens, Document_Tokens.token_id == Tokens.id).\
|
||||
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
|
||||
group_by(Documents.url).\
|
||||
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
|
||||
order_by(func.count(1).desc())
|
||||
for result in query.all():
|
||||
if result[0] in results.keys():
|
||||
results[result[0]] += result[1]
|
||||
else:
|
||||
results[result[0]] = result[1]
|
||||
x = session.query(NGrams).filter(
|
||||
NGrams.gram.in_(query_words['ngrams'])).all()
|
||||
if len(query_words['ands']) > 0:
|
||||
for a in query_words['ands']:
|
||||
query = session.query(Documents.url, func.count(1)). \
|
||||
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
|
||||
join(Tokens, Document_Tokens.token_id == Tokens.id).\
|
||||
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
|
||||
group_by(Documents.url).\
|
||||
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
|
||||
order_by(func.count(1).desc())
|
||||
for result in query.all():
|
||||
if result[0] in results.keys():
|
||||
results[result[0]] += result[1]
|
||||
else:
|
||||
results[result[0]] = result[1]
|
||||
if len(query_words['ngrams']) > 0:
|
||||
print('entering ngrams: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
|
||||
for y in x:
|
||||
print(y.gram)
|
||||
for document_ngram in y.document_ngrams:
|
||||
if document_ngram.document.url in results.keys():
|
||||
results[document_ngram.document.url] += 1
|
||||
else:
|
||||
results[document_ngram.document.url] = 1
|
||||
q = session.query(NGrams)
|
||||
for ngram in query_words['ngrams']:
|
||||
q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
|
||||
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
|
||||
x = session.query(Tokens).filter(
|
||||
Tokens.token.in_(query_words['words'])).limit(1000)
|
||||
for y in x:
|
||||
for document_token in y.document_tokens:
|
||||
if document_token.document.url in results.keys():
|
||||
results[document_token.document.url] += 1
|
||||
else:
|
||||
results[document_token.document.url] = 1
|
||||
x = q.all()
|
||||
for y in x:
|
||||
for document_ngram in y.document_ngrams:
|
||||
if document_ngram.document.url in results.keys():
|
||||
results[document_ngram.document.url] += 1
|
||||
else:
|
||||
results[document_ngram.document.url] = 1
|
||||
print('exiting ngrams: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
if len(query_words['words']) > 0:
|
||||
print('entering words: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
x = session.query(Tokens).filter(
|
||||
Tokens.token.in_(query_words['words'])).limit(1000)
|
||||
for y in x:
|
||||
for document_token in y.document_tokens:
|
||||
if document_token.document.url in results.keys():
|
||||
results[document_token.document.url] += 1
|
||||
else:
|
||||
results[document_token.document.url] = 1
|
||||
print('exiting words: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
|
||||
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue