Indexer and query optimisations
This commit is contained in:
parent
9d57f66cd7
commit
9f0e7e6b29
2 changed files with 63 additions and 43 deletions
33
src/index.py
33
src/index.py
|
|
@ -17,10 +17,12 @@ Session = sessionmaker(bind=engine)
|
|||
|
||||
def build_index_chunk(document_chunk):
|
||||
session = Session()
|
||||
print(len(document_chunk))
|
||||
for document in document_chunk:
|
||||
print(document.url)
|
||||
content = re.sub(r'[^\w\s]', '', str(document.text_content))
|
||||
content_words = content.split()
|
||||
build_ngrams(2, content_words, session, document.id)
|
||||
build_ngrams(3, content_words, session, document.id)
|
||||
build_ngrams(4, content_words, session, document.id)
|
||||
build_ngrams(5, content_words, session, document.id)
|
||||
|
|
@ -43,17 +45,21 @@ def build_index_chunk(document_chunk):
|
|||
|
||||
def build_index():
|
||||
session = Session()
|
||||
documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
|
||||
None), Documents.last_index_date < Documents.last_crawl_date)).limit(1000)
|
||||
session.close()
|
||||
|
||||
documents = list(documents_query) # Execute the query to get the result set
|
||||
while True:
|
||||
documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
|
||||
None), Documents.last_index_date < Documents.last_crawl_date)).limit(100)
|
||||
session.close()
|
||||
|
||||
chunk_size = 100
|
||||
document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)]
|
||||
# Execute the query to get the result set
|
||||
documents = list(documents_query)
|
||||
if len(documents) == 0:
|
||||
return
|
||||
chunk_size = 10
|
||||
document_chunks = [documents[i:i+chunk_size]
|
||||
for i in range(0, len(documents), chunk_size)]
|
||||
|
||||
with Pool() as pool:
|
||||
pool.map(build_index_chunk, document_chunks)
|
||||
with Pool() as pool:
|
||||
pool.map(build_index_chunk, document_chunks)
|
||||
|
||||
|
||||
def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str):
|
||||
|
|
@ -66,9 +72,10 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str
|
|||
if i + n >= len(corpus):
|
||||
break
|
||||
gram += corpus[i+n] + ' '
|
||||
gram = gram.rstrip().lower()
|
||||
print(gram)
|
||||
|
||||
gram = gram.strip().lower()
|
||||
if len(gram) > 4000:
|
||||
i += 1
|
||||
continue
|
||||
ngram = session.query(NGrams).filter_by(gram=gram).first()
|
||||
if ngram is None:
|
||||
ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
|
||||
|
|
@ -76,7 +83,7 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str
|
|||
document_ngram = Document_NGrams(
|
||||
document_id=document_id, ngram_id=ngram.id)
|
||||
session.add(document_ngram)
|
||||
session.commit()
|
||||
# session.commit()
|
||||
i += 1
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -50,38 +50,51 @@ def search(query):
|
|||
session = Session()
|
||||
results = {}
|
||||
query_words = split_query(unquote(query))
|
||||
for a in query_words['ands']:
|
||||
query = session.query(Documents.url, func.count(1)). \
|
||||
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
|
||||
join(Tokens, Document_Tokens.token_id == Tokens.id).\
|
||||
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
|
||||
group_by(Documents.url).\
|
||||
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
|
||||
order_by(func.count(1).desc())
|
||||
for result in query.all():
|
||||
if result[0] in results.keys():
|
||||
results[result[0]] += result[1]
|
||||
else:
|
||||
results[result[0]] = result[1]
|
||||
x = session.query(NGrams).filter(
|
||||
NGrams.gram.in_(query_words['ngrams'])).all()
|
||||
if len(query_words['ands']) > 0:
|
||||
for a in query_words['ands']:
|
||||
query = session.query(Documents.url, func.count(1)). \
|
||||
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
|
||||
join(Tokens, Document_Tokens.token_id == Tokens.id).\
|
||||
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
|
||||
group_by(Documents.url).\
|
||||
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
|
||||
order_by(func.count(1).desc())
|
||||
for result in query.all():
|
||||
if result[0] in results.keys():
|
||||
results[result[0]] += result[1]
|
||||
else:
|
||||
results[result[0]] = result[1]
|
||||
if len(query_words['ngrams']) > 0:
|
||||
print('entering ngrams: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
|
||||
for y in x:
|
||||
print(y.gram)
|
||||
for document_ngram in y.document_ngrams:
|
||||
if document_ngram.document.url in results.keys():
|
||||
results[document_ngram.document.url] += 1
|
||||
else:
|
||||
results[document_ngram.document.url] = 1
|
||||
q = session.query(NGrams)
|
||||
for ngram in query_words['ngrams']:
|
||||
q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
|
||||
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
|
||||
x = session.query(Tokens).filter(
|
||||
Tokens.token.in_(query_words['words'])).limit(1000)
|
||||
for y in x:
|
||||
for document_token in y.document_tokens:
|
||||
if document_token.document.url in results.keys():
|
||||
results[document_token.document.url] += 1
|
||||
else:
|
||||
results[document_token.document.url] = 1
|
||||
x = q.all()
|
||||
for y in x:
|
||||
for document_ngram in y.document_ngrams:
|
||||
if document_ngram.document.url in results.keys():
|
||||
results[document_ngram.document.url] += 1
|
||||
else:
|
||||
results[document_ngram.document.url] = 1
|
||||
print('exiting ngrams: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
if len(query_words['words']) > 0:
|
||||
print('entering words: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
x = session.query(Tokens).filter(
|
||||
Tokens.token.in_(query_words['words'])).limit(1000)
|
||||
for y in x:
|
||||
for document_token in y.document_tokens:
|
||||
if document_token.document.url in results.keys():
|
||||
results[document_token.document.url] += 1
|
||||
else:
|
||||
results[document_token.document.url] = 1
|
||||
print('exiting words: ' +
|
||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
|
||||
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue