Indexer and query optimisations

This commit is contained in:
rmgr 2024-04-06 19:34:59 +10:30
parent 9d57f66cd7
commit 9f0e7e6b29
2 changed files with 63 additions and 43 deletions

View file

@ -17,10 +17,12 @@ Session = sessionmaker(bind=engine)
def build_index_chunk(document_chunk): def build_index_chunk(document_chunk):
session = Session() session = Session()
print(len(document_chunk))
for document in document_chunk: for document in document_chunk:
print(document.url) print(document.url)
content = re.sub(r'[^\w\s]', '', str(document.text_content)) content = re.sub(r'[^\w\s]', '', str(document.text_content))
content_words = content.split() content_words = content.split()
build_ngrams(2, content_words, session, document.id)
build_ngrams(3, content_words, session, document.id) build_ngrams(3, content_words, session, document.id)
build_ngrams(4, content_words, session, document.id) build_ngrams(4, content_words, session, document.id)
build_ngrams(5, content_words, session, document.id) build_ngrams(5, content_words, session, document.id)
@ -43,17 +45,21 @@ def build_index_chunk(document_chunk):
def build_index(): def build_index():
session = Session() session = Session()
documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_( while True:
None), Documents.last_index_date < Documents.last_crawl_date)).limit(1000) documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
session.close() None), Documents.last_index_date < Documents.last_crawl_date)).limit(100)
session.close()
documents = list(documents_query) # Execute the query to get the result set # Execute the query to get the result set
documents = list(documents_query)
if len(documents) == 0:
return
chunk_size = 10
document_chunks = [documents[i:i+chunk_size]
for i in range(0, len(documents), chunk_size)]
chunk_size = 100 with Pool() as pool:
document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)] pool.map(build_index_chunk, document_chunks)
with Pool() as pool:
pool.map(build_index_chunk, document_chunks)
def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str): def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str):
@ -66,9 +72,10 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str
if i + n >= len(corpus): if i + n >= len(corpus):
break break
gram += corpus[i+n] + ' ' gram += corpus[i+n] + ' '
gram = gram.rstrip().lower() gram = gram.strip().lower()
print(gram) if len(gram) > 4000:
i += 1
continue
ngram = session.query(NGrams).filter_by(gram=gram).first() ngram = session.query(NGrams).filter_by(gram=gram).first()
if ngram is None: if ngram is None:
ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram) ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
@ -76,7 +83,7 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str
document_ngram = Document_NGrams( document_ngram = Document_NGrams(
document_id=document_id, ngram_id=ngram.id) document_id=document_id, ngram_id=ngram.id)
session.add(document_ngram) session.add(document_ngram)
session.commit() # session.commit()
i += 1 i += 1

View file

@ -50,38 +50,51 @@ def search(query):
session = Session() session = Session()
results = {} results = {}
query_words = split_query(unquote(query)) query_words = split_query(unquote(query))
for a in query_words['ands']: if len(query_words['ands']) > 0:
query = session.query(Documents.url, func.count(1)). \ for a in query_words['ands']:
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ query = session.query(Documents.url, func.count(1)). \
join(Tokens, Document_Tokens.token_id == Tokens.id).\ join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ join(Tokens, Document_Tokens.token_id == Tokens.id).\
group_by(Documents.url).\ filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
having(func.count(distinct(Document_Tokens.token_id)) == 2).\ group_by(Documents.url).\
order_by(func.count(1).desc()) having(func.count(distinct(Document_Tokens.token_id)) == 2).\
for result in query.all(): order_by(func.count(1).desc())
if result[0] in results.keys(): for result in query.all():
results[result[0]] += result[1] if result[0] in results.keys():
else: results[result[0]] += result[1]
results[result[0]] = result[1] else:
x = session.query(NGrams).filter( results[result[0]] = result[1]
NGrams.gram.in_(query_words['ngrams'])).all() if len(query_words['ngrams']) > 0:
print('entering ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
for y in x: q = session.query(NGrams)
print(y.gram) for ngram in query_words['ngrams']:
for document_ngram in y.document_ngrams: q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
if document_ngram.document.url in results.keys(): print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
results[document_ngram.document.url] += 1
else:
results[document_ngram.document.url] = 1
x = session.query(Tokens).filter( x = q.all()
Tokens.token.in_(query_words['words'])).limit(1000) for y in x:
for y in x: for document_ngram in y.document_ngrams:
for document_token in y.document_tokens: if document_ngram.document.url in results.keys():
if document_token.document.url in results.keys(): results[document_ngram.document.url] += 1
results[document_token.document.url] += 1 else:
else: results[document_ngram.document.url] = 1
results[document_token.document.url] = 1 print('exiting ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
if len(query_words['words']) > 0:
print('entering words: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
x = session.query(Tokens).filter(
Tokens.token.in_(query_words['words'])).limit(1000)
for y in x:
for document_token in y.document_tokens:
if document_token.document.url in results.keys():
results[document_token.document.url] += 1
else:
results[document_token.document.url] = 1
print('exiting words: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(str((time.time_ns() - start_time) // 1_000_000) + "ms") print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10] return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]