Indexer and query optimisations
This commit is contained in:
parent
9d57f66cd7
commit
9f0e7e6b29
2 changed files with 63 additions and 43 deletions
33
src/index.py
33
src/index.py
|
|
@ -17,10 +17,12 @@ Session = sessionmaker(bind=engine)
|
||||||
|
|
||||||
def build_index_chunk(document_chunk):
|
def build_index_chunk(document_chunk):
|
||||||
session = Session()
|
session = Session()
|
||||||
|
print(len(document_chunk))
|
||||||
for document in document_chunk:
|
for document in document_chunk:
|
||||||
print(document.url)
|
print(document.url)
|
||||||
content = re.sub(r'[^\w\s]', '', str(document.text_content))
|
content = re.sub(r'[^\w\s]', '', str(document.text_content))
|
||||||
content_words = content.split()
|
content_words = content.split()
|
||||||
|
build_ngrams(2, content_words, session, document.id)
|
||||||
build_ngrams(3, content_words, session, document.id)
|
build_ngrams(3, content_words, session, document.id)
|
||||||
build_ngrams(4, content_words, session, document.id)
|
build_ngrams(4, content_words, session, document.id)
|
||||||
build_ngrams(5, content_words, session, document.id)
|
build_ngrams(5, content_words, session, document.id)
|
||||||
|
|
@ -43,17 +45,21 @@ def build_index_chunk(document_chunk):
|
||||||
|
|
||||||
def build_index():
|
def build_index():
|
||||||
session = Session()
|
session = Session()
|
||||||
documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
|
while True:
|
||||||
None), Documents.last_index_date < Documents.last_crawl_date)).limit(1000)
|
documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
|
||||||
session.close()
|
None), Documents.last_index_date < Documents.last_crawl_date)).limit(100)
|
||||||
|
session.close()
|
||||||
documents = list(documents_query) # Execute the query to get the result set
|
|
||||||
|
|
||||||
chunk_size = 100
|
# Execute the query to get the result set
|
||||||
document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)]
|
documents = list(documents_query)
|
||||||
|
if len(documents) == 0:
|
||||||
|
return
|
||||||
|
chunk_size = 10
|
||||||
|
document_chunks = [documents[i:i+chunk_size]
|
||||||
|
for i in range(0, len(documents), chunk_size)]
|
||||||
|
|
||||||
with Pool() as pool:
|
with Pool() as pool:
|
||||||
pool.map(build_index_chunk, document_chunks)
|
pool.map(build_index_chunk, document_chunks)
|
||||||
|
|
||||||
|
|
||||||
def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str):
|
def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str):
|
||||||
|
|
@ -66,9 +72,10 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str
|
||||||
if i + n >= len(corpus):
|
if i + n >= len(corpus):
|
||||||
break
|
break
|
||||||
gram += corpus[i+n] + ' '
|
gram += corpus[i+n] + ' '
|
||||||
gram = gram.rstrip().lower()
|
gram = gram.strip().lower()
|
||||||
print(gram)
|
if len(gram) > 4000:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
ngram = session.query(NGrams).filter_by(gram=gram).first()
|
ngram = session.query(NGrams).filter_by(gram=gram).first()
|
||||||
if ngram is None:
|
if ngram is None:
|
||||||
ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
|
ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
|
||||||
|
|
@ -76,7 +83,7 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str
|
||||||
document_ngram = Document_NGrams(
|
document_ngram = Document_NGrams(
|
||||||
document_id=document_id, ngram_id=ngram.id)
|
document_id=document_id, ngram_id=ngram.id)
|
||||||
session.add(document_ngram)
|
session.add(document_ngram)
|
||||||
session.commit()
|
# session.commit()
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -50,38 +50,51 @@ def search(query):
|
||||||
session = Session()
|
session = Session()
|
||||||
results = {}
|
results = {}
|
||||||
query_words = split_query(unquote(query))
|
query_words = split_query(unquote(query))
|
||||||
for a in query_words['ands']:
|
if len(query_words['ands']) > 0:
|
||||||
query = session.query(Documents.url, func.count(1)). \
|
for a in query_words['ands']:
|
||||||
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
|
query = session.query(Documents.url, func.count(1)). \
|
||||||
join(Tokens, Document_Tokens.token_id == Tokens.id).\
|
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
|
||||||
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
|
join(Tokens, Document_Tokens.token_id == Tokens.id).\
|
||||||
group_by(Documents.url).\
|
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
|
||||||
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
|
group_by(Documents.url).\
|
||||||
order_by(func.count(1).desc())
|
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
|
||||||
for result in query.all():
|
order_by(func.count(1).desc())
|
||||||
if result[0] in results.keys():
|
for result in query.all():
|
||||||
results[result[0]] += result[1]
|
if result[0] in results.keys():
|
||||||
else:
|
results[result[0]] += result[1]
|
||||||
results[result[0]] = result[1]
|
else:
|
||||||
x = session.query(NGrams).filter(
|
results[result[0]] = result[1]
|
||||||
NGrams.gram.in_(query_words['ngrams'])).all()
|
if len(query_words['ngrams']) > 0:
|
||||||
|
print('entering ngrams: ' +
|
||||||
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
|
||||||
for y in x:
|
q = session.query(NGrams)
|
||||||
print(y.gram)
|
for ngram in query_words['ngrams']:
|
||||||
for document_ngram in y.document_ngrams:
|
q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
|
||||||
if document_ngram.document.url in results.keys():
|
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
results[document_ngram.document.url] += 1
|
|
||||||
else:
|
|
||||||
results[document_ngram.document.url] = 1
|
|
||||||
|
|
||||||
x = session.query(Tokens).filter(
|
x = q.all()
|
||||||
Tokens.token.in_(query_words['words'])).limit(1000)
|
for y in x:
|
||||||
for y in x:
|
for document_ngram in y.document_ngrams:
|
||||||
for document_token in y.document_tokens:
|
if document_ngram.document.url in results.keys():
|
||||||
if document_token.document.url in results.keys():
|
results[document_ngram.document.url] += 1
|
||||||
results[document_token.document.url] += 1
|
else:
|
||||||
else:
|
results[document_ngram.document.url] = 1
|
||||||
results[document_token.document.url] = 1
|
print('exiting ngrams: ' +
|
||||||
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
if len(query_words['words']) > 0:
|
||||||
|
print('entering words: ' +
|
||||||
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
x = session.query(Tokens).filter(
|
||||||
|
Tokens.token.in_(query_words['words'])).limit(1000)
|
||||||
|
for y in x:
|
||||||
|
for document_token in y.document_tokens:
|
||||||
|
if document_token.document.url in results.keys():
|
||||||
|
results[document_token.document.url] += 1
|
||||||
|
else:
|
||||||
|
results[document_token.document.url] = 1
|
||||||
|
print('exiting words: ' +
|
||||||
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
|
||||||
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
|
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue