From 9f0e7e6b299c8439d43494f902410be83576f130 Mon Sep 17 00:00:00 2001 From: rmgr Date: Sat, 6 Apr 2024 19:34:59 +1030 Subject: [PATCH] Indexer and query optimisations --- src/index.py | 33 ++++++++++++++--------- src/search.py | 73 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 63 insertions(+), 43 deletions(-) diff --git a/src/index.py b/src/index.py index e73c93d..4629c75 100644 --- a/src/index.py +++ b/src/index.py @@ -17,10 +17,12 @@ Session = sessionmaker(bind=engine) def build_index_chunk(document_chunk): session = Session() + print(len(document_chunk)) for document in document_chunk: print(document.url) content = re.sub(r'[^\w\s]', '', str(document.text_content)) content_words = content.split() + build_ngrams(2, content_words, session, document.id) build_ngrams(3, content_words, session, document.id) build_ngrams(4, content_words, session, document.id) build_ngrams(5, content_words, session, document.id) @@ -43,17 +45,21 @@ def build_index_chunk(document_chunk): def build_index(): session = Session() - documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_( - None), Documents.last_index_date < Documents.last_crawl_date)).limit(1000) - session.close() - - documents = list(documents_query) # Execute the query to get the result set + while True: + documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_( + None), Documents.last_index_date < Documents.last_crawl_date)).limit(100) + session.close() - chunk_size = 100 - document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)] + # Execute the query to get the result set + documents = list(documents_query) + if len(documents) == 0: + return + chunk_size = 10 + document_chunks = [documents[i:i+chunk_size] + for i in range(0, len(documents), chunk_size)] - with Pool() as pool: - pool.map(build_index_chunk, document_chunks) + with Pool() as pool: + pool.map(build_index_chunk, document_chunks) def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str): @@ -66,9 +72,10 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str if i + n >= len(corpus): break gram += corpus[i+n] + ' ' - gram = gram.rstrip().lower() - print(gram) - + gram = gram.strip().lower() + if len(gram) > 4000: + i += 1 + continue ngram = session.query(NGrams).filter_by(gram=gram).first() if ngram is None: ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram) @@ -76,7 +83,7 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str document_ngram = Document_NGrams( document_id=document_id, ngram_id=ngram.id) session.add(document_ngram) - session.commit() + # session.commit() i += 1 diff --git a/src/search.py b/src/search.py index f77927b..0dedf77 100755 --- a/src/search.py +++ b/src/search.py @@ -50,38 +50,51 @@ def search(query): session = Session() results = {} query_words = split_query(unquote(query)) - for a in query_words['ands']: - query = session.query(Documents.url, func.count(1)). \ - join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ - join(Tokens, Document_Tokens.token_id == Tokens.id).\ - filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ - group_by(Documents.url).\ - having(func.count(distinct(Document_Tokens.token_id)) == 2).\ - order_by(func.count(1).desc()) - for result in query.all(): - if result[0] in results.keys(): - results[result[0]] += result[1] - else: - results[result[0]] = result[1] - x = session.query(NGrams).filter( - NGrams.gram.in_(query_words['ngrams'])).all() + if len(query_words['ands']) > 0: + for a in query_words['ands']: + query = session.query(Documents.url, func.count(1)). \ + join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ + join(Tokens, Document_Tokens.token_id == Tokens.id).\ + filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ + group_by(Documents.url).\ + having(func.count(distinct(Document_Tokens.token_id)) == 2).\ + order_by(func.count(1).desc()) + for result in query.all(): + if result[0] in results.keys(): + results[result[0]] += result[1] + else: + results[result[0]] = result[1] + if len(query_words['ngrams']) > 0: + print('entering ngrams: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") - for y in x: - print(y.gram) - for document_ngram in y.document_ngrams: - if document_ngram.document.url in results.keys(): - results[document_ngram.document.url] += 1 - else: - results[document_ngram.document.url] = 1 + q = session.query(NGrams) + for ngram in query_words['ngrams']: + q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram) + print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") - x = session.query(Tokens).filter( - Tokens.token.in_(query_words['words'])).limit(1000) - for y in x: - for document_token in y.document_tokens: - if document_token.document.url in results.keys(): - results[document_token.document.url] += 1 - else: - results[document_token.document.url] = 1 + x = q.all() + for y in x: + for document_ngram in y.document_ngrams: + if document_ngram.document.url in results.keys(): + results[document_ngram.document.url] += 1 + else: + results[document_ngram.document.url] = 1 + print('exiting ngrams: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") + if len(query_words['words']) > 0: + print('entering words: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") + x = session.query(Tokens).filter( + Tokens.token.in_(query_words['words'])).limit(1000) + for y in x: + for document_token in y.document_tokens: + if document_token.document.url in results.keys(): + results[document_token.document.url] += 1 + else: + results[document_token.document.url] = 1 + print('exiting words: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(str((time.time_ns() - start_time) // 1_000_000) + "ms") return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]