#!/usr/bin/python3 from sqlalchemy import create_engine, func, and_, or_ from config import DATABASE_URI from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams from sqlalchemy.orm import sessionmaker from sqlalchemy.sql.expression import distinct import time from flask import Flask from urllib.parse import unquote app = Flask(__name__) engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) # Todo - Boolean search (AND/OR/NOT/"") def split_query(query): result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []} query_words = query.split() i = 0 while i < len(query_words): if i + 1 < len(query_words): if query_words[i + 1].lower() == "and": if i + 2 < len(query_words): result['ands'].append( query_words[i] + ',' + query_words[i+2]) i = i + 3 continue if query_words[i][0] == '"': n = 0 quoted_query = "" while i+n < len(query_words): quoted_query += query_words[i+n] + ' ' if query_words[i+n][len(query_words[i+n])-1] == '"': break n += 1 result['ngrams'].append( quoted_query[1:len(quoted_query)-2].rstrip()) i += n + 1 continue result['ngrams'].append(query_words[i]) i += 1 return result @ app.route("/search/") def search(query): start_time = time.time_ns() session = Session() results = {} query_words = split_query(unquote(query)) print(query_words) if len(query_words['ands']) > 0: for a in query_words['ands']: query = session.query(Documents.url, func.count(1)). \ join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ join(Tokens, Document_Tokens.token_id == Tokens.id).\ filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ group_by(Documents.url).\ having(func.count(distinct(Document_Tokens.token_id)) == 2).\ order_by(func.count(1).desc()) for result in query.all(): if result[0] in results.keys(): results[result[0]] += result[1] else: results[result[0]] = result[1] if len(query_words['ngrams']) > 0: print('entering ngrams: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") q = session.query(Documents.url, func.count(1)) \ .join(Document_NGrams, Documents.id == Document_NGrams.document_id) \ .join(NGrams, Document_NGrams.ngram_id == NGrams.id) \ .group_by(Documents.url) conditions = [] for ngram in query_words['ngrams']: conditions.append( (NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram)) # q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram) and_conditions = [and_(*condition_pair) for condition_pair in conditions] q = q.filter(or_(*and_conditions)) print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(q) x = q.all() print('query executed: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(x) for result in x: if result[0] in results.keys(): results[result[0]] += result[1] else: results[result[0]] = result[1] # for y in x: # print(y) # for document_ngram in y.document_ngrams: # if document_ngram.document.url in results.keys(): # results[document_ngram.document.url] += 1 # else: # results[document_ngram.document.url] = 1 print('exiting ngrams: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") if len(query_words['words']) > 0: print('entering words: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") q = session.query(Documents.url, func.count(1)) \ .join(Document_Tokens, Documents.id == Document_Tokens.document_id) \ .join(Tokens, Document_Tokens.token_id == Tokens.id) \ .group_by(Documents.url).filter(Tokens.token.in_(query_words['words'])) print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(q) x = q.all() print('query executed: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") for result in x: if result[0] in results.keys(): results[result[0]] += result[1] else: results[result[0]] = result[1] print('exiting words: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(str((time.time_ns() - start_time) // 1_000_000) + "ms") session.close() return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10] # @app.route("/search/") # def search(query): # start_time = time.time_ns() # session = Session() # result = {} # query_words = unquote(query).split() # x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000) # for word in query_words: # word = word.lower() # matching_token = session.query(Tokens).filter_by(token=word).first() # # if matching_token is None: # continue # for document_token in matching_token.document_tokens: # if document_token.document.url in result.keys(): # result[document_token.document.url] += 1 # else: # result[document_token.document.url] = 1 # print(str((time.time_ns() - start_time) // 1_000_000) + "ms") # return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]