Tidy up crawling and implement boolean search
This commit is contained in:
parent
d4bb3fb8dc
commit
7ee9d978b2
4 changed files with 91 additions and 30 deletions
|
|
@ -1,9 +1,10 @@
|
|||
#!/usr/bin/python3
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy import create_engine, func
|
||||
from config import DATABASE_URI
|
||||
from models import Base, Tokens
|
||||
from models import Base, Tokens, Documents, Document_Tokens
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from sqlalchemy.sql.expression import distinct
|
||||
import time
|
||||
from flask import Flask
|
||||
from urllib.parse import unquote
|
||||
|
||||
|
|
@ -14,17 +15,71 @@ Session = sessionmaker(bind=engine)
|
|||
# Todo - Boolean search (AND/OR/NOT/"")
|
||||
|
||||
|
||||
def split_query(query):
|
||||
result = {'ands': [], 'ors': [], 'words': []}
|
||||
query_words = query.split()
|
||||
i = 0
|
||||
while i < len(query_words):
|
||||
if i + 1 < len(query_words):
|
||||
if query_words[i + 1].lower() == "and":
|
||||
if i + 2 < len(query_words):
|
||||
result['ands'].append(
|
||||
query_words[i] + ',' + query_words[i+2])
|
||||
i = i + 3
|
||||
continue
|
||||
result['words'].append(query_words[i])
|
||||
i += 1
|
||||
return result
|
||||
|
||||
|
||||
@app.route("/search/<query>")
|
||||
def search(query):
|
||||
start_time = time.time_ns()
|
||||
session = Session()
|
||||
result = []
|
||||
query_words = unquote(query).split()
|
||||
for word in query_words:
|
||||
word = word.lower()
|
||||
matching_token = session.query(Tokens).filter_by(token=word).first()
|
||||
if session is None:
|
||||
continue
|
||||
for document_token in matching_token.document_tokens:
|
||||
results = {}
|
||||
query_words = split_query(unquote(query))
|
||||
for a in query_words['ands']:
|
||||
query = session.query(Documents.url, func.count(1)).\
|
||||
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
|
||||
join(Tokens, Document_Tokens.token_id == Tokens.id).\
|
||||
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
|
||||
group_by(Documents.url).\
|
||||
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
|
||||
order_by(func.count(1).desc())
|
||||
for result in query.all():
|
||||
if result[0] in results.keys():
|
||||
results[result[0]] += result[1]
|
||||
else:
|
||||
results[result[0]] = result[1]
|
||||
x = session.query(Tokens).filter(
|
||||
Tokens.token.in_(query_words['words'])).limit(1000)
|
||||
for y in x:
|
||||
for document_token in y.document_tokens:
|
||||
if document_token.document.url in results.keys():
|
||||
results[document_token.document.url] += 1
|
||||
else:
|
||||
results[document_token.document.url] = 1
|
||||
|
||||
result.append(document_token.document.url)
|
||||
return result
|
||||
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
|
||||
# @app.route("/search/<query>")
|
||||
# def search(query):
|
||||
# start_time = time.time_ns()
|
||||
# session = Session()
|
||||
# result = {}
|
||||
# query_words = unquote(query).split()
|
||||
# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000)
|
||||
# for word in query_words:
|
||||
# word = word.lower()
|
||||
# matching_token = session.query(Tokens).filter_by(token=word).first()
|
||||
#
|
||||
# if matching_token is None:
|
||||
# continue
|
||||
# for document_token in matching_token.document_tokens:
|
||||
# if document_token.document.url in result.keys():
|
||||
# result[document_token.document.url] += 1
|
||||
# else:
|
||||
# result[document_token.document.url] = 1
|
||||
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue