Tidy up crawling and implement boolean search
This commit is contained in:
parent
d4bb3fb8dc
commit
7ee9d978b2
4 changed files with 91 additions and 30 deletions
24
src/crawl.py
24
src/crawl.py
|
|
@ -47,14 +47,14 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
|
||||||
|
|
||||||
s = Session()
|
s = Session()
|
||||||
existing_website = s.query(Documents).filter_by(url=url).first()
|
existing_website = s.query(Documents).filter_by(url=url).first()
|
||||||
print (existing_website)
|
if existing_website is None:
|
||||||
if existing_website == None:
|
|
||||||
website = Documents(
|
website = Documents(
|
||||||
url=url,
|
url=url,
|
||||||
text_content=soup.get_text(),
|
text_content=soup.get_text(),
|
||||||
html_content=soup.prettify(),
|
html_content=soup.prettify(),
|
||||||
first_crawl_date=datetime.datetime.now(),
|
first_crawl_date=datetime.datetime.now(),
|
||||||
last_crawl_date = datetime.datetime.now()
|
last_crawl_date=datetime.datetime.now(),
|
||||||
|
last_index_date=None
|
||||||
)
|
)
|
||||||
s.add(website)
|
s.add(website)
|
||||||
else:
|
else:
|
||||||
|
|
@ -64,44 +64,44 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
|
||||||
s.close()
|
s.close()
|
||||||
x = open(f'data/links.txt', 'a')
|
x = open(f'data/links.txt', 'a')
|
||||||
x.close()
|
x.close()
|
||||||
links = soup.find_all("a")
|
links = soup.find_all("a", href=True)
|
||||||
for link in links:
|
for link in links:
|
||||||
found = False
|
found = False
|
||||||
if not hasattr(link, "href"):
|
|
||||||
continue
|
|
||||||
link = link["href"]
|
link = link["href"]
|
||||||
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
||||||
continue
|
continue
|
||||||
if not "http" in link:
|
if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
|
||||||
|
continue
|
||||||
|
if "http" not in link:
|
||||||
link = urljoin(url, link)
|
link = urljoin(url, link)
|
||||||
if (recursion > 0 and link not in traversed_links):
|
if (recursion > 0 and link not in traversed_links):
|
||||||
try:
|
try:
|
||||||
traversed_links.append(link)
|
traversed_links.append(link)
|
||||||
link_html = get_html(link)
|
link_html = get_html(link)
|
||||||
r = recursion -1
|
r = recursion -1
|
||||||
sleep(1)
|
sleep(0.5)
|
||||||
parse_html(link, link_html, r, traversed_links)
|
parse_html(link, link_html, r, traversed_links)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
elif link not in traversed_links:
|
elif link not in traversed_links:
|
||||||
with open(f'data/links.txt', 'r+') as linksfile:
|
with open('data/links.txt', 'r+') as linksfile:
|
||||||
while line := linksfile.readline():
|
while line := linksfile.readline():
|
||||||
if line.strip() == link.strip():
|
if line.strip() == link.strip():
|
||||||
found = True
|
found = True
|
||||||
if not found:
|
if not found:
|
||||||
linksfile.write(f'{link}\n')
|
linksfile.write(f'{link}\n')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
os.makedirs("data/content", exist_ok=True)
|
os.makedirs("data/content", exist_ok=True)
|
||||||
# check inputs
|
# check inputs
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("url", help="URL of the webpage to be crawled")
|
parser.add_argument("url", help="URL of the webpage to be crawled")
|
||||||
parser.add_argument('-f', "--followlinks", action="store_true")
|
parser.add_argument('-f', "--followlinks", action="store_true")
|
||||||
max_recursion = 2
|
max_recursion = 4
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.url == "links":
|
if args.url == "links":
|
||||||
with open(f'data/links.txt', 'r+') as linksfile:
|
with open('data/links.txt', 'r+') as linksfile:
|
||||||
while line := linksfile.readline():
|
while line := linksfile.readline():
|
||||||
if "http" in line:
|
if "http" in line:
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,11 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
import argparse
|
import argparse
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine, or_
|
||||||
from config import DATABASE_URI
|
from config import DATABASE_URI
|
||||||
from models import Base, Documents, Document_Tokens, Tokens
|
from models import Base, Documents, Document_Tokens, Tokens
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
import uuid
|
import uuid
|
||||||
|
import datetime
|
||||||
|
|
||||||
engine = create_engine(DATABASE_URI)
|
engine = create_engine(DATABASE_URI)
|
||||||
Base.metadata.create_all(engine)
|
Base.metadata.create_all(engine)
|
||||||
|
|
@ -14,18 +15,22 @@ Session = sessionmaker(bind=engine)
|
||||||
def build_index():
|
def build_index():
|
||||||
session = Session()
|
session = Session()
|
||||||
# Read list of 1000 documents from db
|
# Read list of 1000 documents from db
|
||||||
documents = session.query(Documents).limit(1000)
|
documents = session.query(Documents).filter(or_(Documents.last_index_date.is_(None), Documents.last_index_date<Documents.last_crawl_date)).limit(1000)
|
||||||
for document in documents:
|
for document in documents:
|
||||||
print(document.url)
|
print(document.url)
|
||||||
content_words = document.text_content.split()
|
content_words = document.text_content.split()
|
||||||
for word in content_words:
|
for word in content_words:
|
||||||
word = word.lower()
|
word = word.lower()
|
||||||
|
if len(word) > 50:
|
||||||
|
continue
|
||||||
token = session.query(Tokens).filter_by(token=word).first()
|
token = session.query(Tokens).filter_by(token=word).first()
|
||||||
if token is None:
|
if token is None:
|
||||||
token = Tokens(token=word, id=uuid.uuid4())
|
token = Tokens(token=word, id=uuid.uuid4())
|
||||||
session.add(token)
|
session.add(token)
|
||||||
document_token = Document_Tokens(document_id=document.id, token_id=token.id)
|
document_token = Document_Tokens(document_id=document.id, token_id=token.id)
|
||||||
session.add(document_token)
|
session.add(document_token)
|
||||||
|
document.last_index_date = datetime.datetime.now()
|
||||||
|
session.add(document)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ class Documents(Base):
|
||||||
html_content = Column(String)
|
html_content = Column(String)
|
||||||
first_crawl_date = Column(DateTime)
|
first_crawl_date = Column(DateTime)
|
||||||
last_crawl_date = Column(DateTime)
|
last_crawl_date = Column(DateTime)
|
||||||
|
last_index_date = Column(DateTime)
|
||||||
document_tokens = relationship("Document_Tokens", back_populates="document")
|
document_tokens = relationship("Document_Tokens", back_populates="document")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,10 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine, func
|
||||||
from config import DATABASE_URI
|
from config import DATABASE_URI
|
||||||
from models import Base, Tokens
|
from models import Base, Tokens, Documents, Document_Tokens
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
from sqlalchemy.sql.expression import distinct
|
||||||
|
import time
|
||||||
from flask import Flask
|
from flask import Flask
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
|
|
@ -14,17 +15,71 @@ Session = sessionmaker(bind=engine)
|
||||||
# Todo - Boolean search (AND/OR/NOT/"")
|
# Todo - Boolean search (AND/OR/NOT/"")
|
||||||
|
|
||||||
|
|
||||||
|
def split_query(query):
|
||||||
|
result = {'ands': [], 'ors': [], 'words': []}
|
||||||
|
query_words = query.split()
|
||||||
|
i = 0
|
||||||
|
while i < len(query_words):
|
||||||
|
if i + 1 < len(query_words):
|
||||||
|
if query_words[i + 1].lower() == "and":
|
||||||
|
if i + 2 < len(query_words):
|
||||||
|
result['ands'].append(
|
||||||
|
query_words[i] + ',' + query_words[i+2])
|
||||||
|
i = i + 3
|
||||||
|
continue
|
||||||
|
result['words'].append(query_words[i])
|
||||||
|
i += 1
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
@app.route("/search/<query>")
|
@app.route("/search/<query>")
|
||||||
def search(query):
|
def search(query):
|
||||||
|
start_time = time.time_ns()
|
||||||
session = Session()
|
session = Session()
|
||||||
result = []
|
results = {}
|
||||||
query_words = unquote(query).split()
|
query_words = split_query(unquote(query))
|
||||||
for word in query_words:
|
for a in query_words['ands']:
|
||||||
word = word.lower()
|
query = session.query(Documents.url, func.count(1)).\
|
||||||
matching_token = session.query(Tokens).filter_by(token=word).first()
|
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
|
||||||
if session is None:
|
join(Tokens, Document_Tokens.token_id == Tokens.id).\
|
||||||
continue
|
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
|
||||||
for document_token in matching_token.document_tokens:
|
group_by(Documents.url).\
|
||||||
|
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
|
||||||
|
order_by(func.count(1).desc())
|
||||||
|
for result in query.all():
|
||||||
|
if result[0] in results.keys():
|
||||||
|
results[result[0]] += result[1]
|
||||||
|
else:
|
||||||
|
results[result[0]] = result[1]
|
||||||
|
x = session.query(Tokens).filter(
|
||||||
|
Tokens.token.in_(query_words['words'])).limit(1000)
|
||||||
|
for y in x:
|
||||||
|
for document_token in y.document_tokens:
|
||||||
|
if document_token.document.url in results.keys():
|
||||||
|
results[document_token.document.url] += 1
|
||||||
|
else:
|
||||||
|
results[document_token.document.url] = 1
|
||||||
|
|
||||||
result.append(document_token.document.url)
|
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
return result
|
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||||
|
|
||||||
|
# @app.route("/search/<query>")
|
||||||
|
# def search(query):
|
||||||
|
# start_time = time.time_ns()
|
||||||
|
# session = Session()
|
||||||
|
# result = {}
|
||||||
|
# query_words = unquote(query).split()
|
||||||
|
# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000)
|
||||||
|
# for word in query_words:
|
||||||
|
# word = word.lower()
|
||||||
|
# matching_token = session.query(Tokens).filter_by(token=word).first()
|
||||||
|
#
|
||||||
|
# if matching_token is None:
|
||||||
|
# continue
|
||||||
|
# for document_token in matching_token.document_tokens:
|
||||||
|
# if document_token.document.url in result.keys():
|
||||||
|
# result[document_token.document.url] += 1
|
||||||
|
# else:
|
||||||
|
# result[document_token.document.url] = 1
|
||||||
|
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue