From 7ee9d978b26faa7ef5a73c83f060fef7549ab349 Mon Sep 17 00:00:00 2001 From: rmgr Date: Thu, 4 Apr 2024 20:46:34 +1030 Subject: [PATCH] Tidy up crawling and implement boolean search --- src/crawl.py | 30 +++++++++---------- src/index.py | 9 ++++-- src/models.py | 1 + src/search.py | 81 ++++++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 91 insertions(+), 30 deletions(-) diff --git a/src/crawl.py b/src/crawl.py index 3856300..e7e35be 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -40,21 +40,21 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro if not rp.can_fetch("*", url): print("Robots prevents crawling url: " + url) return - - soup = BeautifulSoup(html,'html.parser') + + soup = BeautifulSoup(html, 'html.parser') hash = hashlib.sha256() hash.update(url.encode('ascii')) s = Session() existing_website = s.query(Documents).filter_by(url=url).first() - print (existing_website) - if existing_website == None: + if existing_website is None: website = Documents( url=url, text_content=soup.get_text(), html_content=soup.prettify(), first_crawl_date=datetime.datetime.now(), - last_crawl_date = datetime.datetime.now() + last_crawl_date=datetime.datetime.now(), + last_index_date=None ) s.add(website) else: @@ -64,44 +64,44 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro s.close() x = open(f'data/links.txt', 'a') x.close() - links = soup.find_all("a") + links = soup.find_all("a", href=True) for link in links: found = False - if not hasattr(link, "href"): - continue link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue - if not "http" in link: + if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link: + continue + if "http" not in link: link = urljoin(url, link) if (recursion > 0 and link not in traversed_links): try: traversed_links.append(link) link_html = get_html(link) - r = recursion -1 - sleep(1) + r = recursion -1 + sleep(0.5) parse_html(link, link_html, r, traversed_links) except: pass elif link not in traversed_links: - with open(f'data/links.txt', 'r+') as linksfile: + with open('data/links.txt', 'r+') as linksfile: while line := linksfile.readline(): if line.strip() == link.strip(): found = True if not found: linksfile.write(f'{link}\n') -if __name__ == "__main__": +if __name__ == "__main__": os.makedirs("data/content", exist_ok=True) # check inputs parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") - max_recursion = 2 + max_recursion = 4 args = parser.parse_args() if args.url == "links": - with open(f'data/links.txt', 'r+') as linksfile: + with open('data/links.txt', 'r+') as linksfile: while line := linksfile.readline(): if "http" in line: try: diff --git a/src/index.py b/src/index.py index 227815e..d7259ce 100644 --- a/src/index.py +++ b/src/index.py @@ -1,10 +1,11 @@ #!/usr/bin/python3 import argparse -from sqlalchemy import create_engine +from sqlalchemy import create_engine, or_ from config import DATABASE_URI from models import Base, Documents, Document_Tokens, Tokens from sqlalchemy.orm import sessionmaker import uuid +import datetime engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) @@ -14,18 +15,22 @@ Session = sessionmaker(bind=engine) def build_index(): session = Session() # Read list of 1000 documents from db - documents = session.query(Documents).limit(1000) + documents = session.query(Documents).filter(or_(Documents.last_index_date.is_(None), Documents.last_index_date 50: + continue token = session.query(Tokens).filter_by(token=word).first() if token is None: token = Tokens(token=word, id=uuid.uuid4()) session.add(token) document_token = Document_Tokens(document_id=document.id, token_id=token.id) session.add(document_token) + document.last_index_date = datetime.datetime.now() + session.add(document) session.commit() diff --git a/src/models.py b/src/models.py index c2c1d07..de7e7a9 100644 --- a/src/models.py +++ b/src/models.py @@ -15,6 +15,7 @@ class Documents(Base): html_content = Column(String) first_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime) + last_index_date = Column(DateTime) document_tokens = relationship("Document_Tokens", back_populates="document") diff --git a/src/search.py b/src/search.py index b95a83f..c5c233e 100755 --- a/src/search.py +++ b/src/search.py @@ -1,9 +1,10 @@ #!/usr/bin/python3 -from sqlalchemy import create_engine +from sqlalchemy import create_engine, func from config import DATABASE_URI -from models import Base, Tokens +from models import Base, Tokens, Documents, Document_Tokens from sqlalchemy.orm import sessionmaker - +from sqlalchemy.sql.expression import distinct +import time from flask import Flask from urllib.parse import unquote @@ -14,17 +15,71 @@ Session = sessionmaker(bind=engine) # Todo - Boolean search (AND/OR/NOT/"") +def split_query(query): + result = {'ands': [], 'ors': [], 'words': []} + query_words = query.split() + i = 0 + while i < len(query_words): + if i + 1 < len(query_words): + if query_words[i + 1].lower() == "and": + if i + 2 < len(query_words): + result['ands'].append( + query_words[i] + ',' + query_words[i+2]) + i = i + 3 + continue + result['words'].append(query_words[i]) + i += 1 + return result + + @app.route("/search/") def search(query): + start_time = time.time_ns() session = Session() - result = [] - query_words = unquote(query).split() - for word in query_words: - word = word.lower() - matching_token = session.query(Tokens).filter_by(token=word).first() - if session is None: - continue - for document_token in matching_token.document_tokens: + results = {} + query_words = split_query(unquote(query)) + for a in query_words['ands']: + query = session.query(Documents.url, func.count(1)).\ + join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ + join(Tokens, Document_Tokens.token_id == Tokens.id).\ + filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ + group_by(Documents.url).\ + having(func.count(distinct(Document_Tokens.token_id)) == 2).\ + order_by(func.count(1).desc()) + for result in query.all(): + if result[0] in results.keys(): + results[result[0]] += result[1] + else: + results[result[0]] = result[1] + x = session.query(Tokens).filter( + Tokens.token.in_(query_words['words'])).limit(1000) + for y in x: + for document_token in y.document_tokens: + if document_token.document.url in results.keys(): + results[document_token.document.url] += 1 + else: + results[document_token.document.url] = 1 - result.append(document_token.document.url) - return result + print(str((time.time_ns() - start_time) // 1_000_000) + "ms") + return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10] + +# @app.route("/search/") +# def search(query): +# start_time = time.time_ns() +# session = Session() +# result = {} +# query_words = unquote(query).split() +# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000) +# for word in query_words: +# word = word.lower() +# matching_token = session.query(Tokens).filter_by(token=word).first() +# +# if matching_token is None: +# continue +# for document_token in matching_token.document_tokens: +# if document_token.document.url in result.keys(): +# result[document_token.document.url] += 1 +# else: +# result[document_token.document.url] = 1 +# print(str((time.time_ns() - start_time) // 1_000_000) + "ms") +# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]