Tidy up crawling and implement boolean search

This commit is contained in:
rmgr 2024-04-04 20:46:34 +10:30
parent d4bb3fb8dc
commit 7ee9d978b2
4 changed files with 91 additions and 30 deletions

View file

@ -40,21 +40,21 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
if not rp.can_fetch("*", url): if not rp.can_fetch("*", url):
print("Robots prevents crawling url: " + url) print("Robots prevents crawling url: " + url)
return return
soup = BeautifulSoup(html,'html.parser') soup = BeautifulSoup(html, 'html.parser')
hash = hashlib.sha256() hash = hashlib.sha256()
hash.update(url.encode('ascii')) hash.update(url.encode('ascii'))
s = Session() s = Session()
existing_website = s.query(Documents).filter_by(url=url).first() existing_website = s.query(Documents).filter_by(url=url).first()
print (existing_website) if existing_website is None:
if existing_website == None:
website = Documents( website = Documents(
url=url, url=url,
text_content=soup.get_text(), text_content=soup.get_text(),
html_content=soup.prettify(), html_content=soup.prettify(),
first_crawl_date=datetime.datetime.now(), first_crawl_date=datetime.datetime.now(),
last_crawl_date = datetime.datetime.now() last_crawl_date=datetime.datetime.now(),
last_index_date=None
) )
s.add(website) s.add(website)
else: else:
@ -64,44 +64,44 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
s.close() s.close()
x = open(f'data/links.txt', 'a') x = open(f'data/links.txt', 'a')
x.close() x.close()
links = soup.find_all("a") links = soup.find_all("a", href=True)
for link in links: for link in links:
found = False found = False
if not hasattr(link, "href"):
continue
link = link["href"] link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link: if (len(link) > 0 and link[0] == "#") or "localhost" in link:
continue continue
if not "http" in link: if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
continue
if "http" not in link:
link = urljoin(url, link) link = urljoin(url, link)
if (recursion > 0 and link not in traversed_links): if (recursion > 0 and link not in traversed_links):
try: try:
traversed_links.append(link) traversed_links.append(link)
link_html = get_html(link) link_html = get_html(link)
r = recursion -1 r = recursion -1
sleep(1) sleep(0.5)
parse_html(link, link_html, r, traversed_links) parse_html(link, link_html, r, traversed_links)
except: except:
pass pass
elif link not in traversed_links: elif link not in traversed_links:
with open(f'data/links.txt', 'r+') as linksfile: with open('data/links.txt', 'r+') as linksfile:
while line := linksfile.readline(): while line := linksfile.readline():
if line.strip() == link.strip(): if line.strip() == link.strip():
found = True found = True
if not found: if not found:
linksfile.write(f'{link}\n') linksfile.write(f'{link}\n')
if __name__ == "__main__":
if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True) os.makedirs("data/content", exist_ok=True)
# check inputs # check inputs
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true") parser.add_argument('-f', "--followlinks", action="store_true")
max_recursion = 2 max_recursion = 4
args = parser.parse_args() args = parser.parse_args()
if args.url == "links": if args.url == "links":
with open(f'data/links.txt', 'r+') as linksfile: with open('data/links.txt', 'r+') as linksfile:
while line := linksfile.readline(): while line := linksfile.readline():
if "http" in line: if "http" in line:
try: try:

View file

@ -1,10 +1,11 @@
#!/usr/bin/python3 #!/usr/bin/python3
import argparse import argparse
from sqlalchemy import create_engine from sqlalchemy import create_engine, or_
from config import DATABASE_URI from config import DATABASE_URI
from models import Base, Documents, Document_Tokens, Tokens from models import Base, Documents, Document_Tokens, Tokens
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
import uuid import uuid
import datetime
engine = create_engine(DATABASE_URI) engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine) Base.metadata.create_all(engine)
@ -14,18 +15,22 @@ Session = sessionmaker(bind=engine)
def build_index(): def build_index():
session = Session() session = Session()
# Read list of 1000 documents from db # Read list of 1000 documents from db
documents = session.query(Documents).limit(1000) documents = session.query(Documents).filter(or_(Documents.last_index_date.is_(None), Documents.last_index_date<Documents.last_crawl_date)).limit(1000)
for document in documents: for document in documents:
print(document.url) print(document.url)
content_words = document.text_content.split() content_words = document.text_content.split()
for word in content_words: for word in content_words:
word = word.lower() word = word.lower()
if len(word) > 50:
continue
token = session.query(Tokens).filter_by(token=word).first() token = session.query(Tokens).filter_by(token=word).first()
if token is None: if token is None:
token = Tokens(token=word, id=uuid.uuid4()) token = Tokens(token=word, id=uuid.uuid4())
session.add(token) session.add(token)
document_token = Document_Tokens(document_id=document.id, token_id=token.id) document_token = Document_Tokens(document_id=document.id, token_id=token.id)
session.add(document_token) session.add(document_token)
document.last_index_date = datetime.datetime.now()
session.add(document)
session.commit() session.commit()

View file

@ -15,6 +15,7 @@ class Documents(Base):
html_content = Column(String) html_content = Column(String)
first_crawl_date = Column(DateTime) first_crawl_date = Column(DateTime)
last_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime)
last_index_date = Column(DateTime)
document_tokens = relationship("Document_Tokens", back_populates="document") document_tokens = relationship("Document_Tokens", back_populates="document")

View file

@ -1,9 +1,10 @@
#!/usr/bin/python3 #!/usr/bin/python3
from sqlalchemy import create_engine from sqlalchemy import create_engine, func
from config import DATABASE_URI from config import DATABASE_URI
from models import Base, Tokens from models import Base, Tokens, Documents, Document_Tokens
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.expression import distinct
import time
from flask import Flask from flask import Flask
from urllib.parse import unquote from urllib.parse import unquote
@ -14,17 +15,71 @@ Session = sessionmaker(bind=engine)
# Todo - Boolean search (AND/OR/NOT/"") # Todo - Boolean search (AND/OR/NOT/"")
def split_query(query):
result = {'ands': [], 'ors': [], 'words': []}
query_words = query.split()
i = 0
while i < len(query_words):
if i + 1 < len(query_words):
if query_words[i + 1].lower() == "and":
if i + 2 < len(query_words):
result['ands'].append(
query_words[i] + ',' + query_words[i+2])
i = i + 3
continue
result['words'].append(query_words[i])
i += 1
return result
@app.route("/search/<query>") @app.route("/search/<query>")
def search(query): def search(query):
start_time = time.time_ns()
session = Session() session = Session()
result = [] results = {}
query_words = unquote(query).split() query_words = split_query(unquote(query))
for word in query_words: for a in query_words['ands']:
word = word.lower() query = session.query(Documents.url, func.count(1)).\
matching_token = session.query(Tokens).filter_by(token=word).first() join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
if session is None: join(Tokens, Document_Tokens.token_id == Tokens.id).\
continue filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
for document_token in matching_token.document_tokens: group_by(Documents.url).\
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
order_by(func.count(1).desc())
for result in query.all():
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
x = session.query(Tokens).filter(
Tokens.token.in_(query_words['words'])).limit(1000)
for y in x:
for document_token in y.document_tokens:
if document_token.document.url in results.keys():
results[document_token.document.url] += 1
else:
results[document_token.document.url] = 1
result.append(document_token.document.url) print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
return result return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
# @app.route("/search/<query>")
# def search(query):
# start_time = time.time_ns()
# session = Session()
# result = {}
# query_words = unquote(query).split()
# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000)
# for word in query_words:
# word = word.lower()
# matching_token = session.query(Tokens).filter_by(token=word).first()
#
# if matching_token is None:
# continue
# for document_token in matching_token.document_tokens:
# if document_token.document.url in result.keys():
# result[document_token.document.url] += 1
# else:
# result[document_token.document.url] = 1
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]