From f4ea8ad1d776ea76241357186ed9cc1fb26e11a8 Mon Sep 17 00:00:00 2001 From: rmgr Date: Mon, 1 Jan 2024 19:53:22 +1030 Subject: [PATCH 01/15] Respect robots.txt --- beehave.txt | 1 + src/__pycache__/search.cpython-310.pyc | Bin 685 -> 1033 bytes src/crawl.py | 18 ++++++++++++++++-- src/index.py | 8 ++++++++ 4 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 beehave.txt diff --git a/beehave.txt b/beehave.txt new file mode 100644 index 0000000..e3415b6 --- /dev/null +++ b/beehave.txt @@ -0,0 +1 @@ +https://github.com/bitbrain/beehave diff --git a/src/__pycache__/search.cpython-310.pyc b/src/__pycache__/search.cpython-310.pyc index c7402826f1c52ce28e47c499f6b8c84b197fb854..f3e8621f1765bb2ca051342506fd63ec8716cacf 100644 GIT binary patch delta 673 zcmZuu&1w`u5U%R^+1bhLx>2I8F6zM*28pN$UWP@85HJveIrsw-#_8GYvc0oOcdtKT zLWBvJoMhRvkQ{UH6?_RVGLPXCSUo{Ru#4}jrn{=Ys_JjfvDch58a~i!vJYu&&6*v2 zKD+y@;qm$*y!N@Z386pRTe)7N7>{5P)OR8c962l4myOxa?!lvdCwESqQ!qYf??A5Q zEO*-Q6y9`?LA$#48T!aAZXbHNU9bZ8UG3#=f!rA}*~{&d`VpKW!frQrIc0L!fi@hG zZ^L8Kjp%0gN^ZXZ1{`d%!a~@D2cTYFd{2qSUJ*DQhPC+yF0CaEjG}EC!-^M!~TMU&h-TqMMRF;-Vv(k-scG8Sf#b5E^LD?F| zIvEVJ?Jb$AsnANA(aXA_1@oSJE@ysC-69H@j~i=Nf9g;XMM)qII|j&5re6$!m_PJ z0yY-;LtFVLwDxW~=WxE6a}QjYcR3SYT&?ly&DrpIn%z(dE5`9oX-cm7Y9mn$Oc$jbb~%M zDi?iy9VNb-ec~bi0$tC3*};ix!a$#26`wR~gbJ3KB=I>V^B4{#BHR4Y%T)X2TwL@5 rbE}b(b_$fT4`;Vow7UKuOB8c?bQi0KUU str: response = requests.get(url) return response.content -def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool: +def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool: + rp = urllib.robotparser.RobotFileParser() print(url) print(recursion) urlparts = urlparse(url) baseurl = urlparts.scheme + "://" + urlparts.netloc + if baseurl not in robots: + rp.set_url(baseurl + "/robots.txt") + rp.read() + robots[baseurl] = rp + else: + rp = robots[baseurl] + if not rp.can_fetch("*", url): + print("Robots prevents crawling url: " + url) + return + soup = BeautifulSoup(html,'html.parser') hash = hashlib.sha256() hash.update(url.encode('ascii')) @@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> links = soup.find_all("a") for link in links: found = False + if "href" not in link: + continue link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue @@ -80,4 +93,5 @@ if __name__ == "__main__": # parse_html(line, get_html(line)) # except: # pass + os.remove('data/links.txt') diff --git a/src/index.py b/src/index.py index f55a356..7532247 100755 --- a/src/index.py +++ b/src/index.py @@ -7,6 +7,13 @@ import json # investigate ngrams for "multi word" matching ignored_words = ['a', 'the','is'] +def remove_punctuation(input_string): + punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' + for p in punc: + input_string = input_string.replace(p, '') + return input_string + + def build_index(): with open(f"data/index.json", "w") as index: # get a list of all content files @@ -20,6 +27,7 @@ def build_index(): content_words = content.split() for word in content_words: word = word.lower() + word = remove_punctuation(word) if not word in ignored_words: if not word in dictionary: dictionary[word] = [] From efe6dea1f575480d14db9f5df75848cd5bb44482 Mon Sep 17 00:00:00 2001 From: rmgr Date: Mon, 1 Jan 2024 20:52:12 +1030 Subject: [PATCH 02/15] Fix crawling. Add initial linksfile crawling. Still need to remove records as they are processed. --- src/crawl.py | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/crawl.py b/src/crawl.py index da6bffa..a0d2e64 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -47,7 +47,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro links = soup.find_all("a") for link in links: found = False - if "href" not in link: + if not hasattr(link, "href"): continue link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: @@ -63,13 +63,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro parse_html(link, link_html, r, traversed_links) except: pass -# else: -# with open(f'data/links.txt', 'r+') as linksfile: -# while line := linksfile.readline(): -# if line.strip() == link.strip(): -# found = True -# if not found: -# linksfile.write(f'{link}\n') + elif link not in traversed_links: + with open(f'data/links.txt', 'r+') as linksfile: + while line := linksfile.readline(): + if line.strip() == link.strip(): + found = True + if not found: + linksfile.write(f'{link}\n') if __name__ == "__main__": os.makedirs("data/content", exist_ok=True) @@ -77,21 +77,21 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") - max_recursion = 4 + max_recursion = 2 args = parser.parse_args() - html = get_html(args.url) - parse_html(args.url, html, max_recursion) + if args.url == "links": + with open(f'data/links.txt', 'r+') as linksfile: + while line := linksfile.readline(): + if "http" in line: + try: + parse_html(line, get_html(line)) + except: + pass + + else: + html = get_html(args.url) + parse_html(args.url, html, max_recursion) # recursion = 0 # if (args.followlinks): -# with open(f'data/links.txt', 'r+') as linksfile: -# while line := linksfile.readline(): -# if recursion < max_recursion: -# if "http" in line: -# recursion += 1 -# try: -# parse_html(line, get_html(line)) -# except: -# pass - - os.remove('data/links.txt') +# os.remove('data/links.txt') From aed568d11ed36132e418e4eb8b1def9855dc3ca8 Mon Sep 17 00:00:00 2001 From: rmgr Date: Sat, 2 Mar 2024 19:54:53 +1030 Subject: [PATCH 03/15] Remove beehave.txt note --- beehave.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 beehave.txt diff --git a/beehave.txt b/beehave.txt deleted file mode 100644 index e3415b6..0000000 --- a/beehave.txt +++ /dev/null @@ -1 +0,0 @@ -https://github.com/bitbrain/beehave From 8605ee6b2c8ed8b6da9f7a4bd3d1427f48673878 Mon Sep 17 00:00:00 2001 From: rmgr Date: Sat, 2 Mar 2024 19:58:10 +1030 Subject: [PATCH 04/15] Add todo file --- todo | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 todo diff --git a/todo b/todo new file mode 100644 index 0000000..2c7e8cc --- /dev/null +++ b/todo @@ -0,0 +1,6 @@ +[ ] Refactor website table to generic document table (maybe using URN instead of URL?) +[ ] Define tokens table FKed to document table +[ ] Refactor index.py to tokenize input into tokens table +[ ] Define N-Grams table +[ ] Add N-Gram generation to index.py + From 20d198e5595f33244d7a364ab2538e983dd8ab71 Mon Sep 17 00:00:00 2001 From: rmgr Date: Thu, 7 Mar 2024 20:44:34 +1030 Subject: [PATCH 05/15] Refactor to use postgresql end to end --- src/crawl.py | 9 +++--- src/index.py | 83 ++++++++++++++++++++++++------------------------ src/index.py.old | 54 +++++++++++++++++++++++++++++++ src/models.py | 26 ++++++++++++--- src/search.py | 44 ++++++++++++------------- 5 files changed, 144 insertions(+), 72 deletions(-) create mode 100644 src/index.py.old diff --git a/src/crawl.py b/src/crawl.py index 9521b5d..3856300 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -9,7 +9,7 @@ from time import sleep from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI -from models import Base, Website +from models import Base, Documents, Document_Tokens, Tokens from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine import datetime @@ -19,11 +19,12 @@ engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) -def get_html(url: str) -> str: +def get_html(url: str) -> str: response = requests.get(url) return response.content + def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool: rp = urllib.robotparser.RobotFileParser() print(url) @@ -45,10 +46,10 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro hash.update(url.encode('ascii')) s = Session() - existing_website = s.query(Website).filter_by(url=url).first() + existing_website = s.query(Documents).filter_by(url=url).first() print (existing_website) if existing_website == None: - website = Website( + website = Documents( url=url, text_content=soup.get_text(), html_content=soup.prettify(), diff --git a/src/index.py b/src/index.py index e04c787..c80b5e7 100644 --- a/src/index.py +++ b/src/index.py @@ -1,54 +1,53 @@ +#!/usr/bin/python3 +import argparse +import requests +import hashlib +from urllib.parse import urlparse, urljoin +import urllib.robotparser +import os +from time import sleep +from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI -from models import Base, Website -from pathlib import Path -import argparse -import os -import json -# investigate ngrams for "multi word" matching -ignored_words = ['a', 'the','is'] +from models import Base, Documents, Document_Tokens, Tokens +from sqlalchemy.orm import sessionmaker +from sqlalchemy import create_engine +import datetime -def remove_punctuation(input_string): - punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!''' - for p in punc: - input_string = input_string.replace(p, '') - return input_string +engine = create_engine(DATABASE_URI) +Base.metadata.create_all(engine) +Session = sessionmaker(bind=engine) def build_index(): - with open(f"data/index.json", "w") as index: - # get a list of all content files - # split on whitespace and add to index - dictionary = {} - pathlist = Path('data/content').rglob('*.txt') - for path in pathlist: - with open(str(path)) as content_file: - url = content_file.readline() - content = content_file.read() - content_words = content.split() - for word in content_words: - word = word.lower() - word = remove_punctuation(word) - if not word in ignored_words: - if not word in dictionary: - dictionary[word] = [] - matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word])) - if len(matching_urls) == 0: -# if not url.strip() in dictionary[word]: - entries = dictionary[word] - entry = {"url": url.strip(), "count": 1, "filename": str(path)} - dictionary[word].append(entry) - else: - entries = dictionary[word] - entry = matching_urls[0] - entry["count"] += 1 - entries.sort(reverse=True, key=lambda entry: entry["count"]) - index.write(json.dumps(dictionary)) + session = Session() + # Read list of 1000 documents from db + documents = session.query(Documents).limit(1000) + for document in documents: + print(document.url) + content_words = document.text_content.split() + for word in content_words: + word = word.lower() + token = session.query(Tokens).filter_by(token=word).first() + if token is None: + token = Tokens(token=word) + session.add(token) + document_token = Document_Tokens(document_id=document.id, token_id=token.id) + session.add(document_token) + session.commit() + + # Foreach document, break into words + # Check if word exists in database + # Create if not exist + # Link to document + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") + parser.add_argument('-r', + "--rebuild", + action="store_true", + help="Blow away the index and rebuild") args = parser.parse_args() if args.rebuild: build_index() - diff --git a/src/index.py.old b/src/index.py.old new file mode 100644 index 0000000..6ec8e21 --- /dev/null +++ b/src/index.py.old @@ -0,0 +1,54 @@ +from sqlalchemy import create_engine +from config import DATABASE_URI +from models import Base, Website +from pathlib import Path +import argparse +import os +import json +# investigate ngrams for "multi word" matching +ignored_words = ['a', 'the','is'] + +def remove_punctuation(input_string): + punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!''' + for p in punc: + input_string = input_string.replace(p, '') + return input_string + + +def build_index(): + with open("data/index.json", "w") as index: + # get a list of all content files + # split on whitespace and add to index + dictionary = {} + pathlist = Path('data/content').rglob('*.txt') + for path in pathlist: + with open(str(path)) as content_file: + url = content_file.readline() + content = content_file.read() + content_words = content.split() + for word in content_words: + word = word.lower() + word = remove_punctuation(word) + if word not in ignored_words: + if word not in dictionary: + dictionary[word] = [] + matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word])) + if len(matching_urls) == 0: +# if not url.strip() in dictionary[word]: + entries = dictionary[word] + entry = {"url": url.strip(), "count": 1, "filename": str(path)} + dictionary[word].append(entry) + else: + entries = dictionary[word] + entry = matching_urls[0] + entry["count"] += 1 + entries.sort(reverse=True, key=lambda entry: entry["count"]) + index.write(json.dumps(dictionary)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") + args = parser.parse_args() + if args.rebuild: + build_index() + diff --git a/src/models.py b/src/models.py index ee768d4..c2c1d07 100644 --- a/src/models.py +++ b/src/models.py @@ -1,18 +1,36 @@ from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, DateTime +from sqlalchemy import Column, String, DateTime, ForeignKey, Index from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.orm import relationship, mapped_column import uuid Base = declarative_base() -class Website(Base): - __tablename__ = 'websites' - id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4) +class Documents(Base): + __tablename__ = 'documents' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) url = Column(String) text_content = Column(String) html_content = Column(String) first_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime) + document_tokens = relationship("Document_Tokens", back_populates="document") +class Document_Tokens(Base): + __tablename__ = 'document_tokens' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document_id = mapped_column(ForeignKey("documents.id")) + # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + token_id = mapped_column(ForeignKey("tokens.id")) + #Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document = relationship("Documents", back_populates="document_tokens", uselist=False) + token = relationship("Tokens", back_populates="document_tokens") + + +class Tokens(Base): + __tablename__ = 'tokens' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + token = Column(String, index=True) + document_tokens = relationship("Document_Tokens", back_populates="token") diff --git a/src/search.py b/src/search.py index 17668f9..b95a83f 100755 --- a/src/search.py +++ b/src/search.py @@ -1,30 +1,30 @@ -#!/bin/bash +#!/usr/bin/python3 +from sqlalchemy import create_engine +from config import DATABASE_URI +from models import Base, Tokens +from sqlalchemy.orm import sessionmaker from flask import Flask -from flask import Request -import json from urllib.parse import unquote app = Flask(__name__) -## Todo - Boolean search (AND/OR/NOT/"") +engine = create_engine(DATABASE_URI) +Base.metadata.create_all(engine) +Session = sessionmaker(bind=engine) +# Todo - Boolean search (AND/OR/NOT/"") + + @app.route("/search/") def search(query): - with open('data/index.json', 'r') as index_json: - index = json.load(index_json) - query = unquote(query) - query_split = query.split() - result = [] - for q in query_split: - q = q.lower() - if q in index: - for item in index[q]: - matching_results = list(filter(lambda entry: entry['url'] == item["url"], result)) - if len(matching_results) == 0: - result.append(item) - else: - matching_results[0]["count"] += item["count"] - return result - -def handle_and(): - pass + session = Session() + result = [] + query_words = unquote(query).split() + for word in query_words: + word = word.lower() + matching_token = session.query(Tokens).filter_by(token=word).first() + if session is None: + continue + for document_token in matching_token.document_tokens: + result.append(document_token.document.url) + return result From d4bb3fb8dc00d2f8c8bdee00def5731fa5a053c3 Mon Sep 17 00:00:00 2001 From: rmgr Date: Thu, 7 Mar 2024 21:12:19 +1030 Subject: [PATCH 06/15] Tidy up index.py --- src/index.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/index.py b/src/index.py index c80b5e7..227815e 100644 --- a/src/index.py +++ b/src/index.py @@ -1,18 +1,10 @@ #!/usr/bin/python3 import argparse -import requests -import hashlib -from urllib.parse import urlparse, urljoin -import urllib.robotparser -import os -from time import sleep -from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI from models import Base, Documents, Document_Tokens, Tokens from sqlalchemy.orm import sessionmaker -from sqlalchemy import create_engine -import datetime +import uuid engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) @@ -30,16 +22,11 @@ def build_index(): word = word.lower() token = session.query(Tokens).filter_by(token=word).first() if token is None: - token = Tokens(token=word) + token = Tokens(token=word, id=uuid.uuid4()) session.add(token) document_token = Document_Tokens(document_id=document.id, token_id=token.id) session.add(document_token) session.commit() - - # Foreach document, break into words - # Check if word exists in database - # Create if not exist - # Link to document if __name__ == "__main__": From 7ee9d978b26faa7ef5a73c83f060fef7549ab349 Mon Sep 17 00:00:00 2001 From: rmgr Date: Thu, 4 Apr 2024 20:46:34 +1030 Subject: [PATCH 07/15] Tidy up crawling and implement boolean search --- src/crawl.py | 30 +++++++++---------- src/index.py | 9 ++++-- src/models.py | 1 + src/search.py | 81 ++++++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 91 insertions(+), 30 deletions(-) diff --git a/src/crawl.py b/src/crawl.py index 3856300..e7e35be 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -40,21 +40,21 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro if not rp.can_fetch("*", url): print("Robots prevents crawling url: " + url) return - - soup = BeautifulSoup(html,'html.parser') + + soup = BeautifulSoup(html, 'html.parser') hash = hashlib.sha256() hash.update(url.encode('ascii')) s = Session() existing_website = s.query(Documents).filter_by(url=url).first() - print (existing_website) - if existing_website == None: + if existing_website is None: website = Documents( url=url, text_content=soup.get_text(), html_content=soup.prettify(), first_crawl_date=datetime.datetime.now(), - last_crawl_date = datetime.datetime.now() + last_crawl_date=datetime.datetime.now(), + last_index_date=None ) s.add(website) else: @@ -64,44 +64,44 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro s.close() x = open(f'data/links.txt', 'a') x.close() - links = soup.find_all("a") + links = soup.find_all("a", href=True) for link in links: found = False - if not hasattr(link, "href"): - continue link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue - if not "http" in link: + if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link: + continue + if "http" not in link: link = urljoin(url, link) if (recursion > 0 and link not in traversed_links): try: traversed_links.append(link) link_html = get_html(link) - r = recursion -1 - sleep(1) + r = recursion -1 + sleep(0.5) parse_html(link, link_html, r, traversed_links) except: pass elif link not in traversed_links: - with open(f'data/links.txt', 'r+') as linksfile: + with open('data/links.txt', 'r+') as linksfile: while line := linksfile.readline(): if line.strip() == link.strip(): found = True if not found: linksfile.write(f'{link}\n') -if __name__ == "__main__": +if __name__ == "__main__": os.makedirs("data/content", exist_ok=True) # check inputs parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") - max_recursion = 2 + max_recursion = 4 args = parser.parse_args() if args.url == "links": - with open(f'data/links.txt', 'r+') as linksfile: + with open('data/links.txt', 'r+') as linksfile: while line := linksfile.readline(): if "http" in line: try: diff --git a/src/index.py b/src/index.py index 227815e..d7259ce 100644 --- a/src/index.py +++ b/src/index.py @@ -1,10 +1,11 @@ #!/usr/bin/python3 import argparse -from sqlalchemy import create_engine +from sqlalchemy import create_engine, or_ from config import DATABASE_URI from models import Base, Documents, Document_Tokens, Tokens from sqlalchemy.orm import sessionmaker import uuid +import datetime engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) @@ -14,18 +15,22 @@ Session = sessionmaker(bind=engine) def build_index(): session = Session() # Read list of 1000 documents from db - documents = session.query(Documents).limit(1000) + documents = session.query(Documents).filter(or_(Documents.last_index_date.is_(None), Documents.last_index_date 50: + continue token = session.query(Tokens).filter_by(token=word).first() if token is None: token = Tokens(token=word, id=uuid.uuid4()) session.add(token) document_token = Document_Tokens(document_id=document.id, token_id=token.id) session.add(document_token) + document.last_index_date = datetime.datetime.now() + session.add(document) session.commit() diff --git a/src/models.py b/src/models.py index c2c1d07..de7e7a9 100644 --- a/src/models.py +++ b/src/models.py @@ -15,6 +15,7 @@ class Documents(Base): html_content = Column(String) first_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime) + last_index_date = Column(DateTime) document_tokens = relationship("Document_Tokens", back_populates="document") diff --git a/src/search.py b/src/search.py index b95a83f..c5c233e 100755 --- a/src/search.py +++ b/src/search.py @@ -1,9 +1,10 @@ #!/usr/bin/python3 -from sqlalchemy import create_engine +from sqlalchemy import create_engine, func from config import DATABASE_URI -from models import Base, Tokens +from models import Base, Tokens, Documents, Document_Tokens from sqlalchemy.orm import sessionmaker - +from sqlalchemy.sql.expression import distinct +import time from flask import Flask from urllib.parse import unquote @@ -14,17 +15,71 @@ Session = sessionmaker(bind=engine) # Todo - Boolean search (AND/OR/NOT/"") +def split_query(query): + result = {'ands': [], 'ors': [], 'words': []} + query_words = query.split() + i = 0 + while i < len(query_words): + if i + 1 < len(query_words): + if query_words[i + 1].lower() == "and": + if i + 2 < len(query_words): + result['ands'].append( + query_words[i] + ',' + query_words[i+2]) + i = i + 3 + continue + result['words'].append(query_words[i]) + i += 1 + return result + + @app.route("/search/") def search(query): + start_time = time.time_ns() session = Session() - result = [] - query_words = unquote(query).split() - for word in query_words: - word = word.lower() - matching_token = session.query(Tokens).filter_by(token=word).first() - if session is None: - continue - for document_token in matching_token.document_tokens: + results = {} + query_words = split_query(unquote(query)) + for a in query_words['ands']: + query = session.query(Documents.url, func.count(1)).\ + join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ + join(Tokens, Document_Tokens.token_id == Tokens.id).\ + filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ + group_by(Documents.url).\ + having(func.count(distinct(Document_Tokens.token_id)) == 2).\ + order_by(func.count(1).desc()) + for result in query.all(): + if result[0] in results.keys(): + results[result[0]] += result[1] + else: + results[result[0]] = result[1] + x = session.query(Tokens).filter( + Tokens.token.in_(query_words['words'])).limit(1000) + for y in x: + for document_token in y.document_tokens: + if document_token.document.url in results.keys(): + results[document_token.document.url] += 1 + else: + results[document_token.document.url] = 1 - result.append(document_token.document.url) - return result + print(str((time.time_ns() - start_time) // 1_000_000) + "ms") + return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10] + +# @app.route("/search/") +# def search(query): +# start_time = time.time_ns() +# session = Session() +# result = {} +# query_words = unquote(query).split() +# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000) +# for word in query_words: +# word = word.lower() +# matching_token = session.query(Tokens).filter_by(token=word).first() +# +# if matching_token is None: +# continue +# for document_token in matching_token.document_tokens: +# if document_token.document.url in result.keys(): +# result[document_token.document.url] += 1 +# else: +# result[document_token.document.url] = 1 +# print(str((time.time_ns() - start_time) // 1_000_000) + "ms") +# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10] From 343410e62f5a8754f845d0cba064900285e2edaa Mon Sep 17 00:00:00 2001 From: rmgr Date: Fri, 5 Apr 2024 06:22:56 +1030 Subject: [PATCH 08/15] Add first pass youtube subtitle indexer --- src/crawl.py | 82 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 70 insertions(+), 12 deletions(-) diff --git a/src/crawl.py b/src/crawl.py index e7e35be..bf814e2 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -13,6 +13,7 @@ from models import Base, Documents, Document_Tokens, Tokens from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine import datetime +import yt_dlp as youtube_dl # TODO- Handle gemini/gopher links engine = create_engine(DATABASE_URI) @@ -25,7 +26,64 @@ def get_html(url: str) -> str: return response.content -def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool: +def parse_youtube(video_url: str) -> bool: + # Language preference for subtitles (set to None for auto-generated) + # Change this to 'en' for English subtitles, or None for auto-generated + subtitle_language = 'en' + # Options for youtube_dl + ydl_opts = { + 'writesubtitles': True, + 'allsubtitles': True, + 'skip_download': True, # We only want to fetch metadata + 'subtitleslangs': [subtitle_language] if subtitle_language else None, + } + + # Initialize youtube_dl object + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + # Download metadata + info_dict = ydl.extract_info(video_url, download=False) + + # Extract subtitles + subtitles = info_dict.get('subtitles') + subtitles_text = "" + # Print available subtitles + if subtitles: + for subs in subtitles.values(): + for sub in subs: + subtitle_url = sub['url'] + with youtube_dl.YoutubeDL({}) as ydl: + subtitle_info = ydl.extract_info( + subtitle_url, download=False) + for subtitle in subtitle_info['subtitles'][subtitle_language]: + if subtitle["ext"] == "srv1": + soup = BeautifulSoup( + get_html(subtitle["url"]), 'html.parser') + subtitles_text = soup.get_text() + + s = Session() + existing_website = s.query( + Documents).filter_by(url=video_url).first() + if existing_website is None: + website = Documents( + url=video_url, + text_content=subtitles_text, + html_content=None, # soup.prettify(), + first_crawl_date=datetime.datetime.now(), + last_crawl_date=datetime.datetime.now(), + last_index_date=None + ) + s.add(website) + else: + existing_website.last_crawl_date = datetime.datetime.now() + s.add(existing_website) + s.commit() + s.close() + + +def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool: + if "youtube.com" in url: + parse_youtube(url) + return rp = urllib.robotparser.RobotFileParser() print(url) print(recursion) @@ -49,13 +107,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro existing_website = s.query(Documents).filter_by(url=url).first() if existing_website is None: website = Documents( - url=url, - text_content=soup.get_text(), - html_content=soup.prettify(), - first_crawl_date=datetime.datetime.now(), - last_crawl_date=datetime.datetime.now(), - last_index_date=None - ) + url=url, + text_content=soup.get_text(), + html_content=soup.prettify(), + first_crawl_date=datetime.datetime.now(), + last_crawl_date=datetime.datetime.now(), + last_index_date=None + ) s.add(website) else: existing_website.last_crawl_date = datetime.datetime.now() @@ -78,7 +136,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro try: traversed_links.append(link) link_html = get_html(link) - r = recursion -1 + r = recursion - 1 sleep(0.5) parse_html(link, link_html, r, traversed_links) except: @@ -98,9 +156,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") - max_recursion = 4 + max_recursion = 4 args = parser.parse_args() - if args.url == "links": + if args.url == "links": with open('data/links.txt', 'r+') as linksfile: while line := linksfile.readline(): if "http" in line: @@ -112,7 +170,7 @@ if __name__ == "__main__": else: html = get_html(args.url) parse_html(args.url, html, max_recursion) - + # recursion = 0 # if (args.followlinks): # os.remove('data/links.txt') From 9d57f66cd763032c7d6ac4d9a6d0b49f8d74cffe Mon Sep 17 00:00:00 2001 From: rmgr Date: Fri, 5 Apr 2024 21:36:15 +1030 Subject: [PATCH 09/15] Add beginnings of ngram search capability --- src/crawl.py | 4 ++-- src/index.py | 60 +++++++++++++++++++++++++++++++++++++++++++++------ src/models.py | 32 +++++++++++++++++++++++---- src/search.py | 31 ++++++++++++++++++++++---- 4 files changed, 110 insertions(+), 17 deletions(-) diff --git a/src/crawl.py b/src/crawl.py index bf814e2..c62f4a9 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -1,4 +1,5 @@ #!/usr/bin/python3 + import argparse import requests import hashlib @@ -9,9 +10,8 @@ from time import sleep from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI -from models import Base, Documents, Document_Tokens, Tokens +from models import Base, Documents, Document_Tokens from sqlalchemy.orm import sessionmaker -from sqlalchemy import create_engine import datetime import yt_dlp as youtube_dl # TODO- Handle gemini/gopher links diff --git a/src/index.py b/src/index.py index d7259ce..e73c93d 100644 --- a/src/index.py +++ b/src/index.py @@ -1,24 +1,29 @@ #!/usr/bin/python3 + import argparse from sqlalchemy import create_engine, or_ from config import DATABASE_URI -from models import Base, Documents, Document_Tokens, Tokens +from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams from sqlalchemy.orm import sessionmaker import uuid import datetime +import re +from multiprocessing import Pool engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) -def build_index(): +def build_index_chunk(document_chunk): session = Session() - # Read list of 1000 documents from db - documents = session.query(Documents).filter(or_(Documents.last_index_date.is_(None), Documents.last_index_date 50: @@ -27,11 +32,52 @@ def build_index(): if token is None: token = Tokens(token=word, id=uuid.uuid4()) session.add(token) - document_token = Document_Tokens(document_id=document.id, token_id=token.id) + document_token = Document_Tokens( + document_id=document.id, token_id=token.id) session.add(document_token) document.last_index_date = datetime.datetime.now() session.add(document) session.commit() + session.close() + + +def build_index(): + session = Session() + documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_( + None), Documents.last_index_date < Documents.last_crawl_date)).limit(1000) + session.close() + + documents = list(documents_query) # Execute the query to get the result set + + chunk_size = 100 + document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)] + + with Pool() as pool: + pool.map(build_index_chunk, document_chunks) + + +def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str): + i = 0 + while i < len(corpus): + if i + size >= len(corpus): + i = len(corpus) + gram = '' + for n in range(0, size): + if i + n >= len(corpus): + break + gram += corpus[i+n] + ' ' + gram = gram.rstrip().lower() + print(gram) + + ngram = session.query(NGrams).filter_by(gram=gram).first() + if ngram is None: + ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram) + session.add(ngram) + document_ngram = Document_NGrams( + document_id=document_id, ngram_id=ngram.id) + session.add(document_ngram) + session.commit() + i += 1 if __name__ == "__main__": diff --git a/src/models.py b/src/models.py index de7e7a9..c73ea7d 100644 --- a/src/models.py +++ b/src/models.py @@ -1,5 +1,5 @@ from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, String, DateTime, ForeignKey, Index +from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.orm import relationship, mapped_column import uuid @@ -16,7 +16,10 @@ class Documents(Base): first_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime) last_index_date = Column(DateTime) - document_tokens = relationship("Document_Tokens", back_populates="document") + document_tokens = relationship( + "Document_Tokens", back_populates="document") + document_ngrams = relationship( + "Document_NGrams", back_populates="document") class Document_Tokens(Base): @@ -25,8 +28,9 @@ class Document_Tokens(Base): document_id = mapped_column(ForeignKey("documents.id")) # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) token_id = mapped_column(ForeignKey("tokens.id")) - #Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - document = relationship("Documents", back_populates="document_tokens", uselist=False) + # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document = relationship( + "Documents", back_populates="document_tokens", uselist=False) token = relationship("Tokens", back_populates="document_tokens") @@ -35,3 +39,23 @@ class Tokens(Base): id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) token = Column(String, index=True) document_tokens = relationship("Document_Tokens", back_populates="token") + + +class NGrams(Base): + __tablename__ = 'ngrams' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + size = Column(Integer, index=True) + gram = Column(String, index=True) + document_ngrams = relationship("Document_NGrams", back_populates="ngram") + + +class Document_NGrams(Base): + __tablename__ = 'document_ngrams' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document_id = mapped_column(ForeignKey("documents.id")) + # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + ngram_id = mapped_column(ForeignKey("ngrams.id")) + # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document = relationship( + "Documents", back_populates="document_ngrams", uselist=False) + ngram = relationship("NGrams", back_populates="document_ngrams") diff --git a/src/search.py b/src/search.py index c5c233e..f77927b 100755 --- a/src/search.py +++ b/src/search.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 from sqlalchemy import create_engine, func from config import DATABASE_URI -from models import Base, Tokens, Documents, Document_Tokens +from models import Base, Tokens, Documents, Document_Tokens, NGrams from sqlalchemy.orm import sessionmaker from sqlalchemy.sql.expression import distinct import time @@ -16,7 +16,7 @@ Session = sessionmaker(bind=engine) def split_query(query): - result = {'ands': [], 'ors': [], 'words': []} + result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []} query_words = query.split() i = 0 while i < len(query_words): @@ -27,19 +27,31 @@ def split_query(query): query_words[i] + ',' + query_words[i+2]) i = i + 3 continue + if query_words[i][0] == '"': + n = 0 + quoted_query = "" + while i+n < len(query_words): + quoted_query += query_words[i+n] + ' ' + if query_words[i+n][len(query_words[i+n])-1] == '"': + break + n += 1 + result['ngrams'].append( + quoted_query[1:len(quoted_query)-2].rstrip()) + i += n + continue result['words'].append(query_words[i]) i += 1 return result -@app.route("/search/") +@ app.route("/search/") def search(query): start_time = time.time_ns() session = Session() results = {} query_words = split_query(unquote(query)) for a in query_words['ands']: - query = session.query(Documents.url, func.count(1)).\ + query = session.query(Documents.url, func.count(1)). \ join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ join(Tokens, Document_Tokens.token_id == Tokens.id).\ filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ @@ -51,6 +63,17 @@ def search(query): results[result[0]] += result[1] else: results[result[0]] = result[1] + x = session.query(NGrams).filter( + NGrams.gram.in_(query_words['ngrams'])).all() + + for y in x: + print(y.gram) + for document_ngram in y.document_ngrams: + if document_ngram.document.url in results.keys(): + results[document_ngram.document.url] += 1 + else: + results[document_ngram.document.url] = 1 + x = session.query(Tokens).filter( Tokens.token.in_(query_words['words'])).limit(1000) for y in x: From 9f0e7e6b299c8439d43494f902410be83576f130 Mon Sep 17 00:00:00 2001 From: rmgr Date: Sat, 6 Apr 2024 19:34:59 +1030 Subject: [PATCH 10/15] Indexer and query optimisations --- src/index.py | 33 ++++++++++++++--------- src/search.py | 73 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 63 insertions(+), 43 deletions(-) diff --git a/src/index.py b/src/index.py index e73c93d..4629c75 100644 --- a/src/index.py +++ b/src/index.py @@ -17,10 +17,12 @@ Session = sessionmaker(bind=engine) def build_index_chunk(document_chunk): session = Session() + print(len(document_chunk)) for document in document_chunk: print(document.url) content = re.sub(r'[^\w\s]', '', str(document.text_content)) content_words = content.split() + build_ngrams(2, content_words, session, document.id) build_ngrams(3, content_words, session, document.id) build_ngrams(4, content_words, session, document.id) build_ngrams(5, content_words, session, document.id) @@ -43,17 +45,21 @@ def build_index_chunk(document_chunk): def build_index(): session = Session() - documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_( - None), Documents.last_index_date < Documents.last_crawl_date)).limit(1000) - session.close() - - documents = list(documents_query) # Execute the query to get the result set + while True: + documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_( + None), Documents.last_index_date < Documents.last_crawl_date)).limit(100) + session.close() - chunk_size = 100 - document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)] + # Execute the query to get the result set + documents = list(documents_query) + if len(documents) == 0: + return + chunk_size = 10 + document_chunks = [documents[i:i+chunk_size] + for i in range(0, len(documents), chunk_size)] - with Pool() as pool: - pool.map(build_index_chunk, document_chunks) + with Pool() as pool: + pool.map(build_index_chunk, document_chunks) def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str): @@ -66,9 +72,10 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str if i + n >= len(corpus): break gram += corpus[i+n] + ' ' - gram = gram.rstrip().lower() - print(gram) - + gram = gram.strip().lower() + if len(gram) > 4000: + i += 1 + continue ngram = session.query(NGrams).filter_by(gram=gram).first() if ngram is None: ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram) @@ -76,7 +83,7 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str document_ngram = Document_NGrams( document_id=document_id, ngram_id=ngram.id) session.add(document_ngram) - session.commit() + # session.commit() i += 1 diff --git a/src/search.py b/src/search.py index f77927b..0dedf77 100755 --- a/src/search.py +++ b/src/search.py @@ -50,38 +50,51 @@ def search(query): session = Session() results = {} query_words = split_query(unquote(query)) - for a in query_words['ands']: - query = session.query(Documents.url, func.count(1)). \ - join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ - join(Tokens, Document_Tokens.token_id == Tokens.id).\ - filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ - group_by(Documents.url).\ - having(func.count(distinct(Document_Tokens.token_id)) == 2).\ - order_by(func.count(1).desc()) - for result in query.all(): - if result[0] in results.keys(): - results[result[0]] += result[1] - else: - results[result[0]] = result[1] - x = session.query(NGrams).filter( - NGrams.gram.in_(query_words['ngrams'])).all() + if len(query_words['ands']) > 0: + for a in query_words['ands']: + query = session.query(Documents.url, func.count(1)). \ + join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ + join(Tokens, Document_Tokens.token_id == Tokens.id).\ + filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ + group_by(Documents.url).\ + having(func.count(distinct(Document_Tokens.token_id)) == 2).\ + order_by(func.count(1).desc()) + for result in query.all(): + if result[0] in results.keys(): + results[result[0]] += result[1] + else: + results[result[0]] = result[1] + if len(query_words['ngrams']) > 0: + print('entering ngrams: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") - for y in x: - print(y.gram) - for document_ngram in y.document_ngrams: - if document_ngram.document.url in results.keys(): - results[document_ngram.document.url] += 1 - else: - results[document_ngram.document.url] = 1 + q = session.query(NGrams) + for ngram in query_words['ngrams']: + q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram) + print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") - x = session.query(Tokens).filter( - Tokens.token.in_(query_words['words'])).limit(1000) - for y in x: - for document_token in y.document_tokens: - if document_token.document.url in results.keys(): - results[document_token.document.url] += 1 - else: - results[document_token.document.url] = 1 + x = q.all() + for y in x: + for document_ngram in y.document_ngrams: + if document_ngram.document.url in results.keys(): + results[document_ngram.document.url] += 1 + else: + results[document_ngram.document.url] = 1 + print('exiting ngrams: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") + if len(query_words['words']) > 0: + print('entering words: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") + x = session.query(Tokens).filter( + Tokens.token.in_(query_words['words'])).limit(1000) + for y in x: + for document_token in y.document_tokens: + if document_token.document.url in results.keys(): + results[document_token.document.url] += 1 + else: + results[document_token.document.url] = 1 + print('exiting words: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(str((time.time_ns() - start_time) // 1_000_000) + "ms") return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10] From bdb4064acce0d3a204d73acd4d96e8ce39bfae32 Mon Sep 17 00:00:00 2001 From: rmgr Date: Sat, 4 May 2024 21:10:46 +0930 Subject: [PATCH 11/15] Rework ngram generation. Greatly improve performance of indexer. Commit horrendous sql sins --- src/crawl.py | 10 ++++- src/index.py | 115 +++++++++++++++++++++++++++++++++++++------------- src/models.py | 15 ++++++- src/search.py | 59 +++++++++++++++++--------- todo | 13 +++--- 5 files changed, 155 insertions(+), 57 deletions(-) diff --git a/src/crawl.py b/src/crawl.py index c62f4a9..467b434 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -20,6 +20,9 @@ engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) +excluded_domains = ['amazon.', 'news.ycombinator.', + 'facebook.com', 'amzn', 'fb.com'] + def get_html(url: str) -> str: response = requests.get(url) @@ -36,6 +39,7 @@ def parse_youtube(video_url: str) -> bool: 'allsubtitles': True, 'skip_download': True, # We only want to fetch metadata 'subtitleslangs': [subtitle_language] if subtitle_language else None, + 'extractor-args': {'youtube': {'player_client': 'ios,web'}}, } # Initialize youtube_dl object @@ -132,6 +136,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo continue if "http" not in link: link = urljoin(url, link) + link = link.split('?')[0] + link = link.split('#')[0] if (recursion > 0 and link not in traversed_links): try: traversed_links.append(link) @@ -156,8 +162,10 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") - max_recursion = 4 + parser.add_argument('-r', "--max-recursion", help="", type=int, default=1) + args = parser.parse_args() + max_recursion = int(args.max_recursion) if args.url == "links": with open('data/links.txt', 'r+') as linksfile: while line := linksfile.readline(): diff --git a/src/index.py b/src/index.py index 4629c75..542424c 100644 --- a/src/index.py +++ b/src/index.py @@ -1,51 +1,55 @@ #!/usr/bin/python3 import argparse -from sqlalchemy import create_engine, or_ +from sqlalchemy import create_engine, or_, text +from sqlalchemy import Table, Column, String, Integer from config import DATABASE_URI +from sqlalchemy.dialects.postgresql import UUID from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams from sqlalchemy.orm import sessionmaker +from sqlalchemy.exc import SQLAlchemyError import uuid import datetime +import time import re +import random from multiprocessing import Pool engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) +# https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html + + +def contains_latin(text): + latin_pattern = r'[a-zA-ZÀ-ÖØ-öø-ÿ]' + return bool(re.search(latin_pattern, text)) def build_index_chunk(document_chunk): session = Session() print(len(document_chunk)) + start_time = time.time_ns() for document in document_chunk: print(document.url) - content = re.sub(r'[^\w\s]', '', str(document.text_content)) + content = re.sub(r'[.,?!]', ' ', str(document.text_content)) + content = re.sub(r'[^\w\s]', '', str(content)) content_words = content.split() - build_ngrams(2, content_words, session, document.id) - build_ngrams(3, content_words, session, document.id) - build_ngrams(4, content_words, session, document.id) - build_ngrams(5, content_words, session, document.id) - for word in content_words: - word = word.lower() - if len(word) > 50: - continue - token = session.query(Tokens).filter_by(token=word).first() - if token is None: - token = Tokens(token=word, id=uuid.uuid4()) - session.add(token) - document_token = Document_Tokens( - document_id=document.id, token_id=token.id) - session.add(document_token) + build_ngrams(1, content_words, document.id) + build_ngrams(2, content_words, document.id) + build_ngrams(3, content_words, document.id) + build_ngrams(4, content_words, document.id) + build_ngrams(5, content_words, document.id) + document.last_index_date = datetime.datetime.now() - session.add(document) + session.merge(document) session.commit() session.close() def build_index(): - session = Session() while True: + session = Session() documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_( None), Documents.last_index_date < Documents.last_crawl_date)).limit(100) session.close() @@ -54,16 +58,62 @@ def build_index(): documents = list(documents_query) if len(documents) == 0: return + build_index_chunk(documents) + continue chunk_size = 10 document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)] - with Pool() as pool: pool.map(build_index_chunk, document_chunks) -def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str): +def zip_ngrams(size: int, corpus, document_id): + size = int(size) + connection = engine.connect() + temptbl_name = 'temp_del_{}'.format(random.randint(100000, 9999999)) + temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column( + 'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True) + + try: + # Start transaction + with connection.begin(): + temptbl.create(engine) + insert_grams = [] + grams = zip(*[corpus[i:] for i in range(size)]) + for gram in grams: + gram = ' '.join(gram).lower() + insert_grams.append( + {"id": uuid.uuid4(), "gram": gram, "size": size}) + connection.execute(temptbl.insert().values(insert_grams)) + connection.execute(text("UPDATE " + temptbl_name + + " SET id = ngrams.id FROM ngrams WHERE ngrams.gram = " + + temptbl_name + ".gram;")) + connection.execute(text("INSERT INTO ngrams (id, gram, size) SELECT " + + " distinct t.id, t.gram as gram, t.size FROM " + + temptbl_name + " t LEFT JOIN ngrams on ngrams.gram = " + + "t.gram WHERE ngrams.id is null and t.size is not null " + " ON CONFLICT DO NOTHING;")) + connection.execute(text("INSERT INTO document_ngrams(id, document_id, ngram_id) SELECT DISTINCT " + + "uuid_generate_v4() , '" + str(document_id) + "'::UUID, t.id FROM " + temptbl_name + " t;")) + except SQLAlchemyError as e: + # Handle exceptions + print("An error occurred:", e) + # Rollback transaction + connection.rollback() + else: + # Commit transaction if no exceptions occurred + connection.commit() + finally: + connection.close() + # Drop table outside the transaction block + temptbl.drop(engine) + + +def build_ngrams(size: int, corpus: str, document_id: str): + session = Session() + zip_ngrams(size, corpus, document_id) + return i = 0 + grams = [] while i < len(corpus): if i + size >= len(corpus): i = len(corpus) @@ -73,18 +123,23 @@ def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str break gram += corpus[i+n] + ' ' gram = gram.strip().lower() - if len(gram) > 4000: + if len(gram) > 1000 or gram in grams or not contains_latin(gram): i += 1 continue - ngram = session.query(NGrams).filter_by(gram=gram).first() - if ngram is None: - ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram) - session.add(ngram) - document_ngram = Document_NGrams( - document_id=document_id, ngram_id=ngram.id) - session.add(document_ngram) - # session.commit() + grams.append(gram) + if (len(gram) > 1): + ngram = session.query(NGrams).filter_by( + gram=gram).filter_by(size=size).first() + if ngram is None: + ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram) + session.add(ngram) + document_ngram = Document_NGrams( + document_id=document_id, ngram_id=ngram.id) + session.add(document_ngram) + session.commit() i += 1 +# print(str((time.time_ns() - start_time)//1_000_000)) + session.close() if __name__ == "__main__": diff --git a/src/models.py b/src/models.py index c73ea7d..50010b6 100644 --- a/src/models.py +++ b/src/models.py @@ -32,6 +32,12 @@ class Document_Tokens(Base): document = relationship( "Documents", back_populates="document_tokens", uselist=False) token = relationship("Tokens", back_populates="document_tokens") + __table_args__ = ( + Index('idx_document_tokens_document_id_token_id', 'document_id', + 'token_id', unique=True, postgresql_using='hash'), + Index('idx_document_tokens_clustered', 'document_id', + 'token_id', postgresql_using='hash'), + ) class Tokens(Base): @@ -53,9 +59,14 @@ class Document_NGrams(Base): __tablename__ = 'document_ngrams' id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) document_id = mapped_column(ForeignKey("documents.id")) - # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) ngram_id = mapped_column(ForeignKey("ngrams.id")) - # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) document = relationship( "Documents", back_populates="document_ngrams", uselist=False) ngram = relationship("NGrams", back_populates="document_ngrams") + + __table_args__ = ( + Index('idx_document_ngrams_document_id_ngram_id', 'document_id', + 'ngram_id', unique=True, postgresql_using='hash'), + Index('idx_document_ngrams_clustered', 'document_id', + 'ngram_id', postgresql_using='hash'), + ) diff --git a/src/search.py b/src/search.py index 0dedf77..6033e60 100755 --- a/src/search.py +++ b/src/search.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 from sqlalchemy import create_engine, func from config import DATABASE_URI -from models import Base, Tokens, Documents, Document_Tokens, NGrams +from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams from sqlalchemy.orm import sessionmaker from sqlalchemy.sql.expression import distinct import time @@ -37,9 +37,9 @@ def split_query(query): n += 1 result['ngrams'].append( quoted_query[1:len(quoted_query)-2].rstrip()) - i += n + i += n + 1 continue - result['words'].append(query_words[i]) + result['ngrams'].append(query_words[i]) i += 1 return result @@ -50,6 +50,7 @@ def search(query): session = Session() results = {} query_words = split_query(unquote(query)) + print(query_words) if len(query_words['ands']) > 0: for a in query_words['ands']: query = session.query(Documents.url, func.count(1)). \ @@ -68,35 +69,55 @@ def search(query): print('entering ngrams: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") - q = session.query(NGrams) + q = session.query(Documents.url, func.count(1)) \ + .join(Document_NGrams, Documents.id == Document_NGrams.document_id) \ + .join(NGrams, Document_NGrams.ngram_id == NGrams.id) \ + .group_by(Documents.url) for ngram in query_words['ngrams']: q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram) print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") - + print(q) x = q.all() - for y in x: - for document_ngram in y.document_ngrams: - if document_ngram.document.url in results.keys(): - results[document_ngram.document.url] += 1 - else: - results[document_ngram.document.url] = 1 + print('query executed: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") + print(x) + for result in x: + if result[0] in results.keys(): + results[result[0]] += result[1] + else: + results[result[0]] = result[1] +# for y in x: +# print(y) +# for document_ngram in y.document_ngrams: +# if document_ngram.document.url in results.keys(): +# results[document_ngram.document.url] += 1 +# else: +# results[document_ngram.document.url] = 1 print('exiting ngrams: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") if len(query_words['words']) > 0: print('entering words: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") - x = session.query(Tokens).filter( - Tokens.token.in_(query_words['words'])).limit(1000) - for y in x: - for document_token in y.document_tokens: - if document_token.document.url in results.keys(): - results[document_token.document.url] += 1 - else: - results[document_token.document.url] = 1 + q = session.query(Documents.url, func.count(1)) \ + .join(Document_Tokens, Documents.id == Document_Tokens.document_id) \ + .join(Tokens, Document_Tokens.token_id == Tokens.id) \ + .group_by(Documents.url).filter(Tokens.token.in_(query_words['words'])) + + print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") + print(q) + x = q.all() + print('query executed: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") + for result in x: + if result[0] in results.keys(): + results[result[0]] += result[1] + else: + results[result[0]] = result[1] print('exiting words: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(str((time.time_ns() - start_time) // 1_000_000) + "ms") + session.close() return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10] # @app.route("/search/") diff --git a/todo b/todo index 2c7e8cc..328320b 100644 --- a/todo +++ b/todo @@ -1,6 +1,9 @@ -[ ] Refactor website table to generic document table (maybe using URN instead of URL?) -[ ] Define tokens table FKed to document table -[ ] Refactor index.py to tokenize input into tokens table -[ ] Define N-Grams table -[ ] Add N-Gram generation to index.py +[x] Refactor website table to generic document table (maybe using URN instead of URL?) +[x] Define tokens table FKed to document table +[x] Refactor index.py to tokenize input into tokens table +[x] Define N-Grams table +[x] Add N-Gram generation to index.py +[x] Add clustered index to document_ngrams table model +[x] Add clustered index to document_tokens table model +[ ] Add ddl command to create partition tables From 98efe9d1a2cb5fe913f4ca7bf8719c872793cdbe Mon Sep 17 00:00:00 2001 From: rmgr Date: Sun, 5 May 2024 19:06:56 +0930 Subject: [PATCH 12/15] Fix temp table being randomly dropped due to name collision. Fix multi-word non-phrase search --- src/index.py | 3 ++- src/search.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/index.py b/src/index.py index 542424c..679d312 100644 --- a/src/index.py +++ b/src/index.py @@ -70,7 +70,8 @@ def build_index(): def zip_ngrams(size: int, corpus, document_id): size = int(size) connection = engine.connect() - temptbl_name = 'temp_del_{}'.format(random.randint(100000, 9999999)) + temptbl_name = 'temp_del_{}'.format( + time.time_ns() + random.randint(100000, 9999999)) temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column( 'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True) diff --git a/src/search.py b/src/search.py index 6033e60..d643eb2 100755 --- a/src/search.py +++ b/src/search.py @@ -1,5 +1,5 @@ #!/usr/bin/python3 -from sqlalchemy import create_engine, func +from sqlalchemy import create_engine, func, and_, or_ from config import DATABASE_URI from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams from sqlalchemy.orm import sessionmaker @@ -73,8 +73,14 @@ def search(query): .join(Document_NGrams, Documents.id == Document_NGrams.document_id) \ .join(NGrams, Document_NGrams.ngram_id == NGrams.id) \ .group_by(Documents.url) + conditions = [] for ngram in query_words['ngrams']: - q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram) + conditions.append( + (NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram)) +# q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram) + and_conditions = [and_(*condition_pair) + for condition_pair in conditions] + q = q.filter(or_(*and_conditions)) print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(q) x = q.all() From e3c67b64e63762e50899390afa7f5d4cca3d6c17 Mon Sep 17 00:00:00 2001 From: rmgr Date: Sat, 8 Jun 2024 20:24:21 +0930 Subject: [PATCH 13/15] Make excluded file types more robust --- src/crawl.py | 10 +++++++--- todo | 2 ++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/crawl.py b/src/crawl.py index 467b434..6966a25 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -23,6 +23,9 @@ Session = sessionmaker(bind=engine) excluded_domains = ['amazon.', 'news.ycombinator.', 'facebook.com', 'amzn', 'fb.com'] +excluded_filetypes = [".jpg", ".xml", ".mp4", + ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"] + def get_html(url: str) -> str: response = requests.get(url) @@ -85,6 +88,9 @@ def parse_youtube(video_url: str) -> bool: def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool: + for domain in excluded_domains: + if domain in url: + return if "youtube.com" in url: parse_youtube(url) return @@ -124,15 +130,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo s.add(existing_website) s.commit() s.close() - x = open(f'data/links.txt', 'a') - x.close() links = soup.find_all("a", href=True) for link in links: found = False link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue - if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link: + if any(ext in link for ext in excluded_filetypes): continue if "http" not in link: link = urljoin(url, link) diff --git a/todo b/todo index 328320b..2f5f3e5 100644 --- a/todo +++ b/todo @@ -6,4 +6,6 @@ [x] Add clustered index to document_ngrams table model [x] Add clustered index to document_tokens table model [ ] Add ddl command to create partition tables +[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be +[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that From 2a99a61dbe098774067d47f92e76c16175d8dedf Mon Sep 17 00:00:00 2001 From: rmgr Date: Sat, 8 Jun 2024 20:43:05 +0930 Subject: [PATCH 14/15] Add site map crawl option --- src/crawl.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/crawl.py b/src/crawl.py index 6966a25..0816e1b 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -166,6 +166,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") + parser.add_argument('-s', "--crawl-sitemap", action="store_true") parser.add_argument('-r', "--max-recursion", help="", type=int, default=1) args = parser.parse_args() @@ -178,7 +179,21 @@ if __name__ == "__main__": parse_html(line, get_html(line)) except: pass - + elif args.crawl_sitemap: + rp = urllib.robotparser.RobotFileParser() + urlparts = urlparse(args.url) + baseurl = urlparts.scheme + "://" + urlparts.netloc + rp.set_url(baseurl + "/robots.txt") + rp.read() + if not rp.can_fetch("*", args.url): + print("Robots prevents crawling url: " + args.url) + exit(0) + if len(rp.site_maps()) > 0: + map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml') + for loc in map.find_all('loc'): + url = loc.contents[0] + html = get_html(url) + parse_html(url, html, max_recursion) else: html = get_html(args.url) parse_html(args.url, html, max_recursion) From bbba459480971d678880584efd7871d7acc5328d Mon Sep 17 00:00:00 2001 From: rmgr Date: Sun, 9 Jun 2024 21:53:57 +0930 Subject: [PATCH 15/15] Clean up site map scanning. Return all results instead of 10 --- client/src/css/styles.css | 33 +++++++++++++++++ client/src/index.html | 16 +++++++++ client/src/js/index.js | 28 +++++++++++++++ src/crawl.py | 42 +++++++++++++--------- src/search.py | 74 +++++++++++++++++++-------------------- todo | 4 +-- 6 files changed, 140 insertions(+), 57 deletions(-) create mode 100644 client/src/css/styles.css create mode 100644 client/src/index.html create mode 100644 client/src/js/index.js diff --git a/client/src/css/styles.css b/client/src/css/styles.css new file mode 100644 index 0000000..37323ab --- /dev/null +++ b/client/src/css/styles.css @@ -0,0 +1,33 @@ +html, body { + height: 100%; +} +body { + margin: 0; +} +input { + padding: 7px; + font-size: 1.1rem; +} +.search-container { + display: flex; + justify-content: center; + align-items: center; + text-align: center; + min-height: 25vh; +} + +.flex-container { + padding: 0; + margin: 0; + display: flex; + align-items: center; + justify-content: center; + flex-direction: column; +} +.flex-item { +} +.result { + display:block; + max-width: 60vw; + overflow-x: hidden; +} diff --git a/client/src/index.html b/client/src/index.html new file mode 100644 index 0000000..a748d6c --- /dev/null +++ b/client/src/index.html @@ -0,0 +1,16 @@ + + + + + + +
+ +
+
+
+
+
+ + + diff --git a/client/src/js/index.js b/client/src/js/index.js new file mode 100644 index 0000000..09b0bb2 --- /dev/null +++ b/client/src/js/index.js @@ -0,0 +1,28 @@ +function debounce(func, timeout = 300){ + let timer; + return (...args) => { + clearTimeout(timer); + timer = setTimeout(() => { func.apply(this, args); }, timeout); + }; +} +async function search(searchBox){ + const response = await fetch(`http://localhost:5000/search/${searchBox.value}`); + const results = await response.json(); + + const resultView = document.getElementById("results"); + resultView.replaceChildren(); + for (let i = 0; i < results.length; i++){ + let result = results[i]; + let resultElement = document.createElement("a"); + resultElement.innerText = result[0]; + resultElement.href = result[0]; + resultElement.className = "flex-item result"; + resultView.appendChild(resultElement); + } +} + +const searchBoxKeyUp = debounce(() => search()) + +const searchBox = document.getElementById("searchbox"); + +searchBox.addEventListener("keyup", debounce(() => search(searchBox))) diff --git a/src/crawl.py b/src/crawl.py index 0816e1b..1480b4e 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -2,7 +2,6 @@ import argparse import requests -import hashlib from urllib.parse import urlparse, urljoin import urllib.robotparser import os @@ -10,7 +9,7 @@ from time import sleep from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI -from models import Base, Documents, Document_Tokens +from models import Base, Documents from sqlalchemy.orm import sessionmaker import datetime import yt_dlp as youtube_dl @@ -23,7 +22,7 @@ Session = sessionmaker(bind=engine) excluded_domains = ['amazon.', 'news.ycombinator.', 'facebook.com', 'amzn', 'fb.com'] -excluded_filetypes = [".jpg", ".xml", ".mp4", +excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db", ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"] @@ -33,6 +32,7 @@ def get_html(url: str) -> str: def parse_youtube(video_url: str) -> bool: + return # Language preference for subtitles (set to None for auto-generated) # Change this to 'en' for English subtitles, or None for auto-generated subtitle_language = 'en' @@ -91,6 +91,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo for domain in excluded_domains: if domain in url: return + if any(ext in url for ext in excluded_filetypes): + return if "youtube.com" in url: parse_youtube(url) return @@ -110,8 +112,6 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo return soup = BeautifulSoup(html, 'html.parser') - hash = hashlib.sha256() - hash.update(url.encode('ascii')) s = Session() existing_website = s.query(Documents).filter_by(url=url).first() @@ -151,13 +151,25 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo parse_html(link, link_html, r, traversed_links) except: pass - elif link not in traversed_links: - with open('data/links.txt', 'r+') as linksfile: - while line := linksfile.readline(): - if line.strip() == link.strip(): - found = True - if not found: - linksfile.write(f'{link}\n') +# elif link not in traversed_links: +# with open('data/links.txt', 'r+') as linksfile: +# while line := linksfile.readline(): +# if line.strip() == link.strip(): +# found = True +# if not found: +# linksfile.write(f'{link}\n') + + +def parse_site_map(base_url): + map = BeautifulSoup(requests.get(base_url).content, 'xml') + print(map.find_all('loc')) + for loc in map.find_all('loc'): + if "xml" in loc.contents[0]: + parse_site_map(loc.contents[0]) + else: + url = loc.contents[0] + html = get_html(url) + parse_html(url, html, max_recursion) if __name__ == "__main__": @@ -189,11 +201,7 @@ if __name__ == "__main__": print("Robots prevents crawling url: " + args.url) exit(0) if len(rp.site_maps()) > 0: - map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml') - for loc in map.find_all('loc'): - url = loc.contents[0] - html = get_html(url) - parse_html(url, html, max_recursion) + parse_site_map(rp.site_maps()[0]) else: html = get_html(args.url) parse_html(args.url, html, max_recursion) diff --git a/src/search.py b/src/search.py index d643eb2..fd013bc 100755 --- a/src/search.py +++ b/src/search.py @@ -1,14 +1,17 @@ #!/usr/bin/python3 -from sqlalchemy import create_engine, func, and_, or_ +from sqlalchemy import create_engine, func, and_, or_, not_ from config import DATABASE_URI -from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams +from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams from sqlalchemy.orm import sessionmaker from sqlalchemy.sql.expression import distinct import time from flask import Flask +from flask_cors import CORS +from flask import send_from_directory from urllib.parse import unquote -app = Flask(__name__) +app = Flask(__name__, static_url_path='/static/') +CORS(app) engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) @@ -16,7 +19,9 @@ Session = sessionmaker(bind=engine) def split_query(query): - result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []} + query = query.lower() + result = {'ands': [], 'ors': [], 'words': [], + 'ngrams': [], 'exclusions': []} query_words = query.split() i = 0 while i < len(query_words): @@ -39,6 +44,11 @@ def split_query(query): quoted_query[1:len(quoted_query)-2].rstrip()) i += n + 1 continue + elif query_words[i][0] == "-": + excluded_query = query_words[i][1: len(query_words[i])] + result['exclusions'].append(excluded_query) + i += 1 + continue result['ngrams'].append(query_words[i]) i += 1 return result @@ -52,19 +62,26 @@ def search(query): query_words = split_query(unquote(query)) print(query_words) if len(query_words['ands']) > 0: + print('entering ands: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") for a in query_words['ands']: query = session.query(Documents.url, func.count(1)). \ - join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ - join(Tokens, Document_Tokens.token_id == Tokens.id).\ - filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ - group_by(Documents.url).\ - having(func.count(distinct(Document_Tokens.token_id)) == 2).\ + join(Document_NGrams, Documents.id == Document_NGrams.document_id). \ + join(NGrams, Document_NGrams.ngram_id == NGrams.id). \ + filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\ + group_by(Documents.url). \ + having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \ order_by(func.count(1).desc()) + +# limit(100) + print(query) for result in query.all(): if result[0] in results.keys(): results[result[0]] += result[1] else: results[result[0]] = result[1] + print('exiting ands: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") if len(query_words['ngrams']) > 0: print('entering ngrams: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") @@ -83,7 +100,7 @@ def search(query): q = q.filter(or_(*and_conditions)) print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(q) - x = q.all() + x = q.limit(100).all() print('query executed: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(x) @@ -101,30 +118,11 @@ def search(query): # results[document_ngram.document.url] = 1 print('exiting ngrams: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") - if len(query_words['words']) > 0: - print('entering words: ' + - str((time.time_ns() - start_time) // 1_000_000) + "ms") - q = session.query(Documents.url, func.count(1)) \ - .join(Document_Tokens, Documents.id == Document_Tokens.document_id) \ - .join(Tokens, Document_Tokens.token_id == Tokens.id) \ - .group_by(Documents.url).filter(Tokens.token.in_(query_words['words'])) - - print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") - print(q) - x = q.all() - print('query executed: ' + - str((time.time_ns() - start_time) // 1_000_000) + "ms") - for result in x: - if result[0] in results.keys(): - results[result[0]] += result[1] - else: - results[result[0]] = result[1] - print('exiting words: ' + - str((time.time_ns() - start_time) // 1_000_000) + "ms") print(str((time.time_ns() - start_time) // 1_000_000) + "ms") session.close() - return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10] + return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())] + # @app.route("/search/") # def search(query): @@ -132,17 +130,17 @@ def search(query): # session = Session() # result = {} # query_words = unquote(query).split() -# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000) +# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000) # for word in query_words: # word = word.lower() -# matching_token = session.query(Tokens).filter_by(token=word).first() +# matching_ngram = session.query(NGrams).filter_by(ngram=word).first() # -# if matching_token is None: +# if matching_ngram is None: # continue -# for document_token in matching_token.document_tokens: -# if document_token.document.url in result.keys(): -# result[document_token.document.url] += 1 +# for document_ngram in matching_ngram.document_ngrams: +# if document_ngram.document.url in result.keys(): +# result[document_ngram.document.url] += 1 # else: -# result[document_token.document.url] = 1 +# result[document_ngram.document.url] = 1 # print(str((time.time_ns() - start_time) // 1_000_000) + "ms") # return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10] diff --git a/todo b/todo index 2f5f3e5..ddda3bd 100644 --- a/todo +++ b/todo @@ -6,6 +6,6 @@ [x] Add clustered index to document_ngrams table model [x] Add clustered index to document_tokens table model [ ] Add ddl command to create partition tables -[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be -[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that +[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be +[x] Instead of starting from a random page on the site, go to root and find site map and crawl that