From 20d198e5595f33244d7a364ab2538e983dd8ab71 Mon Sep 17 00:00:00 2001 From: rmgr Date: Thu, 7 Mar 2024 20:44:34 +1030 Subject: [PATCH] Refactor to use postgresql end to end --- src/crawl.py | 9 +++--- src/index.py | 83 ++++++++++++++++++++++++------------------------ src/index.py.old | 54 +++++++++++++++++++++++++++++++ src/models.py | 26 ++++++++++++--- src/search.py | 44 ++++++++++++------------- 5 files changed, 144 insertions(+), 72 deletions(-) create mode 100644 src/index.py.old diff --git a/src/crawl.py b/src/crawl.py index 9521b5d..3856300 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -9,7 +9,7 @@ from time import sleep from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI -from models import Base, Website +from models import Base, Documents, Document_Tokens, Tokens from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine import datetime @@ -19,11 +19,12 @@ engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) -def get_html(url: str) -> str: +def get_html(url: str) -> str: response = requests.get(url) return response.content + def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool: rp = urllib.robotparser.RobotFileParser() print(url) @@ -45,10 +46,10 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro hash.update(url.encode('ascii')) s = Session() - existing_website = s.query(Website).filter_by(url=url).first() + existing_website = s.query(Documents).filter_by(url=url).first() print (existing_website) if existing_website == None: - website = Website( + website = Documents( url=url, text_content=soup.get_text(), html_content=soup.prettify(), diff --git a/src/index.py b/src/index.py index e04c787..c80b5e7 100644 --- a/src/index.py +++ b/src/index.py @@ -1,54 +1,53 @@ +#!/usr/bin/python3 +import argparse +import requests +import hashlib +from urllib.parse import urlparse, urljoin +import urllib.robotparser +import os +from time import sleep +from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI -from models import Base, Website -from pathlib import Path -import argparse -import os -import json -# investigate ngrams for "multi word" matching -ignored_words = ['a', 'the','is'] +from models import Base, Documents, Document_Tokens, Tokens +from sqlalchemy.orm import sessionmaker +from sqlalchemy import create_engine +import datetime -def remove_punctuation(input_string): - punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!''' - for p in punc: - input_string = input_string.replace(p, '') - return input_string +engine = create_engine(DATABASE_URI) +Base.metadata.create_all(engine) +Session = sessionmaker(bind=engine) def build_index(): - with open(f"data/index.json", "w") as index: - # get a list of all content files - # split on whitespace and add to index - dictionary = {} - pathlist = Path('data/content').rglob('*.txt') - for path in pathlist: - with open(str(path)) as content_file: - url = content_file.readline() - content = content_file.read() - content_words = content.split() - for word in content_words: - word = word.lower() - word = remove_punctuation(word) - if not word in ignored_words: - if not word in dictionary: - dictionary[word] = [] - matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word])) - if len(matching_urls) == 0: -# if not url.strip() in dictionary[word]: - entries = dictionary[word] - entry = {"url": url.strip(), "count": 1, "filename": str(path)} - dictionary[word].append(entry) - else: - entries = dictionary[word] - entry = matching_urls[0] - entry["count"] += 1 - entries.sort(reverse=True, key=lambda entry: entry["count"]) - index.write(json.dumps(dictionary)) + session = Session() + # Read list of 1000 documents from db + documents = session.query(Documents).limit(1000) + for document in documents: + print(document.url) + content_words = document.text_content.split() + for word in content_words: + word = word.lower() + token = session.query(Tokens).filter_by(token=word).first() + if token is None: + token = Tokens(token=word) + session.add(token) + document_token = Document_Tokens(document_id=document.id, token_id=token.id) + session.add(document_token) + session.commit() + + # Foreach document, break into words + # Check if word exists in database + # Create if not exist + # Link to document + if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") + parser.add_argument('-r', + "--rebuild", + action="store_true", + help="Blow away the index and rebuild") args = parser.parse_args() if args.rebuild: build_index() - diff --git a/src/index.py.old b/src/index.py.old new file mode 100644 index 0000000..6ec8e21 --- /dev/null +++ b/src/index.py.old @@ -0,0 +1,54 @@ +from sqlalchemy import create_engine +from config import DATABASE_URI +from models import Base, Website +from pathlib import Path +import argparse +import os +import json +# investigate ngrams for "multi word" matching +ignored_words = ['a', 'the','is'] + +def remove_punctuation(input_string): + punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!''' + for p in punc: + input_string = input_string.replace(p, '') + return input_string + + +def build_index(): + with open("data/index.json", "w") as index: + # get a list of all content files + # split on whitespace and add to index + dictionary = {} + pathlist = Path('data/content').rglob('*.txt') + for path in pathlist: + with open(str(path)) as content_file: + url = content_file.readline() + content = content_file.read() + content_words = content.split() + for word in content_words: + word = word.lower() + word = remove_punctuation(word) + if word not in ignored_words: + if word not in dictionary: + dictionary[word] = [] + matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word])) + if len(matching_urls) == 0: +# if not url.strip() in dictionary[word]: + entries = dictionary[word] + entry = {"url": url.strip(), "count": 1, "filename": str(path)} + dictionary[word].append(entry) + else: + entries = dictionary[word] + entry = matching_urls[0] + entry["count"] += 1 + entries.sort(reverse=True, key=lambda entry: entry["count"]) + index.write(json.dumps(dictionary)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") + args = parser.parse_args() + if args.rebuild: + build_index() + diff --git a/src/models.py b/src/models.py index ee768d4..c2c1d07 100644 --- a/src/models.py +++ b/src/models.py @@ -1,18 +1,36 @@ from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, DateTime +from sqlalchemy import Column, String, DateTime, ForeignKey, Index from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.orm import relationship, mapped_column import uuid Base = declarative_base() -class Website(Base): - __tablename__ = 'websites' - id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4) +class Documents(Base): + __tablename__ = 'documents' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) url = Column(String) text_content = Column(String) html_content = Column(String) first_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime) + document_tokens = relationship("Document_Tokens", back_populates="document") +class Document_Tokens(Base): + __tablename__ = 'document_tokens' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document_id = mapped_column(ForeignKey("documents.id")) + # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + token_id = mapped_column(ForeignKey("tokens.id")) + #Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document = relationship("Documents", back_populates="document_tokens", uselist=False) + token = relationship("Tokens", back_populates="document_tokens") + + +class Tokens(Base): + __tablename__ = 'tokens' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + token = Column(String, index=True) + document_tokens = relationship("Document_Tokens", back_populates="token") diff --git a/src/search.py b/src/search.py index 17668f9..b95a83f 100755 --- a/src/search.py +++ b/src/search.py @@ -1,30 +1,30 @@ -#!/bin/bash +#!/usr/bin/python3 +from sqlalchemy import create_engine +from config import DATABASE_URI +from models import Base, Tokens +from sqlalchemy.orm import sessionmaker from flask import Flask -from flask import Request -import json from urllib.parse import unquote app = Flask(__name__) -## Todo - Boolean search (AND/OR/NOT/"") +engine = create_engine(DATABASE_URI) +Base.metadata.create_all(engine) +Session = sessionmaker(bind=engine) +# Todo - Boolean search (AND/OR/NOT/"") + + @app.route("/search/") def search(query): - with open('data/index.json', 'r') as index_json: - index = json.load(index_json) - query = unquote(query) - query_split = query.split() - result = [] - for q in query_split: - q = q.lower() - if q in index: - for item in index[q]: - matching_results = list(filter(lambda entry: entry['url'] == item["url"], result)) - if len(matching_results) == 0: - result.append(item) - else: - matching_results[0]["count"] += item["count"] - return result - -def handle_and(): - pass + session = Session() + result = [] + query_words = unquote(query).split() + for word in query_words: + word = word.lower() + matching_token = session.query(Tokens).filter_by(token=word).first() + if session is None: + continue + for document_token in matching_token.document_tokens: + result.append(document_token.document.url) + return result