Refactor to use postgresql end to end

2024-03-07 20:44:34 +10:30 · 2024-03-07 20:44:34 +10:30 · 20d198e559
commit 20d198e559
parent 8605ee6b2c
5 changed files with 144 additions and 72 deletions
--- a/src/crawl.py
+++ b/src/crawl.py
@ -9,7 +9,7 @@ from time import sleep
 from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
-from models import Base, Website
+from models import Base, Documents, Document_Tokens, Tokens
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy import create_engine
 import datetime
@ -19,11 +19,12 @@ engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
 def get_html(url: str) -> str:
 def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content
 def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
    rp = urllib.robotparser.RobotFileParser()
    print(url)
@ -45,10 +46,10 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
    hash.update(url.encode('ascii'))
    s = Session()
-    existing_website = s.query(Website).filter_by(url=url).first()
+    existing_website = s.query(Documents).filter_by(url=url).first()
    print (existing_website)
    if existing_website == None:
-        website = Website(
+        website = Documents(
                url=url,
                text_content=soup.get_text(),
                html_content=soup.prettify(),
--- a/src/index.py
+++ b/src/index.py
@ -1,54 +1,53 @@
 #!/usr/bin/python3
 import argparse
 import requests
 import hashlib
 from urllib.parse import urlparse, urljoin
 import urllib.robotparser
 import os
 from time import sleep
 from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
-from models import Base, Website
+from models import Base, Documents, Document_Tokens, Tokens
-from pathlib import Path
+from sqlalchemy.orm import sessionmaker
-import argparse
+from sqlalchemy import create_engine
-import os
+import datetime
 import json
 # investigate ngrams for "multi word" matching
 ignored_words = ['a', 'the','is']
-def remove_punctuation(input_string):
+engine = create_engine(DATABASE_URI)
-    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
+Base.metadata.create_all(engine)
-    for p in punc:
+Session = sessionmaker(bind=engine)
        input_string = input_string.replace(p, '')
    return input_string
 def build_index():
-    with open(f"data/index.json", "w") as index:
+    session = Session()
-        # get a list of all content files
+    # Read list of 1000 documents from db
-        # split on whitespace and add to index
+    documents = session.query(Documents).limit(1000)
-        dictionary = {}
+    for document in documents:
-        pathlist = Path('data/content').rglob('*.txt')
+        print(document.url)
-        for path in pathlist:
+        content_words = document.text_content.split()
            with open(str(path)) as content_file:
                url = content_file.readline()
                content = content_file.read()
                content_words = content.split()
        for word in content_words:
            word = word.lower()
-                    word = remove_punctuation(word)
+            token = session.query(Tokens).filter_by(token=word).first()
-                    if not word in ignored_words:
+            if token is None:
-                        if not word in dictionary:
+                token = Tokens(token=word)
-                            dictionary[word] = []
+                session.add(token)
-                        matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
+            document_token = Document_Tokens(document_id=document.id, token_id=token.id)
-                        if len(matching_urls) == 0:
+            session.add(document_token)
-#                        if not url.strip() in dictionary[word]:
+        session.commit()
-                            entries = dictionary[word]
+ 
-                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
+    # Foreach document, break into words
-                            dictionary[word].append(entry)
+    # Check if word exists in database
-                        else:
+    # Create if not exist
-                            entries = dictionary[word]
+    # Link to document
-                            entry = matching_urls[0]
+
                            entry["count"] += 1
                            entries.sort(reverse=True, key=lambda entry: entry["count"])
        index.write(json.dumps(dictionary))
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
+    parser.add_argument('-r',
                        "--rebuild",
                        action="store_true",
                        help="Blow away the index and rebuild")
    args = parser.parse_args()
    if args.rebuild:
        build_index()
--- a/src/index.py.old
+++ b/src/index.py.old
@ -0,0 +1,54 @@
 from sqlalchemy import create_engine
 from config import DATABASE_URI
 from models import Base, Website
 from pathlib import Path
 import argparse
 import os
 import json
 # investigate ngrams for "multi word" matching
 ignored_words = ['a', 'the','is']
 def remove_punctuation(input_string):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
    for p in punc:
        input_string = input_string.replace(p, '')
    return input_string
 def build_index():
    with open("data/index.json", "w") as index:
        # get a list of all content files
        # split on whitespace and add to index
        dictionary = {}
        pathlist = Path('data/content').rglob('*.txt')
        for path in pathlist:
            with open(str(path)) as content_file:
                url = content_file.readline()
                content = content_file.read()
                content_words = content.split()
                for word in content_words:
                    word = word.lower()
                    word = remove_punctuation(word)
                    if word not in ignored_words:
                        if word not in dictionary:
                            dictionary[word] = []
                        matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
                        if len(matching_urls) == 0:
 #                        if not url.strip() in dictionary[word]:
                            entries = dictionary[word]
                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
                            dictionary[word].append(entry)
                        else:
                            entries = dictionary[word]
                            entry = matching_urls[0]
                            entry["count"] += 1
                            entries.sort(reverse=True, key=lambda entry: entry["count"])
        index.write(json.dumps(dictionary))
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
    args = parser.parse_args()
    if args.rebuild:
        build_index()
--- a/src/models.py
+++ b/src/models.py
@ -1,18 +1,36 @@
 from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, Integer, String, DateTime
+from sqlalchemy import Column, String, DateTime, ForeignKey, Index
 from sqlalchemy.dialects.postgresql import UUID
 from sqlalchemy.orm import relationship, mapped_column
 import uuid
 Base = declarative_base()
 class Website(Base):
-    __tablename__ = 'websites'
+class Documents(Base):
    __tablename__ = 'documents'
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    url = Column(String)
    text_content = Column(String)
    html_content = Column(String)
    first_crawl_date = Column(DateTime)
    last_crawl_date = Column(DateTime)
    document_tokens = relationship("Document_Tokens", back_populates="document")
 class Document_Tokens(Base):
    __tablename__ = 'document_tokens'
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    document_id = mapped_column(ForeignKey("documents.id"))
    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    token_id = mapped_column(ForeignKey("tokens.id"))
    #Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    document = relationship("Documents", back_populates="document_tokens", uselist=False)
    token = relationship("Tokens", back_populates="document_tokens")
 class Tokens(Base):
    __tablename__ = 'tokens'
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    token = Column(String, index=True)
    document_tokens = relationship("Document_Tokens", back_populates="token")
--- a/src/search.py
+++ b/src/search.py
@ -1,30 +1,30 @@
-#!/bin/bash
+#!/usr/bin/python3
 from sqlalchemy import create_engine
 from config import DATABASE_URI
 from models import Base, Tokens
 from sqlalchemy.orm import sessionmaker
 from flask import Flask
 from flask import Request
 import json
 from urllib.parse import unquote
 app = Flask(__name__)
-## Todo - Boolean search (AND/OR/NOT/"")
+engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
 # Todo - Boolean search (AND/OR/NOT/"")
@app.route("/search/<query>")
 def search(query):
-    with open('data/index.json', 'r') as index_json:
+    session = Session()
        index = json.load(index_json)
        query = unquote(query)
        query_split = query.split()
    result = []
-        for q in query_split:
+    query_words = unquote(query).split()
-            q = q.lower()
+    for word in query_words:
-            if q in index:
+        word = word.lower()
-                for item in index[q]:
+        matching_token = session.query(Tokens).filter_by(token=word).first()
-                    matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
+        if session is None:
-                    if len(matching_results) == 0:
+            continue
-                        result.append(item)
+        for document_token in matching_token.document_tokens:
-                    else:
+
-                        matching_results[0]["count"] += item["count"]
+            result.append(document_token.document.url)
    return result
 def handle_and():
    pass