Refactor to use postgresql end to end

2024-03-07 20:44:34 +10:30 · 2024-03-07 20:44:34 +10:30 · 20d198e559
commit 20d198e559
parent 8605ee6b2c
5 changed files with 144 additions and 72 deletions
--- a/src/crawl.py
+++ b/src/crawl.py
@ -9,7 +9,7 @@ from time import sleep
 from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
-from models import Base, Website
+from models import Base, Documents, Document_Tokens, Tokens
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy import create_engine
 import datetime
@ -19,11 +19,12 @@ engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)

-def get_html(url: str) -> str:

+def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content

+
 def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
    rp = urllib.robotparser.RobotFileParser()
    print(url)
@ -45,10 +46,10 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
    hash.update(url.encode('ascii'))

    s = Session()
-    existing_website = s.query(Website).filter_by(url=url).first()
+    existing_website = s.query(Documents).filter_by(url=url).first()
    print (existing_website)
    if existing_website == None:
-        website = Website(
+        website = Documents(
                url=url,
                text_content=soup.get_text(),
                html_content=soup.prettify(),
--- a/src/index.py
+++ b/src/index.py
@ -1,54 +1,53 @@
+#!/usr/bin/python3
+import argparse
+import requests
+import hashlib
+from urllib.parse import urlparse, urljoin
+import urllib.robotparser
+import os
+from time import sleep
+from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
-from models import Base, Website
-from pathlib import Path
-import argparse
-import os
-import json
-# investigate ngrams for "multi word" matching
-ignored_words = ['a', 'the','is']
+from models import Base, Documents, Document_Tokens, Tokens
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy import create_engine
+import datetime

-def remove_punctuation(input_string):
-    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
-    for p in punc:
-        input_string = input_string.replace(p, '')
-    return input_string
+engine = create_engine(DATABASE_URI)
+Base.metadata.create_all(engine)
+Session = sessionmaker(bind=engine)


 def build_index():
-    with open(f"data/index.json", "w") as index:
-        # get a list of all content files
-        # split on whitespace and add to index
-        dictionary = {}
-        pathlist = Path('data/content').rglob('*.txt')
-        for path in pathlist:
-            with open(str(path)) as content_file:
-                url = content_file.readline()
-                content = content_file.read()
-                content_words = content.split()
+    session = Session()
+    # Read list of 1000 documents from db
+    documents = session.query(Documents).limit(1000)
+    for document in documents:
+        print(document.url)
+        content_words = document.text_content.split()
        for word in content_words:
            word = word.lower()
-                    word = remove_punctuation(word)
-                    if not word in ignored_words:
-                        if not word in dictionary:
-                            dictionary[word] = []
-                        matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
-                        if len(matching_urls) == 0:
-#                        if not url.strip() in dictionary[word]:
-                            entries = dictionary[word]
-                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
-                            dictionary[word].append(entry)
-                        else:
-                            entries = dictionary[word]
-                            entry = matching_urls[0]
-                            entry["count"] += 1
-                            entries.sort(reverse=True, key=lambda entry: entry["count"])
-        index.write(json.dumps(dictionary))
+            token = session.query(Tokens).filter_by(token=word).first()
+            if token is None:
+                token = Tokens(token=word)
+                session.add(token)
+            document_token = Document_Tokens(document_id=document.id, token_id=token.id)
+            session.add(document_token)
+        session.commit()
+ 
+    # Foreach document, break into words
+    # Check if word exists in database
+    # Create if not exist
+    # Link to document
+

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
+    parser.add_argument('-r',
+                        "--rebuild",
+                        action="store_true",
+                        help="Blow away the index and rebuild")
    args = parser.parse_args()
    if args.rebuild:
        build_index()
-
--- a/src/index.py.old
+++ b/src/index.py.old
@ -0,0 +1,54 @@
+from sqlalchemy import create_engine
+from config import DATABASE_URI
+from models import Base, Website
+from pathlib import Path
+import argparse
+import os
+import json
+# investigate ngrams for "multi word" matching
+ignored_words = ['a', 'the','is']
+
+def remove_punctuation(input_string):
+    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
+    for p in punc:
+        input_string = input_string.replace(p, '')
+    return input_string
+
+
+def build_index():
+    with open("data/index.json", "w") as index:
+        # get a list of all content files
+        # split on whitespace and add to index
+        dictionary = {}
+        pathlist = Path('data/content').rglob('*.txt')
+        for path in pathlist:
+            with open(str(path)) as content_file:
+                url = content_file.readline()
+                content = content_file.read()
+                content_words = content.split()
+                for word in content_words:
+                    word = word.lower()
+                    word = remove_punctuation(word)
+                    if word not in ignored_words:
+                        if word not in dictionary:
+                            dictionary[word] = []
+                        matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
+                        if len(matching_urls) == 0:
+#                        if not url.strip() in dictionary[word]:
+                            entries = dictionary[word]
+                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
+                            dictionary[word].append(entry)
+                        else:
+                            entries = dictionary[word]
+                            entry = matching_urls[0]
+                            entry["count"] += 1
+                            entries.sort(reverse=True, key=lambda entry: entry["count"])
+        index.write(json.dumps(dictionary))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
+    args = parser.parse_args()
+    if args.rebuild:
+        build_index()
+
--- a/src/models.py
+++ b/src/models.py
@ -1,18 +1,36 @@
 from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, Integer, String, DateTime
+from sqlalchemy import Column, String, DateTime, ForeignKey, Index
 from sqlalchemy.dialects.postgresql import UUID
+from sqlalchemy.orm import relationship, mapped_column
 import uuid

 Base = declarative_base()

-class Website(Base):

-    __tablename__ = 'websites'
+class Documents(Base):
+    __tablename__ = 'documents'
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    url = Column(String)
    text_content = Column(String)
    html_content = Column(String)
    first_crawl_date = Column(DateTime)
    last_crawl_date = Column(DateTime)
+    document_tokens = relationship("Document_Tokens", back_populates="document")


+class Document_Tokens(Base):
+    __tablename__ = 'document_tokens'
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    document_id = mapped_column(ForeignKey("documents.id"))
+    # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    token_id = mapped_column(ForeignKey("tokens.id"))
+    #Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    document = relationship("Documents", back_populates="document_tokens", uselist=False)
+    token = relationship("Tokens", back_populates="document_tokens")
+
+
+class Tokens(Base):
+    __tablename__ = 'tokens'
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
+    token = Column(String, index=True)
+    document_tokens = relationship("Document_Tokens", back_populates="token")
--- a/src/search.py
+++ b/src/search.py
@ -1,30 +1,30 @@
-#!/bin/bash
+#!/usr/bin/python3
+from sqlalchemy import create_engine
+from config import DATABASE_URI
+from models import Base, Tokens
+from sqlalchemy.orm import sessionmaker

 from flask import Flask
-from flask import Request
-import json
 from urllib.parse import unquote

 app = Flask(__name__)
-## Todo - Boolean search (AND/OR/NOT/"")
+engine = create_engine(DATABASE_URI)
+Base.metadata.create_all(engine)
+Session = sessionmaker(bind=engine)
+# Todo - Boolean search (AND/OR/NOT/"")
+
+
@app.route("/search/<query>")
 def search(query):
-    with open('data/index.json', 'r') as index_json:
-        index = json.load(index_json)
-        query = unquote(query)
-        query_split = query.split()
+    session = Session()
    result = []
-        for q in query_split:
-            q = q.lower()
-            if q in index:
-                for item in index[q]:
-                    matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
-                    if len(matching_results) == 0:
-                        result.append(item)
-                    else:
-                        matching_results[0]["count"] += item["count"]
+    query_words = unquote(query).split()
+    for word in query_words:
+        word = word.lower()
+        matching_token = session.query(Tokens).filter_by(token=word).first()
+        if session is None:
+            continue
+        for document_token in matching_token.document_tokens:
+
+            result.append(document_token.document.url)
    return result
-
-def handle_and():
-    pass
-