Refactor to use postgresql end to end

This commit is contained in:
rmgr 2024-03-07 20:44:34 +10:30
parent 8605ee6b2c
commit 20d198e559
5 changed files with 144 additions and 72 deletions

View file

@ -9,7 +9,7 @@ from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from sqlalchemy import create_engine from sqlalchemy import create_engine
from config import DATABASE_URI from config import DATABASE_URI
from models import Base, Website from models import Base, Documents, Document_Tokens, Tokens
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine from sqlalchemy import create_engine
import datetime import datetime
@ -19,11 +19,12 @@ engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine) Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine) Session = sessionmaker(bind=engine)
def get_html(url: str) -> str:
def get_html(url: str) -> str:
response = requests.get(url) response = requests.get(url)
return response.content return response.content
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool: def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
rp = urllib.robotparser.RobotFileParser() rp = urllib.robotparser.RobotFileParser()
print(url) print(url)
@ -45,10 +46,10 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
hash.update(url.encode('ascii')) hash.update(url.encode('ascii'))
s = Session() s = Session()
existing_website = s.query(Website).filter_by(url=url).first() existing_website = s.query(Documents).filter_by(url=url).first()
print (existing_website) print (existing_website)
if existing_website == None: if existing_website == None:
website = Website( website = Documents(
url=url, url=url,
text_content=soup.get_text(), text_content=soup.get_text(),
html_content=soup.prettify(), html_content=soup.prettify(),

View file

@ -1,54 +1,53 @@
#!/usr/bin/python3
import argparse
import requests
import hashlib
from urllib.parse import urlparse, urljoin
import urllib.robotparser
import os
from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine from sqlalchemy import create_engine
from config import DATABASE_URI from config import DATABASE_URI
from models import Base, Website from models import Base, Documents, Document_Tokens, Tokens
from pathlib import Path from sqlalchemy.orm import sessionmaker
import argparse from sqlalchemy import create_engine
import os import datetime
import json
# investigate ngrams for "multi word" matching
ignored_words = ['a', 'the','is']
def remove_punctuation(input_string): engine = create_engine(DATABASE_URI)
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!''' Base.metadata.create_all(engine)
for p in punc: Session = sessionmaker(bind=engine)
input_string = input_string.replace(p, '')
return input_string
def build_index(): def build_index():
with open(f"data/index.json", "w") as index: session = Session()
# get a list of all content files # Read list of 1000 documents from db
# split on whitespace and add to index documents = session.query(Documents).limit(1000)
dictionary = {} for document in documents:
pathlist = Path('data/content').rglob('*.txt') print(document.url)
for path in pathlist: content_words = document.text_content.split()
with open(str(path)) as content_file:
url = content_file.readline()
content = content_file.read()
content_words = content.split()
for word in content_words: for word in content_words:
word = word.lower() word = word.lower()
word = remove_punctuation(word) token = session.query(Tokens).filter_by(token=word).first()
if not word in ignored_words: if token is None:
if not word in dictionary: token = Tokens(token=word)
dictionary[word] = [] session.add(token)
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word])) document_token = Document_Tokens(document_id=document.id, token_id=token.id)
if len(matching_urls) == 0: session.add(document_token)
# if not url.strip() in dictionary[word]: session.commit()
entries = dictionary[word]
entry = {"url": url.strip(), "count": 1, "filename": str(path)} # Foreach document, break into words
dictionary[word].append(entry) # Check if word exists in database
else: # Create if not exist
entries = dictionary[word] # Link to document
entry = matching_urls[0]
entry["count"] += 1
entries.sort(reverse=True, key=lambda entry: entry["count"])
index.write(json.dumps(dictionary))
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") parser.add_argument('-r',
"--rebuild",
action="store_true",
help="Blow away the index and rebuild")
args = parser.parse_args() args = parser.parse_args()
if args.rebuild: if args.rebuild:
build_index() build_index()

54
src/index.py.old Normal file
View file

@ -0,0 +1,54 @@
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from pathlib import Path
import argparse
import os
import json
# investigate ngrams for "multi word" matching
ignored_words = ['a', 'the','is']
def remove_punctuation(input_string):
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
for p in punc:
input_string = input_string.replace(p, '')
return input_string
def build_index():
with open("data/index.json", "w") as index:
# get a list of all content files
# split on whitespace and add to index
dictionary = {}
pathlist = Path('data/content').rglob('*.txt')
for path in pathlist:
with open(str(path)) as content_file:
url = content_file.readline()
content = content_file.read()
content_words = content.split()
for word in content_words:
word = word.lower()
word = remove_punctuation(word)
if word not in ignored_words:
if word not in dictionary:
dictionary[word] = []
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
if len(matching_urls) == 0:
# if not url.strip() in dictionary[word]:
entries = dictionary[word]
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
dictionary[word].append(entry)
else:
entries = dictionary[word]
entry = matching_urls[0]
entry["count"] += 1
entries.sort(reverse=True, key=lambda entry: entry["count"])
index.write(json.dumps(dictionary))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
args = parser.parse_args()
if args.rebuild:
build_index()

View file

@ -1,18 +1,36 @@
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime from sqlalchemy import Column, String, DateTime, ForeignKey, Index
from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import relationship, mapped_column
import uuid import uuid
Base = declarative_base() Base = declarative_base()
class Website(Base):
__tablename__ = 'websites' class Documents(Base):
__tablename__ = 'documents'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
url = Column(String) url = Column(String)
text_content = Column(String) text_content = Column(String)
html_content = Column(String) html_content = Column(String)
first_crawl_date = Column(DateTime) first_crawl_date = Column(DateTime)
last_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime)
document_tokens = relationship("Document_Tokens", back_populates="document")
class Document_Tokens(Base):
__tablename__ = 'document_tokens'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id = mapped_column(ForeignKey("documents.id"))
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
token_id = mapped_column(ForeignKey("tokens.id"))
#Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document = relationship("Documents", back_populates="document_tokens", uselist=False)
token = relationship("Tokens", back_populates="document_tokens")
class Tokens(Base):
__tablename__ = 'tokens'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
token = Column(String, index=True)
document_tokens = relationship("Document_Tokens", back_populates="token")

View file

@ -1,30 +1,30 @@
#!/bin/bash #!/usr/bin/python3
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Tokens
from sqlalchemy.orm import sessionmaker
from flask import Flask from flask import Flask
from flask import Request
import json
from urllib.parse import unquote from urllib.parse import unquote
app = Flask(__name__) app = Flask(__name__)
## Todo - Boolean search (AND/OR/NOT/"") engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
# Todo - Boolean search (AND/OR/NOT/"")
@app.route("/search/<query>") @app.route("/search/<query>")
def search(query): def search(query):
with open('data/index.json', 'r') as index_json: session = Session()
index = json.load(index_json)
query = unquote(query)
query_split = query.split()
result = [] result = []
for q in query_split: query_words = unquote(query).split()
q = q.lower() for word in query_words:
if q in index: word = word.lower()
for item in index[q]: matching_token = session.query(Tokens).filter_by(token=word).first()
matching_results = list(filter(lambda entry: entry['url'] == item["url"], result)) if session is None:
if len(matching_results) == 0: continue
result.append(item) for document_token in matching_token.document_tokens:
else:
matching_results[0]["count"] += item["count"] result.append(document_token.document.url)
return result return result
def handle_and():
pass