Refactor to use postgresql end to end

This commit is contained in:
rmgr 2024-03-07 20:44:34 +10:30
parent 8605ee6b2c
commit 20d198e559
5 changed files with 144 additions and 72 deletions

View file

@ -9,7 +9,7 @@ from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from models import Base, Documents, Document_Tokens, Tokens
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import datetime
@ -19,11 +19,12 @@ engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
def get_html(url: str) -> str:
def get_html(url: str) -> str:
response = requests.get(url)
return response.content
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
rp = urllib.robotparser.RobotFileParser()
print(url)
@ -45,10 +46,10 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
hash.update(url.encode('ascii'))
s = Session()
existing_website = s.query(Website).filter_by(url=url).first()
existing_website = s.query(Documents).filter_by(url=url).first()
print (existing_website)
if existing_website == None:
website = Website(
website = Documents(
url=url,
text_content=soup.get_text(),
html_content=soup.prettify(),

View file

@ -1,54 +1,53 @@
#!/usr/bin/python3
import argparse
import requests
import hashlib
from urllib.parse import urlparse, urljoin
import urllib.robotparser
import os
from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from pathlib import Path
import argparse
import os
import json
# investigate ngrams for "multi word" matching
ignored_words = ['a', 'the','is']
from models import Base, Documents, Document_Tokens, Tokens
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import datetime
def remove_punctuation(input_string):
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
for p in punc:
input_string = input_string.replace(p, '')
return input_string
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
def build_index():
with open(f"data/index.json", "w") as index:
# get a list of all content files
# split on whitespace and add to index
dictionary = {}
pathlist = Path('data/content').rglob('*.txt')
for path in pathlist:
with open(str(path)) as content_file:
url = content_file.readline()
content = content_file.read()
content_words = content.split()
session = Session()
# Read list of 1000 documents from db
documents = session.query(Documents).limit(1000)
for document in documents:
print(document.url)
content_words = document.text_content.split()
for word in content_words:
word = word.lower()
word = remove_punctuation(word)
if not word in ignored_words:
if not word in dictionary:
dictionary[word] = []
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
if len(matching_urls) == 0:
# if not url.strip() in dictionary[word]:
entries = dictionary[word]
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
dictionary[word].append(entry)
else:
entries = dictionary[word]
entry = matching_urls[0]
entry["count"] += 1
entries.sort(reverse=True, key=lambda entry: entry["count"])
index.write(json.dumps(dictionary))
token = session.query(Tokens).filter_by(token=word).first()
if token is None:
token = Tokens(token=word)
session.add(token)
document_token = Document_Tokens(document_id=document.id, token_id=token.id)
session.add(document_token)
session.commit()
# Foreach document, break into words
# Check if word exists in database
# Create if not exist
# Link to document
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
parser.add_argument('-r',
"--rebuild",
action="store_true",
help="Blow away the index and rebuild")
args = parser.parse_args()
if args.rebuild:
build_index()

54
src/index.py.old Normal file
View file

@ -0,0 +1,54 @@
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from pathlib import Path
import argparse
import os
import json
# investigate ngrams for "multi word" matching
ignored_words = ['a', 'the','is']
def remove_punctuation(input_string):
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
for p in punc:
input_string = input_string.replace(p, '')
return input_string
def build_index():
with open("data/index.json", "w") as index:
# get a list of all content files
# split on whitespace and add to index
dictionary = {}
pathlist = Path('data/content').rglob('*.txt')
for path in pathlist:
with open(str(path)) as content_file:
url = content_file.readline()
content = content_file.read()
content_words = content.split()
for word in content_words:
word = word.lower()
word = remove_punctuation(word)
if word not in ignored_words:
if word not in dictionary:
dictionary[word] = []
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
if len(matching_urls) == 0:
# if not url.strip() in dictionary[word]:
entries = dictionary[word]
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
dictionary[word].append(entry)
else:
entries = dictionary[word]
entry = matching_urls[0]
entry["count"] += 1
entries.sort(reverse=True, key=lambda entry: entry["count"])
index.write(json.dumps(dictionary))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
args = parser.parse_args()
if args.rebuild:
build_index()

View file

@ -1,18 +1,36 @@
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime
from sqlalchemy import Column, String, DateTime, ForeignKey, Index
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import relationship, mapped_column
import uuid
Base = declarative_base()
class Website(Base):
__tablename__ = 'websites'
class Documents(Base):
__tablename__ = 'documents'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
url = Column(String)
text_content = Column(String)
html_content = Column(String)
first_crawl_date = Column(DateTime)
last_crawl_date = Column(DateTime)
document_tokens = relationship("Document_Tokens", back_populates="document")
class Document_Tokens(Base):
__tablename__ = 'document_tokens'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id = mapped_column(ForeignKey("documents.id"))
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
token_id = mapped_column(ForeignKey("tokens.id"))
#Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document = relationship("Documents", back_populates="document_tokens", uselist=False)
token = relationship("Tokens", back_populates="document_tokens")
class Tokens(Base):
__tablename__ = 'tokens'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
token = Column(String, index=True)
document_tokens = relationship("Document_Tokens", back_populates="token")

View file

@ -1,30 +1,30 @@
#!/bin/bash
#!/usr/bin/python3
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Tokens
from sqlalchemy.orm import sessionmaker
from flask import Flask
from flask import Request
import json
from urllib.parse import unquote
app = Flask(__name__)
## Todo - Boolean search (AND/OR/NOT/"")
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
# Todo - Boolean search (AND/OR/NOT/"")
@app.route("/search/<query>")
def search(query):
with open('data/index.json', 'r') as index_json:
index = json.load(index_json)
query = unquote(query)
query_split = query.split()
session = Session()
result = []
for q in query_split:
q = q.lower()
if q in index:
for item in index[q]:
matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
if len(matching_results) == 0:
result.append(item)
else:
matching_results[0]["count"] += item["count"]
query_words = unquote(query).split()
for word in query_words:
word = word.lower()
matching_token = session.query(Tokens).filter_by(token=word).first()
if session is None:
continue
for document_token in matching_token.document_tokens:
result.append(document_token.document.url)
return result
def handle_and():
pass