Begin adding Postgresql support instead of filesystem flat files

This commit is contained in:
rmgr 2024-03-01 21:12:40 +10:30
parent b43343e0ee
commit 24ee04c0ff
6 changed files with 80 additions and 13 deletions

2
src/config.py Normal file
View file

@ -0,0 +1,2 @@
DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'

View file

@ -6,14 +6,26 @@ from urllib.parse import urlparse, urljoin
import os
from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import datetime
# TODO- Handle gemini/gopher links
# TODO- Keep a list of traversed links and check before traversing again
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
def get_html(url: str) -> str:
response = requests.get(url)
return response.content
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
print(url)
print(recursion)
urlparts = urlparse(url)
@ -21,16 +33,24 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
soup = BeautifulSoup(html,'html.parser')
hash = hashlib.sha256()
hash.update(url.encode('ascii'))
filename_text = hash.hexdigest() + '.txt'
filename_html = hash.hexdigest() + '.html'
with open(f'data/content/{filename_text}', 'w') as outfile:
outfile.write(url)
outfile.write('\n')
outfile.write(soup.get_text())
with open(f'data/content/{filename_html}', 'w') as outfile:
outfile.write(url)
outfile.write('\n')
outfile.write(soup.prettify())
s = Session()
existing_website = s.query(Website).filter_by(url=url).first()
print (existing_website)
if existing_website == None:
website = Website(
url=url,
text_content=soup.get_text(),
html_content=soup.prettify(),
first_crawl_date=datetime.datetime.now(),
last_crawl_date = datetime.datetime.now()
)
s.add(website)
else:
existing_website.last_crawl_date = datetime.datetime.now()
s.add(existing_website)
s.commit()
s.close()
x = open(f'data/links.txt', 'a')
x.close()
links = soup.find_all("a")
@ -59,6 +79,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
# linksfile.write(f'{link}\n')
if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True)
# check inputs
parser = argparse.ArgumentParser()
@ -68,7 +89,7 @@ if __name__ == "__main__":
args = parser.parse_args()
html = get_html(args.url)
parse_html(args.url, html, max_recursion)
# recursion = 0
# if (args.followlinks):
# with open(f'data/links.txt', 'r+') as linksfile:

14
src/index.py Executable file → Normal file
View file

@ -1,5 +1,6 @@
#!/usr/bin/python3
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from pathlib import Path
import argparse
import os
@ -7,6 +8,13 @@ import json
# investigate ngrams for "multi word" matching
ignored_words = ['a', 'the','is']
def remove_punctuation(input_string):
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
for p in punc:
input_string = input_string.replace(p, '')
return input_string
def build_index():
with open(f"data/index.json", "w") as index:
# get a list of all content files
@ -20,6 +28,7 @@ def build_index():
content_words = content.split()
for word in content_words:
word = word.lower()
word = remove_punctuation(word)
if not word in ignored_words:
if not word in dictionary:
dictionary[word] = []
@ -42,3 +51,4 @@ if __name__ == "__main__":
args = parser.parse_args()
if args.rebuild:
build_index()

18
src/models.py Normal file
View file

@ -0,0 +1,18 @@
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime
from sqlalchemy.dialects.postgresql import UUID
import uuid
Base = declarative_base()
class Website(Base):
__tablename__ = 'websites'
id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
url = Column(String)
text_content = Column(String)
html_content = Column(String)
first_crawl_date = Column(DateTime)
last_crawl_date = Column(DateTime)