Begin adding Postgresql support instead of filesystem flat files

2024-03-01 21:12:40 +10:30 · 2024-03-01 21:12:40 +10:30 · 24ee04c0ff
commit 24ee04c0ff
parent b43343e0ee
6 changed files with 80 additions and 13 deletions
--- a/db/docker-compose.yaml
+++ b/db/docker-compose.yaml
@ -0,0 +1,12 @@
 version: '3.9'
 services:
  postgres:
    image: postgres:15.3-alpine
    ports:
      - 5432:5432
    volumes:
      - ./postgres:/var/lib/postgresql/data
    environment:
      - POSTGRES_PASSWORD=S3cret
      - POSTGRES_DB=search
--- a/requirements.txt
+++ b/requirements.txt
@ -7,15 +7,19 @@ charset-normalizer==3.3.2
 click==8.1.7
 cssselect==1.2.0
 flask==3.0.0
 greenlet==3.0.3
 idna==3.6
 importlib-metadata==6.8.0
 itsdangerous==2.1.2
 Jinja2==3.1.2
 lxml==4.9.3
 MarkupSafe==2.1.3
 psycopg2-binary==2.9.9
 readability-lxml==0.8.1
 requests==2.31.0
 soupsieve==2.5
 SQLAlchemy==2.0.27
 typing-extensions==4.10.0
 urllib3==2.1.0
 werkzeug==3.0.1
 zipp==3.17.0
--- a/src/config.py
+++ b/src/config.py
@ -0,0 +1,2 @@
 DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'
--- a/src/crawl.py
+++ b/src/crawl.py
@ -6,14 +6,26 @@ from urllib.parse import urlparse, urljoin
 import os
 from time import sleep
 from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
 from models import Base, Website
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy import create_engine
 import datetime
 # TODO- Handle gemini/gopher links
 # TODO- Keep a list of traversed links and check before traversing again
 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
 def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content
 def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
    print(url)
    print(recursion)
    urlparts = urlparse(url)
@ -21,16 +33,24 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
    soup = BeautifulSoup(html,'html.parser')
    hash = hashlib.sha256()
    hash.update(url.encode('ascii'))
-    filename_text = hash.hexdigest() + '.txt' 
+
-    filename_html = hash.hexdigest() + '.html' 
+    s = Session()
-    with open(f'data/content/{filename_text}', 'w') as outfile:
+    existing_website = s.query(Website).filter_by(url=url).first()
-        outfile.write(url)
+    print (existing_website)
-        outfile.write('\n')
+    if existing_website == None:
-        outfile.write(soup.get_text())
+        website = Website(
-    with open(f'data/content/{filename_html}', 'w') as outfile:
+                url=url,
-        outfile.write(url)
+                text_content=soup.get_text(),
-        outfile.write('\n')
+                html_content=soup.prettify(),
-        outfile.write(soup.prettify())
+                first_crawl_date=datetime.datetime.now(),
                last_crawl_date = datetime.datetime.now()
                )
        s.add(website)
    else:
        existing_website.last_crawl_date = datetime.datetime.now()
        s.add(existing_website)
    s.commit()
    s.close()
    x = open(f'data/links.txt', 'a')
    x.close()
    links = soup.find_all("a")
@ -59,6 +79,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
 #                    linksfile.write(f'{link}\n')
 if __name__ == "__main__":
    os.makedirs("data/content", exist_ok=True)
    # check inputs
    parser = argparse.ArgumentParser()
@ -68,7 +89,7 @@ if __name__ == "__main__":
    args = parser.parse_args()
    html = get_html(args.url)
    parse_html(args.url, html, max_recursion)
-    
+
 #    recursion = 0
 #    if (args.followlinks):
 #        with open(f'data/links.txt', 'r+') as linksfile:
--- a/src/index.py
+++ b/src/index.py
@ -1,5 +1,6 @@
-#!/usr/bin/python3
+from sqlalchemy import create_engine
-
+from config import DATABASE_URI
 from models import Base, Website
 from pathlib import Path
 import argparse
 import os
@ -7,6 +8,13 @@ import json
 # investigate ngrams for "multi word" matching
 ignored_words = ['a', 'the','is']
 def remove_punctuation(input_string):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
    for p in punc:
        input_string = input_string.replace(p, '')
    return input_string
 def build_index():
    with open(f"data/index.json", "w") as index:
        # get a list of all content files
@ -20,6 +28,7 @@ def build_index():
                content_words = content.split()
                for word in content_words:
                    word = word.lower()
                    word = remove_punctuation(word)
                    if not word in ignored_words:
                        if not word in dictionary:
                            dictionary[word] = []
@ -42,3 +51,4 @@ if __name__ == "__main__":
    args = parser.parse_args()
    if args.rebuild:
        build_index()
--- a/src/models.py
+++ b/src/models.py
@ -0,0 +1,18 @@
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy import Column, Integer, String, DateTime
 from sqlalchemy.dialects.postgresql import UUID
 import uuid
 Base = declarative_base()
 class Website(Base):
    __tablename__ = 'websites'
    id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
    url = Column(String)
    text_content = Column(String)
    html_content = Column(String)
    first_crawl_date = Column(DateTime)
    last_crawl_date = Column(DateTime)
		`@ -0,0 +1,2 @@`
							`DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'`