Merge postgres chagnes

2024-03-02 19:53:58 +10:30 · 2024-03-02 19:53:58 +10:30 · 8903f7a3e5
commit 8903f7a3e5
parent efe6dea1f5 24ee04c0ff
6 changed files with 71 additions and 13 deletions
--- a/db/docker-compose.yaml
+++ b/db/docker-compose.yaml
@ -0,0 +1,12 @@
 version: '3.9'
 services:
  postgres:
    image: postgres:15.3-alpine
    ports:
      - 5432:5432
    volumes:
      - ./postgres:/var/lib/postgresql/data
    environment:
      - POSTGRES_PASSWORD=S3cret
      - POSTGRES_DB=search
--- a/requirements.txt
+++ b/requirements.txt
@ -7,15 +7,19 @@ charset-normalizer==3.3.2
 click==8.1.7
 cssselect==1.2.0
 flask==3.0.0
 greenlet==3.0.3
 idna==3.6
 importlib-metadata==6.8.0
 itsdangerous==2.1.2
 Jinja2==3.1.2
 lxml==4.9.3
 MarkupSafe==2.1.3
 psycopg2-binary==2.9.9
 readability-lxml==0.8.1
 requests==2.31.0
 soupsieve==2.5
 SQLAlchemy==2.0.27
 typing-extensions==4.10.0
 urllib3==2.1.0
 werkzeug==3.0.1
 zipp==3.17.0
--- a/src/config.py
+++ b/src/config.py
@ -0,0 +1,2 @@
 DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'
--- a/src/crawl.py
+++ b/src/crawl.py
@ -7,9 +7,20 @@ import urllib.robotparser
 import os
 from time import sleep
 from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
 from models import Base, Website
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy import create_engine
 import datetime
 # TODO- Handle gemini/gopher links
 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
 def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content
@ -32,16 +43,24 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
    soup = BeautifulSoup(html,'html.parser')
    hash = hashlib.sha256()
    hash.update(url.encode('ascii'))
-    filename_text = hash.hexdigest() + '.txt' 
+
-    filename_html = hash.hexdigest() + '.html' 
+    s = Session()
-    with open(f'data/content/{filename_text}', 'w') as outfile:
+    existing_website = s.query(Website).filter_by(url=url).first()
-        outfile.write(url)
+    print (existing_website)
-        outfile.write('\n')
+    if existing_website == None:
-        outfile.write(soup.get_text())
+        website = Website(
-    with open(f'data/content/{filename_html}', 'w') as outfile:
+                url=url,
-        outfile.write(url)
+                text_content=soup.get_text(),
-        outfile.write('\n')
+                html_content=soup.prettify(),
-        outfile.write(soup.prettify())
+                first_crawl_date=datetime.datetime.now(),
                last_crawl_date = datetime.datetime.now()
                )
        s.add(website)
    else:
        existing_website.last_crawl_date = datetime.datetime.now()
        s.add(existing_website)
    s.commit()
    s.close()
    x = open(f'data/links.txt', 'a')
    x.close()
    links = soup.find_all("a")
@ -72,6 +91,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
                    linksfile.write(f'{link}\n')
 if __name__ == "__main__":
    os.makedirs("data/content", exist_ok=True)
    # check inputs
    parser = argparse.ArgumentParser()
--- a/src/index.py
+++ b/src/index.py
@ -1,5 +1,6 @@
-#!/usr/bin/python3
+from sqlalchemy import create_engine
-
+from config import DATABASE_URI
 from models import Base, Website
 from pathlib import Path
 import argparse
 import os
@ -8,7 +9,7 @@ import json
 ignored_words = ['a', 'the','is']
 def remove_punctuation(input_string):
-    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
+    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
    for p in punc:
        input_string = input_string.replace(p, '')
    return input_string
@ -50,3 +51,4 @@ if __name__ == "__main__":
    args = parser.parse_args()
    if args.rebuild:
        build_index()
--- a/src/models.py
+++ b/src/models.py
@ -0,0 +1,18 @@
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy import Column, Integer, String, DateTime
 from sqlalchemy.dialects.postgresql import UUID
 import uuid
 Base = declarative_base()
 class Website(Base):
    __tablename__ = 'websites'
    id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
    url = Column(String)
    text_content = Column(String)
    html_content = Column(String)
    first_crawl_date = Column(DateTime)
    last_crawl_date = Column(DateTime)
		`@ -0,0 +1,2 @@`
							`DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'`