Merge postgres chagnes

2024-03-02 19:53:58 +10:30 · 2024-03-02 19:53:58 +10:30 · 8903f7a3e5
commit 8903f7a3e5
parent efe6dea1f5 24ee04c0ff
6 changed files with 71 additions and 13 deletions
--- a/src/config.py
+++ b/src/config.py
@ -0,0 +1,2 @@
+DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'
+
--- a/src/crawl.py
+++ b/src/crawl.py
@ -7,9 +7,20 @@ import urllib.robotparser
 import os
 from time import sleep
 from bs4 import BeautifulSoup
+from sqlalchemy import create_engine
+from config import DATABASE_URI
+from models import Base, Website
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy import create_engine
+import datetime
 # TODO- Handle gemini/gopher links

+engine = create_engine(DATABASE_URI)
+Base.metadata.create_all(engine)
+Session = sessionmaker(bind=engine)
+
 def get_html(url: str) -> str:
+
    response = requests.get(url)
    return response.content

@ -32,16 +43,24 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
    soup = BeautifulSoup(html,'html.parser')
    hash = hashlib.sha256()
    hash.update(url.encode('ascii'))
-    filename_text = hash.hexdigest() + '.txt' 
-    filename_html = hash.hexdigest() + '.html' 
-    with open(f'data/content/{filename_text}', 'w') as outfile:
-        outfile.write(url)
-        outfile.write('\n')
-        outfile.write(soup.get_text())
-    with open(f'data/content/{filename_html}', 'w') as outfile:
-        outfile.write(url)
-        outfile.write('\n')
-        outfile.write(soup.prettify())
+
+    s = Session()
+    existing_website = s.query(Website).filter_by(url=url).first()
+    print (existing_website)
+    if existing_website == None:
+        website = Website(
+                url=url,
+                text_content=soup.get_text(),
+                html_content=soup.prettify(),
+                first_crawl_date=datetime.datetime.now(),
+                last_crawl_date = datetime.datetime.now()
+                )
+        s.add(website)
+    else:
+        existing_website.last_crawl_date = datetime.datetime.now()
+        s.add(existing_website)
+    s.commit()
+    s.close()
    x = open(f'data/links.txt', 'a')
    x.close()
    links = soup.find_all("a")
@ -72,6 +91,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
                    linksfile.write(f'{link}\n')

 if __name__ == "__main__":
+
    os.makedirs("data/content", exist_ok=True)
    # check inputs
    parser = argparse.ArgumentParser()
--- a/src/index.py
+++ b/src/index.py
@ -1,5 +1,6 @@
-#!/usr/bin/python3
-
+from sqlalchemy import create_engine
+from config import DATABASE_URI
+from models import Base, Website
 from pathlib import Path
 import argparse
 import os
@ -8,7 +9,7 @@ import json
 ignored_words = ['a', 'the','is']

 def remove_punctuation(input_string):
-    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
+    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
    for p in punc:
        input_string = input_string.replace(p, '')
    return input_string
@ -50,3 +51,4 @@ if __name__ == "__main__":
    args = parser.parse_args()
    if args.rebuild:
        build_index()
+
--- a/src/models.py
+++ b/src/models.py
@ -0,0 +1,18 @@
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Column, Integer, String, DateTime
+from sqlalchemy.dialects.postgresql import UUID
+import uuid
+
+Base = declarative_base()
+
+class Website(Base):
+
+    __tablename__ = 'websites'
+    id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
+    url = Column(String)
+    text_content = Column(String)
+    html_content = Column(String)
+    first_crawl_date = Column(DateTime)
+    last_crawl_date = Column(DateTime)
+
+
				`@ -0,0 +1,2 @@`
				`DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'`