Begin adding Postgresql support instead of filesystem flat files

2024-03-01 21:12:40 +10:30 · 2024-03-01 21:12:40 +10:30 · 24ee04c0ff
commit 24ee04c0ff
parent b43343e0ee
6 changed files with 80 additions and 13 deletions
--- a/src/config.py
+++ b/src/config.py
@ -0,0 +1,2 @@
+DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'
+
--- a/src/crawl.py
+++ b/src/crawl.py
@ -6,14 +6,26 @@ from urllib.parse import urlparse, urljoin
 import os
 from time import sleep
 from bs4 import BeautifulSoup
+from sqlalchemy import create_engine
+from config import DATABASE_URI
+from models import Base, Website
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy import create_engine
+import datetime
 # TODO- Handle gemini/gopher links
 # TODO- Keep a list of traversed links and check before traversing again

+engine = create_engine(DATABASE_URI)
+Base.metadata.create_all(engine)
+Session = sessionmaker(bind=engine)
+
 def get_html(url: str) -> str:
+
    response = requests.get(url)
    return response.content

 def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
+
    print(url)
    print(recursion)
    urlparts = urlparse(url)
@ -21,16 +33,24 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
    soup = BeautifulSoup(html,'html.parser')
    hash = hashlib.sha256()
    hash.update(url.encode('ascii'))
-    filename_text = hash.hexdigest() + '.txt' 
-    filename_html = hash.hexdigest() + '.html' 
-    with open(f'data/content/{filename_text}', 'w') as outfile:
-        outfile.write(url)
-        outfile.write('\n')
-        outfile.write(soup.get_text())
-    with open(f'data/content/{filename_html}', 'w') as outfile:
-        outfile.write(url)
-        outfile.write('\n')
-        outfile.write(soup.prettify())
+
+    s = Session()
+    existing_website = s.query(Website).filter_by(url=url).first()
+    print (existing_website)
+    if existing_website == None:
+        website = Website(
+                url=url,
+                text_content=soup.get_text(),
+                html_content=soup.prettify(),
+                first_crawl_date=datetime.datetime.now(),
+                last_crawl_date = datetime.datetime.now()
+                )
+        s.add(website)
+    else:
+        existing_website.last_crawl_date = datetime.datetime.now()
+        s.add(existing_website)
+    s.commit()
+    s.close()
    x = open(f'data/links.txt', 'a')
    x.close()
    links = soup.find_all("a")
@ -59,6 +79,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
 #                    linksfile.write(f'{link}\n')

 if __name__ == "__main__":
+
    os.makedirs("data/content", exist_ok=True)
    # check inputs
    parser = argparse.ArgumentParser()
@ -68,7 +89,7 @@ if __name__ == "__main__":
    args = parser.parse_args()
    html = get_html(args.url)
    parse_html(args.url, html, max_recursion)
-    
+
 #    recursion = 0
 #    if (args.followlinks):
 #        with open(f'data/links.txt', 'r+') as linksfile:
--- a/src/index.py
+++ b/src/index.py
@ -1,5 +1,6 @@
-#!/usr/bin/python3
-
+from sqlalchemy import create_engine
+from config import DATABASE_URI
+from models import Base, Website
 from pathlib import Path
 import argparse
 import os
@ -7,6 +8,13 @@ import json
 # investigate ngrams for "multi word" matching
 ignored_words = ['a', 'the','is']

+def remove_punctuation(input_string):
+    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
+    for p in punc:
+        input_string = input_string.replace(p, '')
+    return input_string
+
+
 def build_index():
    with open(f"data/index.json", "w") as index:
        # get a list of all content files
@ -20,6 +28,7 @@ def build_index():
                content_words = content.split()
                for word in content_words:
                    word = word.lower()
+                    word = remove_punctuation(word)
                    if not word in ignored_words:
                        if not word in dictionary:
                            dictionary[word] = []
@ -42,3 +51,4 @@ if __name__ == "__main__":
    args = parser.parse_args()
    if args.rebuild:
        build_index()
+
--- a/src/models.py
+++ b/src/models.py
@ -0,0 +1,18 @@
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Column, Integer, String, DateTime
+from sqlalchemy.dialects.postgresql import UUID
+import uuid
+
+Base = declarative_base()
+
+class Website(Base):
+
+    __tablename__ = 'websites'
+    id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
+    url = Column(String)
+    text_content = Column(String)
+    html_content = Column(String)
+    first_crawl_date = Column(DateTime)
+    last_crawl_date = Column(DateTime)
+
+
				`@ -0,0 +1,2 @@`
				`DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'`