From 24ee04c0ff3c49c2646aaccffd6544cf033878b1 Mon Sep 17 00:00:00 2001 From: rmgr Date: Fri, 1 Mar 2024 21:12:40 +1030 Subject: [PATCH] Begin adding Postgresql support instead of filesystem flat files --- db/docker-compose.yaml | 12 ++++++++++++ requirements.txt | 4 ++++ src/config.py | 2 ++ src/crawl.py | 43 +++++++++++++++++++++++++++++++----------- src/index.py | 14 ++++++++++++-- src/models.py | 18 ++++++++++++++++++ 6 files changed, 80 insertions(+), 13 deletions(-) create mode 100644 db/docker-compose.yaml create mode 100644 src/config.py mode change 100755 => 100644 src/index.py create mode 100644 src/models.py diff --git a/db/docker-compose.yaml b/db/docker-compose.yaml new file mode 100644 index 0000000..f7d2c11 --- /dev/null +++ b/db/docker-compose.yaml @@ -0,0 +1,12 @@ +version: '3.9' + +services: + postgres: + image: postgres:15.3-alpine + ports: + - 5432:5432 + volumes: + - ./postgres:/var/lib/postgresql/data + environment: + - POSTGRES_PASSWORD=S3cret + - POSTGRES_DB=search diff --git a/requirements.txt b/requirements.txt index 3321311..b2f64e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,15 +7,19 @@ charset-normalizer==3.3.2 click==8.1.7 cssselect==1.2.0 flask==3.0.0 +greenlet==3.0.3 idna==3.6 importlib-metadata==6.8.0 itsdangerous==2.1.2 Jinja2==3.1.2 lxml==4.9.3 MarkupSafe==2.1.3 +psycopg2-binary==2.9.9 readability-lxml==0.8.1 requests==2.31.0 soupsieve==2.5 +SQLAlchemy==2.0.27 +typing-extensions==4.10.0 urllib3==2.1.0 werkzeug==3.0.1 zipp==3.17.0 diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..2b38ec2 --- /dev/null +++ b/src/config.py @@ -0,0 +1,2 @@ +DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search' + diff --git a/src/crawl.py b/src/crawl.py index dcac7ed..bc6470d 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -6,14 +6,26 @@ from urllib.parse import urlparse, urljoin import os from time import sleep from bs4 import BeautifulSoup +from sqlalchemy import create_engine +from config import DATABASE_URI +from models import Base, Website +from sqlalchemy.orm import sessionmaker +from sqlalchemy import create_engine +import datetime # TODO- Handle gemini/gopher links # TODO- Keep a list of traversed links and check before traversing again +engine = create_engine(DATABASE_URI) +Base.metadata.create_all(engine) +Session = sessionmaker(bind=engine) + def get_html(url: str) -> str: + response = requests.get(url) return response.content def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool: + print(url) print(recursion) urlparts = urlparse(url) @@ -21,16 +33,24 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> soup = BeautifulSoup(html,'html.parser') hash = hashlib.sha256() hash.update(url.encode('ascii')) - filename_text = hash.hexdigest() + '.txt' - filename_html = hash.hexdigest() + '.html' - with open(f'data/content/{filename_text}', 'w') as outfile: - outfile.write(url) - outfile.write('\n') - outfile.write(soup.get_text()) - with open(f'data/content/{filename_html}', 'w') as outfile: - outfile.write(url) - outfile.write('\n') - outfile.write(soup.prettify()) + + s = Session() + existing_website = s.query(Website).filter_by(url=url).first() + print (existing_website) + if existing_website == None: + website = Website( + url=url, + text_content=soup.get_text(), + html_content=soup.prettify(), + first_crawl_date=datetime.datetime.now(), + last_crawl_date = datetime.datetime.now() + ) + s.add(website) + else: + existing_website.last_crawl_date = datetime.datetime.now() + s.add(existing_website) + s.commit() + s.close() x = open(f'data/links.txt', 'a') x.close() links = soup.find_all("a") @@ -59,6 +79,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> # linksfile.write(f'{link}\n') if __name__ == "__main__": + os.makedirs("data/content", exist_ok=True) # check inputs parser = argparse.ArgumentParser() @@ -68,7 +89,7 @@ if __name__ == "__main__": args = parser.parse_args() html = get_html(args.url) parse_html(args.url, html, max_recursion) - + # recursion = 0 # if (args.followlinks): # with open(f'data/links.txt', 'r+') as linksfile: diff --git a/src/index.py b/src/index.py old mode 100755 new mode 100644 index f55a356..e04c787 --- a/src/index.py +++ b/src/index.py @@ -1,5 +1,6 @@ -#!/usr/bin/python3 - +from sqlalchemy import create_engine +from config import DATABASE_URI +from models import Base, Website from pathlib import Path import argparse import os @@ -7,6 +8,13 @@ import json # investigate ngrams for "multi word" matching ignored_words = ['a', 'the','is'] +def remove_punctuation(input_string): + punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!''' + for p in punc: + input_string = input_string.replace(p, '') + return input_string + + def build_index(): with open(f"data/index.json", "w") as index: # get a list of all content files @@ -20,6 +28,7 @@ def build_index(): content_words = content.split() for word in content_words: word = word.lower() + word = remove_punctuation(word) if not word in ignored_words: if not word in dictionary: dictionary[word] = [] @@ -42,3 +51,4 @@ if __name__ == "__main__": args = parser.parse_args() if args.rebuild: build_index() + diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..ee768d4 --- /dev/null +++ b/src/models.py @@ -0,0 +1,18 @@ +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy import Column, Integer, String, DateTime +from sqlalchemy.dialects.postgresql import UUID +import uuid + +Base = declarative_base() + +class Website(Base): + + __tablename__ = 'websites' + id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4) + url = Column(String) + text_content = Column(String) + html_content = Column(String) + first_crawl_date = Column(DateTime) + last_crawl_date = Column(DateTime) + +