Merge postgres chagnes

This commit is contained in:
rmgr 2024-03-02 19:53:58 +10:30
commit 8903f7a3e5
6 changed files with 71 additions and 13 deletions

12
db/docker-compose.yaml Normal file
View file

@ -0,0 +1,12 @@
version: '3.9'
services:
postgres:
image: postgres:15.3-alpine
ports:
- 5432:5432
volumes:
- ./postgres:/var/lib/postgresql/data
environment:
- POSTGRES_PASSWORD=S3cret
- POSTGRES_DB=search

View file

@ -7,15 +7,19 @@ charset-normalizer==3.3.2
click==8.1.7 click==8.1.7
cssselect==1.2.0 cssselect==1.2.0
flask==3.0.0 flask==3.0.0
greenlet==3.0.3
idna==3.6 idna==3.6
importlib-metadata==6.8.0 importlib-metadata==6.8.0
itsdangerous==2.1.2 itsdangerous==2.1.2
Jinja2==3.1.2 Jinja2==3.1.2
lxml==4.9.3 lxml==4.9.3
MarkupSafe==2.1.3 MarkupSafe==2.1.3
psycopg2-binary==2.9.9
readability-lxml==0.8.1 readability-lxml==0.8.1
requests==2.31.0 requests==2.31.0
soupsieve==2.5 soupsieve==2.5
SQLAlchemy==2.0.27
typing-extensions==4.10.0
urllib3==2.1.0 urllib3==2.1.0
werkzeug==3.0.1 werkzeug==3.0.1
zipp==3.17.0 zipp==3.17.0

2
src/config.py Normal file
View file

@ -0,0 +1,2 @@
DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'

View file

@ -7,9 +7,20 @@ import urllib.robotparser
import os import os
from time import sleep from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import datetime
# TODO- Handle gemini/gopher links # TODO- Handle gemini/gopher links
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
def get_html(url: str) -> str: def get_html(url: str) -> str:
response = requests.get(url) response = requests.get(url)
return response.content return response.content
@ -32,16 +43,24 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
soup = BeautifulSoup(html,'html.parser') soup = BeautifulSoup(html,'html.parser')
hash = hashlib.sha256() hash = hashlib.sha256()
hash.update(url.encode('ascii')) hash.update(url.encode('ascii'))
filename_text = hash.hexdigest() + '.txt'
filename_html = hash.hexdigest() + '.html' s = Session()
with open(f'data/content/{filename_text}', 'w') as outfile: existing_website = s.query(Website).filter_by(url=url).first()
outfile.write(url) print (existing_website)
outfile.write('\n') if existing_website == None:
outfile.write(soup.get_text()) website = Website(
with open(f'data/content/{filename_html}', 'w') as outfile: url=url,
outfile.write(url) text_content=soup.get_text(),
outfile.write('\n') html_content=soup.prettify(),
outfile.write(soup.prettify()) first_crawl_date=datetime.datetime.now(),
last_crawl_date = datetime.datetime.now()
)
s.add(website)
else:
existing_website.last_crawl_date = datetime.datetime.now()
s.add(existing_website)
s.commit()
s.close()
x = open(f'data/links.txt', 'a') x = open(f'data/links.txt', 'a')
x.close() x.close()
links = soup.find_all("a") links = soup.find_all("a")
@ -72,6 +91,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
linksfile.write(f'{link}\n') linksfile.write(f'{link}\n')
if __name__ == "__main__": if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True) os.makedirs("data/content", exist_ok=True)
# check inputs # check inputs
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()

8
src/index.py Executable file → Normal file
View file

@ -1,5 +1,6 @@
#!/usr/bin/python3 from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from pathlib import Path from pathlib import Path
import argparse import argparse
import os import os
@ -8,7 +9,7 @@ import json
ignored_words = ['a', 'the','is'] ignored_words = ['a', 'the','is']
def remove_punctuation(input_string): def remove_punctuation(input_string):
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
for p in punc: for p in punc:
input_string = input_string.replace(p, '') input_string = input_string.replace(p, '')
return input_string return input_string
@ -50,3 +51,4 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
if args.rebuild: if args.rebuild:
build_index() build_index()

18
src/models.py Normal file
View file

@ -0,0 +1,18 @@
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime
from sqlalchemy.dialects.postgresql import UUID
import uuid
Base = declarative_base()
class Website(Base):
__tablename__ = 'websites'
id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
url = Column(String)
text_content = Column(String)
html_content = Column(String)
first_crawl_date = Column(DateTime)
last_crawl_date = Column(DateTime)