Refactor to use postgresql end to end

This commit is contained in:
rmgr 2024-03-07 20:44:34 +10:30
parent 8605ee6b2c
commit 20d198e559
5 changed files with 144 additions and 72 deletions

View file

@ -9,7 +9,7 @@ from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from models import Base, Documents, Document_Tokens, Tokens
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import datetime
@ -19,11 +19,12 @@ engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
def get_html(url: str) -> str:
def get_html(url: str) -> str:
response = requests.get(url)
return response.content
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
rp = urllib.robotparser.RobotFileParser()
print(url)
@ -45,10 +46,10 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
hash.update(url.encode('ascii'))
s = Session()
existing_website = s.query(Website).filter_by(url=url).first()
existing_website = s.query(Documents).filter_by(url=url).first()
print (existing_website)
if existing_website == None:
website = Website(
website = Documents(
url=url,
text_content=soup.get_text(),
html_content=soup.prettify(),