Merge postgres chagnes

This commit is contained in:
rmgr 2024-03-02 19:53:58 +10:30
commit 8903f7a3e5
6 changed files with 71 additions and 13 deletions

View file

@ -7,9 +7,20 @@ import urllib.robotparser
import os
from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import datetime
# TODO- Handle gemini/gopher links
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
def get_html(url: str) -> str:
response = requests.get(url)
return response.content
@ -32,16 +43,24 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
soup = BeautifulSoup(html,'html.parser')
hash = hashlib.sha256()
hash.update(url.encode('ascii'))
filename_text = hash.hexdigest() + '.txt'
filename_html = hash.hexdigest() + '.html'
with open(f'data/content/{filename_text}', 'w') as outfile:
outfile.write(url)
outfile.write('\n')
outfile.write(soup.get_text())
with open(f'data/content/{filename_html}', 'w') as outfile:
outfile.write(url)
outfile.write('\n')
outfile.write(soup.prettify())
s = Session()
existing_website = s.query(Website).filter_by(url=url).first()
print (existing_website)
if existing_website == None:
website = Website(
url=url,
text_content=soup.get_text(),
html_content=soup.prettify(),
first_crawl_date=datetime.datetime.now(),
last_crawl_date = datetime.datetime.now()
)
s.add(website)
else:
existing_website.last_crawl_date = datetime.datetime.now()
s.add(existing_website)
s.commit()
s.close()
x = open(f'data/links.txt', 'a')
x.close()
links = soup.find_all("a")
@ -72,6 +91,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
linksfile.write(f'{link}\n')
if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True)
# check inputs
parser = argparse.ArgumentParser()