Refactor to use postgresql end to end
This commit is contained in:
parent
8605ee6b2c
commit
20d198e559
5 changed files with 144 additions and 72 deletions
|
|
@ -9,7 +9,7 @@ from time import sleep
|
|||
from bs4 import BeautifulSoup
|
||||
from sqlalchemy import create_engine
|
||||
from config import DATABASE_URI
|
||||
from models import Base, Website
|
||||
from models import Base, Documents, Document_Tokens, Tokens
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy import create_engine
|
||||
import datetime
|
||||
|
|
@ -19,11 +19,12 @@ engine = create_engine(DATABASE_URI)
|
|||
Base.metadata.create_all(engine)
|
||||
Session = sessionmaker(bind=engine)
|
||||
|
||||
def get_html(url: str) -> str:
|
||||
|
||||
def get_html(url: str) -> str:
|
||||
response = requests.get(url)
|
||||
return response.content
|
||||
|
||||
|
||||
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
|
||||
rp = urllib.robotparser.RobotFileParser()
|
||||
print(url)
|
||||
|
|
@ -45,10 +46,10 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
|
|||
hash.update(url.encode('ascii'))
|
||||
|
||||
s = Session()
|
||||
existing_website = s.query(Website).filter_by(url=url).first()
|
||||
existing_website = s.query(Documents).filter_by(url=url).first()
|
||||
print (existing_website)
|
||||
if existing_website == None:
|
||||
website = Website(
|
||||
website = Documents(
|
||||
url=url,
|
||||
text_content=soup.get_text(),
|
||||
html_content=soup.prettify(),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue