Merge postgres chagnes
This commit is contained in:
commit
8903f7a3e5
6 changed files with 71 additions and 13 deletions
2
src/config.py
Normal file
2
src/config.py
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'
|
||||
|
||||
40
src/crawl.py
40
src/crawl.py
|
|
@ -7,9 +7,20 @@ import urllib.robotparser
|
|||
import os
|
||||
from time import sleep
|
||||
from bs4 import BeautifulSoup
|
||||
from sqlalchemy import create_engine
|
||||
from config import DATABASE_URI
|
||||
from models import Base, Website
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy import create_engine
|
||||
import datetime
|
||||
# TODO- Handle gemini/gopher links
|
||||
|
||||
engine = create_engine(DATABASE_URI)
|
||||
Base.metadata.create_all(engine)
|
||||
Session = sessionmaker(bind=engine)
|
||||
|
||||
def get_html(url: str) -> str:
|
||||
|
||||
response = requests.get(url)
|
||||
return response.content
|
||||
|
||||
|
|
@ -32,16 +43,24 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
|
|||
soup = BeautifulSoup(html,'html.parser')
|
||||
hash = hashlib.sha256()
|
||||
hash.update(url.encode('ascii'))
|
||||
filename_text = hash.hexdigest() + '.txt'
|
||||
filename_html = hash.hexdigest() + '.html'
|
||||
with open(f'data/content/{filename_text}', 'w') as outfile:
|
||||
outfile.write(url)
|
||||
outfile.write('\n')
|
||||
outfile.write(soup.get_text())
|
||||
with open(f'data/content/{filename_html}', 'w') as outfile:
|
||||
outfile.write(url)
|
||||
outfile.write('\n')
|
||||
outfile.write(soup.prettify())
|
||||
|
||||
s = Session()
|
||||
existing_website = s.query(Website).filter_by(url=url).first()
|
||||
print (existing_website)
|
||||
if existing_website == None:
|
||||
website = Website(
|
||||
url=url,
|
||||
text_content=soup.get_text(),
|
||||
html_content=soup.prettify(),
|
||||
first_crawl_date=datetime.datetime.now(),
|
||||
last_crawl_date = datetime.datetime.now()
|
||||
)
|
||||
s.add(website)
|
||||
else:
|
||||
existing_website.last_crawl_date = datetime.datetime.now()
|
||||
s.add(existing_website)
|
||||
s.commit()
|
||||
s.close()
|
||||
x = open(f'data/links.txt', 'a')
|
||||
x.close()
|
||||
links = soup.find_all("a")
|
||||
|
|
@ -72,6 +91,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
|
|||
linksfile.write(f'{link}\n')
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
os.makedirs("data/content", exist_ok=True)
|
||||
# check inputs
|
||||
parser = argparse.ArgumentParser()
|
||||
|
|
|
|||
8
src/index.py
Executable file → Normal file
8
src/index.py
Executable file → Normal file
|
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from config import DATABASE_URI
|
||||
from models import Base, Website
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import os
|
||||
|
|
@ -8,7 +9,7 @@ import json
|
|||
ignored_words = ['a', 'the','is']
|
||||
|
||||
def remove_punctuation(input_string):
|
||||
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
|
||||
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
|
||||
for p in punc:
|
||||
input_string = input_string.replace(p, '')
|
||||
return input_string
|
||||
|
|
@ -50,3 +51,4 @@ if __name__ == "__main__":
|
|||
args = parser.parse_args()
|
||||
if args.rebuild:
|
||||
build_index()
|
||||
|
||||
|
|
|
|||
18
src/models.py
Normal file
18
src/models.py
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy import Column, Integer, String, DateTime
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
import uuid
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
class Website(Base):
|
||||
|
||||
__tablename__ = 'websites'
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
|
||||
url = Column(String)
|
||||
text_content = Column(String)
|
||||
html_content = Column(String)
|
||||
first_crawl_date = Column(DateTime)
|
||||
last_crawl_date = Column(DateTime)
|
||||
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue