Begin adding Postgresql support instead of filesystem flat files
This commit is contained in:
parent
b43343e0ee
commit
24ee04c0ff
6 changed files with 80 additions and 13 deletions
12
db/docker-compose.yaml
Normal file
12
db/docker-compose.yaml
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
version: '3.9'
|
||||||
|
|
||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: postgres:15.3-alpine
|
||||||
|
ports:
|
||||||
|
- 5432:5432
|
||||||
|
volumes:
|
||||||
|
- ./postgres:/var/lib/postgresql/data
|
||||||
|
environment:
|
||||||
|
- POSTGRES_PASSWORD=S3cret
|
||||||
|
- POSTGRES_DB=search
|
||||||
|
|
@ -7,15 +7,19 @@ charset-normalizer==3.3.2
|
||||||
click==8.1.7
|
click==8.1.7
|
||||||
cssselect==1.2.0
|
cssselect==1.2.0
|
||||||
flask==3.0.0
|
flask==3.0.0
|
||||||
|
greenlet==3.0.3
|
||||||
idna==3.6
|
idna==3.6
|
||||||
importlib-metadata==6.8.0
|
importlib-metadata==6.8.0
|
||||||
itsdangerous==2.1.2
|
itsdangerous==2.1.2
|
||||||
Jinja2==3.1.2
|
Jinja2==3.1.2
|
||||||
lxml==4.9.3
|
lxml==4.9.3
|
||||||
MarkupSafe==2.1.3
|
MarkupSafe==2.1.3
|
||||||
|
psycopg2-binary==2.9.9
|
||||||
readability-lxml==0.8.1
|
readability-lxml==0.8.1
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
soupsieve==2.5
|
soupsieve==2.5
|
||||||
|
SQLAlchemy==2.0.27
|
||||||
|
typing-extensions==4.10.0
|
||||||
urllib3==2.1.0
|
urllib3==2.1.0
|
||||||
werkzeug==3.0.1
|
werkzeug==3.0.1
|
||||||
zipp==3.17.0
|
zipp==3.17.0
|
||||||
|
|
|
||||||
2
src/config.py
Normal file
2
src/config.py
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
DATABASE_URI = 'postgresql+psycopg2://postgres:S3cret@localhost:5432/search'
|
||||||
|
|
||||||
43
src/crawl.py
43
src/crawl.py
|
|
@ -6,14 +6,26 @@ from urllib.parse import urlparse, urljoin
|
||||||
import os
|
import os
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from config import DATABASE_URI
|
||||||
|
from models import Base, Website
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
import datetime
|
||||||
# TODO- Handle gemini/gopher links
|
# TODO- Handle gemini/gopher links
|
||||||
# TODO- Keep a list of traversed links and check before traversing again
|
# TODO- Keep a list of traversed links and check before traversing again
|
||||||
|
|
||||||
|
engine = create_engine(DATABASE_URI)
|
||||||
|
Base.metadata.create_all(engine)
|
||||||
|
Session = sessionmaker(bind=engine)
|
||||||
|
|
||||||
def get_html(url: str) -> str:
|
def get_html(url: str) -> str:
|
||||||
|
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
return response.content
|
return response.content
|
||||||
|
|
||||||
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
|
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
|
||||||
|
|
||||||
print(url)
|
print(url)
|
||||||
print(recursion)
|
print(recursion)
|
||||||
urlparts = urlparse(url)
|
urlparts = urlparse(url)
|
||||||
|
|
@ -21,16 +33,24 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
|
||||||
soup = BeautifulSoup(html,'html.parser')
|
soup = BeautifulSoup(html,'html.parser')
|
||||||
hash = hashlib.sha256()
|
hash = hashlib.sha256()
|
||||||
hash.update(url.encode('ascii'))
|
hash.update(url.encode('ascii'))
|
||||||
filename_text = hash.hexdigest() + '.txt'
|
|
||||||
filename_html = hash.hexdigest() + '.html'
|
s = Session()
|
||||||
with open(f'data/content/{filename_text}', 'w') as outfile:
|
existing_website = s.query(Website).filter_by(url=url).first()
|
||||||
outfile.write(url)
|
print (existing_website)
|
||||||
outfile.write('\n')
|
if existing_website == None:
|
||||||
outfile.write(soup.get_text())
|
website = Website(
|
||||||
with open(f'data/content/{filename_html}', 'w') as outfile:
|
url=url,
|
||||||
outfile.write(url)
|
text_content=soup.get_text(),
|
||||||
outfile.write('\n')
|
html_content=soup.prettify(),
|
||||||
outfile.write(soup.prettify())
|
first_crawl_date=datetime.datetime.now(),
|
||||||
|
last_crawl_date = datetime.datetime.now()
|
||||||
|
)
|
||||||
|
s.add(website)
|
||||||
|
else:
|
||||||
|
existing_website.last_crawl_date = datetime.datetime.now()
|
||||||
|
s.add(existing_website)
|
||||||
|
s.commit()
|
||||||
|
s.close()
|
||||||
x = open(f'data/links.txt', 'a')
|
x = open(f'data/links.txt', 'a')
|
||||||
x.close()
|
x.close()
|
||||||
links = soup.find_all("a")
|
links = soup.find_all("a")
|
||||||
|
|
@ -59,6 +79,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
|
||||||
# linksfile.write(f'{link}\n')
|
# linksfile.write(f'{link}\n')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
os.makedirs("data/content", exist_ok=True)
|
os.makedirs("data/content", exist_ok=True)
|
||||||
# check inputs
|
# check inputs
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
@ -68,7 +89,7 @@ if __name__ == "__main__":
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
html = get_html(args.url)
|
html = get_html(args.url)
|
||||||
parse_html(args.url, html, max_recursion)
|
parse_html(args.url, html, max_recursion)
|
||||||
|
|
||||||
# recursion = 0
|
# recursion = 0
|
||||||
# if (args.followlinks):
|
# if (args.followlinks):
|
||||||
# with open(f'data/links.txt', 'r+') as linksfile:
|
# with open(f'data/links.txt', 'r+') as linksfile:
|
||||||
|
|
|
||||||
14
src/index.py
Executable file → Normal file
14
src/index.py
Executable file → Normal file
|
|
@ -1,5 +1,6 @@
|
||||||
#!/usr/bin/python3
|
from sqlalchemy import create_engine
|
||||||
|
from config import DATABASE_URI
|
||||||
|
from models import Base, Website
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
|
@ -7,6 +8,13 @@ import json
|
||||||
# investigate ngrams for "multi word" matching
|
# investigate ngrams for "multi word" matching
|
||||||
ignored_words = ['a', 'the','is']
|
ignored_words = ['a', 'the','is']
|
||||||
|
|
||||||
|
def remove_punctuation(input_string):
|
||||||
|
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
|
||||||
|
for p in punc:
|
||||||
|
input_string = input_string.replace(p, '')
|
||||||
|
return input_string
|
||||||
|
|
||||||
|
|
||||||
def build_index():
|
def build_index():
|
||||||
with open(f"data/index.json", "w") as index:
|
with open(f"data/index.json", "w") as index:
|
||||||
# get a list of all content files
|
# get a list of all content files
|
||||||
|
|
@ -20,6 +28,7 @@ def build_index():
|
||||||
content_words = content.split()
|
content_words = content.split()
|
||||||
for word in content_words:
|
for word in content_words:
|
||||||
word = word.lower()
|
word = word.lower()
|
||||||
|
word = remove_punctuation(word)
|
||||||
if not word in ignored_words:
|
if not word in ignored_words:
|
||||||
if not word in dictionary:
|
if not word in dictionary:
|
||||||
dictionary[word] = []
|
dictionary[word] = []
|
||||||
|
|
@ -42,3 +51,4 @@ if __name__ == "__main__":
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.rebuild:
|
if args.rebuild:
|
||||||
build_index()
|
build_index()
|
||||||
|
|
||||||
|
|
|
||||||
18
src/models.py
Normal file
18
src/models.py
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
from sqlalchemy import Column, Integer, String, DateTime
|
||||||
|
from sqlalchemy.dialects.postgresql import UUID
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
Base = declarative_base()
|
||||||
|
|
||||||
|
class Website(Base):
|
||||||
|
|
||||||
|
__tablename__ = 'websites'
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
|
||||||
|
url = Column(String)
|
||||||
|
text_content = Column(String)
|
||||||
|
html_content = Column(String)
|
||||||
|
first_crawl_date = Column(DateTime)
|
||||||
|
last_crawl_date = Column(DateTime)
|
||||||
|
|
||||||
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue