Refactor to use postgresql end to end
This commit is contained in:
parent
8605ee6b2c
commit
20d198e559
5 changed files with 144 additions and 72 deletions
|
|
@ -9,7 +9,7 @@ from time import sleep
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
from config import DATABASE_URI
|
from config import DATABASE_URI
|
||||||
from models import Base, Website
|
from models import Base, Documents, Document_Tokens, Tokens
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
import datetime
|
import datetime
|
||||||
|
|
@ -19,11 +19,12 @@ engine = create_engine(DATABASE_URI)
|
||||||
Base.metadata.create_all(engine)
|
Base.metadata.create_all(engine)
|
||||||
Session = sessionmaker(bind=engine)
|
Session = sessionmaker(bind=engine)
|
||||||
|
|
||||||
def get_html(url: str) -> str:
|
|
||||||
|
|
||||||
|
def get_html(url: str) -> str:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
return response.content
|
return response.content
|
||||||
|
|
||||||
|
|
||||||
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
|
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
|
||||||
rp = urllib.robotparser.RobotFileParser()
|
rp = urllib.robotparser.RobotFileParser()
|
||||||
print(url)
|
print(url)
|
||||||
|
|
@ -45,10 +46,10 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
|
||||||
hash.update(url.encode('ascii'))
|
hash.update(url.encode('ascii'))
|
||||||
|
|
||||||
s = Session()
|
s = Session()
|
||||||
existing_website = s.query(Website).filter_by(url=url).first()
|
existing_website = s.query(Documents).filter_by(url=url).first()
|
||||||
print (existing_website)
|
print (existing_website)
|
||||||
if existing_website == None:
|
if existing_website == None:
|
||||||
website = Website(
|
website = Documents(
|
||||||
url=url,
|
url=url,
|
||||||
text_content=soup.get_text(),
|
text_content=soup.get_text(),
|
||||||
html_content=soup.prettify(),
|
html_content=soup.prettify(),
|
||||||
|
|
|
||||||
83
src/index.py
83
src/index.py
|
|
@ -1,54 +1,53 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
import argparse
|
||||||
|
import requests
|
||||||
|
import hashlib
|
||||||
|
from urllib.parse import urlparse, urljoin
|
||||||
|
import urllib.robotparser
|
||||||
|
import os
|
||||||
|
from time import sleep
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
from config import DATABASE_URI
|
from config import DATABASE_URI
|
||||||
from models import Base, Website
|
from models import Base, Documents, Document_Tokens, Tokens
|
||||||
from pathlib import Path
|
from sqlalchemy.orm import sessionmaker
|
||||||
import argparse
|
from sqlalchemy import create_engine
|
||||||
import os
|
import datetime
|
||||||
import json
|
|
||||||
# investigate ngrams for "multi word" matching
|
|
||||||
ignored_words = ['a', 'the','is']
|
|
||||||
|
|
||||||
def remove_punctuation(input_string):
|
engine = create_engine(DATABASE_URI)
|
||||||
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
|
Base.metadata.create_all(engine)
|
||||||
for p in punc:
|
Session = sessionmaker(bind=engine)
|
||||||
input_string = input_string.replace(p, '')
|
|
||||||
return input_string
|
|
||||||
|
|
||||||
|
|
||||||
def build_index():
|
def build_index():
|
||||||
with open(f"data/index.json", "w") as index:
|
session = Session()
|
||||||
# get a list of all content files
|
# Read list of 1000 documents from db
|
||||||
# split on whitespace and add to index
|
documents = session.query(Documents).limit(1000)
|
||||||
dictionary = {}
|
for document in documents:
|
||||||
pathlist = Path('data/content').rglob('*.txt')
|
print(document.url)
|
||||||
for path in pathlist:
|
content_words = document.text_content.split()
|
||||||
with open(str(path)) as content_file:
|
for word in content_words:
|
||||||
url = content_file.readline()
|
word = word.lower()
|
||||||
content = content_file.read()
|
token = session.query(Tokens).filter_by(token=word).first()
|
||||||
content_words = content.split()
|
if token is None:
|
||||||
for word in content_words:
|
token = Tokens(token=word)
|
||||||
word = word.lower()
|
session.add(token)
|
||||||
word = remove_punctuation(word)
|
document_token = Document_Tokens(document_id=document.id, token_id=token.id)
|
||||||
if not word in ignored_words:
|
session.add(document_token)
|
||||||
if not word in dictionary:
|
session.commit()
|
||||||
dictionary[word] = []
|
|
||||||
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
|
# Foreach document, break into words
|
||||||
if len(matching_urls) == 0:
|
# Check if word exists in database
|
||||||
# if not url.strip() in dictionary[word]:
|
# Create if not exist
|
||||||
entries = dictionary[word]
|
# Link to document
|
||||||
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
|
|
||||||
dictionary[word].append(entry)
|
|
||||||
else:
|
|
||||||
entries = dictionary[word]
|
|
||||||
entry = matching_urls[0]
|
|
||||||
entry["count"] += 1
|
|
||||||
entries.sort(reverse=True, key=lambda entry: entry["count"])
|
|
||||||
index.write(json.dumps(dictionary))
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
|
parser.add_argument('-r',
|
||||||
|
"--rebuild",
|
||||||
|
action="store_true",
|
||||||
|
help="Blow away the index and rebuild")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.rebuild:
|
if args.rebuild:
|
||||||
build_index()
|
build_index()
|
||||||
|
|
||||||
|
|
|
||||||
54
src/index.py.old
Normal file
54
src/index.py.old
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from config import DATABASE_URI
|
||||||
|
from models import Base, Website
|
||||||
|
from pathlib import Path
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
# investigate ngrams for "multi word" matching
|
||||||
|
ignored_words = ['a', 'the','is']
|
||||||
|
|
||||||
|
def remove_punctuation(input_string):
|
||||||
|
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
|
||||||
|
for p in punc:
|
||||||
|
input_string = input_string.replace(p, '')
|
||||||
|
return input_string
|
||||||
|
|
||||||
|
|
||||||
|
def build_index():
|
||||||
|
with open("data/index.json", "w") as index:
|
||||||
|
# get a list of all content files
|
||||||
|
# split on whitespace and add to index
|
||||||
|
dictionary = {}
|
||||||
|
pathlist = Path('data/content').rglob('*.txt')
|
||||||
|
for path in pathlist:
|
||||||
|
with open(str(path)) as content_file:
|
||||||
|
url = content_file.readline()
|
||||||
|
content = content_file.read()
|
||||||
|
content_words = content.split()
|
||||||
|
for word in content_words:
|
||||||
|
word = word.lower()
|
||||||
|
word = remove_punctuation(word)
|
||||||
|
if word not in ignored_words:
|
||||||
|
if word not in dictionary:
|
||||||
|
dictionary[word] = []
|
||||||
|
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
|
||||||
|
if len(matching_urls) == 0:
|
||||||
|
# if not url.strip() in dictionary[word]:
|
||||||
|
entries = dictionary[word]
|
||||||
|
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
|
||||||
|
dictionary[word].append(entry)
|
||||||
|
else:
|
||||||
|
entries = dictionary[word]
|
||||||
|
entry = matching_urls[0]
|
||||||
|
entry["count"] += 1
|
||||||
|
entries.sort(reverse=True, key=lambda entry: entry["count"])
|
||||||
|
index.write(json.dumps(dictionary))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.rebuild:
|
||||||
|
build_index()
|
||||||
|
|
||||||
|
|
@ -1,18 +1,36 @@
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
from sqlalchemy import Column, Integer, String, DateTime
|
from sqlalchemy import Column, String, DateTime, ForeignKey, Index
|
||||||
from sqlalchemy.dialects.postgresql import UUID
|
from sqlalchemy.dialects.postgresql import UUID
|
||||||
|
from sqlalchemy.orm import relationship, mapped_column
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
class Website(Base):
|
|
||||||
|
|
||||||
__tablename__ = 'websites'
|
class Documents(Base):
|
||||||
id = Column(UUID(as_uuid=True), primary_key=True, default = uuid.uuid4)
|
__tablename__ = 'documents'
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
url = Column(String)
|
url = Column(String)
|
||||||
text_content = Column(String)
|
text_content = Column(String)
|
||||||
html_content = Column(String)
|
html_content = Column(String)
|
||||||
first_crawl_date = Column(DateTime)
|
first_crawl_date = Column(DateTime)
|
||||||
last_crawl_date = Column(DateTime)
|
last_crawl_date = Column(DateTime)
|
||||||
|
document_tokens = relationship("Document_Tokens", back_populates="document")
|
||||||
|
|
||||||
|
|
||||||
|
class Document_Tokens(Base):
|
||||||
|
__tablename__ = 'document_tokens'
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
document_id = mapped_column(ForeignKey("documents.id"))
|
||||||
|
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
token_id = mapped_column(ForeignKey("tokens.id"))
|
||||||
|
#Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
document = relationship("Documents", back_populates="document_tokens", uselist=False)
|
||||||
|
token = relationship("Tokens", back_populates="document_tokens")
|
||||||
|
|
||||||
|
|
||||||
|
class Tokens(Base):
|
||||||
|
__tablename__ = 'tokens'
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
token = Column(String, index=True)
|
||||||
|
document_tokens = relationship("Document_Tokens", back_populates="token")
|
||||||
|
|
|
||||||
|
|
@ -1,30 +1,30 @@
|
||||||
#!/bin/bash
|
#!/usr/bin/python3
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from config import DATABASE_URI
|
||||||
|
from models import Base, Tokens
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
from flask import Flask
|
from flask import Flask
|
||||||
from flask import Request
|
|
||||||
import json
|
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
## Todo - Boolean search (AND/OR/NOT/"")
|
engine = create_engine(DATABASE_URI)
|
||||||
|
Base.metadata.create_all(engine)
|
||||||
|
Session = sessionmaker(bind=engine)
|
||||||
|
# Todo - Boolean search (AND/OR/NOT/"")
|
||||||
|
|
||||||
|
|
||||||
@app.route("/search/<query>")
|
@app.route("/search/<query>")
|
||||||
def search(query):
|
def search(query):
|
||||||
with open('data/index.json', 'r') as index_json:
|
session = Session()
|
||||||
index = json.load(index_json)
|
result = []
|
||||||
query = unquote(query)
|
query_words = unquote(query).split()
|
||||||
query_split = query.split()
|
for word in query_words:
|
||||||
result = []
|
word = word.lower()
|
||||||
for q in query_split:
|
matching_token = session.query(Tokens).filter_by(token=word).first()
|
||||||
q = q.lower()
|
if session is None:
|
||||||
if q in index:
|
continue
|
||||||
for item in index[q]:
|
for document_token in matching_token.document_tokens:
|
||||||
matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
|
|
||||||
if len(matching_results) == 0:
|
|
||||||
result.append(item)
|
|
||||||
else:
|
|
||||||
matching_results[0]["count"] += item["count"]
|
|
||||||
return result
|
|
||||||
|
|
||||||
def handle_and():
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
result.append(document_token.document.url)
|
||||||
|
return result
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue