Compare commits
16 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bbba459480 | ||
|
|
2a99a61dbe | ||
|
|
e3c67b64e6 | ||
|
|
98efe9d1a2 | ||
|
|
bdb4064acc | ||
|
|
9f0e7e6b29 | ||
|
|
9d57f66cd7 | ||
|
|
343410e62f | ||
|
|
7ee9d978b2 | ||
|
|
d4bb3fb8dc | ||
|
|
20d198e559 | ||
|
|
8605ee6b2c | ||
|
|
aed568d11e | ||
|
|
8903f7a3e5 | ||
|
|
efe6dea1f5 | ||
|
|
f4ea8ad1d7 |
10 changed files with 632 additions and 113 deletions
33
client/src/css/styles.css
Normal file
33
client/src/css/styles.css
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
html, body {
|
||||||
|
height: 100%;
|
||||||
|
}
|
||||||
|
body {
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
input {
|
||||||
|
padding: 7px;
|
||||||
|
font-size: 1.1rem;
|
||||||
|
}
|
||||||
|
.search-container {
|
||||||
|
display: flex;
|
||||||
|
justify-content: center;
|
||||||
|
align-items: center;
|
||||||
|
text-align: center;
|
||||||
|
min-height: 25vh;
|
||||||
|
}
|
||||||
|
|
||||||
|
.flex-container {
|
||||||
|
padding: 0;
|
||||||
|
margin: 0;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
flex-direction: column;
|
||||||
|
}
|
||||||
|
.flex-item {
|
||||||
|
}
|
||||||
|
.result {
|
||||||
|
display:block;
|
||||||
|
max-width: 60vw;
|
||||||
|
overflow-x: hidden;
|
||||||
|
}
|
||||||
16
client/src/index.html
Normal file
16
client/src/index.html
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
<html>
|
||||||
|
|
||||||
|
<head>
|
||||||
|
<link rel="stylesheet" href="css/styles.css">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="search-container">
|
||||||
|
<input type="text" class="searchbox" id="searchbox">
|
||||||
|
</div>
|
||||||
|
<div class="flex-container">
|
||||||
|
<div class="flex-item" id="results">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script src="js/index.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
28
client/src/js/index.js
Normal file
28
client/src/js/index.js
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
function debounce(func, timeout = 300){
|
||||||
|
let timer;
|
||||||
|
return (...args) => {
|
||||||
|
clearTimeout(timer);
|
||||||
|
timer = setTimeout(() => { func.apply(this, args); }, timeout);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
async function search(searchBox){
|
||||||
|
const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
|
||||||
|
const results = await response.json();
|
||||||
|
|
||||||
|
const resultView = document.getElementById("results");
|
||||||
|
resultView.replaceChildren();
|
||||||
|
for (let i = 0; i < results.length; i++){
|
||||||
|
let result = results[i];
|
||||||
|
let resultElement = document.createElement("a");
|
||||||
|
resultElement.innerText = result[0];
|
||||||
|
resultElement.href = result[0];
|
||||||
|
resultElement.className = "flex-item result";
|
||||||
|
resultView.appendChild(resultElement);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const searchBoxKeyUp = debounce(() => search())
|
||||||
|
|
||||||
|
const searchBox = document.getElementById("searchbox");
|
||||||
|
|
||||||
|
searchBox.addEventListener("keyup", debounce(() => search(searchBox)))
|
||||||
Binary file not shown.
187
src/crawl.py
187
src/crawl.py
|
|
@ -1,49 +1,83 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import requests
|
import requests
|
||||||
import hashlib
|
|
||||||
from urllib.parse import urlparse, urljoin
|
from urllib.parse import urlparse, urljoin
|
||||||
|
import urllib.robotparser
|
||||||
import os
|
import os
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
from config import DATABASE_URI
|
from config import DATABASE_URI
|
||||||
from models import Base, Website
|
from models import Base, Documents
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from sqlalchemy import create_engine
|
|
||||||
import datetime
|
import datetime
|
||||||
|
import yt_dlp as youtube_dl
|
||||||
# TODO- Handle gemini/gopher links
|
# TODO- Handle gemini/gopher links
|
||||||
# TODO- Keep a list of traversed links and check before traversing again
|
|
||||||
|
|
||||||
engine = create_engine(DATABASE_URI)
|
engine = create_engine(DATABASE_URI)
|
||||||
Base.metadata.create_all(engine)
|
Base.metadata.create_all(engine)
|
||||||
Session = sessionmaker(bind=engine)
|
Session = sessionmaker(bind=engine)
|
||||||
|
|
||||||
def get_html(url: str) -> str:
|
excluded_domains = ['amazon.', 'news.ycombinator.',
|
||||||
|
'facebook.com', 'amzn', 'fb.com']
|
||||||
|
|
||||||
|
excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
|
||||||
|
".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_html(url: str) -> str:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
return response.content
|
return response.content
|
||||||
|
|
||||||
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
|
|
||||||
|
|
||||||
print(url)
|
def parse_youtube(video_url: str) -> bool:
|
||||||
print(recursion)
|
return
|
||||||
urlparts = urlparse(url)
|
# Language preference for subtitles (set to None for auto-generated)
|
||||||
baseurl = urlparts.scheme + "://" + urlparts.netloc
|
# Change this to 'en' for English subtitles, or None for auto-generated
|
||||||
soup = BeautifulSoup(html,'html.parser')
|
subtitle_language = 'en'
|
||||||
hash = hashlib.sha256()
|
# Options for youtube_dl
|
||||||
hash.update(url.encode('ascii'))
|
ydl_opts = {
|
||||||
|
'writesubtitles': True,
|
||||||
|
'allsubtitles': True,
|
||||||
|
'skip_download': True, # We only want to fetch metadata
|
||||||
|
'subtitleslangs': [subtitle_language] if subtitle_language else None,
|
||||||
|
'extractor-args': {'youtube': {'player_client': 'ios,web'}},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Initialize youtube_dl object
|
||||||
|
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||||
|
# Download metadata
|
||||||
|
info_dict = ydl.extract_info(video_url, download=False)
|
||||||
|
|
||||||
|
# Extract subtitles
|
||||||
|
subtitles = info_dict.get('subtitles')
|
||||||
|
subtitles_text = ""
|
||||||
|
# Print available subtitles
|
||||||
|
if subtitles:
|
||||||
|
for subs in subtitles.values():
|
||||||
|
for sub in subs:
|
||||||
|
subtitle_url = sub['url']
|
||||||
|
with youtube_dl.YoutubeDL({}) as ydl:
|
||||||
|
subtitle_info = ydl.extract_info(
|
||||||
|
subtitle_url, download=False)
|
||||||
|
for subtitle in subtitle_info['subtitles'][subtitle_language]:
|
||||||
|
if subtitle["ext"] == "srv1":
|
||||||
|
soup = BeautifulSoup(
|
||||||
|
get_html(subtitle["url"]), 'html.parser')
|
||||||
|
subtitles_text = soup.get_text()
|
||||||
|
|
||||||
s = Session()
|
s = Session()
|
||||||
existing_website = s.query(Website).filter_by(url=url).first()
|
existing_website = s.query(
|
||||||
print (existing_website)
|
Documents).filter_by(url=video_url).first()
|
||||||
if existing_website == None:
|
if existing_website is None:
|
||||||
website = Website(
|
website = Documents(
|
||||||
url=url,
|
url=video_url,
|
||||||
text_content=soup.get_text(),
|
text_content=subtitles_text,
|
||||||
html_content=soup.prettify(),
|
html_content=None, # soup.prettify(),
|
||||||
first_crawl_date=datetime.datetime.now(),
|
first_crawl_date=datetime.datetime.now(),
|
||||||
last_crawl_date = datetime.datetime.now()
|
last_crawl_date=datetime.datetime.now(),
|
||||||
|
last_index_date=None
|
||||||
)
|
)
|
||||||
s.add(website)
|
s.add(website)
|
||||||
else:
|
else:
|
||||||
|
|
@ -51,54 +85,127 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
|
||||||
s.add(existing_website)
|
s.add(existing_website)
|
||||||
s.commit()
|
s.commit()
|
||||||
s.close()
|
s.close()
|
||||||
x = open(f'data/links.txt', 'a')
|
|
||||||
x.close()
|
|
||||||
links = soup.find_all("a")
|
def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
|
||||||
|
for domain in excluded_domains:
|
||||||
|
if domain in url:
|
||||||
|
return
|
||||||
|
if any(ext in url for ext in excluded_filetypes):
|
||||||
|
return
|
||||||
|
if "youtube.com" in url:
|
||||||
|
parse_youtube(url)
|
||||||
|
return
|
||||||
|
rp = urllib.robotparser.RobotFileParser()
|
||||||
|
print(url)
|
||||||
|
print(recursion)
|
||||||
|
urlparts = urlparse(url)
|
||||||
|
baseurl = urlparts.scheme + "://" + urlparts.netloc
|
||||||
|
if baseurl not in robots:
|
||||||
|
rp.set_url(baseurl + "/robots.txt")
|
||||||
|
rp.read()
|
||||||
|
robots[baseurl] = rp
|
||||||
|
else:
|
||||||
|
rp = robots[baseurl]
|
||||||
|
if not rp.can_fetch("*", url):
|
||||||
|
print("Robots prevents crawling url: " + url)
|
||||||
|
return
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
s = Session()
|
||||||
|
existing_website = s.query(Documents).filter_by(url=url).first()
|
||||||
|
if existing_website is None:
|
||||||
|
website = Documents(
|
||||||
|
url=url,
|
||||||
|
text_content=soup.get_text(),
|
||||||
|
html_content=soup.prettify(),
|
||||||
|
first_crawl_date=datetime.datetime.now(),
|
||||||
|
last_crawl_date=datetime.datetime.now(),
|
||||||
|
last_index_date=None
|
||||||
|
)
|
||||||
|
s.add(website)
|
||||||
|
else:
|
||||||
|
existing_website.last_crawl_date = datetime.datetime.now()
|
||||||
|
s.add(existing_website)
|
||||||
|
s.commit()
|
||||||
|
s.close()
|
||||||
|
links = soup.find_all("a", href=True)
|
||||||
for link in links:
|
for link in links:
|
||||||
found = False
|
found = False
|
||||||
link = link["href"]
|
link = link["href"]
|
||||||
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
||||||
continue
|
continue
|
||||||
if not "http" in link:
|
if any(ext in link for ext in excluded_filetypes):
|
||||||
|
continue
|
||||||
|
if "http" not in link:
|
||||||
link = urljoin(url, link)
|
link = urljoin(url, link)
|
||||||
|
link = link.split('?')[0]
|
||||||
|
link = link.split('#')[0]
|
||||||
if (recursion > 0 and link not in traversed_links):
|
if (recursion > 0 and link not in traversed_links):
|
||||||
try:
|
try:
|
||||||
traversed_links.append(link)
|
traversed_links.append(link)
|
||||||
link_html = get_html(link)
|
link_html = get_html(link)
|
||||||
r = recursion - 1
|
r = recursion - 1
|
||||||
sleep(1)
|
sleep(0.5)
|
||||||
parse_html(link, link_html, r, traversed_links)
|
parse_html(link, link_html, r, traversed_links)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
# else:
|
# elif link not in traversed_links:
|
||||||
# with open(f'data/links.txt', 'r+') as linksfile:
|
# with open('data/links.txt', 'r+') as linksfile:
|
||||||
# while line := linksfile.readline():
|
# while line := linksfile.readline():
|
||||||
# if line.strip() == link.strip():
|
# if line.strip() == link.strip():
|
||||||
# found = True
|
# found = True
|
||||||
# if not found:
|
# if not found:
|
||||||
# linksfile.write(f'{link}\n')
|
# linksfile.write(f'{link}\n')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
|
def parse_site_map(base_url):
|
||||||
|
map = BeautifulSoup(requests.get(base_url).content, 'xml')
|
||||||
|
print(map.find_all('loc'))
|
||||||
|
for loc in map.find_all('loc'):
|
||||||
|
if "xml" in loc.contents[0]:
|
||||||
|
parse_site_map(loc.contents[0])
|
||||||
|
else:
|
||||||
|
url = loc.contents[0]
|
||||||
|
html = get_html(url)
|
||||||
|
parse_html(url, html, max_recursion)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
os.makedirs("data/content", exist_ok=True)
|
os.makedirs("data/content", exist_ok=True)
|
||||||
# check inputs
|
# check inputs
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("url", help="URL of the webpage to be crawled")
|
parser.add_argument("url", help="URL of the webpage to be crawled")
|
||||||
parser.add_argument('-f', "--followlinks", action="store_true")
|
parser.add_argument('-f', "--followlinks", action="store_true")
|
||||||
max_recursion = 4
|
parser.add_argument('-s', "--crawl-sitemap", action="store_true")
|
||||||
|
parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
max_recursion = int(args.max_recursion)
|
||||||
|
if args.url == "links":
|
||||||
|
with open('data/links.txt', 'r+') as linksfile:
|
||||||
|
while line := linksfile.readline():
|
||||||
|
if "http" in line:
|
||||||
|
try:
|
||||||
|
parse_html(line, get_html(line))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
elif args.crawl_sitemap:
|
||||||
|
rp = urllib.robotparser.RobotFileParser()
|
||||||
|
urlparts = urlparse(args.url)
|
||||||
|
baseurl = urlparts.scheme + "://" + urlparts.netloc
|
||||||
|
rp.set_url(baseurl + "/robots.txt")
|
||||||
|
rp.read()
|
||||||
|
if not rp.can_fetch("*", args.url):
|
||||||
|
print("Robots prevents crawling url: " + args.url)
|
||||||
|
exit(0)
|
||||||
|
if len(rp.site_maps()) > 0:
|
||||||
|
parse_site_map(rp.site_maps()[0])
|
||||||
|
else:
|
||||||
html = get_html(args.url)
|
html = get_html(args.url)
|
||||||
parse_html(args.url, html, max_recursion)
|
parse_html(args.url, html, max_recursion)
|
||||||
|
|
||||||
# recursion = 0
|
# recursion = 0
|
||||||
# if (args.followlinks):
|
# if (args.followlinks):
|
||||||
# with open(f'data/links.txt', 'r+') as linksfile:
|
# os.remove('data/links.txt')
|
||||||
# while line := linksfile.readline():
|
|
||||||
# if recursion < max_recursion:
|
|
||||||
# if "http" in line:
|
|
||||||
# recursion += 1
|
|
||||||
# try:
|
|
||||||
# parse_html(line, get_html(line))
|
|
||||||
# except:
|
|
||||||
# pass
|
|
||||||
os.remove('data/links.txt')
|
|
||||||
|
|
|
||||||
186
src/index.py
186
src/index.py
|
|
@ -1,54 +1,154 @@
|
||||||
from sqlalchemy import create_engine
|
#!/usr/bin/python3
|
||||||
from config import DATABASE_URI
|
|
||||||
from models import Base, Website
|
|
||||||
from pathlib import Path
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
# investigate ngrams for "multi word" matching
|
|
||||||
ignored_words = ['a', 'the','is']
|
|
||||||
|
|
||||||
def remove_punctuation(input_string):
|
import argparse
|
||||||
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
|
from sqlalchemy import create_engine, or_, text
|
||||||
for p in punc:
|
from sqlalchemy import Table, Column, String, Integer
|
||||||
input_string = input_string.replace(p, '')
|
from config import DATABASE_URI
|
||||||
return input_string
|
from sqlalchemy.dialects.postgresql import UUID
|
||||||
|
from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
|
import uuid
|
||||||
|
import datetime
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import random
|
||||||
|
from multiprocessing import Pool
|
||||||
|
|
||||||
|
engine = create_engine(DATABASE_URI)
|
||||||
|
Base.metadata.create_all(engine)
|
||||||
|
Session = sessionmaker(bind=engine)
|
||||||
|
# https://docs.sqlalchemy.org/en/20/orm/queryguide/dml.html
|
||||||
|
|
||||||
|
|
||||||
|
def contains_latin(text):
|
||||||
|
latin_pattern = r'[a-zA-ZÀ-ÖØ-öø-ÿ]'
|
||||||
|
return bool(re.search(latin_pattern, text))
|
||||||
|
|
||||||
|
|
||||||
|
def build_index_chunk(document_chunk):
|
||||||
|
session = Session()
|
||||||
|
print(len(document_chunk))
|
||||||
|
start_time = time.time_ns()
|
||||||
|
for document in document_chunk:
|
||||||
|
print(document.url)
|
||||||
|
content = re.sub(r'[.,?!]', ' ', str(document.text_content))
|
||||||
|
content = re.sub(r'[^\w\s]', '', str(content))
|
||||||
|
content_words = content.split()
|
||||||
|
build_ngrams(1, content_words, document.id)
|
||||||
|
build_ngrams(2, content_words, document.id)
|
||||||
|
build_ngrams(3, content_words, document.id)
|
||||||
|
build_ngrams(4, content_words, document.id)
|
||||||
|
build_ngrams(5, content_words, document.id)
|
||||||
|
|
||||||
|
document.last_index_date = datetime.datetime.now()
|
||||||
|
session.merge(document)
|
||||||
|
session.commit()
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
def build_index():
|
def build_index():
|
||||||
with open(f"data/index.json", "w") as index:
|
while True:
|
||||||
# get a list of all content files
|
session = Session()
|
||||||
# split on whitespace and add to index
|
documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
|
||||||
dictionary = {}
|
None), Documents.last_index_date < Documents.last_crawl_date)).limit(100)
|
||||||
pathlist = Path('data/content').rglob('*.txt')
|
session.close()
|
||||||
for path in pathlist:
|
|
||||||
with open(str(path)) as content_file:
|
# Execute the query to get the result set
|
||||||
url = content_file.readline()
|
documents = list(documents_query)
|
||||||
content = content_file.read()
|
if len(documents) == 0:
|
||||||
content_words = content.split()
|
return
|
||||||
for word in content_words:
|
build_index_chunk(documents)
|
||||||
word = word.lower()
|
continue
|
||||||
word = remove_punctuation(word)
|
chunk_size = 10
|
||||||
if not word in ignored_words:
|
document_chunks = [documents[i:i+chunk_size]
|
||||||
if not word in dictionary:
|
for i in range(0, len(documents), chunk_size)]
|
||||||
dictionary[word] = []
|
with Pool() as pool:
|
||||||
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
|
pool.map(build_index_chunk, document_chunks)
|
||||||
if len(matching_urls) == 0:
|
|
||||||
# if not url.strip() in dictionary[word]:
|
|
||||||
entries = dictionary[word]
|
def zip_ngrams(size: int, corpus, document_id):
|
||||||
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
|
size = int(size)
|
||||||
dictionary[word].append(entry)
|
connection = engine.connect()
|
||||||
|
temptbl_name = 'temp_del_{}'.format(
|
||||||
|
time.time_ns() + random.randint(100000, 9999999))
|
||||||
|
temptbl = Table(temptbl_name, Base.metadata, Column('id', UUID(as_uuid=True), index=True), Column(
|
||||||
|
'gram', String, index=True), Column('size', Integer, index=True), extend_existing=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Start transaction
|
||||||
|
with connection.begin():
|
||||||
|
temptbl.create(engine)
|
||||||
|
insert_grams = []
|
||||||
|
grams = zip(*[corpus[i:] for i in range(size)])
|
||||||
|
for gram in grams:
|
||||||
|
gram = ' '.join(gram).lower()
|
||||||
|
insert_grams.append(
|
||||||
|
{"id": uuid.uuid4(), "gram": gram, "size": size})
|
||||||
|
connection.execute(temptbl.insert().values(insert_grams))
|
||||||
|
connection.execute(text("UPDATE " + temptbl_name +
|
||||||
|
" SET id = ngrams.id FROM ngrams WHERE ngrams.gram = "
|
||||||
|
+ temptbl_name + ".gram;"))
|
||||||
|
connection.execute(text("INSERT INTO ngrams (id, gram, size) SELECT " +
|
||||||
|
" distinct t.id, t.gram as gram, t.size FROM " +
|
||||||
|
temptbl_name + " t LEFT JOIN ngrams on ngrams.gram = " +
|
||||||
|
"t.gram WHERE ngrams.id is null and t.size is not null " + " ON CONFLICT DO NOTHING;"))
|
||||||
|
connection.execute(text("INSERT INTO document_ngrams(id, document_id, ngram_id) SELECT DISTINCT " +
|
||||||
|
"uuid_generate_v4() , '" + str(document_id) + "'::UUID, t.id FROM " + temptbl_name + " t;"))
|
||||||
|
except SQLAlchemyError as e:
|
||||||
|
# Handle exceptions
|
||||||
|
print("An error occurred:", e)
|
||||||
|
# Rollback transaction
|
||||||
|
connection.rollback()
|
||||||
else:
|
else:
|
||||||
entries = dictionary[word]
|
# Commit transaction if no exceptions occurred
|
||||||
entry = matching_urls[0]
|
connection.commit()
|
||||||
entry["count"] += 1
|
finally:
|
||||||
entries.sort(reverse=True, key=lambda entry: entry["count"])
|
connection.close()
|
||||||
index.write(json.dumps(dictionary))
|
# Drop table outside the transaction block
|
||||||
|
temptbl.drop(engine)
|
||||||
|
|
||||||
|
|
||||||
|
def build_ngrams(size: int, corpus: str, document_id: str):
|
||||||
|
session = Session()
|
||||||
|
zip_ngrams(size, corpus, document_id)
|
||||||
|
return
|
||||||
|
i = 0
|
||||||
|
grams = []
|
||||||
|
while i < len(corpus):
|
||||||
|
if i + size >= len(corpus):
|
||||||
|
i = len(corpus)
|
||||||
|
gram = ''
|
||||||
|
for n in range(0, size):
|
||||||
|
if i + n >= len(corpus):
|
||||||
|
break
|
||||||
|
gram += corpus[i+n] + ' '
|
||||||
|
gram = gram.strip().lower()
|
||||||
|
if len(gram) > 1000 or gram in grams or not contains_latin(gram):
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
grams.append(gram)
|
||||||
|
if (len(gram) > 1):
|
||||||
|
ngram = session.query(NGrams).filter_by(
|
||||||
|
gram=gram).filter_by(size=size).first()
|
||||||
|
if ngram is None:
|
||||||
|
ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
|
||||||
|
session.add(ngram)
|
||||||
|
document_ngram = Document_NGrams(
|
||||||
|
document_id=document_id, ngram_id=ngram.id)
|
||||||
|
session.add(document_ngram)
|
||||||
|
session.commit()
|
||||||
|
i += 1
|
||||||
|
# print(str((time.time_ns() - start_time)//1_000_000))
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
|
parser.add_argument('-r',
|
||||||
|
"--rebuild",
|
||||||
|
action="store_true",
|
||||||
|
help="Blow away the index and rebuild")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.rebuild:
|
if args.rebuild:
|
||||||
build_index()
|
build_index()
|
||||||
|
|
||||||
|
|
|
||||||
54
src/index.py.old
Normal file
54
src/index.py.old
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from config import DATABASE_URI
|
||||||
|
from models import Base, Website
|
||||||
|
from pathlib import Path
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
# investigate ngrams for "multi word" matching
|
||||||
|
ignored_words = ['a', 'the','is']
|
||||||
|
|
||||||
|
def remove_punctuation(input_string):
|
||||||
|
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
|
||||||
|
for p in punc:
|
||||||
|
input_string = input_string.replace(p, '')
|
||||||
|
return input_string
|
||||||
|
|
||||||
|
|
||||||
|
def build_index():
|
||||||
|
with open("data/index.json", "w") as index:
|
||||||
|
# get a list of all content files
|
||||||
|
# split on whitespace and add to index
|
||||||
|
dictionary = {}
|
||||||
|
pathlist = Path('data/content').rglob('*.txt')
|
||||||
|
for path in pathlist:
|
||||||
|
with open(str(path)) as content_file:
|
||||||
|
url = content_file.readline()
|
||||||
|
content = content_file.read()
|
||||||
|
content_words = content.split()
|
||||||
|
for word in content_words:
|
||||||
|
word = word.lower()
|
||||||
|
word = remove_punctuation(word)
|
||||||
|
if word not in ignored_words:
|
||||||
|
if word not in dictionary:
|
||||||
|
dictionary[word] = []
|
||||||
|
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
|
||||||
|
if len(matching_urls) == 0:
|
||||||
|
# if not url.strip() in dictionary[word]:
|
||||||
|
entries = dictionary[word]
|
||||||
|
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
|
||||||
|
dictionary[word].append(entry)
|
||||||
|
else:
|
||||||
|
entries = dictionary[word]
|
||||||
|
entry = matching_urls[0]
|
||||||
|
entry["count"] += 1
|
||||||
|
entries.sort(reverse=True, key=lambda entry: entry["count"])
|
||||||
|
index.write(json.dumps(dictionary))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.rebuild:
|
||||||
|
build_index()
|
||||||
|
|
||||||
|
|
@ -1,18 +1,72 @@
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
from sqlalchemy import Column, Integer, String, DateTime
|
from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer
|
||||||
from sqlalchemy.dialects.postgresql import UUID
|
from sqlalchemy.dialects.postgresql import UUID
|
||||||
|
from sqlalchemy.orm import relationship, mapped_column
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
class Website(Base):
|
|
||||||
|
|
||||||
__tablename__ = 'websites'
|
class Documents(Base):
|
||||||
|
__tablename__ = 'documents'
|
||||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
url = Column(String)
|
url = Column(String)
|
||||||
text_content = Column(String)
|
text_content = Column(String)
|
||||||
html_content = Column(String)
|
html_content = Column(String)
|
||||||
first_crawl_date = Column(DateTime)
|
first_crawl_date = Column(DateTime)
|
||||||
last_crawl_date = Column(DateTime)
|
last_crawl_date = Column(DateTime)
|
||||||
|
last_index_date = Column(DateTime)
|
||||||
|
document_tokens = relationship(
|
||||||
|
"Document_Tokens", back_populates="document")
|
||||||
|
document_ngrams = relationship(
|
||||||
|
"Document_NGrams", back_populates="document")
|
||||||
|
|
||||||
|
|
||||||
|
class Document_Tokens(Base):
|
||||||
|
__tablename__ = 'document_tokens'
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
document_id = mapped_column(ForeignKey("documents.id"))
|
||||||
|
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
token_id = mapped_column(ForeignKey("tokens.id"))
|
||||||
|
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
document = relationship(
|
||||||
|
"Documents", back_populates="document_tokens", uselist=False)
|
||||||
|
token = relationship("Tokens", back_populates="document_tokens")
|
||||||
|
__table_args__ = (
|
||||||
|
Index('idx_document_tokens_document_id_token_id', 'document_id',
|
||||||
|
'token_id', unique=True, postgresql_using='hash'),
|
||||||
|
Index('idx_document_tokens_clustered', 'document_id',
|
||||||
|
'token_id', postgresql_using='hash'),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Tokens(Base):
|
||||||
|
__tablename__ = 'tokens'
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
token = Column(String, index=True)
|
||||||
|
document_tokens = relationship("Document_Tokens", back_populates="token")
|
||||||
|
|
||||||
|
|
||||||
|
class NGrams(Base):
|
||||||
|
__tablename__ = 'ngrams'
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
size = Column(Integer, index=True)
|
||||||
|
gram = Column(String, index=True)
|
||||||
|
document_ngrams = relationship("Document_NGrams", back_populates="ngram")
|
||||||
|
|
||||||
|
|
||||||
|
class Document_NGrams(Base):
|
||||||
|
__tablename__ = 'document_ngrams'
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
document_id = mapped_column(ForeignKey("documents.id"))
|
||||||
|
ngram_id = mapped_column(ForeignKey("ngrams.id"))
|
||||||
|
document = relationship(
|
||||||
|
"Documents", back_populates="document_ngrams", uselist=False)
|
||||||
|
ngram = relationship("NGrams", back_populates="document_ngrams")
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
Index('idx_document_ngrams_document_id_ngram_id', 'document_id',
|
||||||
|
'ngram_id', unique=True, postgresql_using='hash'),
|
||||||
|
Index('idx_document_ngrams_clustered', 'document_id',
|
||||||
|
'ngram_id', postgresql_using='hash'),
|
||||||
|
)
|
||||||
|
|
|
||||||
164
src/search.py
164
src/search.py
|
|
@ -1,30 +1,146 @@
|
||||||
#!/bin/bash
|
#!/usr/bin/python3
|
||||||
|
from sqlalchemy import create_engine, func, and_, or_, not_
|
||||||
|
from config import DATABASE_URI
|
||||||
|
from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
from sqlalchemy.sql.expression import distinct
|
||||||
|
import time
|
||||||
from flask import Flask
|
from flask import Flask
|
||||||
from flask import Request
|
from flask_cors import CORS
|
||||||
import json
|
from flask import send_from_directory
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__, static_url_path='/static/')
|
||||||
## Todo - Boolean search (AND/OR/NOT/"")
|
CORS(app)
|
||||||
@app.route("/search/<query>")
|
engine = create_engine(DATABASE_URI)
|
||||||
def search(query):
|
Base.metadata.create_all(engine)
|
||||||
with open('data/index.json', 'r') as index_json:
|
Session = sessionmaker(bind=engine)
|
||||||
index = json.load(index_json)
|
# Todo - Boolean search (AND/OR/NOT/"")
|
||||||
query = unquote(query)
|
|
||||||
query_split = query.split()
|
|
||||||
result = []
|
def split_query(query):
|
||||||
for q in query_split:
|
query = query.lower()
|
||||||
q = q.lower()
|
result = {'ands': [], 'ors': [], 'words': [],
|
||||||
if q in index:
|
'ngrams': [], 'exclusions': []}
|
||||||
for item in index[q]:
|
query_words = query.split()
|
||||||
matching_results = list(filter(lambda entry: entry['url'] == item["url"], result))
|
i = 0
|
||||||
if len(matching_results) == 0:
|
while i < len(query_words):
|
||||||
result.append(item)
|
if i + 1 < len(query_words):
|
||||||
else:
|
if query_words[i + 1].lower() == "and":
|
||||||
matching_results[0]["count"] += item["count"]
|
if i + 2 < len(query_words):
|
||||||
|
result['ands'].append(
|
||||||
|
query_words[i] + ',' + query_words[i+2])
|
||||||
|
i = i + 3
|
||||||
|
continue
|
||||||
|
if query_words[i][0] == '"':
|
||||||
|
n = 0
|
||||||
|
quoted_query = ""
|
||||||
|
while i+n < len(query_words):
|
||||||
|
quoted_query += query_words[i+n] + ' '
|
||||||
|
if query_words[i+n][len(query_words[i+n])-1] == '"':
|
||||||
|
break
|
||||||
|
n += 1
|
||||||
|
result['ngrams'].append(
|
||||||
|
quoted_query[1:len(quoted_query)-2].rstrip())
|
||||||
|
i += n + 1
|
||||||
|
continue
|
||||||
|
elif query_words[i][0] == "-":
|
||||||
|
excluded_query = query_words[i][1: len(query_words[i])]
|
||||||
|
result['exclusions'].append(excluded_query)
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
result['ngrams'].append(query_words[i])
|
||||||
|
i += 1
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def handle_and():
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
@ app.route("/search/<query>")
|
||||||
|
def search(query):
|
||||||
|
start_time = time.time_ns()
|
||||||
|
session = Session()
|
||||||
|
results = {}
|
||||||
|
query_words = split_query(unquote(query))
|
||||||
|
print(query_words)
|
||||||
|
if len(query_words['ands']) > 0:
|
||||||
|
print('entering ands: ' +
|
||||||
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
for a in query_words['ands']:
|
||||||
|
query = session.query(Documents.url, func.count(1)). \
|
||||||
|
join(Document_NGrams, Documents.id == Document_NGrams.document_id). \
|
||||||
|
join(NGrams, Document_NGrams.ngram_id == NGrams.id). \
|
||||||
|
filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\
|
||||||
|
group_by(Documents.url). \
|
||||||
|
having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \
|
||||||
|
order_by(func.count(1).desc())
|
||||||
|
|
||||||
|
# limit(100)
|
||||||
|
print(query)
|
||||||
|
for result in query.all():
|
||||||
|
if result[0] in results.keys():
|
||||||
|
results[result[0]] += result[1]
|
||||||
|
else:
|
||||||
|
results[result[0]] = result[1]
|
||||||
|
print('exiting ands: ' +
|
||||||
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
if len(query_words['ngrams']) > 0:
|
||||||
|
print('entering ngrams: ' +
|
||||||
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
|
||||||
|
q = session.query(Documents.url, func.count(1)) \
|
||||||
|
.join(Document_NGrams, Documents.id == Document_NGrams.document_id) \
|
||||||
|
.join(NGrams, Document_NGrams.ngram_id == NGrams.id) \
|
||||||
|
.group_by(Documents.url)
|
||||||
|
conditions = []
|
||||||
|
for ngram in query_words['ngrams']:
|
||||||
|
conditions.append(
|
||||||
|
(NGrams.size == len(ngram.split(' ')), NGrams.gram == ngram))
|
||||||
|
# q = q.filter_by(size=len(ngram.split(' '))).filter_by(gram=ngram)
|
||||||
|
and_conditions = [and_(*condition_pair)
|
||||||
|
for condition_pair in conditions]
|
||||||
|
q = q.filter(or_(*and_conditions))
|
||||||
|
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
print(q)
|
||||||
|
x = q.limit(100).all()
|
||||||
|
print('query executed: ' +
|
||||||
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
print(x)
|
||||||
|
for result in x:
|
||||||
|
if result[0] in results.keys():
|
||||||
|
results[result[0]] += result[1]
|
||||||
|
else:
|
||||||
|
results[result[0]] = result[1]
|
||||||
|
# for y in x:
|
||||||
|
# print(y)
|
||||||
|
# for document_ngram in y.document_ngrams:
|
||||||
|
# if document_ngram.document.url in results.keys():
|
||||||
|
# results[document_ngram.document.url] += 1
|
||||||
|
# else:
|
||||||
|
# results[document_ngram.document.url] = 1
|
||||||
|
print('exiting ngrams: ' +
|
||||||
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
|
||||||
|
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
session.close()
|
||||||
|
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
|
||||||
|
|
||||||
|
|
||||||
|
# @app.route("/search/<query>")
|
||||||
|
# def search(query):
|
||||||
|
# start_time = time.time_ns()
|
||||||
|
# session = Session()
|
||||||
|
# result = {}
|
||||||
|
# query_words = unquote(query).split()
|
||||||
|
# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
|
||||||
|
# for word in query_words:
|
||||||
|
# word = word.lower()
|
||||||
|
# matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
|
||||||
|
#
|
||||||
|
# if matching_ngram is None:
|
||||||
|
# continue
|
||||||
|
# for document_ngram in matching_ngram.document_ngrams:
|
||||||
|
# if document_ngram.document.url in result.keys():
|
||||||
|
# result[document_ngram.document.url] += 1
|
||||||
|
# else:
|
||||||
|
# result[document_ngram.document.url] = 1
|
||||||
|
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||||
|
|
|
||||||
11
todo
Normal file
11
todo
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
[x] Refactor website table to generic document table (maybe using URN instead of URL?)
|
||||||
|
[x] Define tokens table FKed to document table
|
||||||
|
[x] Refactor index.py to tokenize input into tokens table
|
||||||
|
[x] Define N-Grams table
|
||||||
|
[x] Add N-Gram generation to index.py
|
||||||
|
[x] Add clustered index to document_ngrams table model
|
||||||
|
[x] Add clustered index to document_tokens table model
|
||||||
|
[ ] Add ddl command to create partition tables
|
||||||
|
[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
|
||||||
|
[x] Instead of starting from a random page on the site, go to root and find site map and crawl that
|
||||||
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue