diff --git a/client/src/css/styles.css b/client/src/css/styles.css
new file mode 100644
index 0000000..37323ab
--- /dev/null
+++ b/client/src/css/styles.css
@@ -0,0 +1,33 @@
+html, body {
+ height: 100%;
+}
+body {
+ margin: 0;
+}
+input {
+ padding: 7px;
+ font-size: 1.1rem;
+}
+.search-container {
+ display: flex;
+ justify-content: center;
+ align-items: center;
+ text-align: center;
+ min-height: 25vh;
+}
+
+.flex-container {
+ padding: 0;
+ margin: 0;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ flex-direction: column;
+}
+.flex-item {
+}
+.result {
+ display:block;
+ max-width: 60vw;
+ overflow-x: hidden;
+}
diff --git a/client/src/index.html b/client/src/index.html
new file mode 100644
index 0000000..a748d6c
--- /dev/null
+++ b/client/src/index.html
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/client/src/js/index.js b/client/src/js/index.js
new file mode 100644
index 0000000..09b0bb2
--- /dev/null
+++ b/client/src/js/index.js
@@ -0,0 +1,28 @@
+function debounce(func, timeout = 300){
+ let timer;
+ return (...args) => {
+ clearTimeout(timer);
+ timer = setTimeout(() => { func.apply(this, args); }, timeout);
+ };
+}
+async function search(searchBox){
+ const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
+ const results = await response.json();
+
+ const resultView = document.getElementById("results");
+ resultView.replaceChildren();
+ for (let i = 0; i < results.length; i++){
+ let result = results[i];
+ let resultElement = document.createElement("a");
+ resultElement.innerText = result[0];
+ resultElement.href = result[0];
+ resultElement.className = "flex-item result";
+ resultView.appendChild(resultElement);
+ }
+}
+
+const searchBoxKeyUp = debounce(() => search())
+
+const searchBox = document.getElementById("searchbox");
+
+searchBox.addEventListener("keyup", debounce(() => search(searchBox)))
diff --git a/src/crawl.py b/src/crawl.py
index 0816e1b..1480b4e 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -2,7 +2,6 @@
import argparse
import requests
-import hashlib
from urllib.parse import urlparse, urljoin
import urllib.robotparser
import os
@@ -10,7 +9,7 @@ from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
-from models import Base, Documents, Document_Tokens
+from models import Base, Documents
from sqlalchemy.orm import sessionmaker
import datetime
import yt_dlp as youtube_dl
@@ -23,7 +22,7 @@ Session = sessionmaker(bind=engine)
excluded_domains = ['amazon.', 'news.ycombinator.',
'facebook.com', 'amzn', 'fb.com']
-excluded_filetypes = [".jpg", ".xml", ".mp4",
+excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
@@ -33,6 +32,7 @@ def get_html(url: str) -> str:
def parse_youtube(video_url: str) -> bool:
+ return
# Language preference for subtitles (set to None for auto-generated)
# Change this to 'en' for English subtitles, or None for auto-generated
subtitle_language = 'en'
@@ -91,6 +91,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
for domain in excluded_domains:
if domain in url:
return
+ if any(ext in url for ext in excluded_filetypes):
+ return
if "youtube.com" in url:
parse_youtube(url)
return
@@ -110,8 +112,6 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
return
soup = BeautifulSoup(html, 'html.parser')
- hash = hashlib.sha256()
- hash.update(url.encode('ascii'))
s = Session()
existing_website = s.query(Documents).filter_by(url=url).first()
@@ -151,13 +151,25 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
parse_html(link, link_html, r, traversed_links)
except:
pass
- elif link not in traversed_links:
- with open('data/links.txt', 'r+') as linksfile:
- while line := linksfile.readline():
- if line.strip() == link.strip():
- found = True
- if not found:
- linksfile.write(f'{link}\n')
+# elif link not in traversed_links:
+# with open('data/links.txt', 'r+') as linksfile:
+# while line := linksfile.readline():
+# if line.strip() == link.strip():
+# found = True
+# if not found:
+# linksfile.write(f'{link}\n')
+
+
+def parse_site_map(base_url):
+ map = BeautifulSoup(requests.get(base_url).content, 'xml')
+ print(map.find_all('loc'))
+ for loc in map.find_all('loc'):
+ if "xml" in loc.contents[0]:
+ parse_site_map(loc.contents[0])
+ else:
+ url = loc.contents[0]
+ html = get_html(url)
+ parse_html(url, html, max_recursion)
if __name__ == "__main__":
@@ -189,11 +201,7 @@ if __name__ == "__main__":
print("Robots prevents crawling url: " + args.url)
exit(0)
if len(rp.site_maps()) > 0:
- map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml')
- for loc in map.find_all('loc'):
- url = loc.contents[0]
- html = get_html(url)
- parse_html(url, html, max_recursion)
+ parse_site_map(rp.site_maps()[0])
else:
html = get_html(args.url)
parse_html(args.url, html, max_recursion)
diff --git a/src/search.py b/src/search.py
index d643eb2..fd013bc 100755
--- a/src/search.py
+++ b/src/search.py
@@ -1,14 +1,17 @@
#!/usr/bin/python3
-from sqlalchemy import create_engine, func, and_, or_
+from sqlalchemy import create_engine, func, and_, or_, not_
from config import DATABASE_URI
-from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams
+from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.expression import distinct
import time
from flask import Flask
+from flask_cors import CORS
+from flask import send_from_directory
from urllib.parse import unquote
-app = Flask(__name__)
+app = Flask(__name__, static_url_path='/static/')
+CORS(app)
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
@@ -16,7 +19,9 @@ Session = sessionmaker(bind=engine)
def split_query(query):
- result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []}
+ query = query.lower()
+ result = {'ands': [], 'ors': [], 'words': [],
+ 'ngrams': [], 'exclusions': []}
query_words = query.split()
i = 0
while i < len(query_words):
@@ -39,6 +44,11 @@ def split_query(query):
quoted_query[1:len(quoted_query)-2].rstrip())
i += n + 1
continue
+ elif query_words[i][0] == "-":
+ excluded_query = query_words[i][1: len(query_words[i])]
+ result['exclusions'].append(excluded_query)
+ i += 1
+ continue
result['ngrams'].append(query_words[i])
i += 1
return result
@@ -52,19 +62,26 @@ def search(query):
query_words = split_query(unquote(query))
print(query_words)
if len(query_words['ands']) > 0:
+ print('entering ands: ' +
+ str((time.time_ns() - start_time) // 1_000_000) + "ms")
for a in query_words['ands']:
query = session.query(Documents.url, func.count(1)). \
- join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
- join(Tokens, Document_Tokens.token_id == Tokens.id).\
- filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
- group_by(Documents.url).\
- having(func.count(distinct(Document_Tokens.token_id)) == 2).\
+ join(Document_NGrams, Documents.id == Document_NGrams.document_id). \
+ join(NGrams, Document_NGrams.ngram_id == NGrams.id). \
+ filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\
+ group_by(Documents.url). \
+ having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \
order_by(func.count(1).desc())
+
+# limit(100)
+ print(query)
for result in query.all():
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
+ print('exiting ands: ' +
+ str((time.time_ns() - start_time) // 1_000_000) + "ms")
if len(query_words['ngrams']) > 0:
print('entering ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
@@ -83,7 +100,7 @@ def search(query):
q = q.filter(or_(*and_conditions))
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(q)
- x = q.all()
+ x = q.limit(100).all()
print('query executed: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(x)
@@ -101,30 +118,11 @@ def search(query):
# results[document_ngram.document.url] = 1
print('exiting ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
- if len(query_words['words']) > 0:
- print('entering words: ' +
- str((time.time_ns() - start_time) // 1_000_000) + "ms")
- q = session.query(Documents.url, func.count(1)) \
- .join(Document_Tokens, Documents.id == Document_Tokens.document_id) \
- .join(Tokens, Document_Tokens.token_id == Tokens.id) \
- .group_by(Documents.url).filter(Tokens.token.in_(query_words['words']))
-
- print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
- print(q)
- x = q.all()
- print('query executed: ' +
- str((time.time_ns() - start_time) // 1_000_000) + "ms")
- for result in x:
- if result[0] in results.keys():
- results[result[0]] += result[1]
- else:
- results[result[0]] = result[1]
- print('exiting words: ' +
- str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
session.close()
- return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
+ return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
+
# @app.route("/search/")
# def search(query):
@@ -132,17 +130,17 @@ def search(query):
# session = Session()
# result = {}
# query_words = unquote(query).split()
-# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000)
+# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
# for word in query_words:
# word = word.lower()
-# matching_token = session.query(Tokens).filter_by(token=word).first()
+# matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
#
-# if matching_token is None:
+# if matching_ngram is None:
# continue
-# for document_token in matching_token.document_tokens:
-# if document_token.document.url in result.keys():
-# result[document_token.document.url] += 1
+# for document_ngram in matching_ngram.document_ngrams:
+# if document_ngram.document.url in result.keys():
+# result[document_ngram.document.url] += 1
# else:
-# result[document_token.document.url] = 1
+# result[document_ngram.document.url] = 1
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
diff --git a/todo b/todo
index 2f5f3e5..ddda3bd 100644
--- a/todo
+++ b/todo
@@ -6,6 +6,6 @@
[x] Add clustered index to document_ngrams table model
[x] Add clustered index to document_tokens table model
[ ] Add ddl command to create partition tables
-[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
-[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that
+[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
+[x] Instead of starting from a random page on the site, go to root and find site map and crawl that