Clean up site map scanning. Return all results instead of 10
This commit is contained in:
parent
2a99a61dbe
commit
bbba459480
6 changed files with 140 additions and 57 deletions
33
client/src/css/styles.css
Normal file
33
client/src/css/styles.css
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
html, body {
|
||||||
|
height: 100%;
|
||||||
|
}
|
||||||
|
body {
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
input {
|
||||||
|
padding: 7px;
|
||||||
|
font-size: 1.1rem;
|
||||||
|
}
|
||||||
|
.search-container {
|
||||||
|
display: flex;
|
||||||
|
justify-content: center;
|
||||||
|
align-items: center;
|
||||||
|
text-align: center;
|
||||||
|
min-height: 25vh;
|
||||||
|
}
|
||||||
|
|
||||||
|
.flex-container {
|
||||||
|
padding: 0;
|
||||||
|
margin: 0;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
flex-direction: column;
|
||||||
|
}
|
||||||
|
.flex-item {
|
||||||
|
}
|
||||||
|
.result {
|
||||||
|
display:block;
|
||||||
|
max-width: 60vw;
|
||||||
|
overflow-x: hidden;
|
||||||
|
}
|
||||||
16
client/src/index.html
Normal file
16
client/src/index.html
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
<html>
|
||||||
|
|
||||||
|
<head>
|
||||||
|
<link rel="stylesheet" href="css/styles.css">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="search-container">
|
||||||
|
<input type="text" class="searchbox" id="searchbox">
|
||||||
|
</div>
|
||||||
|
<div class="flex-container">
|
||||||
|
<div class="flex-item" id="results">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script src="js/index.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
28
client/src/js/index.js
Normal file
28
client/src/js/index.js
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
function debounce(func, timeout = 300){
|
||||||
|
let timer;
|
||||||
|
return (...args) => {
|
||||||
|
clearTimeout(timer);
|
||||||
|
timer = setTimeout(() => { func.apply(this, args); }, timeout);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
async function search(searchBox){
|
||||||
|
const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
|
||||||
|
const results = await response.json();
|
||||||
|
|
||||||
|
const resultView = document.getElementById("results");
|
||||||
|
resultView.replaceChildren();
|
||||||
|
for (let i = 0; i < results.length; i++){
|
||||||
|
let result = results[i];
|
||||||
|
let resultElement = document.createElement("a");
|
||||||
|
resultElement.innerText = result[0];
|
||||||
|
resultElement.href = result[0];
|
||||||
|
resultElement.className = "flex-item result";
|
||||||
|
resultView.appendChild(resultElement);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const searchBoxKeyUp = debounce(() => search())
|
||||||
|
|
||||||
|
const searchBox = document.getElementById("searchbox");
|
||||||
|
|
||||||
|
searchBox.addEventListener("keyup", debounce(() => search(searchBox)))
|
||||||
42
src/crawl.py
42
src/crawl.py
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import requests
|
import requests
|
||||||
import hashlib
|
|
||||||
from urllib.parse import urlparse, urljoin
|
from urllib.parse import urlparse, urljoin
|
||||||
import urllib.robotparser
|
import urllib.robotparser
|
||||||
import os
|
import os
|
||||||
|
|
@ -10,7 +9,7 @@ from time import sleep
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
from config import DATABASE_URI
|
from config import DATABASE_URI
|
||||||
from models import Base, Documents, Document_Tokens
|
from models import Base, Documents
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
import datetime
|
import datetime
|
||||||
import yt_dlp as youtube_dl
|
import yt_dlp as youtube_dl
|
||||||
|
|
@ -23,7 +22,7 @@ Session = sessionmaker(bind=engine)
|
||||||
excluded_domains = ['amazon.', 'news.ycombinator.',
|
excluded_domains = ['amazon.', 'news.ycombinator.',
|
||||||
'facebook.com', 'amzn', 'fb.com']
|
'facebook.com', 'amzn', 'fb.com']
|
||||||
|
|
||||||
excluded_filetypes = [".jpg", ".xml", ".mp4",
|
excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
|
||||||
".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
|
".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -33,6 +32,7 @@ def get_html(url: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
def parse_youtube(video_url: str) -> bool:
|
def parse_youtube(video_url: str) -> bool:
|
||||||
|
return
|
||||||
# Language preference for subtitles (set to None for auto-generated)
|
# Language preference for subtitles (set to None for auto-generated)
|
||||||
# Change this to 'en' for English subtitles, or None for auto-generated
|
# Change this to 'en' for English subtitles, or None for auto-generated
|
||||||
subtitle_language = 'en'
|
subtitle_language = 'en'
|
||||||
|
|
@ -91,6 +91,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
|
||||||
for domain in excluded_domains:
|
for domain in excluded_domains:
|
||||||
if domain in url:
|
if domain in url:
|
||||||
return
|
return
|
||||||
|
if any(ext in url for ext in excluded_filetypes):
|
||||||
|
return
|
||||||
if "youtube.com" in url:
|
if "youtube.com" in url:
|
||||||
parse_youtube(url)
|
parse_youtube(url)
|
||||||
return
|
return
|
||||||
|
|
@ -110,8 +112,6 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
|
||||||
return
|
return
|
||||||
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
hash = hashlib.sha256()
|
|
||||||
hash.update(url.encode('ascii'))
|
|
||||||
|
|
||||||
s = Session()
|
s = Session()
|
||||||
existing_website = s.query(Documents).filter_by(url=url).first()
|
existing_website = s.query(Documents).filter_by(url=url).first()
|
||||||
|
|
@ -151,13 +151,25 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
|
||||||
parse_html(link, link_html, r, traversed_links)
|
parse_html(link, link_html, r, traversed_links)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
elif link not in traversed_links:
|
# elif link not in traversed_links:
|
||||||
with open('data/links.txt', 'r+') as linksfile:
|
# with open('data/links.txt', 'r+') as linksfile:
|
||||||
while line := linksfile.readline():
|
# while line := linksfile.readline():
|
||||||
if line.strip() == link.strip():
|
# if line.strip() == link.strip():
|
||||||
found = True
|
# found = True
|
||||||
if not found:
|
# if not found:
|
||||||
linksfile.write(f'{link}\n')
|
# linksfile.write(f'{link}\n')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_site_map(base_url):
|
||||||
|
map = BeautifulSoup(requests.get(base_url).content, 'xml')
|
||||||
|
print(map.find_all('loc'))
|
||||||
|
for loc in map.find_all('loc'):
|
||||||
|
if "xml" in loc.contents[0]:
|
||||||
|
parse_site_map(loc.contents[0])
|
||||||
|
else:
|
||||||
|
url = loc.contents[0]
|
||||||
|
html = get_html(url)
|
||||||
|
parse_html(url, html, max_recursion)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
@ -189,11 +201,7 @@ if __name__ == "__main__":
|
||||||
print("Robots prevents crawling url: " + args.url)
|
print("Robots prevents crawling url: " + args.url)
|
||||||
exit(0)
|
exit(0)
|
||||||
if len(rp.site_maps()) > 0:
|
if len(rp.site_maps()) > 0:
|
||||||
map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml')
|
parse_site_map(rp.site_maps()[0])
|
||||||
for loc in map.find_all('loc'):
|
|
||||||
url = loc.contents[0]
|
|
||||||
html = get_html(url)
|
|
||||||
parse_html(url, html, max_recursion)
|
|
||||||
else:
|
else:
|
||||||
html = get_html(args.url)
|
html = get_html(args.url)
|
||||||
parse_html(args.url, html, max_recursion)
|
parse_html(args.url, html, max_recursion)
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,17 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
from sqlalchemy import create_engine, func, and_, or_
|
from sqlalchemy import create_engine, func, and_, or_, not_
|
||||||
from config import DATABASE_URI
|
from config import DATABASE_URI
|
||||||
from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams
|
from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from sqlalchemy.sql.expression import distinct
|
from sqlalchemy.sql.expression import distinct
|
||||||
import time
|
import time
|
||||||
from flask import Flask
|
from flask import Flask
|
||||||
|
from flask_cors import CORS
|
||||||
|
from flask import send_from_directory
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__, static_url_path='/static/')
|
||||||
|
CORS(app)
|
||||||
engine = create_engine(DATABASE_URI)
|
engine = create_engine(DATABASE_URI)
|
||||||
Base.metadata.create_all(engine)
|
Base.metadata.create_all(engine)
|
||||||
Session = sessionmaker(bind=engine)
|
Session = sessionmaker(bind=engine)
|
||||||
|
|
@ -16,7 +19,9 @@ Session = sessionmaker(bind=engine)
|
||||||
|
|
||||||
|
|
||||||
def split_query(query):
|
def split_query(query):
|
||||||
result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []}
|
query = query.lower()
|
||||||
|
result = {'ands': [], 'ors': [], 'words': [],
|
||||||
|
'ngrams': [], 'exclusions': []}
|
||||||
query_words = query.split()
|
query_words = query.split()
|
||||||
i = 0
|
i = 0
|
||||||
while i < len(query_words):
|
while i < len(query_words):
|
||||||
|
|
@ -39,6 +44,11 @@ def split_query(query):
|
||||||
quoted_query[1:len(quoted_query)-2].rstrip())
|
quoted_query[1:len(quoted_query)-2].rstrip())
|
||||||
i += n + 1
|
i += n + 1
|
||||||
continue
|
continue
|
||||||
|
elif query_words[i][0] == "-":
|
||||||
|
excluded_query = query_words[i][1: len(query_words[i])]
|
||||||
|
result['exclusions'].append(excluded_query)
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
result['ngrams'].append(query_words[i])
|
result['ngrams'].append(query_words[i])
|
||||||
i += 1
|
i += 1
|
||||||
return result
|
return result
|
||||||
|
|
@ -52,19 +62,26 @@ def search(query):
|
||||||
query_words = split_query(unquote(query))
|
query_words = split_query(unquote(query))
|
||||||
print(query_words)
|
print(query_words)
|
||||||
if len(query_words['ands']) > 0:
|
if len(query_words['ands']) > 0:
|
||||||
|
print('entering ands: ' +
|
||||||
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
for a in query_words['ands']:
|
for a in query_words['ands']:
|
||||||
query = session.query(Documents.url, func.count(1)). \
|
query = session.query(Documents.url, func.count(1)). \
|
||||||
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
|
join(Document_NGrams, Documents.id == Document_NGrams.document_id). \
|
||||||
join(Tokens, Document_Tokens.token_id == Tokens.id).\
|
join(NGrams, Document_NGrams.ngram_id == NGrams.id). \
|
||||||
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
|
filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\
|
||||||
group_by(Documents.url).\
|
group_by(Documents.url). \
|
||||||
having(func.count(distinct(Document_Tokens.token_id)) == 2).\
|
having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \
|
||||||
order_by(func.count(1).desc())
|
order_by(func.count(1).desc())
|
||||||
|
|
||||||
|
# limit(100)
|
||||||
|
print(query)
|
||||||
for result in query.all():
|
for result in query.all():
|
||||||
if result[0] in results.keys():
|
if result[0] in results.keys():
|
||||||
results[result[0]] += result[1]
|
results[result[0]] += result[1]
|
||||||
else:
|
else:
|
||||||
results[result[0]] = result[1]
|
results[result[0]] = result[1]
|
||||||
|
print('exiting ands: ' +
|
||||||
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
if len(query_words['ngrams']) > 0:
|
if len(query_words['ngrams']) > 0:
|
||||||
print('entering ngrams: ' +
|
print('entering ngrams: ' +
|
||||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
|
|
@ -83,7 +100,7 @@ def search(query):
|
||||||
q = q.filter(or_(*and_conditions))
|
q = q.filter(or_(*and_conditions))
|
||||||
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
print(q)
|
print(q)
|
||||||
x = q.all()
|
x = q.limit(100).all()
|
||||||
print('query executed: ' +
|
print('query executed: ' +
|
||||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
print(x)
|
print(x)
|
||||||
|
|
@ -101,30 +118,11 @@ def search(query):
|
||||||
# results[document_ngram.document.url] = 1
|
# results[document_ngram.document.url] = 1
|
||||||
print('exiting ngrams: ' +
|
print('exiting ngrams: ' +
|
||||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
if len(query_words['words']) > 0:
|
|
||||||
print('entering words: ' +
|
|
||||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
||||||
q = session.query(Documents.url, func.count(1)) \
|
|
||||||
.join(Document_Tokens, Documents.id == Document_Tokens.document_id) \
|
|
||||||
.join(Tokens, Document_Tokens.token_id == Tokens.id) \
|
|
||||||
.group_by(Documents.url).filter(Tokens.token.in_(query_words['words']))
|
|
||||||
|
|
||||||
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
||||||
print(q)
|
|
||||||
x = q.all()
|
|
||||||
print('query executed: ' +
|
|
||||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
||||||
for result in x:
|
|
||||||
if result[0] in results.keys():
|
|
||||||
results[result[0]] += result[1]
|
|
||||||
else:
|
|
||||||
results[result[0]] = result[1]
|
|
||||||
print('exiting words: ' +
|
|
||||||
str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
|
||||||
|
|
||||||
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
session.close()
|
session.close()
|
||||||
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
|
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
|
||||||
|
|
||||||
|
|
||||||
# @app.route("/search/<query>")
|
# @app.route("/search/<query>")
|
||||||
# def search(query):
|
# def search(query):
|
||||||
|
|
@ -132,17 +130,17 @@ def search(query):
|
||||||
# session = Session()
|
# session = Session()
|
||||||
# result = {}
|
# result = {}
|
||||||
# query_words = unquote(query).split()
|
# query_words = unquote(query).split()
|
||||||
# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000)
|
# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
|
||||||
# for word in query_words:
|
# for word in query_words:
|
||||||
# word = word.lower()
|
# word = word.lower()
|
||||||
# matching_token = session.query(Tokens).filter_by(token=word).first()
|
# matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
|
||||||
#
|
#
|
||||||
# if matching_token is None:
|
# if matching_ngram is None:
|
||||||
# continue
|
# continue
|
||||||
# for document_token in matching_token.document_tokens:
|
# for document_ngram in matching_ngram.document_ngrams:
|
||||||
# if document_token.document.url in result.keys():
|
# if document_ngram.document.url in result.keys():
|
||||||
# result[document_token.document.url] += 1
|
# result[document_ngram.document.url] += 1
|
||||||
# else:
|
# else:
|
||||||
# result[document_token.document.url] = 1
|
# result[document_ngram.document.url] = 1
|
||||||
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
|
||||||
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
|
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||||
|
|
|
||||||
4
todo
4
todo
|
|
@ -6,6 +6,6 @@
|
||||||
[x] Add clustered index to document_ngrams table model
|
[x] Add clustered index to document_ngrams table model
|
||||||
[x] Add clustered index to document_tokens table model
|
[x] Add clustered index to document_tokens table model
|
||||||
[ ] Add ddl command to create partition tables
|
[ ] Add ddl command to create partition tables
|
||||||
[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
|
[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
|
||||||
[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that
|
[x] Instead of starting from a random page on the site, go to root and find site map and crawl that
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue