Clean up site map scanning. Return all results instead of 10

This commit is contained in:
rmgr 2024-06-09 21:53:57 +09:30
parent 2a99a61dbe
commit bbba459480
6 changed files with 140 additions and 57 deletions

33
client/src/css/styles.css Normal file
View file

@ -0,0 +1,33 @@
html, body {
height: 100%;
}
body {
margin: 0;
}
input {
padding: 7px;
font-size: 1.1rem;
}
.search-container {
display: flex;
justify-content: center;
align-items: center;
text-align: center;
min-height: 25vh;
}
.flex-container {
padding: 0;
margin: 0;
display: flex;
align-items: center;
justify-content: center;
flex-direction: column;
}
.flex-item {
}
.result {
display:block;
max-width: 60vw;
overflow-x: hidden;
}

16
client/src/index.html Normal file
View file

@ -0,0 +1,16 @@
<html>
<head>
<link rel="stylesheet" href="css/styles.css">
</head>
<body>
<div class="search-container">
<input type="text" class="searchbox" id="searchbox">
</div>
<div class="flex-container">
<div class="flex-item" id="results">
</div>
</div>
<script src="js/index.js"></script>
</body>
</html>

28
client/src/js/index.js Normal file
View file

@ -0,0 +1,28 @@
function debounce(func, timeout = 300){
let timer;
return (...args) => {
clearTimeout(timer);
timer = setTimeout(() => { func.apply(this, args); }, timeout);
};
}
async function search(searchBox){
const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
const results = await response.json();
const resultView = document.getElementById("results");
resultView.replaceChildren();
for (let i = 0; i < results.length; i++){
let result = results[i];
let resultElement = document.createElement("a");
resultElement.innerText = result[0];
resultElement.href = result[0];
resultElement.className = "flex-item result";
resultView.appendChild(resultElement);
}
}
const searchBoxKeyUp = debounce(() => search())
const searchBox = document.getElementById("searchbox");
searchBox.addEventListener("keyup", debounce(() => search(searchBox)))

View file

@ -2,7 +2,6 @@
import argparse import argparse
import requests import requests
import hashlib
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
import urllib.robotparser import urllib.robotparser
import os import os
@ -10,7 +9,7 @@ from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from sqlalchemy import create_engine from sqlalchemy import create_engine
from config import DATABASE_URI from config import DATABASE_URI
from models import Base, Documents, Document_Tokens from models import Base, Documents
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
import datetime import datetime
import yt_dlp as youtube_dl import yt_dlp as youtube_dl
@ -23,7 +22,7 @@ Session = sessionmaker(bind=engine)
excluded_domains = ['amazon.', 'news.ycombinator.', excluded_domains = ['amazon.', 'news.ycombinator.',
'facebook.com', 'amzn', 'fb.com'] 'facebook.com', 'amzn', 'fb.com']
excluded_filetypes = [".jpg", ".xml", ".mp4", excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"] ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]
@ -33,6 +32,7 @@ def get_html(url: str) -> str:
def parse_youtube(video_url: str) -> bool: def parse_youtube(video_url: str) -> bool:
return
# Language preference for subtitles (set to None for auto-generated) # Language preference for subtitles (set to None for auto-generated)
# Change this to 'en' for English subtitles, or None for auto-generated # Change this to 'en' for English subtitles, or None for auto-generated
subtitle_language = 'en' subtitle_language = 'en'
@ -91,6 +91,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
for domain in excluded_domains: for domain in excluded_domains:
if domain in url: if domain in url:
return return
if any(ext in url for ext in excluded_filetypes):
return
if "youtube.com" in url: if "youtube.com" in url:
parse_youtube(url) parse_youtube(url)
return return
@ -110,8 +112,6 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
return return
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
hash = hashlib.sha256()
hash.update(url.encode('ascii'))
s = Session() s = Session()
existing_website = s.query(Documents).filter_by(url=url).first() existing_website = s.query(Documents).filter_by(url=url).first()
@ -151,13 +151,25 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
parse_html(link, link_html, r, traversed_links) parse_html(link, link_html, r, traversed_links)
except: except:
pass pass
elif link not in traversed_links: # elif link not in traversed_links:
with open('data/links.txt', 'r+') as linksfile: # with open('data/links.txt', 'r+') as linksfile:
while line := linksfile.readline(): # while line := linksfile.readline():
if line.strip() == link.strip(): # if line.strip() == link.strip():
found = True # found = True
if not found: # if not found:
linksfile.write(f'{link}\n') # linksfile.write(f'{link}\n')
def parse_site_map(base_url):
map = BeautifulSoup(requests.get(base_url).content, 'xml')
print(map.find_all('loc'))
for loc in map.find_all('loc'):
if "xml" in loc.contents[0]:
parse_site_map(loc.contents[0])
else:
url = loc.contents[0]
html = get_html(url)
parse_html(url, html, max_recursion)
if __name__ == "__main__": if __name__ == "__main__":
@ -189,11 +201,7 @@ if __name__ == "__main__":
print("Robots prevents crawling url: " + args.url) print("Robots prevents crawling url: " + args.url)
exit(0) exit(0)
if len(rp.site_maps()) > 0: if len(rp.site_maps()) > 0:
map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml') parse_site_map(rp.site_maps()[0])
for loc in map.find_all('loc'):
url = loc.contents[0]
html = get_html(url)
parse_html(url, html, max_recursion)
else: else:
html = get_html(args.url) html = get_html(args.url)
parse_html(args.url, html, max_recursion) parse_html(args.url, html, max_recursion)

View file

@ -1,14 +1,17 @@
#!/usr/bin/python3 #!/usr/bin/python3
from sqlalchemy import create_engine, func, and_, or_ from sqlalchemy import create_engine, func, and_, or_, not_
from config import DATABASE_URI from config import DATABASE_URI
from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.expression import distinct from sqlalchemy.sql.expression import distinct
import time import time
from flask import Flask from flask import Flask
from flask_cors import CORS
from flask import send_from_directory
from urllib.parse import unquote from urllib.parse import unquote
app = Flask(__name__) app = Flask(__name__, static_url_path='/static/')
CORS(app)
engine = create_engine(DATABASE_URI) engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine) Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine) Session = sessionmaker(bind=engine)
@ -16,7 +19,9 @@ Session = sessionmaker(bind=engine)
def split_query(query): def split_query(query):
result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []} query = query.lower()
result = {'ands': [], 'ors': [], 'words': [],
'ngrams': [], 'exclusions': []}
query_words = query.split() query_words = query.split()
i = 0 i = 0
while i < len(query_words): while i < len(query_words):
@ -39,6 +44,11 @@ def split_query(query):
quoted_query[1:len(quoted_query)-2].rstrip()) quoted_query[1:len(quoted_query)-2].rstrip())
i += n + 1 i += n + 1
continue continue
elif query_words[i][0] == "-":
excluded_query = query_words[i][1: len(query_words[i])]
result['exclusions'].append(excluded_query)
i += 1
continue
result['ngrams'].append(query_words[i]) result['ngrams'].append(query_words[i])
i += 1 i += 1
return result return result
@ -52,19 +62,26 @@ def search(query):
query_words = split_query(unquote(query)) query_words = split_query(unquote(query))
print(query_words) print(query_words)
if len(query_words['ands']) > 0: if len(query_words['ands']) > 0:
print('entering ands: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
for a in query_words['ands']: for a in query_words['ands']:
query = session.query(Documents.url, func.count(1)). \ query = session.query(Documents.url, func.count(1)). \
join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ join(Document_NGrams, Documents.id == Document_NGrams.document_id). \
join(Tokens, Document_Tokens.token_id == Tokens.id).\ join(NGrams, Document_NGrams.ngram_id == NGrams.id). \
filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\
group_by(Documents.url). \ group_by(Documents.url). \
having(func.count(distinct(Document_Tokens.token_id)) == 2).\ having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \
order_by(func.count(1).desc()) order_by(func.count(1).desc())
# limit(100)
print(query)
for result in query.all(): for result in query.all():
if result[0] in results.keys(): if result[0] in results.keys():
results[result[0]] += result[1] results[result[0]] += result[1]
else: else:
results[result[0]] = result[1] results[result[0]] = result[1]
print('exiting ands: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
if len(query_words['ngrams']) > 0: if len(query_words['ngrams']) > 0:
print('entering ngrams: ' + print('entering ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms") str((time.time_ns() - start_time) // 1_000_000) + "ms")
@ -83,7 +100,7 @@ def search(query):
q = q.filter(or_(*and_conditions)) q = q.filter(or_(*and_conditions))
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(q) print(q)
x = q.all() x = q.limit(100).all()
print('query executed: ' + print('query executed: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms") str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(x) print(x)
@ -101,30 +118,11 @@ def search(query):
# results[document_ngram.document.url] = 1 # results[document_ngram.document.url] = 1
print('exiting ngrams: ' + print('exiting ngrams: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms") str((time.time_ns() - start_time) // 1_000_000) + "ms")
if len(query_words['words']) > 0:
print('entering words: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
q = session.query(Documents.url, func.count(1)) \
.join(Document_Tokens, Documents.id == Document_Tokens.document_id) \
.join(Tokens, Document_Tokens.token_id == Tokens.id) \
.group_by(Documents.url).filter(Tokens.token.in_(query_words['words']))
print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(q)
x = q.all()
print('query executed: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
for result in x:
if result[0] in results.keys():
results[result[0]] += result[1]
else:
results[result[0]] = result[1]
print('exiting words: ' +
str((time.time_ns() - start_time) // 1_000_000) + "ms")
print(str((time.time_ns() - start_time) // 1_000_000) + "ms") print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
session.close() session.close()
return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10] return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
# @app.route("/search/<query>") # @app.route("/search/<query>")
# def search(query): # def search(query):
@ -132,17 +130,17 @@ def search(query):
# session = Session() # session = Session()
# result = {} # result = {}
# query_words = unquote(query).split() # query_words = unquote(query).split()
# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000) # x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
# for word in query_words: # for word in query_words:
# word = word.lower() # word = word.lower()
# matching_token = session.query(Tokens).filter_by(token=word).first() # matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
# #
# if matching_token is None: # if matching_ngram is None:
# continue # continue
# for document_token in matching_token.document_tokens: # for document_ngram in matching_ngram.document_ngrams:
# if document_token.document.url in result.keys(): # if document_ngram.document.url in result.keys():
# result[document_token.document.url] += 1 # result[document_ngram.document.url] += 1
# else: # else:
# result[document_token.document.url] = 1 # result[document_ngram.document.url] = 1
# print(str((time.time_ns() - start_time) // 1_000_000) + "ms") # print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
# return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10] # return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]

4
todo
View file

@ -6,6 +6,6 @@
[x] Add clustered index to document_ngrams table model [x] Add clustered index to document_ngrams table model
[x] Add clustered index to document_tokens table model [x] Add clustered index to document_tokens table model
[ ] Add ddl command to create partition tables [ ] Add ddl command to create partition tables
[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be [x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that [x] Instead of starting from a random page on the site, go to root and find site map and crawl that