From bbba459480971d678880584efd7871d7acc5328d Mon Sep 17 00:00:00 2001 From: rmgr Date: Sun, 9 Jun 2024 21:53:57 +0930 Subject: [PATCH] Clean up site map scanning. Return all results instead of 10 --- client/src/css/styles.css | 33 +++++++++++++++++ client/src/index.html | 16 +++++++++ client/src/js/index.js | 28 +++++++++++++++ src/crawl.py | 42 +++++++++++++--------- src/search.py | 74 +++++++++++++++++++-------------------- todo | 4 +-- 6 files changed, 140 insertions(+), 57 deletions(-) create mode 100644 client/src/css/styles.css create mode 100644 client/src/index.html create mode 100644 client/src/js/index.js diff --git a/client/src/css/styles.css b/client/src/css/styles.css new file mode 100644 index 0000000..37323ab --- /dev/null +++ b/client/src/css/styles.css @@ -0,0 +1,33 @@ +html, body { + height: 100%; +} +body { + margin: 0; +} +input { + padding: 7px; + font-size: 1.1rem; +} +.search-container { + display: flex; + justify-content: center; + align-items: center; + text-align: center; + min-height: 25vh; +} + +.flex-container { + padding: 0; + margin: 0; + display: flex; + align-items: center; + justify-content: center; + flex-direction: column; +} +.flex-item { +} +.result { + display:block; + max-width: 60vw; + overflow-x: hidden; +} diff --git a/client/src/index.html b/client/src/index.html new file mode 100644 index 0000000..a748d6c --- /dev/null +++ b/client/src/index.html @@ -0,0 +1,16 @@ + + + + + + +
+ +
+
+
+
+
+ + + diff --git a/client/src/js/index.js b/client/src/js/index.js new file mode 100644 index 0000000..09b0bb2 --- /dev/null +++ b/client/src/js/index.js @@ -0,0 +1,28 @@ +function debounce(func, timeout = 300){ + let timer; + return (...args) => { + clearTimeout(timer); + timer = setTimeout(() => { func.apply(this, args); }, timeout); + }; +} +async function search(searchBox){ + const response = await fetch(`http://localhost:5000/search/${searchBox.value}`); + const results = await response.json(); + + const resultView = document.getElementById("results"); + resultView.replaceChildren(); + for (let i = 0; i < results.length; i++){ + let result = results[i]; + let resultElement = document.createElement("a"); + resultElement.innerText = result[0]; + resultElement.href = result[0]; + resultElement.className = "flex-item result"; + resultView.appendChild(resultElement); + } +} + +const searchBoxKeyUp = debounce(() => search()) + +const searchBox = document.getElementById("searchbox"); + +searchBox.addEventListener("keyup", debounce(() => search(searchBox))) diff --git a/src/crawl.py b/src/crawl.py index 0816e1b..1480b4e 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -2,7 +2,6 @@ import argparse import requests -import hashlib from urllib.parse import urlparse, urljoin import urllib.robotparser import os @@ -10,7 +9,7 @@ from time import sleep from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI -from models import Base, Documents, Document_Tokens +from models import Base, Documents from sqlalchemy.orm import sessionmaker import datetime import yt_dlp as youtube_dl @@ -23,7 +22,7 @@ Session = sessionmaker(bind=engine) excluded_domains = ['amazon.', 'news.ycombinator.', 'facebook.com', 'amzn', 'fb.com'] -excluded_filetypes = [".jpg", ".xml", ".mp4", +excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db", ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"] @@ -33,6 +32,7 @@ def get_html(url: str) -> str: def parse_youtube(video_url: str) -> bool: + return # Language preference for subtitles (set to None for auto-generated) # Change this to 'en' for English subtitles, or None for auto-generated subtitle_language = 'en' @@ -91,6 +91,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo for domain in excluded_domains: if domain in url: return + if any(ext in url for ext in excluded_filetypes): + return if "youtube.com" in url: parse_youtube(url) return @@ -110,8 +112,6 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo return soup = BeautifulSoup(html, 'html.parser') - hash = hashlib.sha256() - hash.update(url.encode('ascii')) s = Session() existing_website = s.query(Documents).filter_by(url=url).first() @@ -151,13 +151,25 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo parse_html(link, link_html, r, traversed_links) except: pass - elif link not in traversed_links: - with open('data/links.txt', 'r+') as linksfile: - while line := linksfile.readline(): - if line.strip() == link.strip(): - found = True - if not found: - linksfile.write(f'{link}\n') +# elif link not in traversed_links: +# with open('data/links.txt', 'r+') as linksfile: +# while line := linksfile.readline(): +# if line.strip() == link.strip(): +# found = True +# if not found: +# linksfile.write(f'{link}\n') + + +def parse_site_map(base_url): + map = BeautifulSoup(requests.get(base_url).content, 'xml') + print(map.find_all('loc')) + for loc in map.find_all('loc'): + if "xml" in loc.contents[0]: + parse_site_map(loc.contents[0]) + else: + url = loc.contents[0] + html = get_html(url) + parse_html(url, html, max_recursion) if __name__ == "__main__": @@ -189,11 +201,7 @@ if __name__ == "__main__": print("Robots prevents crawling url: " + args.url) exit(0) if len(rp.site_maps()) > 0: - map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml') - for loc in map.find_all('loc'): - url = loc.contents[0] - html = get_html(url) - parse_html(url, html, max_recursion) + parse_site_map(rp.site_maps()[0]) else: html = get_html(args.url) parse_html(args.url, html, max_recursion) diff --git a/src/search.py b/src/search.py index d643eb2..fd013bc 100755 --- a/src/search.py +++ b/src/search.py @@ -1,14 +1,17 @@ #!/usr/bin/python3 -from sqlalchemy import create_engine, func, and_, or_ +from sqlalchemy import create_engine, func, and_, or_, not_ from config import DATABASE_URI -from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams +from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams from sqlalchemy.orm import sessionmaker from sqlalchemy.sql.expression import distinct import time from flask import Flask +from flask_cors import CORS +from flask import send_from_directory from urllib.parse import unquote -app = Flask(__name__) +app = Flask(__name__, static_url_path='/static/') +CORS(app) engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) @@ -16,7 +19,9 @@ Session = sessionmaker(bind=engine) def split_query(query): - result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []} + query = query.lower() + result = {'ands': [], 'ors': [], 'words': [], + 'ngrams': [], 'exclusions': []} query_words = query.split() i = 0 while i < len(query_words): @@ -39,6 +44,11 @@ def split_query(query): quoted_query[1:len(quoted_query)-2].rstrip()) i += n + 1 continue + elif query_words[i][0] == "-": + excluded_query = query_words[i][1: len(query_words[i])] + result['exclusions'].append(excluded_query) + i += 1 + continue result['ngrams'].append(query_words[i]) i += 1 return result @@ -52,19 +62,26 @@ def search(query): query_words = split_query(unquote(query)) print(query_words) if len(query_words['ands']) > 0: + print('entering ands: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") for a in query_words['ands']: query = session.query(Documents.url, func.count(1)). \ - join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ - join(Tokens, Document_Tokens.token_id == Tokens.id).\ - filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ - group_by(Documents.url).\ - having(func.count(distinct(Document_Tokens.token_id)) == 2).\ + join(Document_NGrams, Documents.id == Document_NGrams.document_id). \ + join(NGrams, Document_NGrams.ngram_id == NGrams.id). \ + filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\ + group_by(Documents.url). \ + having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \ order_by(func.count(1).desc()) + +# limit(100) + print(query) for result in query.all(): if result[0] in results.keys(): results[result[0]] += result[1] else: results[result[0]] = result[1] + print('exiting ands: ' + + str((time.time_ns() - start_time) // 1_000_000) + "ms") if len(query_words['ngrams']) > 0: print('entering ngrams: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") @@ -83,7 +100,7 @@ def search(query): q = q.filter(or_(*and_conditions)) print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(q) - x = q.all() + x = q.limit(100).all() print('query executed: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") print(x) @@ -101,30 +118,11 @@ def search(query): # results[document_ngram.document.url] = 1 print('exiting ngrams: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") - if len(query_words['words']) > 0: - print('entering words: ' + - str((time.time_ns() - start_time) // 1_000_000) + "ms") - q = session.query(Documents.url, func.count(1)) \ - .join(Document_Tokens, Documents.id == Document_Tokens.document_id) \ - .join(Tokens, Document_Tokens.token_id == Tokens.id) \ - .group_by(Documents.url).filter(Tokens.token.in_(query_words['words'])) - - print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms") - print(q) - x = q.all() - print('query executed: ' + - str((time.time_ns() - start_time) // 1_000_000) + "ms") - for result in x: - if result[0] in results.keys(): - results[result[0]] += result[1] - else: - results[result[0]] = result[1] - print('exiting words: ' + - str((time.time_ns() - start_time) // 1_000_000) + "ms") print(str((time.time_ns() - start_time) // 1_000_000) + "ms") session.close() - return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10] + return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())] + # @app.route("/search/") # def search(query): @@ -132,17 +130,17 @@ def search(query): # session = Session() # result = {} # query_words = unquote(query).split() -# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000) +# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000) # for word in query_words: # word = word.lower() -# matching_token = session.query(Tokens).filter_by(token=word).first() +# matching_ngram = session.query(NGrams).filter_by(ngram=word).first() # -# if matching_token is None: +# if matching_ngram is None: # continue -# for document_token in matching_token.document_tokens: -# if document_token.document.url in result.keys(): -# result[document_token.document.url] += 1 +# for document_ngram in matching_ngram.document_ngrams: +# if document_ngram.document.url in result.keys(): +# result[document_ngram.document.url] += 1 # else: -# result[document_token.document.url] = 1 +# result[document_ngram.document.url] = 1 # print(str((time.time_ns() - start_time) // 1_000_000) + "ms") # return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10] diff --git a/todo b/todo index 2f5f3e5..ddda3bd 100644 --- a/todo +++ b/todo @@ -6,6 +6,6 @@ [x] Add clustered index to document_ngrams table model [x] Add clustered index to document_tokens table model [ ] Add ddl command to create partition tables -[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be -[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that +[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be +[x] Instead of starting from a random page on the site, go to root and find site map and crawl that