Clean up site map scanning. Return all results instead of 10

2024-06-09 21:53:57 +09:30 · 2024-06-09 21:53:57 +09:30 · bbba459480
commit bbba459480
parent 2a99a61dbe
6 changed files with 140 additions and 57 deletions
--- a/client/src/css/styles.css
+++ b/client/src/css/styles.css
@ -0,0 +1,33 @@
+html, body {
+    height: 100%;
+}
+body {
+    margin: 0;
+}
+input {
+    padding: 7px;
+    font-size: 1.1rem;
+}
+.search-container {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    text-align: center;
+    min-height: 25vh;
+}
+
+.flex-container {
+    padding: 0;
+    margin: 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    flex-direction: column;
+}
+.flex-item {
+}
+.result {
+    display:block;
+    max-width: 60vw;
+    overflow-x: hidden;
+}
--- a/client/src/index.html
+++ b/client/src/index.html
@ -0,0 +1,16 @@
+<html>
+
+	<head>
+		<link rel="stylesheet" href="css/styles.css">
+	</head>
+	<body>
+		<div class="search-container">
+			<input type="text" class="searchbox" id="searchbox">
+		</div>
+		<div class="flex-container">
+			<div class="flex-item" id="results">
+			</div>
+		</div>
+		<script src="js/index.js"></script>
+	</body>
+</html>
--- a/client/src/js/index.js
+++ b/client/src/js/index.js
@ -0,0 +1,28 @@
+function debounce(func, timeout = 300){
+  let timer;
+  return (...args) => {
+    clearTimeout(timer);
+    timer = setTimeout(() => { func.apply(this, args); }, timeout);
+  };
+}
+async function search(searchBox){
+  const response = await fetch(`http://localhost:5000/search/${searchBox.value}`);
+  const results = await response.json();
+  
+  const resultView = document.getElementById("results");
+  resultView.replaceChildren();
+  for (let i = 0; i < results.length; i++){
+    let result = results[i];
+    let resultElement = document.createElement("a");
+    resultElement.innerText = result[0];
+    resultElement.href = result[0];
+    resultElement.className = "flex-item result";
+    resultView.appendChild(resultElement);
+  }
+}
+
+const searchBoxKeyUp = debounce(() => search())
+
+const searchBox = document.getElementById("searchbox");
+
+searchBox.addEventListener("keyup", debounce(() => search(searchBox)))
--- a/src/crawl.py
+++ b/src/crawl.py
@ -2,7 +2,6 @@

 import argparse
 import requests
-import hashlib
 from urllib.parse import urlparse, urljoin
 import urllib.robotparser
 import os
@ -10,7 +9,7 @@ from time import sleep
 from bs4 import BeautifulSoup
 from sqlalchemy import create_engine
 from config import DATABASE_URI
-from models import Base, Documents, Document_Tokens
+from models import Base, Documents
 from sqlalchemy.orm import sessionmaker
 import datetime
 import yt_dlp as youtube_dl
@ -23,7 +22,7 @@ Session = sessionmaker(bind=engine)
 excluded_domains = ['amazon.', 'news.ycombinator.',
                    'facebook.com', 'amzn', 'fb.com']

-excluded_filetypes = [".jpg", ".xml", ".mp4",
+excluded_filetypes = [".jpg", ".xml", ".mp4", ".jpeg", ".db",
                      ".mp3", ".png", ".tiff", ".gif", ".webp", ".pdf"]


@ -33,6 +32,7 @@ def get_html(url: str) -> str:


 def parse_youtube(video_url: str) -> bool:
+    return
    # Language preference for subtitles (set to None for auto-generated)
    # Change this to 'en' for English subtitles, or None for auto-generated
    subtitle_language = 'en'
@ -91,6 +91,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
    for domain in excluded_domains:
        if domain in url:
            return
+    if any(ext in url for ext in excluded_filetypes):
+        return
    if "youtube.com" in url:
        parse_youtube(url)
        return
@ -110,8 +112,6 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
        return

    soup = BeautifulSoup(html, 'html.parser')
-    hash = hashlib.sha256()
-    hash.update(url.encode('ascii'))

    s = Session()
    existing_website = s.query(Documents).filter_by(url=url).first()
@ -151,13 +151,25 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robo
                parse_html(link, link_html, r, traversed_links)
            except:
                pass
-        elif link not in traversed_links:
-            with open('data/links.txt', 'r+') as linksfile:
-                while line := linksfile.readline():
-                    if line.strip() == link.strip():
-                        found = True
-                if not found:
-                    linksfile.write(f'{link}\n')
+#        elif link not in traversed_links:
+#            with open('data/links.txt', 'r+') as linksfile:
+#                while line := linksfile.readline():
+#                    if line.strip() == link.strip():
+#                        found = True
+#                if not found:
+#                    linksfile.write(f'{link}\n')
+
+
+def parse_site_map(base_url):
+    map = BeautifulSoup(requests.get(base_url).content, 'xml')
+    print(map.find_all('loc'))
+    for loc in map.find_all('loc'):
+        if "xml" in loc.contents[0]:
+            parse_site_map(loc.contents[0])
+        else:
+            url = loc.contents[0]
+            html = get_html(url)
+            parse_html(url, html, max_recursion)


 if __name__ == "__main__":
@ -189,11 +201,7 @@ if __name__ == "__main__":
            print("Robots prevents crawling url: " + args.url)
            exit(0)
        if len(rp.site_maps()) > 0:
-            map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml')
-            for loc in map.find_all('loc'):
-                url = loc.contents[0]
-                html = get_html(url)
-                parse_html(url, html, max_recursion)
+            parse_site_map(rp.site_maps()[0])
    else:
        html = get_html(args.url)
        parse_html(args.url, html, max_recursion)
--- a/src/search.py
+++ b/src/search.py
@ -1,14 +1,17 @@
 #!/usr/bin/python3
-from sqlalchemy import create_engine, func, and_, or_
+from sqlalchemy import create_engine, func, and_, or_, not_
 from config import DATABASE_URI
-from models import Base, Tokens, Documents, Document_Tokens, NGrams, Document_NGrams
+from models import Base, NGrams, Documents, Document_NGrams, NGrams, Document_NGrams
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.sql.expression import distinct
 import time
 from flask import Flask
+from flask_cors import CORS
+from flask import send_from_directory
 from urllib.parse import unquote

-app = Flask(__name__)
+app = Flask(__name__, static_url_path='/static/')
+CORS(app)
 engine = create_engine(DATABASE_URI)
 Base.metadata.create_all(engine)
 Session = sessionmaker(bind=engine)
@ -16,7 +19,9 @@ Session = sessionmaker(bind=engine)


 def split_query(query):
-    result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []}
+    query = query.lower()
+    result = {'ands': [], 'ors': [], 'words': [],
+              'ngrams': [], 'exclusions': []}
    query_words = query.split()
    i = 0
    while i < len(query_words):
@ -39,6 +44,11 @@ def split_query(query):
                quoted_query[1:len(quoted_query)-2].rstrip())
            i += n + 1
            continue
+        elif query_words[i][0] == "-":
+            excluded_query = query_words[i][1: len(query_words[i])]
+            result['exclusions'].append(excluded_query)
+            i += 1
+            continue
        result['ngrams'].append(query_words[i])
        i += 1
    return result
@ -52,19 +62,26 @@ def search(query):
    query_words = split_query(unquote(query))
    print(query_words)
    if len(query_words['ands']) > 0:
+        print('entering ands: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
        for a in query_words['ands']:
            query = session.query(Documents.url, func.count(1)). \
-                join(Document_Tokens, Documents.id == Document_Tokens.document_id).\
-                join(Tokens, Document_Tokens.token_id == Tokens.id).\
-                filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\
+                join(Document_NGrams, Documents.id == Document_NGrams.document_id). \
+                join(NGrams, Document_NGrams.ngram_id == NGrams.id). \
+                filter(NGrams.gram.in_([a.split(',')[0], a.split(',')[1]])).\
                group_by(Documents.url). \
-                having(func.count(distinct(Document_Tokens.token_id)) == 2).\
+                having(func.count(distinct(Document_NGrams.ngram_id)) == 2). \
                order_by(func.count(1).desc())
+
+#                limit(100)
+            print(query)
            for result in query.all():
                if result[0] in results.keys():
                    results[result[0]] += result[1]
                else:
                    results[result[0]] = result[1]
+        print('exiting ands: ' +
+              str((time.time_ns() - start_time) // 1_000_000) + "ms")
    if len(query_words['ngrams']) > 0:
        print('entering ngrams: ' +
              str((time.time_ns() - start_time) // 1_000_000) + "ms")
@ -83,7 +100,7 @@ def search(query):
        q = q.filter(or_(*and_conditions))
        print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
        print(q)
-        x = q.all()
+        x = q.limit(100).all()
        print('query executed: ' +
              str((time.time_ns() - start_time) // 1_000_000) + "ms")
        print(x)
@ -101,30 +118,11 @@ def search(query):
 #                    results[document_ngram.document.url] = 1
        print('exiting ngrams: ' +
              str((time.time_ns() - start_time) // 1_000_000) + "ms")
-    if len(query_words['words']) > 0:
-        print('entering words: ' +
-              str((time.time_ns() - start_time) // 1_000_000) + "ms")
-        q = session.query(Documents.url, func.count(1)) \
-            .join(Document_Tokens, Documents.id == Document_Tokens.document_id) \
-            .join(Tokens, Document_Tokens.token_id == Tokens.id) \
-            .group_by(Documents.url).filter(Tokens.token.in_(query_words['words']))
-
-        print('query built: ' + str((time.time_ns() - start_time) // 1_000_000) + "ms")
-        print(q)
-        x = q.all()
-        print('query executed: ' +
-              str((time.time_ns() - start_time) // 1_000_000) + "ms")
-        for result in x:
-            if result[0] in results.keys():
-                results[result[0]] += result[1]
-            else:
-                results[result[0]] = result[1]
-        print('exiting words: ' +
-              str((time.time_ns() - start_time) // 1_000_000) + "ms")

    print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
    session.close()
-    return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
+    return sorted(results.items(), key=lambda x: x[1], reverse=True)[:len(results.items())]
+

 # @app.route("/search/<query>")
 # def search(query):
@ -132,17 +130,17 @@ def search(query):
 #    session = Session()
 #    result = {}
 #    query_words = unquote(query).split()
-# x= session.query(Tokens).filter(Tokens.token.in_(query_words)).take(1000)
+# x= session.query(NGrams).filter(NGrams.ngram.in_(query_words)).take(1000)
 #    for word in query_words:
 #        word = word.lower()
-#        matching_token = session.query(Tokens).filter_by(token=word).first()
+#        matching_ngram = session.query(NGrams).filter_by(ngram=word).first()
 #
-#        if matching_token is None:
+#        if matching_ngram is None:
 #            continue
-#        for document_token in matching_token.document_tokens:
-#            if document_token.document.url in result.keys():
-#                result[document_token.document.url] += 1
+#        for document_ngram in matching_ngram.document_ngrams:
+#            if document_ngram.document.url in result.keys():
+#                result[document_ngram.document.url] += 1
 #            else:
-#                result[document_token.document.url] = 1
+#                result[document_ngram.document.url] = 1
 #    print(str((time.time_ns() - start_time) // 1_000_000) + "ms")
 #    return sorted(result.items(), key=lambda x: x[1], reverse=True)[:10]
--- a/4
+++ b/4
@ -6,6 +6,6 @@
 [x] Add clustered index to document_ngrams table model
 [x] Add clustered index to document_tokens table model
 [ ] Add ddl command to create partition tables
-[ ] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
-[ ] Instead of starting from a random page on the site, go to root and find site map and crawl that
+[x] Investigate whether or not robots.txt is as aggressive as I'm making ito ut to be
+[x] Instead of starting from a random page on the site, go to root and find site map and crawl that