Initial commit

2023-11-28 20:51:54 +10:30 · 2023-11-28 20:51:54 +10:30 · f36ab2fbfb
commit f36ab2fbfb
5 changed files with 144 additions and 0 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,21 @@
 beautifulsoup4==4.12.2
 blinker==1.7.0
 bs4==0.0.1
 certifi==2023.11.17
 chardet==5.2.0
 charset-normalizer==3.3.2
 click==8.1.7
 cssselect==1.2.0
 flask==3.0.0
 idna==3.6
 importlib-metadata==6.8.0
 itsdangerous==2.1.2
 Jinja2==3.1.2
 lxml==4.9.3
 MarkupSafe==2.1.3
 readability-lxml==0.8.1
 requests==2.31.0
 soupsieve==2.5
 urllib3==2.1.0
 werkzeug==3.0.1
 zipp==3.17.0
--- a/src/pycache/search.cpython-39.pyc
+++ b/src/pycache/search.cpython-39.pyc
--- a/src/crawl.py
+++ b/src/crawl.py
@ -0,0 +1,63 @@
 #!/usr/bin/python3
 import argparse
 import requests
 import hashlib
 import os
 from bs4 import BeautifulSoup
 def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content
 def parse_html(url: str, html: str) -> bool:
    soup = BeautifulSoup(html,'html.parser')
    hash = hashlib.sha256()
    hash.update(url.encode('ascii'))
    filename_text = hash.hexdigest() + '.txt' 
    filename_html = hash.hexdigest() + '.html' 
    with open(f'data/content/{filename_text}', 'w') as outfile:
        outfile.write(url)
        outfile.write('\n')
        outfile.write(soup.get_text())
    with open(f'data/content/{filename_html}', 'w') as outfile:
        outfile.write(url)
        outfile.write('\n')
        outfile.write(soup.prettify())
    x = open(f'data/links.txt', 'a')
    x.close()
    links = soup.find_all("a")
    for link in links:
        found = False
        if 'href' in link:
            link = link["href"]
        else:
            continue
        if not "http" in link:
            link = url + "/" + link
        with open(f'data/links.txt', 'r+') as linksfile:
            while line := linksfile.readline():
                if line.strip() == link.strip():
                    found = True
            if not found:
                linksfile.write(f'{link}\n')
 if __name__ == "__main__":
    os.makedirs("data/content", exist_ok=True)
    # check inputs
    parser = argparse.ArgumentParser()
    parser.add_argument("url", help="URL of the webpage to be crawled")
    parser.add_argument('-f', "--followlinks", action="store_true")
    args = parser.parse_args()
    html = get_html(args.url)
    parse_html(args.url, html)
    if (args.followlinks):
        with open(f'data/links.txt', 'r+') as linksfile:
            while line := linksfile.readline():
                if "http" in line:
                    try:
                        parse_html(line, get_html(line))
                    except:
                        pass
--- a/src/index.py
+++ b/src/index.py
@ -0,0 +1,36 @@
 #!/usr/bin/python3
 from pathlib import Path
 import argparse
 import os
 import json
 ignored_words = ['a', 'the','is']
 def build_index():
    with open(f"data/index.json", "w") as index:
        # get a list of all content files
        # split on whitespace and add to index
        dictionary = {}
        pathlist = Path('data/content').rglob('*.txt')
        for path in pathlist:
            with open(str(path)) as content_file:
                url = content_file.readline()
                content = content_file.read()
                content_words = content.split()
                for word in content_words:
                    word = word.lower()
                    if not word in ignored_words:
                        if not word in dictionary:
                            dictionary[word] = []
                        if not url.strip() in dictionary[word]:
                            dictionary[word].append(url.strip())
        index.write(json.dumps(dictionary))
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
    args = parser.parse_args()
    if args.rebuild:
        build_index()
--- a/src/search.py
+++ b/src/search.py
@ -0,0 +1,24 @@
 #!/bin/bash
 from flask import Flask
 from flask import Request
 import json
 from urllib.parse import unquote
 app = Flask(__name__)
@app.route("/search/<query>")
 def search(query):
    with open('data/index.json', 'r') as index_json:
        index = json.load(index_json)
        query = unquote(query)
        query_split = query.split()
        result = []
        for q in query_split:
            q = q.lower()
            if q in index:
                result.append(index[q])
        print(result)
        return result