Initial commit

This commit is contained in:
rmgr 2023-11-28 20:51:54 +10:30
commit f36ab2fbfb
5 changed files with 144 additions and 0 deletions

21
requirements.txt Normal file
View file

@ -0,0 +1,21 @@
beautifulsoup4==4.12.2
blinker==1.7.0
bs4==0.0.1
certifi==2023.11.17
chardet==5.2.0
charset-normalizer==3.3.2
click==8.1.7
cssselect==1.2.0
flask==3.0.0
idna==3.6
importlib-metadata==6.8.0
itsdangerous==2.1.2
Jinja2==3.1.2
lxml==4.9.3
MarkupSafe==2.1.3
readability-lxml==0.8.1
requests==2.31.0
soupsieve==2.5
urllib3==2.1.0
werkzeug==3.0.1
zipp==3.17.0

Binary file not shown.

63
src/crawl.py Executable file
View file

@ -0,0 +1,63 @@
#!/usr/bin/python3
import argparse
import requests
import hashlib
import os
from bs4 import BeautifulSoup
def get_html(url: str) -> str:
response = requests.get(url)
return response.content
def parse_html(url: str, html: str) -> bool:
soup = BeautifulSoup(html,'html.parser')
hash = hashlib.sha256()
hash.update(url.encode('ascii'))
filename_text = hash.hexdigest() + '.txt'
filename_html = hash.hexdigest() + '.html'
with open(f'data/content/{filename_text}', 'w') as outfile:
outfile.write(url)
outfile.write('\n')
outfile.write(soup.get_text())
with open(f'data/content/{filename_html}', 'w') as outfile:
outfile.write(url)
outfile.write('\n')
outfile.write(soup.prettify())
x = open(f'data/links.txt', 'a')
x.close()
links = soup.find_all("a")
for link in links:
found = False
if 'href' in link:
link = link["href"]
else:
continue
if not "http" in link:
link = url + "/" + link
with open(f'data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if line.strip() == link.strip():
found = True
if not found:
linksfile.write(f'{link}\n')
if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True)
# check inputs
parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true")
args = parser.parse_args()
html = get_html(args.url)
parse_html(args.url, html)
if (args.followlinks):
with open(f'data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if "http" in line:
try:
parse_html(line, get_html(line))
except:
pass

36
src/index.py Executable file
View file

@ -0,0 +1,36 @@
#!/usr/bin/python3
from pathlib import Path
import argparse
import os
import json
ignored_words = ['a', 'the','is']
def build_index():
with open(f"data/index.json", "w") as index:
# get a list of all content files
# split on whitespace and add to index
dictionary = {}
pathlist = Path('data/content').rglob('*.txt')
for path in pathlist:
with open(str(path)) as content_file:
url = content_file.readline()
content = content_file.read()
content_words = content.split()
for word in content_words:
word = word.lower()
if not word in ignored_words:
if not word in dictionary:
dictionary[word] = []
if not url.strip() in dictionary[word]:
dictionary[word].append(url.strip())
index.write(json.dumps(dictionary))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
args = parser.parse_args()
if args.rebuild:
build_index()

24
src/search.py Executable file
View file

@ -0,0 +1,24 @@
#!/bin/bash
from flask import Flask
from flask import Request
import json
from urllib.parse import unquote
app = Flask(__name__)
@app.route("/search/<query>")
def search(query):
with open('data/index.json', 'r') as index_json:
index = json.load(index_json)
query = unquote(query)
query_split = query.split()
result = []
for q in query_split:
q = q.lower()
if q in index:
result.append(index[q])
print(result)
return result