Initial commit
This commit is contained in:
commit
f36ab2fbfb
5 changed files with 144 additions and 0 deletions
21
requirements.txt
Normal file
21
requirements.txt
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
beautifulsoup4==4.12.2
|
||||||
|
blinker==1.7.0
|
||||||
|
bs4==0.0.1
|
||||||
|
certifi==2023.11.17
|
||||||
|
chardet==5.2.0
|
||||||
|
charset-normalizer==3.3.2
|
||||||
|
click==8.1.7
|
||||||
|
cssselect==1.2.0
|
||||||
|
flask==3.0.0
|
||||||
|
idna==3.6
|
||||||
|
importlib-metadata==6.8.0
|
||||||
|
itsdangerous==2.1.2
|
||||||
|
Jinja2==3.1.2
|
||||||
|
lxml==4.9.3
|
||||||
|
MarkupSafe==2.1.3
|
||||||
|
readability-lxml==0.8.1
|
||||||
|
requests==2.31.0
|
||||||
|
soupsieve==2.5
|
||||||
|
urllib3==2.1.0
|
||||||
|
werkzeug==3.0.1
|
||||||
|
zipp==3.17.0
|
||||||
BIN
src/__pycache__/search.cpython-39.pyc
Normal file
BIN
src/__pycache__/search.cpython-39.pyc
Normal file
Binary file not shown.
63
src/crawl.py
Executable file
63
src/crawl.py
Executable file
|
|
@ -0,0 +1,63 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
import argparse
|
||||||
|
import requests
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
def get_html(url: str) -> str:
|
||||||
|
response = requests.get(url)
|
||||||
|
return response.content
|
||||||
|
|
||||||
|
def parse_html(url: str, html: str) -> bool:
|
||||||
|
soup = BeautifulSoup(html,'html.parser')
|
||||||
|
hash = hashlib.sha256()
|
||||||
|
hash.update(url.encode('ascii'))
|
||||||
|
filename_text = hash.hexdigest() + '.txt'
|
||||||
|
filename_html = hash.hexdigest() + '.html'
|
||||||
|
with open(f'data/content/{filename_text}', 'w') as outfile:
|
||||||
|
outfile.write(url)
|
||||||
|
outfile.write('\n')
|
||||||
|
outfile.write(soup.get_text())
|
||||||
|
with open(f'data/content/{filename_html}', 'w') as outfile:
|
||||||
|
outfile.write(url)
|
||||||
|
outfile.write('\n')
|
||||||
|
outfile.write(soup.prettify())
|
||||||
|
x = open(f'data/links.txt', 'a')
|
||||||
|
x.close()
|
||||||
|
links = soup.find_all("a")
|
||||||
|
for link in links:
|
||||||
|
found = False
|
||||||
|
if 'href' in link:
|
||||||
|
link = link["href"]
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
if not "http" in link:
|
||||||
|
link = url + "/" + link
|
||||||
|
with open(f'data/links.txt', 'r+') as linksfile:
|
||||||
|
while line := linksfile.readline():
|
||||||
|
if line.strip() == link.strip():
|
||||||
|
found = True
|
||||||
|
if not found:
|
||||||
|
linksfile.write(f'{link}\n')
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
os.makedirs("data/content", exist_ok=True)
|
||||||
|
# check inputs
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("url", help="URL of the webpage to be crawled")
|
||||||
|
parser.add_argument('-f', "--followlinks", action="store_true")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
html = get_html(args.url)
|
||||||
|
parse_html(args.url, html)
|
||||||
|
|
||||||
|
if (args.followlinks):
|
||||||
|
with open(f'data/links.txt', 'r+') as linksfile:
|
||||||
|
while line := linksfile.readline():
|
||||||
|
if "http" in line:
|
||||||
|
try:
|
||||||
|
parse_html(line, get_html(line))
|
||||||
|
|
||||||
|
except:
|
||||||
|
pass
|
||||||
36
src/index.py
Executable file
36
src/index.py
Executable file
|
|
@ -0,0 +1,36 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
ignored_words = ['a', 'the','is']
|
||||||
|
|
||||||
|
def build_index():
|
||||||
|
with open(f"data/index.json", "w") as index:
|
||||||
|
# get a list of all content files
|
||||||
|
# split on whitespace and add to index
|
||||||
|
dictionary = {}
|
||||||
|
pathlist = Path('data/content').rglob('*.txt')
|
||||||
|
for path in pathlist:
|
||||||
|
with open(str(path)) as content_file:
|
||||||
|
url = content_file.readline()
|
||||||
|
content = content_file.read()
|
||||||
|
content_words = content.split()
|
||||||
|
for word in content_words:
|
||||||
|
word = word.lower()
|
||||||
|
if not word in ignored_words:
|
||||||
|
if not word in dictionary:
|
||||||
|
dictionary[word] = []
|
||||||
|
if not url.strip() in dictionary[word]:
|
||||||
|
dictionary[word].append(url.strip())
|
||||||
|
|
||||||
|
index.write(json.dumps(dictionary))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.rebuild:
|
||||||
|
build_index()
|
||||||
24
src/search.py
Executable file
24
src/search.py
Executable file
|
|
@ -0,0 +1,24 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
from flask import Flask
|
||||||
|
from flask import Request
|
||||||
|
import json
|
||||||
|
from urllib.parse import unquote
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@app.route("/search/<query>")
|
||||||
|
def search(query):
|
||||||
|
with open('data/index.json', 'r') as index_json:
|
||||||
|
index = json.load(index_json)
|
||||||
|
query = unquote(query)
|
||||||
|
query_split = query.split()
|
||||||
|
result = []
|
||||||
|
for q in query_split:
|
||||||
|
q = q.lower()
|
||||||
|
if q in index:
|
||||||
|
result.append(index[q])
|
||||||
|
print(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue