Initial commit

This commit is contained in:
rmgr 2023-11-28 20:51:54 +10:30
commit f36ab2fbfb
5 changed files with 144 additions and 0 deletions

63
src/crawl.py Executable file
View file

@ -0,0 +1,63 @@
#!/usr/bin/python3
import argparse
import requests
import hashlib
import os
from bs4 import BeautifulSoup
def get_html(url: str) -> str:
response = requests.get(url)
return response.content
def parse_html(url: str, html: str) -> bool:
soup = BeautifulSoup(html,'html.parser')
hash = hashlib.sha256()
hash.update(url.encode('ascii'))
filename_text = hash.hexdigest() + '.txt'
filename_html = hash.hexdigest() + '.html'
with open(f'data/content/{filename_text}', 'w') as outfile:
outfile.write(url)
outfile.write('\n')
outfile.write(soup.get_text())
with open(f'data/content/{filename_html}', 'w') as outfile:
outfile.write(url)
outfile.write('\n')
outfile.write(soup.prettify())
x = open(f'data/links.txt', 'a')
x.close()
links = soup.find_all("a")
for link in links:
found = False
if 'href' in link:
link = link["href"]
else:
continue
if not "http" in link:
link = url + "/" + link
with open(f'data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if line.strip() == link.strip():
found = True
if not found:
linksfile.write(f'{link}\n')
if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True)
# check inputs
parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true")
args = parser.parse_args()
html = get_html(args.url)
parse_html(args.url, html)
if (args.followlinks):
with open(f'data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if "http" in line:
try:
parse_html(line, get_html(line))
except:
pass