Initial commit
This commit is contained in:
commit
f36ab2fbfb
5 changed files with 144 additions and 0 deletions
63
src/crawl.py
Executable file
63
src/crawl.py
Executable file
|
|
@ -0,0 +1,63 @@
|
|||
#!/usr/bin/python3
|
||||
import argparse
|
||||
import requests
|
||||
import hashlib
|
||||
import os
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def get_html(url: str) -> str:
|
||||
response = requests.get(url)
|
||||
return response.content
|
||||
|
||||
def parse_html(url: str, html: str) -> bool:
|
||||
soup = BeautifulSoup(html,'html.parser')
|
||||
hash = hashlib.sha256()
|
||||
hash.update(url.encode('ascii'))
|
||||
filename_text = hash.hexdigest() + '.txt'
|
||||
filename_html = hash.hexdigest() + '.html'
|
||||
with open(f'data/content/{filename_text}', 'w') as outfile:
|
||||
outfile.write(url)
|
||||
outfile.write('\n')
|
||||
outfile.write(soup.get_text())
|
||||
with open(f'data/content/{filename_html}', 'w') as outfile:
|
||||
outfile.write(url)
|
||||
outfile.write('\n')
|
||||
outfile.write(soup.prettify())
|
||||
x = open(f'data/links.txt', 'a')
|
||||
x.close()
|
||||
links = soup.find_all("a")
|
||||
for link in links:
|
||||
found = False
|
||||
if 'href' in link:
|
||||
link = link["href"]
|
||||
else:
|
||||
continue
|
||||
if not "http" in link:
|
||||
link = url + "/" + link
|
||||
with open(f'data/links.txt', 'r+') as linksfile:
|
||||
while line := linksfile.readline():
|
||||
if line.strip() == link.strip():
|
||||
found = True
|
||||
if not found:
|
||||
linksfile.write(f'{link}\n')
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.makedirs("data/content", exist_ok=True)
|
||||
# check inputs
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("url", help="URL of the webpage to be crawled")
|
||||
parser.add_argument('-f', "--followlinks", action="store_true")
|
||||
|
||||
args = parser.parse_args()
|
||||
html = get_html(args.url)
|
||||
parse_html(args.url, html)
|
||||
|
||||
if (args.followlinks):
|
||||
with open(f'data/links.txt', 'r+') as linksfile:
|
||||
while line := linksfile.readline():
|
||||
if "http" in line:
|
||||
try:
|
||||
parse_html(line, get_html(line))
|
||||
|
||||
except:
|
||||
pass
|
||||
Loading…
Add table
Add a link
Reference in a new issue