Implement recursive page crawling

This commit is contained in:
rmgr 2023-12-06 08:29:39 +10:30
parent 3d7b72e5ef
commit 9fc2e1af53
5 changed files with 47 additions and 26 deletions

4
.gitignore vendored
View file

@ -1 +1,3 @@
src/__pycache__ src/__pycache__/
data
env

View file

@ -2,14 +2,22 @@
import argparse import argparse
import requests import requests
import hashlib import hashlib
from urllib.parse import urlparse, urljoin
import os import os
from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# TODO- Handle gemini/gopher links
# TODO- Keep a list of traversed links and check before traversing again
def get_html(url: str) -> str: def get_html(url: str) -> str:
response = requests.get(url) response = requests.get(url)
return response.content return response.content
def parse_html(url: str, html: str) -> bool: def parse_html(url: str, html: str, recursion: int = 0) -> bool:
print(url)
print(recursion)
urlparts = urlparse(url)
baseurl = urlparts.scheme + "://" + urlparts.netloc
soup = BeautifulSoup(html,'html.parser') soup = BeautifulSoup(html,'html.parser')
hash = hashlib.sha256() hash = hashlib.sha256()
hash.update(url.encode('ascii')) hash.update(url.encode('ascii'))
@ -28,18 +36,26 @@ def parse_html(url: str, html: str) -> bool:
links = soup.find_all("a") links = soup.find_all("a")
for link in links: for link in links:
found = False found = False
if 'href' in link: link = link["href"]
link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link:
else:
continue continue
if not "http" in link: if not "http" in link:
link = url + "/" + link link = urljoin(url, link)
with open(f'data/links.txt', 'r+') as linksfile: if (recursion > 0):
while line := linksfile.readline(): try:
if line.strip() == link.strip(): link_html = get_html(link)
found = True r = recursion -1
if not found: sleep(0.5)
linksfile.write(f'{link}\n') parse_html(link, link_html)
except:
pass
# else:
# with open(f'data/links.txt', 'r+') as linksfile:
# while line := linksfile.readline():
# if line.strip() == link.strip():
# found = True
# if not found:
# linksfile.write(f'{link}\n')
if __name__ == "__main__": if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True) os.makedirs("data/content", exist_ok=True)
@ -47,17 +63,20 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true") parser.add_argument('-f', "--followlinks", action="store_true")
max_recursion = 4
args = parser.parse_args() args = parser.parse_args()
html = get_html(args.url) html = get_html(args.url)
parse_html(args.url, html) parse_html(args.url, html, max_recursion)
if (args.followlinks): # recursion = 0
with open(f'data/links.txt', 'r+') as linksfile: # if (args.followlinks):
while line := linksfile.readline(): # with open(f'data/links.txt', 'r+') as linksfile:
if "http" in line: # while line := linksfile.readline():
try: # if recursion < max_recursion:
parse_html(line, get_html(line)) # if "http" in line:
# recursion += 1
except: # try:
pass # parse_html(line, get_html(line))
# except:
# pass
os.remove('data/links.txt')

View file

@ -27,7 +27,7 @@ def build_index():
if len(matching_urls) == 0: if len(matching_urls) == 0:
# if not url.strip() in dictionary[word]: # if not url.strip() in dictionary[word]:
entries = dictionary[word] entries = dictionary[word]
entry = {"url": url.strip(), "count": 1} entry = {"url": url.strip(), "count": 1, "filename": str(path)}
dictionary[word].append(entry) dictionary[word].append(entry)
else: else:
entries = dictionary[word] entries = dictionary[word]

View file

@ -23,8 +23,8 @@ def search(query):
result.append(item) result.append(item)
else: else:
matching_results[0]["count"] += item["count"] matching_results[0]["count"] += item["count"]
#result.append(index[q])
# result.sort(reverse= True,key=lambda entry: int(entry.count))
return result return result
def handle_and():
pass