Implement recursive page crawling

This commit is contained in:
rmgr 2023-12-06 08:29:39 +10:30
parent 3d7b72e5ef
commit 9fc2e1af53
5 changed files with 47 additions and 26 deletions

View file

@ -2,14 +2,22 @@
import argparse
import requests
import hashlib
from urllib.parse import urlparse, urljoin
import os
from time import sleep
from bs4 import BeautifulSoup
# TODO- Handle gemini/gopher links
# TODO- Keep a list of traversed links and check before traversing again
def get_html(url: str) -> str:
response = requests.get(url)
return response.content
def parse_html(url: str, html: str) -> bool:
def parse_html(url: str, html: str, recursion: int = 0) -> bool:
print(url)
print(recursion)
urlparts = urlparse(url)
baseurl = urlparts.scheme + "://" + urlparts.netloc
soup = BeautifulSoup(html,'html.parser')
hash = hashlib.sha256()
hash.update(url.encode('ascii'))
@ -28,18 +36,26 @@ def parse_html(url: str, html: str) -> bool:
links = soup.find_all("a")
for link in links:
found = False
if 'href' in link:
link = link["href"]
else:
link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
continue
if not "http" in link:
link = url + "/" + link
with open(f'data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if line.strip() == link.strip():
found = True
if not found:
linksfile.write(f'{link}\n')
link = urljoin(url, link)
if (recursion > 0):
try:
link_html = get_html(link)
r = recursion -1
sleep(0.5)
parse_html(link, link_html)
except:
pass
# else:
# with open(f'data/links.txt', 'r+') as linksfile:
# while line := linksfile.readline():
# if line.strip() == link.strip():
# found = True
# if not found:
# linksfile.write(f'{link}\n')
if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True)
@ -47,17 +63,20 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true")
max_recursion = 4
args = parser.parse_args()
html = get_html(args.url)
parse_html(args.url, html)
parse_html(args.url, html, max_recursion)
if (args.followlinks):
with open(f'data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if "http" in line:
try:
parse_html(line, get_html(line))
except:
pass
# recursion = 0
# if (args.followlinks):
# with open(f'data/links.txt', 'r+') as linksfile:
# while line := linksfile.readline():
# if recursion < max_recursion:
# if "http" in line:
# recursion += 1
# try:
# parse_html(line, get_html(line))
# except:
# pass
os.remove('data/links.txt')

View file

@ -27,7 +27,7 @@ def build_index():
if len(matching_urls) == 0:
# if not url.strip() in dictionary[word]:
entries = dictionary[word]
entry = {"url": url.strip(), "count": 1}
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
dictionary[word].append(entry)
else:
entries = dictionary[word]

View file

@ -23,8 +23,8 @@ def search(query):
result.append(item)
else:
matching_results[0]["count"] += item["count"]
#result.append(index[q])
# result.sort(reverse= True,key=lambda entry: int(entry.count))
return result
def handle_and():
pass