Implement recursive page crawling
This commit is contained in:
parent
3d7b72e5ef
commit
9fc2e1af53
5 changed files with 47 additions and 26 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -1 +1,3 @@
|
||||||
src/__pycache__
|
src/__pycache__/
|
||||||
|
data
|
||||||
|
env
|
||||||
|
|
|
||||||
Binary file not shown.
61
src/crawl.py
61
src/crawl.py
|
|
@ -2,14 +2,22 @@
|
||||||
import argparse
|
import argparse
|
||||||
import requests
|
import requests
|
||||||
import hashlib
|
import hashlib
|
||||||
|
from urllib.parse import urlparse, urljoin
|
||||||
import os
|
import os
|
||||||
|
from time import sleep
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
# TODO- Handle gemini/gopher links
|
||||||
|
# TODO- Keep a list of traversed links and check before traversing again
|
||||||
|
|
||||||
def get_html(url: str) -> str:
|
def get_html(url: str) -> str:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
return response.content
|
return response.content
|
||||||
|
|
||||||
def parse_html(url: str, html: str) -> bool:
|
def parse_html(url: str, html: str, recursion: int = 0) -> bool:
|
||||||
|
print(url)
|
||||||
|
print(recursion)
|
||||||
|
urlparts = urlparse(url)
|
||||||
|
baseurl = urlparts.scheme + "://" + urlparts.netloc
|
||||||
soup = BeautifulSoup(html,'html.parser')
|
soup = BeautifulSoup(html,'html.parser')
|
||||||
hash = hashlib.sha256()
|
hash = hashlib.sha256()
|
||||||
hash.update(url.encode('ascii'))
|
hash.update(url.encode('ascii'))
|
||||||
|
|
@ -28,18 +36,26 @@ def parse_html(url: str, html: str) -> bool:
|
||||||
links = soup.find_all("a")
|
links = soup.find_all("a")
|
||||||
for link in links:
|
for link in links:
|
||||||
found = False
|
found = False
|
||||||
if 'href' in link:
|
|
||||||
link = link["href"]
|
link = link["href"]
|
||||||
else:
|
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
||||||
continue
|
continue
|
||||||
if not "http" in link:
|
if not "http" in link:
|
||||||
link = url + "/" + link
|
link = urljoin(url, link)
|
||||||
with open(f'data/links.txt', 'r+') as linksfile:
|
if (recursion > 0):
|
||||||
while line := linksfile.readline():
|
try:
|
||||||
if line.strip() == link.strip():
|
link_html = get_html(link)
|
||||||
found = True
|
r = recursion -1
|
||||||
if not found:
|
sleep(0.5)
|
||||||
linksfile.write(f'{link}\n')
|
parse_html(link, link_html)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# else:
|
||||||
|
# with open(f'data/links.txt', 'r+') as linksfile:
|
||||||
|
# while line := linksfile.readline():
|
||||||
|
# if line.strip() == link.strip():
|
||||||
|
# found = True
|
||||||
|
# if not found:
|
||||||
|
# linksfile.write(f'{link}\n')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
os.makedirs("data/content", exist_ok=True)
|
os.makedirs("data/content", exist_ok=True)
|
||||||
|
|
@ -47,17 +63,20 @@ if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("url", help="URL of the webpage to be crawled")
|
parser.add_argument("url", help="URL of the webpage to be crawled")
|
||||||
parser.add_argument('-f', "--followlinks", action="store_true")
|
parser.add_argument('-f', "--followlinks", action="store_true")
|
||||||
|
max_recursion = 4
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
html = get_html(args.url)
|
html = get_html(args.url)
|
||||||
parse_html(args.url, html)
|
parse_html(args.url, html, max_recursion)
|
||||||
|
|
||||||
if (args.followlinks):
|
# recursion = 0
|
||||||
with open(f'data/links.txt', 'r+') as linksfile:
|
# if (args.followlinks):
|
||||||
while line := linksfile.readline():
|
# with open(f'data/links.txt', 'r+') as linksfile:
|
||||||
if "http" in line:
|
# while line := linksfile.readline():
|
||||||
try:
|
# if recursion < max_recursion:
|
||||||
parse_html(line, get_html(line))
|
# if "http" in line:
|
||||||
|
# recursion += 1
|
||||||
except:
|
# try:
|
||||||
pass
|
# parse_html(line, get_html(line))
|
||||||
|
# except:
|
||||||
|
# pass
|
||||||
|
os.remove('data/links.txt')
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,7 @@ def build_index():
|
||||||
if len(matching_urls) == 0:
|
if len(matching_urls) == 0:
|
||||||
# if not url.strip() in dictionary[word]:
|
# if not url.strip() in dictionary[word]:
|
||||||
entries = dictionary[word]
|
entries = dictionary[word]
|
||||||
entry = {"url": url.strip(), "count": 1}
|
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
|
||||||
dictionary[word].append(entry)
|
dictionary[word].append(entry)
|
||||||
else:
|
else:
|
||||||
entries = dictionary[word]
|
entries = dictionary[word]
|
||||||
|
|
|
||||||
|
|
@ -23,8 +23,8 @@ def search(query):
|
||||||
result.append(item)
|
result.append(item)
|
||||||
else:
|
else:
|
||||||
matching_results[0]["count"] += item["count"]
|
matching_results[0]["count"] += item["count"]
|
||||||
#result.append(index[q])
|
|
||||||
# result.sort(reverse= True,key=lambda entry: int(entry.count))
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def handle_and():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue