Fix crawling. Add initial linksfile crawling. Still need to remove records as they are processed.
This commit is contained in:
parent
f4ea8ad1d7
commit
efe6dea1f5
1 changed files with 22 additions and 22 deletions
40
src/crawl.py
40
src/crawl.py
|
|
@ -47,7 +47,7 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
|
||||||
links = soup.find_all("a")
|
links = soup.find_all("a")
|
||||||
for link in links:
|
for link in links:
|
||||||
found = False
|
found = False
|
||||||
if "href" not in link:
|
if not hasattr(link, "href"):
|
||||||
continue
|
continue
|
||||||
link = link["href"]
|
link = link["href"]
|
||||||
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
||||||
|
|
@ -63,13 +63,13 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
|
||||||
parse_html(link, link_html, r, traversed_links)
|
parse_html(link, link_html, r, traversed_links)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
# else:
|
elif link not in traversed_links:
|
||||||
# with open(f'data/links.txt', 'r+') as linksfile:
|
with open(f'data/links.txt', 'r+') as linksfile:
|
||||||
# while line := linksfile.readline():
|
while line := linksfile.readline():
|
||||||
# if line.strip() == link.strip():
|
if line.strip() == link.strip():
|
||||||
# found = True
|
found = True
|
||||||
# if not found:
|
if not found:
|
||||||
# linksfile.write(f'{link}\n')
|
linksfile.write(f'{link}\n')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
os.makedirs("data/content", exist_ok=True)
|
os.makedirs("data/content", exist_ok=True)
|
||||||
|
|
@ -77,21 +77,21 @@ if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("url", help="URL of the webpage to be crawled")
|
parser.add_argument("url", help="URL of the webpage to be crawled")
|
||||||
parser.add_argument('-f', "--followlinks", action="store_true")
|
parser.add_argument('-f', "--followlinks", action="store_true")
|
||||||
max_recursion = 4
|
max_recursion = 2
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
if args.url == "links":
|
||||||
|
with open(f'data/links.txt', 'r+') as linksfile:
|
||||||
|
while line := linksfile.readline():
|
||||||
|
if "http" in line:
|
||||||
|
try:
|
||||||
|
parse_html(line, get_html(line))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
else:
|
||||||
html = get_html(args.url)
|
html = get_html(args.url)
|
||||||
parse_html(args.url, html, max_recursion)
|
parse_html(args.url, html, max_recursion)
|
||||||
|
|
||||||
# recursion = 0
|
# recursion = 0
|
||||||
# if (args.followlinks):
|
# if (args.followlinks):
|
||||||
# with open(f'data/links.txt', 'r+') as linksfile:
|
# os.remove('data/links.txt')
|
||||||
# while line := linksfile.readline():
|
|
||||||
# if recursion < max_recursion:
|
|
||||||
# if "http" in line:
|
|
||||||
# recursion += 1
|
|
||||||
# try:
|
|
||||||
# parse_html(line, get_html(line))
|
|
||||||
# except:
|
|
||||||
# pass
|
|
||||||
|
|
||||||
os.remove('data/links.txt')
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue