Tidy up crawling and implement boolean search

This commit is contained in:
rmgr 2024-04-04 20:46:34 +10:30
parent d4bb3fb8dc
commit 7ee9d978b2
4 changed files with 91 additions and 30 deletions

View file

@ -40,21 +40,21 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
if not rp.can_fetch("*", url):
print("Robots prevents crawling url: " + url)
return
soup = BeautifulSoup(html,'html.parser')
soup = BeautifulSoup(html, 'html.parser')
hash = hashlib.sha256()
hash.update(url.encode('ascii'))
s = Session()
existing_website = s.query(Documents).filter_by(url=url).first()
print (existing_website)
if existing_website == None:
if existing_website is None:
website = Documents(
url=url,
text_content=soup.get_text(),
html_content=soup.prettify(),
first_crawl_date=datetime.datetime.now(),
last_crawl_date = datetime.datetime.now()
last_crawl_date=datetime.datetime.now(),
last_index_date=None
)
s.add(website)
else:
@ -64,44 +64,44 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
s.close()
x = open(f'data/links.txt', 'a')
x.close()
links = soup.find_all("a")
links = soup.find_all("a", href=True)
for link in links:
found = False
if not hasattr(link, "href"):
continue
link = link["href"]
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
continue
if not "http" in link:
if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
continue
if "http" not in link:
link = urljoin(url, link)
if (recursion > 0 and link not in traversed_links):
try:
traversed_links.append(link)
link_html = get_html(link)
r = recursion -1
sleep(1)
r = recursion -1
sleep(0.5)
parse_html(link, link_html, r, traversed_links)
except:
pass
elif link not in traversed_links:
with open(f'data/links.txt', 'r+') as linksfile:
with open('data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if line.strip() == link.strip():
found = True
if not found:
linksfile.write(f'{link}\n')
if __name__ == "__main__":
if __name__ == "__main__":
os.makedirs("data/content", exist_ok=True)
# check inputs
parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true")
max_recursion = 2
max_recursion = 4
args = parser.parse_args()
if args.url == "links":
with open(f'data/links.txt', 'r+') as linksfile:
with open('data/links.txt', 'r+') as linksfile:
while line := linksfile.readline():
if "http" in line:
try: