Tidy up crawling and implement boolean search
This commit is contained in:
parent
d4bb3fb8dc
commit
7ee9d978b2
4 changed files with 91 additions and 30 deletions
30
src/crawl.py
30
src/crawl.py
|
|
@ -40,21 +40,21 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
|
|||
if not rp.can_fetch("*", url):
|
||||
print("Robots prevents crawling url: " + url)
|
||||
return
|
||||
|
||||
soup = BeautifulSoup(html,'html.parser')
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
hash = hashlib.sha256()
|
||||
hash.update(url.encode('ascii'))
|
||||
|
||||
s = Session()
|
||||
existing_website = s.query(Documents).filter_by(url=url).first()
|
||||
print (existing_website)
|
||||
if existing_website == None:
|
||||
if existing_website is None:
|
||||
website = Documents(
|
||||
url=url,
|
||||
text_content=soup.get_text(),
|
||||
html_content=soup.prettify(),
|
||||
first_crawl_date=datetime.datetime.now(),
|
||||
last_crawl_date = datetime.datetime.now()
|
||||
last_crawl_date=datetime.datetime.now(),
|
||||
last_index_date=None
|
||||
)
|
||||
s.add(website)
|
||||
else:
|
||||
|
|
@ -64,44 +64,44 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], ro
|
|||
s.close()
|
||||
x = open(f'data/links.txt', 'a')
|
||||
x.close()
|
||||
links = soup.find_all("a")
|
||||
links = soup.find_all("a", href=True)
|
||||
for link in links:
|
||||
found = False
|
||||
if not hasattr(link, "href"):
|
||||
continue
|
||||
link = link["href"]
|
||||
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
||||
continue
|
||||
if not "http" in link:
|
||||
if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
|
||||
continue
|
||||
if "http" not in link:
|
||||
link = urljoin(url, link)
|
||||
if (recursion > 0 and link not in traversed_links):
|
||||
try:
|
||||
traversed_links.append(link)
|
||||
link_html = get_html(link)
|
||||
r = recursion -1
|
||||
sleep(1)
|
||||
r = recursion -1
|
||||
sleep(0.5)
|
||||
parse_html(link, link_html, r, traversed_links)
|
||||
except:
|
||||
pass
|
||||
elif link not in traversed_links:
|
||||
with open(f'data/links.txt', 'r+') as linksfile:
|
||||
with open('data/links.txt', 'r+') as linksfile:
|
||||
while line := linksfile.readline():
|
||||
if line.strip() == link.strip():
|
||||
found = True
|
||||
if not found:
|
||||
linksfile.write(f'{link}\n')
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.makedirs("data/content", exist_ok=True)
|
||||
# check inputs
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("url", help="URL of the webpage to be crawled")
|
||||
parser.add_argument('-f', "--followlinks", action="store_true")
|
||||
max_recursion = 2
|
||||
max_recursion = 4
|
||||
args = parser.parse_args()
|
||||
if args.url == "links":
|
||||
with open(f'data/links.txt', 'r+') as linksfile:
|
||||
with open('data/links.txt', 'r+') as linksfile:
|
||||
while line := linksfile.readline():
|
||||
if "http" in line:
|
||||
try:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue