Add site map crawl option

This commit is contained in:
rmgr 2024-06-08 20:43:05 +09:30
parent e3c67b64e6
commit 2a99a61dbe

View file

@ -166,6 +166,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument("url", help="URL of the webpage to be crawled")
parser.add_argument('-f', "--followlinks", action="store_true") parser.add_argument('-f', "--followlinks", action="store_true")
parser.add_argument('-s', "--crawl-sitemap", action="store_true")
parser.add_argument('-r', "--max-recursion", help="", type=int, default=1) parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
args = parser.parse_args() args = parser.parse_args()
@ -178,7 +179,21 @@ if __name__ == "__main__":
parse_html(line, get_html(line)) parse_html(line, get_html(line))
except: except:
pass pass
elif args.crawl_sitemap:
rp = urllib.robotparser.RobotFileParser()
urlparts = urlparse(args.url)
baseurl = urlparts.scheme + "://" + urlparts.netloc
rp.set_url(baseurl + "/robots.txt")
rp.read()
if not rp.can_fetch("*", args.url):
print("Robots prevents crawling url: " + args.url)
exit(0)
if len(rp.site_maps()) > 0:
map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml')
for loc in map.find_all('loc'):
url = loc.contents[0]
html = get_html(url)
parse_html(url, html, max_recursion)
else: else:
html = get_html(args.url) html = get_html(args.url)
parse_html(args.url, html, max_recursion) parse_html(args.url, html, max_recursion)