diff --git a/src/crawl.py b/src/crawl.py index 6966a25..0816e1b 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -166,6 +166,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") + parser.add_argument('-s', "--crawl-sitemap", action="store_true") parser.add_argument('-r', "--max-recursion", help="", type=int, default=1) args = parser.parse_args() @@ -178,7 +179,21 @@ if __name__ == "__main__": parse_html(line, get_html(line)) except: pass - + elif args.crawl_sitemap: + rp = urllib.robotparser.RobotFileParser() + urlparts = urlparse(args.url) + baseurl = urlparts.scheme + "://" + urlparts.netloc + rp.set_url(baseurl + "/robots.txt") + rp.read() + if not rp.can_fetch("*", args.url): + print("Robots prevents crawling url: " + args.url) + exit(0) + if len(rp.site_maps()) > 0: + map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml') + for loc in map.find_all('loc'): + url = loc.contents[0] + html = get_html(url) + parse_html(url, html, max_recursion) else: html = get_html(args.url) parse_html(args.url, html, max_recursion)