Add site map crawl option

2024-06-08 20:43:05 +09:30 · 2024-06-08 20:43:05 +09:30 · 2a99a61dbe
commit 2a99a61dbe
parent e3c67b64e6
1 changed files with 16 additions and 1 deletions
--- a/src/crawl.py
+++ b/src/crawl.py
@ -166,6 +166,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("url", help="URL of the webpage to be crawled")
    parser.add_argument('-f', "--followlinks", action="store_true")
    parser.add_argument('-s', "--crawl-sitemap", action="store_true")
    parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
    args = parser.parse_args()
@ -178,7 +179,21 @@ if __name__ == "__main__":
                        parse_html(line, get_html(line))
                    except:
                        pass
-
+    elif args.crawl_sitemap:
        rp = urllib.robotparser.RobotFileParser()
        urlparts = urlparse(args.url)
        baseurl = urlparts.scheme + "://" + urlparts.netloc
        rp.set_url(baseurl + "/robots.txt")
        rp.read()
        if not rp.can_fetch("*", args.url):
            print("Robots prevents crawling url: " + args.url)
            exit(0)
        if len(rp.site_maps()) > 0:
            map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml')
            for loc in map.find_all('loc'):
                url = loc.contents[0]
                html = get_html(url)
                parse_html(url, html, max_recursion)
    else:
        html = get_html(args.url)
        parse_html(args.url, html, max_recursion)