Add site map crawl option

2024-06-08 20:43:05 +09:30 · 2024-06-08 20:43:05 +09:30 · 2a99a61dbe
commit 2a99a61dbe
parent e3c67b64e6
1 changed files with 16 additions and 1 deletions
--- a/src/crawl.py
+++ b/src/crawl.py
@ -166,6 +166,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("url", help="URL of the webpage to be crawled")
    parser.add_argument('-f', "--followlinks", action="store_true")
+    parser.add_argument('-s', "--crawl-sitemap", action="store_true")
    parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)

    args = parser.parse_args()
@ -178,7 +179,21 @@ if __name__ == "__main__":
                        parse_html(line, get_html(line))
                    except:
                        pass
-
+    elif args.crawl_sitemap:
+        rp = urllib.robotparser.RobotFileParser()
+        urlparts = urlparse(args.url)
+        baseurl = urlparts.scheme + "://" + urlparts.netloc
+        rp.set_url(baseurl + "/robots.txt")
+        rp.read()
+        if not rp.can_fetch("*", args.url):
+            print("Robots prevents crawling url: " + args.url)
+            exit(0)
+        if len(rp.site_maps()) > 0:
+            map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml')
+            for loc in map.find_all('loc'):
+                url = loc.contents[0]
+                html = get_html(url)
+                parse_html(url, html, max_recursion)
    else:
        html = get_html(args.url)
        parse_html(args.url, html, max_recursion)