Add site map crawl option
This commit is contained in:
parent
e3c67b64e6
commit
2a99a61dbe
1 changed files with 16 additions and 1 deletions
17
src/crawl.py
17
src/crawl.py
|
|
@ -166,6 +166,7 @@ if __name__ == "__main__":
|
|||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("url", help="URL of the webpage to be crawled")
|
||||
parser.add_argument('-f', "--followlinks", action="store_true")
|
||||
parser.add_argument('-s', "--crawl-sitemap", action="store_true")
|
||||
parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
|
@ -178,7 +179,21 @@ if __name__ == "__main__":
|
|||
parse_html(line, get_html(line))
|
||||
except:
|
||||
pass
|
||||
|
||||
elif args.crawl_sitemap:
|
||||
rp = urllib.robotparser.RobotFileParser()
|
||||
urlparts = urlparse(args.url)
|
||||
baseurl = urlparts.scheme + "://" + urlparts.netloc
|
||||
rp.set_url(baseurl + "/robots.txt")
|
||||
rp.read()
|
||||
if not rp.can_fetch("*", args.url):
|
||||
print("Robots prevents crawling url: " + args.url)
|
||||
exit(0)
|
||||
if len(rp.site_maps()) > 0:
|
||||
map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml')
|
||||
for loc in map.find_all('loc'):
|
||||
url = loc.contents[0]
|
||||
html = get_html(url)
|
||||
parse_html(url, html, max_recursion)
|
||||
else:
|
||||
html = get_html(args.url)
|
||||
parse_html(args.url, html, max_recursion)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue