Add site map crawl option
This commit is contained in:
parent
e3c67b64e6
commit
2a99a61dbe
1 changed files with 16 additions and 1 deletions
17
src/crawl.py
17
src/crawl.py
|
|
@ -166,6 +166,7 @@ if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("url", help="URL of the webpage to be crawled")
|
parser.add_argument("url", help="URL of the webpage to be crawled")
|
||||||
parser.add_argument('-f', "--followlinks", action="store_true")
|
parser.add_argument('-f', "--followlinks", action="store_true")
|
||||||
|
parser.add_argument('-s', "--crawl-sitemap", action="store_true")
|
||||||
parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
|
parser.add_argument('-r', "--max-recursion", help="", type=int, default=1)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
@ -178,7 +179,21 @@ if __name__ == "__main__":
|
||||||
parse_html(line, get_html(line))
|
parse_html(line, get_html(line))
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
elif args.crawl_sitemap:
|
||||||
|
rp = urllib.robotparser.RobotFileParser()
|
||||||
|
urlparts = urlparse(args.url)
|
||||||
|
baseurl = urlparts.scheme + "://" + urlparts.netloc
|
||||||
|
rp.set_url(baseurl + "/robots.txt")
|
||||||
|
rp.read()
|
||||||
|
if not rp.can_fetch("*", args.url):
|
||||||
|
print("Robots prevents crawling url: " + args.url)
|
||||||
|
exit(0)
|
||||||
|
if len(rp.site_maps()) > 0:
|
||||||
|
map = BeautifulSoup(requests.get(rp.site_maps()[0]).content, 'xml')
|
||||||
|
for loc in map.find_all('loc'):
|
||||||
|
url = loc.contents[0]
|
||||||
|
html = get_html(url)
|
||||||
|
parse_html(url, html, max_recursion)
|
||||||
else:
|
else:
|
||||||
html = get_html(args.url)
|
html = get_html(args.url)
|
||||||
parse_html(args.url, html, max_recursion)
|
parse_html(args.url, html, max_recursion)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue