Respect robots.txt

2024-01-01 19:53:22 +10:30 · 2024-01-01 19:53:22 +10:30 · f4ea8ad1d7
commit f4ea8ad1d7
parent b43343e0ee
4 changed files with 25 additions and 2 deletions
--- a/src/crawl.py
+++ b/src/crawl.py
@ -3,21 +3,32 @@ import argparse
 import requests
 import hashlib
 from urllib.parse import urlparse, urljoin
+import urllib.robotparser
 import os
 from time import sleep
 from bs4 import BeautifulSoup
 # TODO- Handle gemini/gopher links
-# TODO- Keep a list of traversed links and check before traversing again

 def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content

-def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
+def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
+    rp = urllib.robotparser.RobotFileParser()
    print(url)
    print(recursion)
    urlparts = urlparse(url)
    baseurl = urlparts.scheme + "://" + urlparts.netloc
+    if baseurl not in robots:
+        rp.set_url(baseurl + "/robots.txt")
+        rp.read()
+        robots[baseurl] = rp
+    else:
+        rp = robots[baseurl]
+    if not rp.can_fetch("*", url):
+        print("Robots prevents crawling url: " + url)
+        return
+    
    soup = BeautifulSoup(html,'html.parser')
    hash = hashlib.sha256()
    hash.update(url.encode('ascii'))
@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
    links = soup.find_all("a")
    for link in links:
        found = False
+        if "href" not in link:
+            continue
        link = link["href"]
        if (len(link) > 0 and link[0] == "#") or "localhost" in link:
            continue
@ -80,4 +93,5 @@ if __name__ == "__main__":
 #                            parse_html(line, get_html(line))
 #                        except:
 #                            pass
+
    os.remove('data/links.txt')