Respect robots.txt

2024-01-01 19:53:22 +10:30 · 2024-01-01 19:53:22 +10:30 · f4ea8ad1d7
commit f4ea8ad1d7
parent b43343e0ee
4 changed files with 25 additions and 2 deletions
--- a/src/pycache/search.cpython-310.pyc
+++ b/src/pycache/search.cpython-310.pyc
--- a/src/crawl.py
+++ b/src/crawl.py
@ -3,21 +3,32 @@ import argparse
 import requests
 import hashlib
 from urllib.parse import urlparse, urljoin
+import urllib.robotparser
 import os
 from time import sleep
 from bs4 import BeautifulSoup
 # TODO- Handle gemini/gopher links
-# TODO- Keep a list of traversed links and check before traversing again

 def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content

-def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
+def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
+    rp = urllib.robotparser.RobotFileParser()
    print(url)
    print(recursion)
    urlparts = urlparse(url)
    baseurl = urlparts.scheme + "://" + urlparts.netloc
+    if baseurl not in robots:
+        rp.set_url(baseurl + "/robots.txt")
+        rp.read()
+        robots[baseurl] = rp
+    else:
+        rp = robots[baseurl]
+    if not rp.can_fetch("*", url):
+        print("Robots prevents crawling url: " + url)
+        return
+    
    soup = BeautifulSoup(html,'html.parser')
    hash = hashlib.sha256()
    hash.update(url.encode('ascii'))
@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
    links = soup.find_all("a")
    for link in links:
        found = False
+        if "href" not in link:
+            continue
        link = link["href"]
        if (len(link) > 0 and link[0] == "#") or "localhost" in link:
            continue
@ -80,4 +93,5 @@ if __name__ == "__main__":
 #                            parse_html(line, get_html(line))
 #                        except:
 #                            pass
+
    os.remove('data/links.txt')
--- a/src/index.py
+++ b/src/index.py
@ -7,6 +7,13 @@ import json
 # investigate ngrams for "multi word" matching
 ignored_words = ['a', 'the','is']

+def remove_punctuation(input_string):
+    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
+    for p in punc:
+        input_string = input_string.replace(p, '')
+    return input_string
+
+
 def build_index():
    with open(f"data/index.json", "w") as index:
        # get a list of all content files
@ -20,6 +27,7 @@ def build_index():
                content_words = content.split()
                for word in content_words:
                    word = word.lower()
+                    word = remove_punctuation(word)
                    if not word in ignored_words:
                        if not word in dictionary:
                            dictionary[word] = []