diff --git a/beehave.txt b/beehave.txt
new file mode 100644
index 0000000..e3415b6
--- /dev/null
+++ b/beehave.txt
@@ -0,0 +1 @@
+https://github.com/bitbrain/beehave
diff --git a/src/__pycache__/search.cpython-310.pyc b/src/__pycache__/search.cpython-310.pyc
index c740282..f3e8621 100644
Binary files a/src/__pycache__/search.cpython-310.pyc and b/src/__pycache__/search.cpython-310.pyc differ
diff --git a/src/crawl.py b/src/crawl.py
index dcac7ed..da6bffa 100755
--- a/src/crawl.py
+++ b/src/crawl.py
@@ -3,21 +3,32 @@ import argparse
 import requests
 import hashlib
 from urllib.parse import urlparse, urljoin
+import urllib.robotparser
 import os
 from time import sleep
 from bs4 import BeautifulSoup
 # TODO- Handle gemini/gopher links
-# TODO- Keep a list of traversed links and check before traversing again
 
 def get_html(url: str) -> str:
     response = requests.get(url)
     return response.content
 
-def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
+def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
+    rp = urllib.robotparser.RobotFileParser()
     print(url)
     print(recursion)
     urlparts = urlparse(url)
     baseurl = urlparts.scheme + "://" + urlparts.netloc
+    if baseurl not in robots:
+        rp.set_url(baseurl + "/robots.txt")
+        rp.read()
+        robots[baseurl] = rp
+    else:
+        rp = robots[baseurl]
+    if not rp.can_fetch("*", url):
+        print("Robots prevents crawling url: " + url)
+        return
+    
     soup = BeautifulSoup(html,'html.parser')
     hash = hashlib.sha256()
     hash.update(url.encode('ascii'))
@@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
     links = soup.find_all("a")
     for link in links:
         found = False
+        if "href" not in link:
+            continue
         link = link["href"]
         if (len(link) > 0 and link[0] == "#") or "localhost" in link:
             continue
@@ -80,4 +93,5 @@ if __name__ == "__main__":
 #                            parse_html(line, get_html(line))
 #                        except:
 #                            pass
+
     os.remove('data/links.txt')
diff --git a/src/index.py b/src/index.py
index f55a356..7532247 100755
--- a/src/index.py
+++ b/src/index.py
@@ -7,6 +7,13 @@ import json
 # investigate ngrams for "multi word" matching
 ignored_words = ['a', 'the','is']
 
+def remove_punctuation(input_string):
+    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
+    for p in punc:
+        input_string = input_string.replace(p, '')
+    return input_string
+
+
 def build_index():
     with open(f"data/index.json", "w") as index:
         # get a list of all content files
@@ -20,6 +27,7 @@ def build_index():
                 content_words = content.split()
                 for word in content_words:
                     word = word.lower()
+                    word = remove_punctuation(word)
                     if not word in ignored_words:
                         if not word in dictionary:
                             dictionary[word] = []