Respect robots.txt
This commit is contained in:
parent
b43343e0ee
commit
f4ea8ad1d7
4 changed files with 25 additions and 2 deletions
1
beehave.txt
Normal file
1
beehave.txt
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
https://github.com/bitbrain/beehave
|
||||||
Binary file not shown.
18
src/crawl.py
18
src/crawl.py
|
|
@ -3,21 +3,32 @@ import argparse
|
||||||
import requests
|
import requests
|
||||||
import hashlib
|
import hashlib
|
||||||
from urllib.parse import urlparse, urljoin
|
from urllib.parse import urlparse, urljoin
|
||||||
|
import urllib.robotparser
|
||||||
import os
|
import os
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
# TODO- Handle gemini/gopher links
|
# TODO- Handle gemini/gopher links
|
||||||
# TODO- Keep a list of traversed links and check before traversing again
|
|
||||||
|
|
||||||
def get_html(url: str) -> str:
|
def get_html(url: str) -> str:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
return response.content
|
return response.content
|
||||||
|
|
||||||
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) -> bool:
|
def parse_html(url: str, html: str, recursion: int = 0, traversed_links = [], robots = {}) -> bool:
|
||||||
|
rp = urllib.robotparser.RobotFileParser()
|
||||||
print(url)
|
print(url)
|
||||||
print(recursion)
|
print(recursion)
|
||||||
urlparts = urlparse(url)
|
urlparts = urlparse(url)
|
||||||
baseurl = urlparts.scheme + "://" + urlparts.netloc
|
baseurl = urlparts.scheme + "://" + urlparts.netloc
|
||||||
|
if baseurl not in robots:
|
||||||
|
rp.set_url(baseurl + "/robots.txt")
|
||||||
|
rp.read()
|
||||||
|
robots[baseurl] = rp
|
||||||
|
else:
|
||||||
|
rp = robots[baseurl]
|
||||||
|
if not rp.can_fetch("*", url):
|
||||||
|
print("Robots prevents crawling url: " + url)
|
||||||
|
return
|
||||||
|
|
||||||
soup = BeautifulSoup(html,'html.parser')
|
soup = BeautifulSoup(html,'html.parser')
|
||||||
hash = hashlib.sha256()
|
hash = hashlib.sha256()
|
||||||
hash.update(url.encode('ascii'))
|
hash.update(url.encode('ascii'))
|
||||||
|
|
@ -36,6 +47,8 @@ def parse_html(url: str, html: str, recursion: int = 0, traversed_links = []) ->
|
||||||
links = soup.find_all("a")
|
links = soup.find_all("a")
|
||||||
for link in links:
|
for link in links:
|
||||||
found = False
|
found = False
|
||||||
|
if "href" not in link:
|
||||||
|
continue
|
||||||
link = link["href"]
|
link = link["href"]
|
||||||
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
||||||
continue
|
continue
|
||||||
|
|
@ -80,4 +93,5 @@ if __name__ == "__main__":
|
||||||
# parse_html(line, get_html(line))
|
# parse_html(line, get_html(line))
|
||||||
# except:
|
# except:
|
||||||
# pass
|
# pass
|
||||||
|
|
||||||
os.remove('data/links.txt')
|
os.remove('data/links.txt')
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,13 @@ import json
|
||||||
# investigate ngrams for "multi word" matching
|
# investigate ngrams for "multi word" matching
|
||||||
ignored_words = ['a', 'the','is']
|
ignored_words = ['a', 'the','is']
|
||||||
|
|
||||||
|
def remove_punctuation(input_string):
|
||||||
|
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
|
||||||
|
for p in punc:
|
||||||
|
input_string = input_string.replace(p, '')
|
||||||
|
return input_string
|
||||||
|
|
||||||
|
|
||||||
def build_index():
|
def build_index():
|
||||||
with open(f"data/index.json", "w") as index:
|
with open(f"data/index.json", "w") as index:
|
||||||
# get a list of all content files
|
# get a list of all content files
|
||||||
|
|
@ -20,6 +27,7 @@ def build_index():
|
||||||
content_words = content.split()
|
content_words = content.split()
|
||||||
for word in content_words:
|
for word in content_words:
|
||||||
word = word.lower()
|
word = word.lower()
|
||||||
|
word = remove_punctuation(word)
|
||||||
if not word in ignored_words:
|
if not word in ignored_words:
|
||||||
if not word in dictionary:
|
if not word in dictionary:
|
||||||
dictionary[word] = []
|
dictionary[word] = []
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue