176 lines
6.2 KiB
Python
Executable file
176 lines
6.2 KiB
Python
Executable file
#!/usr/bin/python3
|
|
import argparse
|
|
import requests
|
|
import hashlib
|
|
from urllib.parse import urlparse, urljoin
|
|
import urllib.robotparser
|
|
import os
|
|
from time import sleep
|
|
from bs4 import BeautifulSoup
|
|
from sqlalchemy import create_engine
|
|
from config import DATABASE_URI
|
|
from models import Base, Documents, Document_Tokens, Tokens
|
|
from sqlalchemy.orm import sessionmaker
|
|
from sqlalchemy import create_engine
|
|
import datetime
|
|
import yt_dlp as youtube_dl
|
|
# TODO- Handle gemini/gopher links
|
|
|
|
engine = create_engine(DATABASE_URI)
|
|
Base.metadata.create_all(engine)
|
|
Session = sessionmaker(bind=engine)
|
|
|
|
|
|
def get_html(url: str) -> str:
|
|
response = requests.get(url)
|
|
return response.content
|
|
|
|
|
|
def parse_youtube(video_url: str) -> bool:
|
|
# Language preference for subtitles (set to None for auto-generated)
|
|
# Change this to 'en' for English subtitles, or None for auto-generated
|
|
subtitle_language = 'en'
|
|
# Options for youtube_dl
|
|
ydl_opts = {
|
|
'writesubtitles': True,
|
|
'allsubtitles': True,
|
|
'skip_download': True, # We only want to fetch metadata
|
|
'subtitleslangs': [subtitle_language] if subtitle_language else None,
|
|
}
|
|
|
|
# Initialize youtube_dl object
|
|
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
|
# Download metadata
|
|
info_dict = ydl.extract_info(video_url, download=False)
|
|
|
|
# Extract subtitles
|
|
subtitles = info_dict.get('subtitles')
|
|
subtitles_text = ""
|
|
# Print available subtitles
|
|
if subtitles:
|
|
for subs in subtitles.values():
|
|
for sub in subs:
|
|
subtitle_url = sub['url']
|
|
with youtube_dl.YoutubeDL({}) as ydl:
|
|
subtitle_info = ydl.extract_info(
|
|
subtitle_url, download=False)
|
|
for subtitle in subtitle_info['subtitles'][subtitle_language]:
|
|
if subtitle["ext"] == "srv1":
|
|
soup = BeautifulSoup(
|
|
get_html(subtitle["url"]), 'html.parser')
|
|
subtitles_text = soup.get_text()
|
|
|
|
s = Session()
|
|
existing_website = s.query(
|
|
Documents).filter_by(url=video_url).first()
|
|
if existing_website is None:
|
|
website = Documents(
|
|
url=video_url,
|
|
text_content=subtitles_text,
|
|
html_content=None, # soup.prettify(),
|
|
first_crawl_date=datetime.datetime.now(),
|
|
last_crawl_date=datetime.datetime.now(),
|
|
last_index_date=None
|
|
)
|
|
s.add(website)
|
|
else:
|
|
existing_website.last_crawl_date = datetime.datetime.now()
|
|
s.add(existing_website)
|
|
s.commit()
|
|
s.close()
|
|
|
|
|
|
def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
|
|
if "youtube.com" in url:
|
|
parse_youtube(url)
|
|
return
|
|
rp = urllib.robotparser.RobotFileParser()
|
|
print(url)
|
|
print(recursion)
|
|
urlparts = urlparse(url)
|
|
baseurl = urlparts.scheme + "://" + urlparts.netloc
|
|
if baseurl not in robots:
|
|
rp.set_url(baseurl + "/robots.txt")
|
|
rp.read()
|
|
robots[baseurl] = rp
|
|
else:
|
|
rp = robots[baseurl]
|
|
if not rp.can_fetch("*", url):
|
|
print("Robots prevents crawling url: " + url)
|
|
return
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
hash = hashlib.sha256()
|
|
hash.update(url.encode('ascii'))
|
|
|
|
s = Session()
|
|
existing_website = s.query(Documents).filter_by(url=url).first()
|
|
if existing_website is None:
|
|
website = Documents(
|
|
url=url,
|
|
text_content=soup.get_text(),
|
|
html_content=soup.prettify(),
|
|
first_crawl_date=datetime.datetime.now(),
|
|
last_crawl_date=datetime.datetime.now(),
|
|
last_index_date=None
|
|
)
|
|
s.add(website)
|
|
else:
|
|
existing_website.last_crawl_date = datetime.datetime.now()
|
|
s.add(existing_website)
|
|
s.commit()
|
|
s.close()
|
|
x = open(f'data/links.txt', 'a')
|
|
x.close()
|
|
links = soup.find_all("a", href=True)
|
|
for link in links:
|
|
found = False
|
|
link = link["href"]
|
|
if (len(link) > 0 and link[0] == "#") or "localhost" in link:
|
|
continue
|
|
if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
|
|
continue
|
|
if "http" not in link:
|
|
link = urljoin(url, link)
|
|
if (recursion > 0 and link not in traversed_links):
|
|
try:
|
|
traversed_links.append(link)
|
|
link_html = get_html(link)
|
|
r = recursion - 1
|
|
sleep(0.5)
|
|
parse_html(link, link_html, r, traversed_links)
|
|
except:
|
|
pass
|
|
elif link not in traversed_links:
|
|
with open('data/links.txt', 'r+') as linksfile:
|
|
while line := linksfile.readline():
|
|
if line.strip() == link.strip():
|
|
found = True
|
|
if not found:
|
|
linksfile.write(f'{link}\n')
|
|
|
|
|
|
if __name__ == "__main__":
|
|
os.makedirs("data/content", exist_ok=True)
|
|
# check inputs
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("url", help="URL of the webpage to be crawled")
|
|
parser.add_argument('-f', "--followlinks", action="store_true")
|
|
max_recursion = 4
|
|
args = parser.parse_args()
|
|
if args.url == "links":
|
|
with open('data/links.txt', 'r+') as linksfile:
|
|
while line := linksfile.readline():
|
|
if "http" in line:
|
|
try:
|
|
parse_html(line, get_html(line))
|
|
except:
|
|
pass
|
|
|
|
else:
|
|
html = get_html(args.url)
|
|
parse_html(args.url, html, max_recursion)
|
|
|
|
# recursion = 0
|
|
# if (args.followlinks):
|
|
# os.remove('data/links.txt')
|