#!/usr/bin/python3 import argparse import requests import hashlib from urllib.parse import urlparse, urljoin import urllib.robotparser import os from time import sleep from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI from models import Base, Documents, Document_Tokens, Tokens from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine import datetime import yt_dlp as youtube_dl # TODO- Handle gemini/gopher links engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) def get_html(url: str) -> str: response = requests.get(url) return response.content def parse_youtube(video_url: str) -> bool: # Language preference for subtitles (set to None for auto-generated) # Change this to 'en' for English subtitles, or None for auto-generated subtitle_language = 'en' # Options for youtube_dl ydl_opts = { 'writesubtitles': True, 'allsubtitles': True, 'skip_download': True, # We only want to fetch metadata 'subtitleslangs': [subtitle_language] if subtitle_language else None, } # Initialize youtube_dl object with youtube_dl.YoutubeDL(ydl_opts) as ydl: # Download metadata info_dict = ydl.extract_info(video_url, download=False) # Extract subtitles subtitles = info_dict.get('subtitles') subtitles_text = "" # Print available subtitles if subtitles: for subs in subtitles.values(): for sub in subs: subtitle_url = sub['url'] with youtube_dl.YoutubeDL({}) as ydl: subtitle_info = ydl.extract_info( subtitle_url, download=False) for subtitle in subtitle_info['subtitles'][subtitle_language]: if subtitle["ext"] == "srv1": soup = BeautifulSoup( get_html(subtitle["url"]), 'html.parser') subtitles_text = soup.get_text() s = Session() existing_website = s.query( Documents).filter_by(url=video_url).first() if existing_website is None: website = Documents( url=video_url, text_content=subtitles_text, html_content=None, # soup.prettify(), first_crawl_date=datetime.datetime.now(), last_crawl_date=datetime.datetime.now(), last_index_date=None ) s.add(website) else: existing_website.last_crawl_date = datetime.datetime.now() s.add(existing_website) s.commit() s.close() def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool: if "youtube.com" in url: parse_youtube(url) return rp = urllib.robotparser.RobotFileParser() print(url) print(recursion) urlparts = urlparse(url) baseurl = urlparts.scheme + "://" + urlparts.netloc if baseurl not in robots: rp.set_url(baseurl + "/robots.txt") rp.read() robots[baseurl] = rp else: rp = robots[baseurl] if not rp.can_fetch("*", url): print("Robots prevents crawling url: " + url) return soup = BeautifulSoup(html, 'html.parser') hash = hashlib.sha256() hash.update(url.encode('ascii')) s = Session() existing_website = s.query(Documents).filter_by(url=url).first() if existing_website is None: website = Documents( url=url, text_content=soup.get_text(), html_content=soup.prettify(), first_crawl_date=datetime.datetime.now(), last_crawl_date=datetime.datetime.now(), last_index_date=None ) s.add(website) else: existing_website.last_crawl_date = datetime.datetime.now() s.add(existing_website) s.commit() s.close() x = open(f'data/links.txt', 'a') x.close() links = soup.find_all("a", href=True) for link in links: found = False link = link["href"] if (len(link) > 0 and link[0] == "#") or "localhost" in link: continue if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link: continue if "http" not in link: link = urljoin(url, link) if (recursion > 0 and link not in traversed_links): try: traversed_links.append(link) link_html = get_html(link) r = recursion - 1 sleep(0.5) parse_html(link, link_html, r, traversed_links) except: pass elif link not in traversed_links: with open('data/links.txt', 'r+') as linksfile: while line := linksfile.readline(): if line.strip() == link.strip(): found = True if not found: linksfile.write(f'{link}\n') if __name__ == "__main__": os.makedirs("data/content", exist_ok=True) # check inputs parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of the webpage to be crawled") parser.add_argument('-f', "--followlinks", action="store_true") max_recursion = 4 args = parser.parse_args() if args.url == "links": with open('data/links.txt', 'r+') as linksfile: while line := linksfile.readline(): if "http" in line: try: parse_html(line, get_html(line)) except: pass else: html = get_html(args.url) parse_html(args.url, html, max_recursion) # recursion = 0 # if (args.followlinks): # os.remove('data/links.txt')