search-engine/src/crawl.py

#!/usr/bin/python3
import argparse
import requests
import hashlib
from urllib.parse import urlparse, urljoin
import urllib.robotparser
import os
from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Documents, Document_Tokens, Tokens
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import datetime
import yt_dlp as youtube_dl
# TODO- Handle gemini/gopher links

engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)


def get_html(url: str) -> str:
    response = requests.get(url)
    return response.content


def parse_youtube(video_url: str) -> bool:
    # Language preference for subtitles (set to None for auto-generated)
    # Change this to 'en' for English subtitles, or None for auto-generated
    subtitle_language = 'en'
    # Options for youtube_dl
    ydl_opts = {
        'writesubtitles': True,
        'allsubtitles': True,
        'skip_download': True,  # We only want to fetch metadata
        'subtitleslangs': [subtitle_language] if subtitle_language else None,
    }

    # Initialize youtube_dl object
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        # Download metadata
        info_dict = ydl.extract_info(video_url, download=False)

    # Extract subtitles
    subtitles = info_dict.get('subtitles')
    subtitles_text = ""
    # Print available subtitles
    if subtitles:
        for subs in subtitles.values():
            for sub in subs:
                subtitle_url = sub['url']
                with youtube_dl.YoutubeDL({}) as ydl:
                    subtitle_info = ydl.extract_info(
                        subtitle_url, download=False)
                    for subtitle in subtitle_info['subtitles'][subtitle_language]:
                        if subtitle["ext"] == "srv1":
                            soup = BeautifulSoup(
                                get_html(subtitle["url"]), 'html.parser')
                            subtitles_text = soup.get_text()

                            s = Session()
                            existing_website = s.query(
                                Documents).filter_by(url=video_url).first()
                            if existing_website is None:
                                website = Documents(
                                    url=video_url,
                                    text_content=subtitles_text,
                                    html_content=None,  # soup.prettify(),
                                    first_crawl_date=datetime.datetime.now(),
                                    last_crawl_date=datetime.datetime.now(),
                                    last_index_date=None
                                )
                                s.add(website)
                            else:
                                existing_website.last_crawl_date = datetime.datetime.now()
                                s.add(existing_website)
                            s.commit()
                            s.close()


def parse_html(url: str, html: str, recursion: int = 0, traversed_links=[], robots={}) -> bool:
    if "youtube.com" in url:
        parse_youtube(url)
        return
    rp = urllib.robotparser.RobotFileParser()
    print(url)
    print(recursion)
    urlparts = urlparse(url)
    baseurl = urlparts.scheme + "://" + urlparts.netloc
    if baseurl not in robots:
        rp.set_url(baseurl + "/robots.txt")
        rp.read()
        robots[baseurl] = rp
    else:
        rp = robots[baseurl]
    if not rp.can_fetch("*", url):
        print("Robots prevents crawling url: " + url)
        return

    soup = BeautifulSoup(html, 'html.parser')
    hash = hashlib.sha256()
    hash.update(url.encode('ascii'))

    s = Session()
    existing_website = s.query(Documents).filter_by(url=url).first()
    if existing_website is None:
        website = Documents(
            url=url,
            text_content=soup.get_text(),
            html_content=soup.prettify(),
            first_crawl_date=datetime.datetime.now(),
            last_crawl_date=datetime.datetime.now(),
            last_index_date=None
        )
        s.add(website)
    else:
        existing_website.last_crawl_date = datetime.datetime.now()
        s.add(existing_website)
    s.commit()
    s.close()
    x = open(f'data/links.txt', 'a')
    x.close()
    links = soup.find_all("a", href=True)
    for link in links:
        found = False
        link = link["href"]
        if (len(link) > 0 and link[0] == "#") or "localhost" in link:
            continue
        if ".webp" in link or ".jpeg" in link or ".png" in link or ".gif" in link or ".pdf" in link or ".jpg" in link:
            continue
        if "http" not in link:
            link = urljoin(url, link)
        if (recursion > 0 and link not in traversed_links):
            try:
                traversed_links.append(link)
                link_html = get_html(link)
                r = recursion - 1
                sleep(0.5)
                parse_html(link, link_html, r, traversed_links)
            except:
                pass
        elif link not in traversed_links:
            with open('data/links.txt', 'r+') as linksfile:
                while line := linksfile.readline():
                    if line.strip() == link.strip():
                        found = True
                if not found:
                    linksfile.write(f'{link}\n')


if __name__ == "__main__":
    os.makedirs("data/content", exist_ok=True)
    # check inputs
    parser = argparse.ArgumentParser()
    parser.add_argument("url", help="URL of the webpage to be crawled")
    parser.add_argument('-f', "--followlinks", action="store_true")
    max_recursion = 4
    args = parser.parse_args()
    if args.url == "links":
        with open('data/links.txt', 'r+') as linksfile:
            while line := linksfile.readline():
                if "http" in line:
                    try:
                        parse_html(line, get_html(line))
                    except:
                        pass

    else:
        html = get_html(args.url)
        parse_html(args.url, html, max_recursion)

#    recursion = 0
#    if (args.followlinks):
#    os.remove('data/links.txt')