Tidy up index.py

This commit is contained in:
rmgr 2024-03-07 21:12:19 +10:30
parent 20d198e559
commit d4bb3fb8dc

View file

@ -1,18 +1,10 @@
#!/usr/bin/python3
import argparse
import requests
import hashlib
from urllib.parse import urlparse, urljoin
import urllib.robotparser
import os
from time import sleep
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Documents, Document_Tokens, Tokens
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import datetime
import uuid
engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine)
@ -30,17 +22,12 @@ def build_index():
word = word.lower()
token = session.query(Tokens).filter_by(token=word).first()
if token is None:
token = Tokens(token=word)
token = Tokens(token=word, id=uuid.uuid4())
session.add(token)
document_token = Document_Tokens(document_id=document.id, token_id=token.id)
session.add(document_token)
session.commit()
# Foreach document, break into words
# Check if word exists in database
# Create if not exist
# Link to document
if __name__ == "__main__":
parser = argparse.ArgumentParser()