Tidy up index.py
This commit is contained in:
parent
20d198e559
commit
d4bb3fb8dc
1 changed files with 2 additions and 15 deletions
17
src/index.py
17
src/index.py
|
|
@ -1,18 +1,10 @@
|
|||
#!/usr/bin/python3
|
||||
import argparse
|
||||
import requests
|
||||
import hashlib
|
||||
from urllib.parse import urlparse, urljoin
|
||||
import urllib.robotparser
|
||||
import os
|
||||
from time import sleep
|
||||
from bs4 import BeautifulSoup
|
||||
from sqlalchemy import create_engine
|
||||
from config import DATABASE_URI
|
||||
from models import Base, Documents, Document_Tokens, Tokens
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy import create_engine
|
||||
import datetime
|
||||
import uuid
|
||||
|
||||
engine = create_engine(DATABASE_URI)
|
||||
Base.metadata.create_all(engine)
|
||||
|
|
@ -30,16 +22,11 @@ def build_index():
|
|||
word = word.lower()
|
||||
token = session.query(Tokens).filter_by(token=word).first()
|
||||
if token is None:
|
||||
token = Tokens(token=word)
|
||||
token = Tokens(token=word, id=uuid.uuid4())
|
||||
session.add(token)
|
||||
document_token = Document_Tokens(document_id=document.id, token_id=token.id)
|
||||
session.add(document_token)
|
||||
session.commit()
|
||||
|
||||
# Foreach document, break into words
|
||||
# Check if word exists in database
|
||||
# Create if not exist
|
||||
# Link to document
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue