Tidy up index.py
This commit is contained in:
parent
20d198e559
commit
d4bb3fb8dc
1 changed files with 2 additions and 15 deletions
17
src/index.py
17
src/index.py
|
|
@ -1,18 +1,10 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
import argparse
|
import argparse
|
||||||
import requests
|
|
||||||
import hashlib
|
|
||||||
from urllib.parse import urlparse, urljoin
|
|
||||||
import urllib.robotparser
|
|
||||||
import os
|
|
||||||
from time import sleep
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
from config import DATABASE_URI
|
from config import DATABASE_URI
|
||||||
from models import Base, Documents, Document_Tokens, Tokens
|
from models import Base, Documents, Document_Tokens, Tokens
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from sqlalchemy import create_engine
|
import uuid
|
||||||
import datetime
|
|
||||||
|
|
||||||
engine = create_engine(DATABASE_URI)
|
engine = create_engine(DATABASE_URI)
|
||||||
Base.metadata.create_all(engine)
|
Base.metadata.create_all(engine)
|
||||||
|
|
@ -30,17 +22,12 @@ def build_index():
|
||||||
word = word.lower()
|
word = word.lower()
|
||||||
token = session.query(Tokens).filter_by(token=word).first()
|
token = session.query(Tokens).filter_by(token=word).first()
|
||||||
if token is None:
|
if token is None:
|
||||||
token = Tokens(token=word)
|
token = Tokens(token=word, id=uuid.uuid4())
|
||||||
session.add(token)
|
session.add(token)
|
||||||
document_token = Document_Tokens(document_id=document.id, token_id=token.id)
|
document_token = Document_Tokens(document_id=document.id, token_id=token.id)
|
||||||
session.add(document_token)
|
session.add(document_token)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
# Foreach document, break into words
|
|
||||||
# Check if word exists in database
|
|
||||||
# Create if not exist
|
|
||||||
# Link to document
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue