Begin adding Postgresql support instead of filesystem flat files
This commit is contained in:
parent
b43343e0ee
commit
24ee04c0ff
6 changed files with 80 additions and 13 deletions
14
src/index.py
Executable file → Normal file
14
src/index.py
Executable file → Normal file
|
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from config import DATABASE_URI
|
||||
from models import Base, Website
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import os
|
||||
|
|
@ -7,6 +8,13 @@ import json
|
|||
# investigate ngrams for "multi word" matching
|
||||
ignored_words = ['a', 'the','is']
|
||||
|
||||
def remove_punctuation(input_string):
|
||||
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
|
||||
for p in punc:
|
||||
input_string = input_string.replace(p, '')
|
||||
return input_string
|
||||
|
||||
|
||||
def build_index():
|
||||
with open(f"data/index.json", "w") as index:
|
||||
# get a list of all content files
|
||||
|
|
@ -20,6 +28,7 @@ def build_index():
|
|||
content_words = content.split()
|
||||
for word in content_words:
|
||||
word = word.lower()
|
||||
word = remove_punctuation(word)
|
||||
if not word in ignored_words:
|
||||
if not word in dictionary:
|
||||
dictionary[word] = []
|
||||
|
|
@ -42,3 +51,4 @@ if __name__ == "__main__":
|
|||
args = parser.parse_args()
|
||||
if args.rebuild:
|
||||
build_index()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue