54 lines
2.1 KiB
Python
54 lines
2.1 KiB
Python
from sqlalchemy import create_engine
|
|
from config import DATABASE_URI
|
|
from models import Base, Website
|
|
from pathlib import Path
|
|
import argparse
|
|
import os
|
|
import json
|
|
# investigate ngrams for "multi word" matching
|
|
ignored_words = ['a', 'the','is']
|
|
|
|
def remove_punctuation(input_string):
|
|
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
|
|
for p in punc:
|
|
input_string = input_string.replace(p, '')
|
|
return input_string
|
|
|
|
|
|
def build_index():
|
|
with open(f"data/index.json", "w") as index:
|
|
# get a list of all content files
|
|
# split on whitespace and add to index
|
|
dictionary = {}
|
|
pathlist = Path('data/content').rglob('*.txt')
|
|
for path in pathlist:
|
|
with open(str(path)) as content_file:
|
|
url = content_file.readline()
|
|
content = content_file.read()
|
|
content_words = content.split()
|
|
for word in content_words:
|
|
word = word.lower()
|
|
word = remove_punctuation(word)
|
|
if not word in ignored_words:
|
|
if not word in dictionary:
|
|
dictionary[word] = []
|
|
matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
|
|
if len(matching_urls) == 0:
|
|
# if not url.strip() in dictionary[word]:
|
|
entries = dictionary[word]
|
|
entry = {"url": url.strip(), "count": 1, "filename": str(path)}
|
|
dictionary[word].append(entry)
|
|
else:
|
|
entries = dictionary[word]
|
|
entry = matching_urls[0]
|
|
entry["count"] += 1
|
|
entries.sort(reverse=True, key=lambda entry: entry["count"])
|
|
index.write(json.dumps(dictionary))
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
|
|
args = parser.parse_args()
|
|
if args.rebuild:
|
|
build_index()
|
|
|