from sqlalchemy import create_engine from config import DATABASE_URI from models import Base, Website from pathlib import Path import argparse import os import json # investigate ngrams for "multi word" matching ignored_words = ['a', 'the','is'] def remove_punctuation(input_string): punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!''' for p in punc: input_string = input_string.replace(p, '') return input_string def build_index(): with open(f"data/index.json", "w") as index: # get a list of all content files # split on whitespace and add to index dictionary = {} pathlist = Path('data/content').rglob('*.txt') for path in pathlist: with open(str(path)) as content_file: url = content_file.readline() content = content_file.read() content_words = content.split() for word in content_words: word = word.lower() word = remove_punctuation(word) if not word in ignored_words: if not word in dictionary: dictionary[word] = [] matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word])) if len(matching_urls) == 0: # if not url.strip() in dictionary[word]: entries = dictionary[word] entry = {"url": url.strip(), "count": 1, "filename": str(path)} dictionary[word].append(entry) else: entries = dictionary[word] entry = matching_urls[0] entry["count"] += 1 entries.sort(reverse=True, key=lambda entry: entry["count"]) index.write(json.dumps(dictionary)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild") args = parser.parse_args() if args.rebuild: build_index()