search-engine/src/index.py

from sqlalchemy import create_engine
from config import DATABASE_URI
from models import Base, Website
from pathlib import Path
import argparse
import os
import json
# investigate ngrams for "multi word" matching
ignored_words = ['a', 'the','is']

def remove_punctuation(input_string):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~?!'''
    for p in punc:
        input_string = input_string.replace(p, '')
    return input_string


def build_index():
    with open(f"data/index.json", "w") as index:
        # get a list of all content files
        # split on whitespace and add to index
        dictionary = {}
        pathlist = Path('data/content').rglob('*.txt')
        for path in pathlist:
            with open(str(path)) as content_file:
                url = content_file.readline()
                content = content_file.read()
                content_words = content.split()
                for word in content_words:
                    word = word.lower()
                    word = remove_punctuation(word)
                    if not word in ignored_words:
                        if not word in dictionary:
                            dictionary[word] = []
                        matching_urls = list(filter(lambda entry: entry["url"] == url.strip(), dictionary[word]))
                        if len(matching_urls) == 0:
#                        if not url.strip() in dictionary[word]:
                            entries = dictionary[word]
                            entry = {"url": url.strip(), "count": 1, "filename": str(path)}
                            dictionary[word].append(entry)
                        else:
                            entries = dictionary[word]
                            entry = matching_urls[0]
                            entry["count"] += 1
                            entries.sort(reverse=True, key=lambda entry: entry["count"])
        index.write(json.dumps(dictionary))

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
    args = parser.parse_args()
    if args.rebuild:
        build_index()