Initial commit

This commit is contained in:
rmgr 2023-11-28 20:51:54 +10:30
commit f36ab2fbfb
5 changed files with 144 additions and 0 deletions

36
src/index.py Executable file
View file

@ -0,0 +1,36 @@
#!/usr/bin/python3
from pathlib import Path
import argparse
import os
import json
ignored_words = ['a', 'the','is']
def build_index():
with open(f"data/index.json", "w") as index:
# get a list of all content files
# split on whitespace and add to index
dictionary = {}
pathlist = Path('data/content').rglob('*.txt')
for path in pathlist:
with open(str(path)) as content_file:
url = content_file.readline()
content = content_file.read()
content_words = content.split()
for word in content_words:
word = word.lower()
if not word in ignored_words:
if not word in dictionary:
dictionary[word] = []
if not url.strip() in dictionary[word]:
dictionary[word].append(url.strip())
index.write(json.dumps(dictionary))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-r', "--rebuild", action="store_true", help="Blow away the index and rebuild")
args = parser.parse_args()
if args.rebuild:
build_index()