diff --git a/src/crawl.py b/src/crawl.py index bf814e2..c62f4a9 100755 --- a/src/crawl.py +++ b/src/crawl.py @@ -1,4 +1,5 @@ #!/usr/bin/python3 + import argparse import requests import hashlib @@ -9,9 +10,8 @@ from time import sleep from bs4 import BeautifulSoup from sqlalchemy import create_engine from config import DATABASE_URI -from models import Base, Documents, Document_Tokens, Tokens +from models import Base, Documents, Document_Tokens from sqlalchemy.orm import sessionmaker -from sqlalchemy import create_engine import datetime import yt_dlp as youtube_dl # TODO- Handle gemini/gopher links diff --git a/src/index.py b/src/index.py index d7259ce..e73c93d 100644 --- a/src/index.py +++ b/src/index.py @@ -1,24 +1,29 @@ #!/usr/bin/python3 + import argparse from sqlalchemy import create_engine, or_ from config import DATABASE_URI -from models import Base, Documents, Document_Tokens, Tokens +from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams from sqlalchemy.orm import sessionmaker import uuid import datetime +import re +from multiprocessing import Pool engine = create_engine(DATABASE_URI) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) -def build_index(): +def build_index_chunk(document_chunk): session = Session() - # Read list of 1000 documents from db - documents = session.query(Documents).filter(or_(Documents.last_index_date.is_(None), Documents.last_index_date 50: @@ -27,11 +32,52 @@ def build_index(): if token is None: token = Tokens(token=word, id=uuid.uuid4()) session.add(token) - document_token = Document_Tokens(document_id=document.id, token_id=token.id) + document_token = Document_Tokens( + document_id=document.id, token_id=token.id) session.add(document_token) document.last_index_date = datetime.datetime.now() session.add(document) session.commit() + session.close() + + +def build_index(): + session = Session() + documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_( + None), Documents.last_index_date < Documents.last_crawl_date)).limit(1000) + session.close() + + documents = list(documents_query) # Execute the query to get the result set + + chunk_size = 100 + document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)] + + with Pool() as pool: + pool.map(build_index_chunk, document_chunks) + + +def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str): + i = 0 + while i < len(corpus): + if i + size >= len(corpus): + i = len(corpus) + gram = '' + for n in range(0, size): + if i + n >= len(corpus): + break + gram += corpus[i+n] + ' ' + gram = gram.rstrip().lower() + print(gram) + + ngram = session.query(NGrams).filter_by(gram=gram).first() + if ngram is None: + ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram) + session.add(ngram) + document_ngram = Document_NGrams( + document_id=document_id, ngram_id=ngram.id) + session.add(document_ngram) + session.commit() + i += 1 if __name__ == "__main__": diff --git a/src/models.py b/src/models.py index de7e7a9..c73ea7d 100644 --- a/src/models.py +++ b/src/models.py @@ -1,5 +1,5 @@ from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, String, DateTime, ForeignKey, Index +from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.orm import relationship, mapped_column import uuid @@ -16,7 +16,10 @@ class Documents(Base): first_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime) last_index_date = Column(DateTime) - document_tokens = relationship("Document_Tokens", back_populates="document") + document_tokens = relationship( + "Document_Tokens", back_populates="document") + document_ngrams = relationship( + "Document_NGrams", back_populates="document") class Document_Tokens(Base): @@ -25,8 +28,9 @@ class Document_Tokens(Base): document_id = mapped_column(ForeignKey("documents.id")) # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) token_id = mapped_column(ForeignKey("tokens.id")) - #Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) - document = relationship("Documents", back_populates="document_tokens", uselist=False) + # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document = relationship( + "Documents", back_populates="document_tokens", uselist=False) token = relationship("Tokens", back_populates="document_tokens") @@ -35,3 +39,23 @@ class Tokens(Base): id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) token = Column(String, index=True) document_tokens = relationship("Document_Tokens", back_populates="token") + + +class NGrams(Base): + __tablename__ = 'ngrams' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + size = Column(Integer, index=True) + gram = Column(String, index=True) + document_ngrams = relationship("Document_NGrams", back_populates="ngram") + + +class Document_NGrams(Base): + __tablename__ = 'document_ngrams' + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document_id = mapped_column(ForeignKey("documents.id")) + # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + ngram_id = mapped_column(ForeignKey("ngrams.id")) + # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + document = relationship( + "Documents", back_populates="document_ngrams", uselist=False) + ngram = relationship("NGrams", back_populates="document_ngrams") diff --git a/src/search.py b/src/search.py index c5c233e..f77927b 100755 --- a/src/search.py +++ b/src/search.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 from sqlalchemy import create_engine, func from config import DATABASE_URI -from models import Base, Tokens, Documents, Document_Tokens +from models import Base, Tokens, Documents, Document_Tokens, NGrams from sqlalchemy.orm import sessionmaker from sqlalchemy.sql.expression import distinct import time @@ -16,7 +16,7 @@ Session = sessionmaker(bind=engine) def split_query(query): - result = {'ands': [], 'ors': [], 'words': []} + result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []} query_words = query.split() i = 0 while i < len(query_words): @@ -27,19 +27,31 @@ def split_query(query): query_words[i] + ',' + query_words[i+2]) i = i + 3 continue + if query_words[i][0] == '"': + n = 0 + quoted_query = "" + while i+n < len(query_words): + quoted_query += query_words[i+n] + ' ' + if query_words[i+n][len(query_words[i+n])-1] == '"': + break + n += 1 + result['ngrams'].append( + quoted_query[1:len(quoted_query)-2].rstrip()) + i += n + continue result['words'].append(query_words[i]) i += 1 return result -@app.route("/search/") +@ app.route("/search/") def search(query): start_time = time.time_ns() session = Session() results = {} query_words = split_query(unquote(query)) for a in query_words['ands']: - query = session.query(Documents.url, func.count(1)).\ + query = session.query(Documents.url, func.count(1)). \ join(Document_Tokens, Documents.id == Document_Tokens.document_id).\ join(Tokens, Document_Tokens.token_id == Tokens.id).\ filter(Tokens.token.in_([a.split(',')[0], a.split(',')[1]])).\ @@ -51,6 +63,17 @@ def search(query): results[result[0]] += result[1] else: results[result[0]] = result[1] + x = session.query(NGrams).filter( + NGrams.gram.in_(query_words['ngrams'])).all() + + for y in x: + print(y.gram) + for document_ngram in y.document_ngrams: + if document_ngram.document.url in results.keys(): + results[document_ngram.document.url] += 1 + else: + results[document_ngram.document.url] = 1 + x = session.query(Tokens).filter( Tokens.token.in_(query_words['words'])).limit(1000) for y in x: