Add beginnings of ngram search capability

This commit is contained in:
rmgr 2024-04-05 21:36:15 +10:30
parent 343410e62f
commit 9d57f66cd7
4 changed files with 110 additions and 17 deletions

View file

@ -1,4 +1,5 @@
#!/usr/bin/python3 #!/usr/bin/python3
import argparse import argparse
import requests import requests
import hashlib import hashlib
@ -9,9 +10,8 @@ from time import sleep
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from sqlalchemy import create_engine from sqlalchemy import create_engine
from config import DATABASE_URI from config import DATABASE_URI
from models import Base, Documents, Document_Tokens, Tokens from models import Base, Documents, Document_Tokens
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import datetime import datetime
import yt_dlp as youtube_dl import yt_dlp as youtube_dl
# TODO- Handle gemini/gopher links # TODO- Handle gemini/gopher links

View file

@ -1,24 +1,29 @@
#!/usr/bin/python3 #!/usr/bin/python3
import argparse import argparse
from sqlalchemy import create_engine, or_ from sqlalchemy import create_engine, or_
from config import DATABASE_URI from config import DATABASE_URI
from models import Base, Documents, Document_Tokens, Tokens from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
import uuid import uuid
import datetime import datetime
import re
from multiprocessing import Pool
engine = create_engine(DATABASE_URI) engine = create_engine(DATABASE_URI)
Base.metadata.create_all(engine) Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine) Session = sessionmaker(bind=engine)
def build_index(): def build_index_chunk(document_chunk):
session = Session() session = Session()
# Read list of 1000 documents from db for document in document_chunk:
documents = session.query(Documents).filter(or_(Documents.last_index_date.is_(None), Documents.last_index_date<Documents.last_crawl_date)).limit(1000)
for document in documents:
print(document.url) print(document.url)
content_words = document.text_content.split() content = re.sub(r'[^\w\s]', '', str(document.text_content))
content_words = content.split()
build_ngrams(3, content_words, session, document.id)
build_ngrams(4, content_words, session, document.id)
build_ngrams(5, content_words, session, document.id)
for word in content_words: for word in content_words:
word = word.lower() word = word.lower()
if len(word) > 50: if len(word) > 50:
@ -27,11 +32,52 @@ def build_index():
if token is None: if token is None:
token = Tokens(token=word, id=uuid.uuid4()) token = Tokens(token=word, id=uuid.uuid4())
session.add(token) session.add(token)
document_token = Document_Tokens(document_id=document.id, token_id=token.id) document_token = Document_Tokens(
document_id=document.id, token_id=token.id)
session.add(document_token) session.add(document_token)
document.last_index_date = datetime.datetime.now() document.last_index_date = datetime.datetime.now()
session.add(document) session.add(document)
session.commit() session.commit()
session.close()
def build_index():
session = Session()
documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
None), Documents.last_index_date < Documents.last_crawl_date)).limit(1000)
session.close()
documents = list(documents_query) # Execute the query to get the result set
chunk_size = 100
document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)]
with Pool() as pool:
pool.map(build_index_chunk, document_chunks)
def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str):
i = 0
while i < len(corpus):
if i + size >= len(corpus):
i = len(corpus)
gram = ''
for n in range(0, size):
if i + n >= len(corpus):
break
gram += corpus[i+n] + ' '
gram = gram.rstrip().lower()
print(gram)
ngram = session.query(NGrams).filter_by(gram=gram).first()
if ngram is None:
ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
session.add(ngram)
document_ngram = Document_NGrams(
document_id=document_id, ngram_id=ngram.id)
session.add(document_ngram)
session.commit()
i += 1
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -1,5 +1,5 @@
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, String, DateTime, ForeignKey, Index from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer
from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import relationship, mapped_column from sqlalchemy.orm import relationship, mapped_column
import uuid import uuid
@ -16,7 +16,10 @@ class Documents(Base):
first_crawl_date = Column(DateTime) first_crawl_date = Column(DateTime)
last_crawl_date = Column(DateTime) last_crawl_date = Column(DateTime)
last_index_date = Column(DateTime) last_index_date = Column(DateTime)
document_tokens = relationship("Document_Tokens", back_populates="document") document_tokens = relationship(
"Document_Tokens", back_populates="document")
document_ngrams = relationship(
"Document_NGrams", back_populates="document")
class Document_Tokens(Base): class Document_Tokens(Base):
@ -26,7 +29,8 @@ class Document_Tokens(Base):
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
token_id = mapped_column(ForeignKey("tokens.id")) token_id = mapped_column(ForeignKey("tokens.id"))
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) # Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document = relationship("Documents", back_populates="document_tokens", uselist=False) document = relationship(
"Documents", back_populates="document_tokens", uselist=False)
token = relationship("Tokens", back_populates="document_tokens") token = relationship("Tokens", back_populates="document_tokens")
@ -35,3 +39,23 @@ class Tokens(Base):
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
token = Column(String, index=True) token = Column(String, index=True)
document_tokens = relationship("Document_Tokens", back_populates="token") document_tokens = relationship("Document_Tokens", back_populates="token")
class NGrams(Base):
__tablename__ = 'ngrams'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
size = Column(Integer, index=True)
gram = Column(String, index=True)
document_ngrams = relationship("Document_NGrams", back_populates="ngram")
class Document_NGrams(Base):
__tablename__ = 'document_ngrams'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id = mapped_column(ForeignKey("documents.id"))
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
ngram_id = mapped_column(ForeignKey("ngrams.id"))
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document = relationship(
"Documents", back_populates="document_ngrams", uselist=False)
ngram = relationship("NGrams", back_populates="document_ngrams")

View file

@ -1,7 +1,7 @@
#!/usr/bin/python3 #!/usr/bin/python3
from sqlalchemy import create_engine, func from sqlalchemy import create_engine, func
from config import DATABASE_URI from config import DATABASE_URI
from models import Base, Tokens, Documents, Document_Tokens from models import Base, Tokens, Documents, Document_Tokens, NGrams
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql.expression import distinct from sqlalchemy.sql.expression import distinct
import time import time
@ -16,7 +16,7 @@ Session = sessionmaker(bind=engine)
def split_query(query): def split_query(query):
result = {'ands': [], 'ors': [], 'words': []} result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []}
query_words = query.split() query_words = query.split()
i = 0 i = 0
while i < len(query_words): while i < len(query_words):
@ -27,6 +27,18 @@ def split_query(query):
query_words[i] + ',' + query_words[i+2]) query_words[i] + ',' + query_words[i+2])
i = i + 3 i = i + 3
continue continue
if query_words[i][0] == '"':
n = 0
quoted_query = ""
while i+n < len(query_words):
quoted_query += query_words[i+n] + ' '
if query_words[i+n][len(query_words[i+n])-1] == '"':
break
n += 1
result['ngrams'].append(
quoted_query[1:len(quoted_query)-2].rstrip())
i += n
continue
result['words'].append(query_words[i]) result['words'].append(query_words[i])
i += 1 i += 1
return result return result
@ -51,6 +63,17 @@ def search(query):
results[result[0]] += result[1] results[result[0]] += result[1]
else: else:
results[result[0]] = result[1] results[result[0]] = result[1]
x = session.query(NGrams).filter(
NGrams.gram.in_(query_words['ngrams'])).all()
for y in x:
print(y.gram)
for document_ngram in y.document_ngrams:
if document_ngram.document.url in results.keys():
results[document_ngram.document.url] += 1
else:
results[document_ngram.document.url] = 1
x = session.query(Tokens).filter( x = session.query(Tokens).filter(
Tokens.token.in_(query_words['words'])).limit(1000) Tokens.token.in_(query_words['words'])).limit(1000)
for y in x: for y in x: