Rework ngram generation. Greatly improve performance of indexer. Commit horrendous sql sins

This commit is contained in:
rmgr 2024-05-04 21:10:46 +09:30
parent 9f0e7e6b29
commit bdb4064acc
5 changed files with 155 additions and 57 deletions

View file

@ -32,6 +32,12 @@ class Document_Tokens(Base):
document = relationship(
"Documents", back_populates="document_tokens", uselist=False)
token = relationship("Tokens", back_populates="document_tokens")
__table_args__ = (
Index('idx_document_tokens_document_id_token_id', 'document_id',
'token_id', unique=True, postgresql_using='hash'),
Index('idx_document_tokens_clustered', 'document_id',
'token_id', postgresql_using='hash'),
)
class Tokens(Base):
@ -53,9 +59,14 @@ class Document_NGrams(Base):
__tablename__ = 'document_ngrams'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document_id = mapped_column(ForeignKey("documents.id"))
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
ngram_id = mapped_column(ForeignKey("ngrams.id"))
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
document = relationship(
"Documents", back_populates="document_ngrams", uselist=False)
ngram = relationship("NGrams", back_populates="document_ngrams")
__table_args__ = (
Index('idx_document_ngrams_document_id_ngram_id', 'document_id',
'ngram_id', unique=True, postgresql_using='hash'),
Index('idx_document_ngrams_clustered', 'document_id',
'ngram_id', postgresql_using='hash'),
)