Add beginnings of ngram search capability
This commit is contained in:
parent
343410e62f
commit
9d57f66cd7
4 changed files with 110 additions and 17 deletions
|
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import requests
|
import requests
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
@ -9,9 +10,8 @@ from time import sleep
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
from config import DATABASE_URI
|
from config import DATABASE_URI
|
||||||
from models import Base, Documents, Document_Tokens, Tokens
|
from models import Base, Documents, Document_Tokens
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from sqlalchemy import create_engine
|
|
||||||
import datetime
|
import datetime
|
||||||
import yt_dlp as youtube_dl
|
import yt_dlp as youtube_dl
|
||||||
# TODO- Handle gemini/gopher links
|
# TODO- Handle gemini/gopher links
|
||||||
|
|
|
||||||
60
src/index.py
60
src/index.py
|
|
@ -1,24 +1,29 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from sqlalchemy import create_engine, or_
|
from sqlalchemy import create_engine, or_
|
||||||
from config import DATABASE_URI
|
from config import DATABASE_URI
|
||||||
from models import Base, Documents, Document_Tokens, Tokens
|
from models import Base, Documents, Document_Tokens, Tokens, NGrams, Document_NGrams
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
import uuid
|
import uuid
|
||||||
import datetime
|
import datetime
|
||||||
|
import re
|
||||||
|
from multiprocessing import Pool
|
||||||
|
|
||||||
engine = create_engine(DATABASE_URI)
|
engine = create_engine(DATABASE_URI)
|
||||||
Base.metadata.create_all(engine)
|
Base.metadata.create_all(engine)
|
||||||
Session = sessionmaker(bind=engine)
|
Session = sessionmaker(bind=engine)
|
||||||
|
|
||||||
|
|
||||||
def build_index():
|
def build_index_chunk(document_chunk):
|
||||||
session = Session()
|
session = Session()
|
||||||
# Read list of 1000 documents from db
|
for document in document_chunk:
|
||||||
documents = session.query(Documents).filter(or_(Documents.last_index_date.is_(None), Documents.last_index_date<Documents.last_crawl_date)).limit(1000)
|
|
||||||
for document in documents:
|
|
||||||
print(document.url)
|
print(document.url)
|
||||||
content_words = document.text_content.split()
|
content = re.sub(r'[^\w\s]', '', str(document.text_content))
|
||||||
|
content_words = content.split()
|
||||||
|
build_ngrams(3, content_words, session, document.id)
|
||||||
|
build_ngrams(4, content_words, session, document.id)
|
||||||
|
build_ngrams(5, content_words, session, document.id)
|
||||||
for word in content_words:
|
for word in content_words:
|
||||||
word = word.lower()
|
word = word.lower()
|
||||||
if len(word) > 50:
|
if len(word) > 50:
|
||||||
|
|
@ -27,11 +32,52 @@ def build_index():
|
||||||
if token is None:
|
if token is None:
|
||||||
token = Tokens(token=word, id=uuid.uuid4())
|
token = Tokens(token=word, id=uuid.uuid4())
|
||||||
session.add(token)
|
session.add(token)
|
||||||
document_token = Document_Tokens(document_id=document.id, token_id=token.id)
|
document_token = Document_Tokens(
|
||||||
|
document_id=document.id, token_id=token.id)
|
||||||
session.add(document_token)
|
session.add(document_token)
|
||||||
document.last_index_date = datetime.datetime.now()
|
document.last_index_date = datetime.datetime.now()
|
||||||
session.add(document)
|
session.add(document)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
def build_index():
|
||||||
|
session = Session()
|
||||||
|
documents_query = session.query(Documents).filter(or_(Documents.last_index_date.is_(
|
||||||
|
None), Documents.last_index_date < Documents.last_crawl_date)).limit(1000)
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
documents = list(documents_query) # Execute the query to get the result set
|
||||||
|
|
||||||
|
chunk_size = 100
|
||||||
|
document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)]
|
||||||
|
|
||||||
|
with Pool() as pool:
|
||||||
|
pool.map(build_index_chunk, document_chunks)
|
||||||
|
|
||||||
|
|
||||||
|
def build_ngrams(size: int, corpus: str, session: sessionmaker, document_id: str):
|
||||||
|
i = 0
|
||||||
|
while i < len(corpus):
|
||||||
|
if i + size >= len(corpus):
|
||||||
|
i = len(corpus)
|
||||||
|
gram = ''
|
||||||
|
for n in range(0, size):
|
||||||
|
if i + n >= len(corpus):
|
||||||
|
break
|
||||||
|
gram += corpus[i+n] + ' '
|
||||||
|
gram = gram.rstrip().lower()
|
||||||
|
print(gram)
|
||||||
|
|
||||||
|
ngram = session.query(NGrams).filter_by(gram=gram).first()
|
||||||
|
if ngram is None:
|
||||||
|
ngram = NGrams(id=uuid.uuid4(), size=size, gram=gram)
|
||||||
|
session.add(ngram)
|
||||||
|
document_ngram = Document_NGrams(
|
||||||
|
document_id=document_id, ngram_id=ngram.id)
|
||||||
|
session.add(document_ngram)
|
||||||
|
session.commit()
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
from sqlalchemy import Column, String, DateTime, ForeignKey, Index
|
from sqlalchemy import Column, String, DateTime, ForeignKey, Index, Integer
|
||||||
from sqlalchemy.dialects.postgresql import UUID
|
from sqlalchemy.dialects.postgresql import UUID
|
||||||
from sqlalchemy.orm import relationship, mapped_column
|
from sqlalchemy.orm import relationship, mapped_column
|
||||||
import uuid
|
import uuid
|
||||||
|
|
@ -16,7 +16,10 @@ class Documents(Base):
|
||||||
first_crawl_date = Column(DateTime)
|
first_crawl_date = Column(DateTime)
|
||||||
last_crawl_date = Column(DateTime)
|
last_crawl_date = Column(DateTime)
|
||||||
last_index_date = Column(DateTime)
|
last_index_date = Column(DateTime)
|
||||||
document_tokens = relationship("Document_Tokens", back_populates="document")
|
document_tokens = relationship(
|
||||||
|
"Document_Tokens", back_populates="document")
|
||||||
|
document_ngrams = relationship(
|
||||||
|
"Document_NGrams", back_populates="document")
|
||||||
|
|
||||||
|
|
||||||
class Document_Tokens(Base):
|
class Document_Tokens(Base):
|
||||||
|
|
@ -26,7 +29,8 @@ class Document_Tokens(Base):
|
||||||
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
token_id = mapped_column(ForeignKey("tokens.id"))
|
token_id = mapped_column(ForeignKey("tokens.id"))
|
||||||
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
document = relationship("Documents", back_populates="document_tokens", uselist=False)
|
document = relationship(
|
||||||
|
"Documents", back_populates="document_tokens", uselist=False)
|
||||||
token = relationship("Tokens", back_populates="document_tokens")
|
token = relationship("Tokens", back_populates="document_tokens")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -35,3 +39,23 @@ class Tokens(Base):
|
||||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
token = Column(String, index=True)
|
token = Column(String, index=True)
|
||||||
document_tokens = relationship("Document_Tokens", back_populates="token")
|
document_tokens = relationship("Document_Tokens", back_populates="token")
|
||||||
|
|
||||||
|
|
||||||
|
class NGrams(Base):
|
||||||
|
__tablename__ = 'ngrams'
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
size = Column(Integer, index=True)
|
||||||
|
gram = Column(String, index=True)
|
||||||
|
document_ngrams = relationship("Document_NGrams", back_populates="ngram")
|
||||||
|
|
||||||
|
|
||||||
|
class Document_NGrams(Base):
|
||||||
|
__tablename__ = 'document_ngrams'
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
document_id = mapped_column(ForeignKey("documents.id"))
|
||||||
|
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
ngram_id = mapped_column(ForeignKey("ngrams.id"))
|
||||||
|
# Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
document = relationship(
|
||||||
|
"Documents", back_populates="document_ngrams", uselist=False)
|
||||||
|
ngram = relationship("NGrams", back_populates="document_ngrams")
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
from sqlalchemy import create_engine, func
|
from sqlalchemy import create_engine, func
|
||||||
from config import DATABASE_URI
|
from config import DATABASE_URI
|
||||||
from models import Base, Tokens, Documents, Document_Tokens
|
from models import Base, Tokens, Documents, Document_Tokens, NGrams
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from sqlalchemy.sql.expression import distinct
|
from sqlalchemy.sql.expression import distinct
|
||||||
import time
|
import time
|
||||||
|
|
@ -16,7 +16,7 @@ Session = sessionmaker(bind=engine)
|
||||||
|
|
||||||
|
|
||||||
def split_query(query):
|
def split_query(query):
|
||||||
result = {'ands': [], 'ors': [], 'words': []}
|
result = {'ands': [], 'ors': [], 'words': [], 'ngrams': []}
|
||||||
query_words = query.split()
|
query_words = query.split()
|
||||||
i = 0
|
i = 0
|
||||||
while i < len(query_words):
|
while i < len(query_words):
|
||||||
|
|
@ -27,6 +27,18 @@ def split_query(query):
|
||||||
query_words[i] + ',' + query_words[i+2])
|
query_words[i] + ',' + query_words[i+2])
|
||||||
i = i + 3
|
i = i + 3
|
||||||
continue
|
continue
|
||||||
|
if query_words[i][0] == '"':
|
||||||
|
n = 0
|
||||||
|
quoted_query = ""
|
||||||
|
while i+n < len(query_words):
|
||||||
|
quoted_query += query_words[i+n] + ' '
|
||||||
|
if query_words[i+n][len(query_words[i+n])-1] == '"':
|
||||||
|
break
|
||||||
|
n += 1
|
||||||
|
result['ngrams'].append(
|
||||||
|
quoted_query[1:len(quoted_query)-2].rstrip())
|
||||||
|
i += n
|
||||||
|
continue
|
||||||
result['words'].append(query_words[i])
|
result['words'].append(query_words[i])
|
||||||
i += 1
|
i += 1
|
||||||
return result
|
return result
|
||||||
|
|
@ -51,6 +63,17 @@ def search(query):
|
||||||
results[result[0]] += result[1]
|
results[result[0]] += result[1]
|
||||||
else:
|
else:
|
||||||
results[result[0]] = result[1]
|
results[result[0]] = result[1]
|
||||||
|
x = session.query(NGrams).filter(
|
||||||
|
NGrams.gram.in_(query_words['ngrams'])).all()
|
||||||
|
|
||||||
|
for y in x:
|
||||||
|
print(y.gram)
|
||||||
|
for document_ngram in y.document_ngrams:
|
||||||
|
if document_ngram.document.url in results.keys():
|
||||||
|
results[document_ngram.document.url] += 1
|
||||||
|
else:
|
||||||
|
results[document_ngram.document.url] = 1
|
||||||
|
|
||||||
x = session.query(Tokens).filter(
|
x = session.query(Tokens).filter(
|
||||||
Tokens.token.in_(query_words['words'])).limit(1000)
|
Tokens.token.in_(query_words['words'])).limit(1000)
|
||||||
for y in x:
|
for y in x:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue