Count the number of tokens in documents [COG-1071] (#476)
* Count the number of tokens in documents * save token count to relational db --------- Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com>
This commit is contained in:
parent
d900060e2b
commit
edae2771a5
5 changed files with 31 additions and 2 deletions
|
|
@ -46,6 +46,7 @@ class TextChunker:
|
|||
id=chunk_data["chunk_id"],
|
||||
text=chunk_data["text"],
|
||||
word_count=chunk_data["word_count"],
|
||||
token_count=chunk_data["token_count"],
|
||||
is_part_of=self.document,
|
||||
chunk_index=self.chunk_index,
|
||||
cut_type=chunk_data["cut_type"],
|
||||
|
|
@ -65,6 +66,7 @@ class TextChunker:
|
|||
),
|
||||
text=chunk_text,
|
||||
word_count=self.chunk_size,
|
||||
token_count=self.token_count,
|
||||
is_part_of=self.document,
|
||||
chunk_index=self.chunk_index,
|
||||
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
|
||||
|
|
@ -87,6 +89,7 @@ class TextChunker:
|
|||
id=uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
|
||||
text=" ".join(chunk["text"] for chunk in paragraph_chunks),
|
||||
word_count=self.chunk_size,
|
||||
token_count=self.token_count,
|
||||
is_part_of=self.document,
|
||||
chunk_index=self.chunk_index,
|
||||
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ class DocumentChunk(DataPoint):
|
|||
__tablename__ = "document_chunk"
|
||||
text: str
|
||||
word_count: int
|
||||
token_count: int
|
||||
chunk_index: int
|
||||
cut_type: str
|
||||
is_part_of: Document
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
from datetime import datetime, timezone
|
||||
from uuid import uuid4
|
||||
from sqlalchemy import UUID, Column, DateTime, String, JSON
|
||||
from sqlalchemy import UUID, Column, DateTime, String, JSON, Integer
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from cognee.infrastructure.databases.relational import Base
|
||||
|
|
@ -20,6 +20,7 @@ class Data(Base):
|
|||
owner_id = Column(UUID, index=True)
|
||||
content_hash = Column(String)
|
||||
external_metadata = Column(JSON)
|
||||
token_count = Column(Integer)
|
||||
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
||||
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,25 @@
|
|||
from typing import Optional, AsyncGenerator
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from cognee.modules.data.processing.document_types.Document import Document
|
||||
from sqlalchemy import select
|
||||
from cognee.modules.data.models import Data
|
||||
from cognee.infrastructure.databases.relational import get_relational_engine
|
||||
from uuid import UUID
|
||||
|
||||
|
||||
async def update_document_token_count(document_id: UUID, token_count: int) -> None:
|
||||
db_engine = get_relational_engine()
|
||||
async with db_engine.get_async_session() as session:
|
||||
document_data_point = (
|
||||
await session.execute(select(Data).filter(Data.id == document_id))
|
||||
).scalar_one_or_none()
|
||||
|
||||
if document_data_point:
|
||||
document_data_point.token_count = token_count
|
||||
await session.merge(document_data_point)
|
||||
await session.commit()
|
||||
else:
|
||||
raise ValueError(f"Document with id {document_id} not found.")
|
||||
|
||||
|
||||
async def extract_chunks_from_documents(
|
||||
|
|
@ -17,7 +36,11 @@ async def extract_chunks_from_documents(
|
|||
- The `chunker` parameter determines the chunking logic and should align with the document type.
|
||||
"""
|
||||
for document in documents:
|
||||
document_token_count = 0
|
||||
for document_chunk in document.read(
|
||||
chunk_size=chunk_size, chunker=chunker, max_chunk_tokens=max_chunk_tokens
|
||||
):
|
||||
document_token_count += document_chunk.token_count
|
||||
yield document_chunk
|
||||
|
||||
await update_document_token_count(document.id, document_token_count)
|
||||
|
|
|
|||
|
|
@ -107,6 +107,7 @@ async def ingest_data(data: Any, dataset_name: str, user: User):
|
|||
owner_id=user.id,
|
||||
content_hash=file_metadata["content_hash"],
|
||||
external_metadata=get_external_metadata_dict(data_item),
|
||||
token_count=-1,
|
||||
)
|
||||
|
||||
# Check if data is already in dataset
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue