diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py index 6567e5825..9fa512906 100644 --- a/cognee/tasks/documents/classify_documents.py +++ b/cognee/tasks/documents/classify_documents.py @@ -117,7 +117,6 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]: documents = [] for data_item in data_documents: - document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension]( id=data_item.id, title=f"{data_item.name}.{data_item.extension}", diff --git a/cognee/tasks/documents/exceptions/__init__.py b/cognee/tasks/documents/exceptions/__init__.py index cdd50c6b0..a8602d6f5 100644 --- a/cognee/tasks/documents/exceptions/__init__.py +++ b/cognee/tasks/documents/exceptions/__init__.py @@ -6,4 +6,6 @@ This module defines a set of exceptions for handling various data errors from .exceptions import ( WrongDataDocumentInputError, + InvalidChunkSizeError, + InvalidChunkerError, ) diff --git a/cognee/tasks/documents/exceptions/exceptions.py b/cognee/tasks/documents/exceptions/exceptions.py index a1fcb1d4d..27907aaf1 100644 --- a/cognee/tasks/documents/exceptions/exceptions.py +++ b/cognee/tasks/documents/exceptions/exceptions.py @@ -7,6 +7,7 @@ from fastapi import status class WrongDataDocumentInputError(CogneeValidationError): """Raised when a wrong data document is provided.""" + def __init__( self, field: str, @@ -14,4 +15,22 @@ class WrongDataDocumentInputError(CogneeValidationError): status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY, ): message = f"Missing of invalid parameter: '{field}'." - super().__init__(message, name, status_code) \ No newline at end of file + super().__init__(message, name, status_code) + + +class InvalidChunkSizeError(CogneeValidationError): + def __init__(self, value): + super().__init__( + message=f"max_chunk_size must be a positive integer (got {value}).", + name="InvalidChunkSizeError", + status_code=status.HTTP_400_BAD_REQUEST, + ) + + +class InvalidChunkerError(CogneeValidationError): + def __init__(self): + super().__init__( + message=f"chunker must be a valid Chunker class.", + name="InvalidChunkerError", + status_code=status.HTTP_400_BAD_REQUEST, + ) diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index 216185495..d52380c8d 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -8,6 +8,7 @@ from cognee.modules.data.models import Data from cognee.infrastructure.databases.relational import get_relational_engine from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.chunking.Chunker import Chunker +from cognee.tasks.documents.exceptions import InvalidChunkSizeError, InvalidChunkerError async def update_document_token_count(document_id: UUID, token_count: int) -> None: @@ -37,6 +38,13 @@ async def extract_chunks_from_documents( - The `read` method of the `Document` class must be implemented to support the chunking operation. - The `chunker` parameter determines the chunking logic and should align with the document type. """ + if not isinstance(max_chunk_size, int) or max_chunk_size <= 0: + raise InvalidChunkSizeError(max_chunk_size) + if not isinstance(chunker, type): + raise InvalidChunkerError() + if not hasattr(chunker, "read"): + raise InvalidChunkerError() + for document in documents: document_token_count = 0 @@ -48,5 +56,3 @@ async def extract_chunks_from_documents( yield document_chunk await update_document_token_count(document.id, document_token_count) - - # todo rita