feat: adds errors to classify, and chunking top level

2025-08-14 13:12:08 +02:00 · 2025-08-14 13:12:08 +02:00 · df3a3df117
commit df3a3df117
parent c99b453d96
4 changed files with 30 additions and 4 deletions
--- a/cognee/tasks/documents/classify_documents.py
+++ b/cognee/tasks/documents/classify_documents.py
@ -117,7 +117,6 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:

    documents = []
    for data_item in data_documents:
-
        document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](
            id=data_item.id,
            title=f"{data_item.name}.{data_item.extension}",
--- a/cognee/tasks/documents/exceptions/init.py
+++ b/cognee/tasks/documents/exceptions/init.py
@ -6,4 +6,6 @@ This module defines a set of exceptions for handling various data errors

 from .exceptions import (
    WrongDataDocumentInputError,
+    InvalidChunkSizeError,
+    InvalidChunkerError,
 )
--- a/cognee/tasks/documents/exceptions/exceptions.py
+++ b/cognee/tasks/documents/exceptions/exceptions.py
@ -7,6 +7,7 @@ from fastapi import status

 class WrongDataDocumentInputError(CogneeValidationError):
    """Raised when a wrong data document is provided."""
+
    def __init__(
        self,
        field: str,
@ -14,4 +15,22 @@ class WrongDataDocumentInputError(CogneeValidationError):
        status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY,
    ):
        message = f"Missing of invalid parameter: '{field}'."
-        super().__init__(message, name, status_code)
+        super().__init__(message, name, status_code)
+
+
+class InvalidChunkSizeError(CogneeValidationError):
+    def __init__(self, value):
+        super().__init__(
+            message=f"max_chunk_size must be a positive integer (got {value}).",
+            name="InvalidChunkSizeError",
+            status_code=status.HTTP_400_BAD_REQUEST,
+        )
+
+
+class InvalidChunkerError(CogneeValidationError):
+    def __init__(self):
+        super().__init__(
+            message=f"chunker must be a valid Chunker class.",
+            name="InvalidChunkerError",
+            status_code=status.HTTP_400_BAD_REQUEST,
+        )
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@ -8,6 +8,7 @@ from cognee.modules.data.models import Data
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.chunking.TextChunker import TextChunker
 from cognee.modules.chunking.Chunker import Chunker
+from cognee.tasks.documents.exceptions import InvalidChunkSizeError, InvalidChunkerError


 async def update_document_token_count(document_id: UUID, token_count: int) -> None:
@ -37,6 +38,13 @@ async def extract_chunks_from_documents(
        - The `read` method of the `Document` class must be implemented to support the chunking operation.
        - The `chunker` parameter determines the chunking logic and should align with the document type.
    """
+    if not isinstance(max_chunk_size, int) or max_chunk_size <= 0:
+        raise InvalidChunkSizeError(max_chunk_size)
+    if not isinstance(chunker, type):
+        raise InvalidChunkerError()
+    if not hasattr(chunker, "read"):
+        raise InvalidChunkerError()
+
    for document in documents:
        document_token_count = 0

@ -48,5 +56,3 @@ async def extract_chunks_from_documents(
            yield document_chunk

        await update_document_token_count(document.id, document_token_count)
-
-        # todo rita