diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 7ddbc0885..e0e3136fc 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -2,6 +2,10 @@ from pypdf import PdfReader from cognee.modules.chunking.Chunker import Chunker from .open_data_file import open_data_file from .Document import Document +from cognee.shared.logging_utils import get_logger +from cognee.modules.data.processing.document_types.exceptions.exceptions import PyPdfInternalError + +logger = get_logger("PDFDocument") class PdfDocument(Document): @@ -9,12 +13,19 @@ class PdfDocument(Document): def read(self, chunker_cls: Chunker, max_chunk_size: int): with open_data_file(self.raw_data_location, mode="rb") as stream: - file = PdfReader(stream) + logger.info(f"Reading PDF:{self.raw_data_location}") + try: + file = PdfReader(stream, strict=False) + except Exception: + raise PyPdfInternalError() def get_text(): - for page in file.pages: - page_text = page.extract_text() - yield page_text + try: + for page in file.pages: + page_text = page.extract_text() + yield page_text + except Exception: + raise PyPdfInternalError() chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size) diff --git a/cognee/modules/data/processing/document_types/exceptions/__init__.py b/cognee/modules/data/processing/document_types/exceptions/__init__.py new file mode 100644 index 000000000..2e7237201 --- /dev/null +++ b/cognee/modules/data/processing/document_types/exceptions/__init__.py @@ -0,0 +1,7 @@ +""" +Custom exceptions for the Cognee API. + +This module defines a set of exceptions for the different document types +""" + +from .exceptions import PyPdfInternalError diff --git a/cognee/modules/data/processing/document_types/exceptions/exceptions.py b/cognee/modules/data/processing/document_types/exceptions/exceptions.py new file mode 100644 index 000000000..b5126a8a7 --- /dev/null +++ b/cognee/modules/data/processing/document_types/exceptions/exceptions.py @@ -0,0 +1,14 @@ +from cognee.exceptions import CogneeApiError +from fastapi import status + + +class PyPdfInternalError(CogneeApiError): + """Internal pypdf error""" + + def __init__( + self, + message: str = "Error during PyPdf processing. Pdf is damaged or cannot be processed.", + name: str = "PyPdfInternalError", + status_code=status.WS_1011_INTERNAL_ERROR, + ): + super().__init__(message, name, status_code) diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index 08df7fa57..d0a698061 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -1,5 +1,6 @@ from typing import AsyncGenerator +from cognee.shared.logging_utils import get_logger from cognee.modules.data.processing.document_types.Document import Document from sqlalchemy import select from cognee.modules.data.models import Data @@ -7,6 +8,7 @@ from cognee.infrastructure.databases.relational import get_relational_engine from uuid import UUID from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.chunking.Chunker import Chunker +from cognee.modules.data.processing.document_types.exceptions.exceptions import PyPdfInternalError async def update_document_token_count(document_id: UUID, token_count: int) -> None: @@ -38,10 +40,13 @@ async def extract_chunks_from_documents( """ for document in documents: document_token_count = 0 - for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker): - document_token_count += document_chunk.chunk_size - document_chunk.belongs_to_set = document.belongs_to_set - yield document_chunk + try: + for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker): + document_token_count += document_chunk.chunk_size + document_chunk.belongs_to_set = document.belongs_to_set + yield document_chunk - await update_document_token_count(document.id, document_token_count) + await update_document_token_count(document.id, document_token_count) + except PyPdfInternalError: + pass # todo rita