fix: Adds graceful handling quick fix for damaged pdf files (#1047)

<!-- .github/pull_request_template.md -->

## Description
fix: Adds graceful handling quick fix for damaged pdf files

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
hajdul88 2025-07-06 13:09:42 +02:00 committed by GitHub
parent c936f5e0a3
commit 3c3c89a140
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 46 additions and 9 deletions

View file

@ -2,6 +2,10 @@ from pypdf import PdfReader
from cognee.modules.chunking.Chunker import Chunker
from .open_data_file import open_data_file
from .Document import Document
from cognee.shared.logging_utils import get_logger
from cognee.modules.data.processing.document_types.exceptions.exceptions import PyPdfInternalError
logger = get_logger("PDFDocument")
class PdfDocument(Document):
@ -9,12 +13,19 @@ class PdfDocument(Document):
def read(self, chunker_cls: Chunker, max_chunk_size: int):
with open_data_file(self.raw_data_location, mode="rb") as stream:
file = PdfReader(stream)
logger.info(f"Reading PDF:{self.raw_data_location}")
try:
file = PdfReader(stream, strict=False)
except Exception:
raise PyPdfInternalError()
def get_text():
for page in file.pages:
page_text = page.extract_text()
yield page_text
try:
for page in file.pages:
page_text = page.extract_text()
yield page_text
except Exception:
raise PyPdfInternalError()
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)

View file

@ -0,0 +1,7 @@
"""
Custom exceptions for the Cognee API.
This module defines a set of exceptions for the different document types
"""
from .exceptions import PyPdfInternalError

View file

@ -0,0 +1,14 @@
from cognee.exceptions import CogneeApiError
from fastapi import status
class PyPdfInternalError(CogneeApiError):
"""Internal pypdf error"""
def __init__(
self,
message: str = "Error during PyPdf processing. Pdf is damaged or cannot be processed.",
name: str = "PyPdfInternalError",
status_code=status.WS_1011_INTERNAL_ERROR,
):
super().__init__(message, name, status_code)

View file

@ -1,5 +1,6 @@
from typing import AsyncGenerator
from cognee.shared.logging_utils import get_logger
from cognee.modules.data.processing.document_types.Document import Document
from sqlalchemy import select
from cognee.modules.data.models import Data
@ -7,6 +8,7 @@ from cognee.infrastructure.databases.relational import get_relational_engine
from uuid import UUID
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.chunking.Chunker import Chunker
from cognee.modules.data.processing.document_types.exceptions.exceptions import PyPdfInternalError
async def update_document_token_count(document_id: UUID, token_count: int) -> None:
@ -38,10 +40,13 @@ async def extract_chunks_from_documents(
"""
for document in documents:
document_token_count = 0
for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
document_token_count += document_chunk.chunk_size
document_chunk.belongs_to_set = document.belongs_to_set
yield document_chunk
try:
for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
document_token_count += document_chunk.chunk_size
document_chunk.belongs_to_set = document.belongs_to_set
yield document_chunk
await update_document_token_count(document.id, document_token_count)
await update_document_token_count(document.id, document_token_count)
except PyPdfInternalError:
pass
# todo rita