From 278ede9dbed614845cb65efc7f1f46a28e58a3db Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 2 Jul 2025 19:29:23 +0200 Subject: [PATCH] Fix: fixes damaged pdf file errors --- .../processing/document_types/PdfDocument.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 7ddbc0885..5d6857e40 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -1,7 +1,11 @@ from pypdf import PdfReader +from pypdf.errors import PdfReadError from cognee.modules.chunking.Chunker import Chunker from .open_data_file import open_data_file from .Document import Document +from cognee.shared.logging_utils import get_logger + +logger = get_logger("PDFDocument") class PdfDocument(Document): @@ -9,12 +13,21 @@ class PdfDocument(Document): def read(self, chunker_cls: Chunker, max_chunk_size: int): with open_data_file(self.raw_data_location, mode="rb") as stream: - file = PdfReader(stream) + logger.info(f"Reading PDF:{self.raw_data_location}") + try: + file = PdfReader(stream, strict=False) + except PdfReadError: + logger.warning(f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location}") + return def get_text(): - for page in file.pages: - page_text = page.extract_text() - yield page_text + try: + for page in file.pages: + page_text = page.extract_text() + yield page_text + except PdfReadError: + logger.warning(f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location}") + return chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)