From 278ede9dbed614845cb65efc7f1f46a28e58a3db Mon Sep 17 00:00:00 2001
From: hajdul88 <52442977+hajdul88@users.noreply.github.com>
Date: Wed, 2 Jul 2025 19:29:23 +0200
Subject: [PATCH] Fix: fixes damaged pdf file errors

---
 .../processing/document_types/PdfDocument.py  | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
index 7ddbc0885..5d6857e40 100644
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -1,7 +1,11 @@
 from pypdf import PdfReader
+from pypdf.errors import PdfReadError
 from cognee.modules.chunking.Chunker import Chunker
 from .open_data_file import open_data_file
 from .Document import Document
+from cognee.shared.logging_utils import get_logger
+
+logger = get_logger("PDFDocument")
 
 
 class PdfDocument(Document):
@@ -9,12 +13,21 @@ class PdfDocument(Document):
 
     def read(self, chunker_cls: Chunker, max_chunk_size: int):
         with open_data_file(self.raw_data_location, mode="rb") as stream:
-            file = PdfReader(stream)
+            logger.info(f"Reading PDF:{self.raw_data_location}")
+            try:
+                file = PdfReader(stream, strict=False)
+            except PdfReadError:
+                logger.warning(f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location}")
+                return
 
             def get_text():
-                for page in file.pages:
-                    page_text = page.extract_text()
-                    yield page_text
+                try:
+                    for page in file.pages:
+                        page_text = page.extract_text()
+                        yield page_text
+                except PdfReadError:
+                    logger.warning(f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location}")
+                    return
 
             chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)