diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 7ddbc0885..5d6857e40 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -1,7 +1,11 @@ from pypdf import PdfReader +from pypdf.errors import PdfReadError from cognee.modules.chunking.Chunker import Chunker from .open_data_file import open_data_file from .Document import Document +from cognee.shared.logging_utils import get_logger + +logger = get_logger("PDFDocument") class PdfDocument(Document): @@ -9,12 +13,21 @@ class PdfDocument(Document): def read(self, chunker_cls: Chunker, max_chunk_size: int): with open_data_file(self.raw_data_location, mode="rb") as stream: - file = PdfReader(stream) + logger.info(f"Reading PDF:{self.raw_data_location}") + try: + file = PdfReader(stream, strict=False) + except PdfReadError: + logger.warning(f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location}") + return def get_text(): - for page in file.pages: - page_text = page.extract_text() - yield page_text + try: + for page in file.pages: + page_text = page.extract_text() + yield page_text + except PdfReadError: + logger.warning(f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location}") + return chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)