Fix: fixes damaged pdf file errors

This commit is contained in:
hajdul88 2025-07-02 19:29:23 +02:00
parent e68b19ec83
commit 278ede9dbe

View file

@ -1,7 +1,11 @@
from pypdf import PdfReader
from pypdf.errors import PdfReadError
from cognee.modules.chunking.Chunker import Chunker
from .open_data_file import open_data_file
from .Document import Document
from cognee.shared.logging_utils import get_logger
logger = get_logger("PDFDocument")
class PdfDocument(Document):
@ -9,12 +13,21 @@ class PdfDocument(Document):
def read(self, chunker_cls: Chunker, max_chunk_size: int):
with open_data_file(self.raw_data_location, mode="rb") as stream:
file = PdfReader(stream)
logger.info(f"Reading PDF:{self.raw_data_location}")
try:
file = PdfReader(stream, strict=False)
except PdfReadError:
logger.warning(f"PyPDF couldnt open PDF—skipping: {self.raw_data_location}")
return
def get_text():
for page in file.pages:
page_text = page.extract_text()
yield page_text
try:
for page in file.pages:
page_text = page.extract_text()
yield page_text
except PdfReadError:
logger.warning(f"PyPDF couldnt open PDF—skipping: {self.raw_data_location}")
return
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)