Fix: fixes damaged pdf file errors
This commit is contained in:
parent
e68b19ec83
commit
278ede9dbe
1 changed files with 17 additions and 4 deletions
|
|
@ -1,7 +1,11 @@
|
|||
from pypdf import PdfReader
|
||||
from pypdf.errors import PdfReadError
|
||||
from cognee.modules.chunking.Chunker import Chunker
|
||||
from .open_data_file import open_data_file
|
||||
from .Document import Document
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
||||
logger = get_logger("PDFDocument")
|
||||
|
||||
|
||||
class PdfDocument(Document):
|
||||
|
|
@ -9,12 +13,21 @@ class PdfDocument(Document):
|
|||
|
||||
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
||||
with open_data_file(self.raw_data_location, mode="rb") as stream:
|
||||
file = PdfReader(stream)
|
||||
logger.info(f"Reading PDF:{self.raw_data_location}")
|
||||
try:
|
||||
file = PdfReader(stream, strict=False)
|
||||
except PdfReadError:
|
||||
logger.warning(f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location}")
|
||||
return
|
||||
|
||||
def get_text():
|
||||
for page in file.pages:
|
||||
page_text = page.extract_text()
|
||||
yield page_text
|
||||
try:
|
||||
for page in file.pages:
|
||||
page_text = page.extract_text()
|
||||
yield page_text
|
||||
except PdfReadError:
|
||||
logger.warning(f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location}")
|
||||
return
|
||||
|
||||
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue