Fix: fixes damaged pdf file errors
This commit is contained in:
parent
e68b19ec83
commit
278ede9dbe
1 changed files with 17 additions and 4 deletions
|
|
@ -1,7 +1,11 @@
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
|
from pypdf.errors import PdfReadError
|
||||||
from cognee.modules.chunking.Chunker import Chunker
|
from cognee.modules.chunking.Chunker import Chunker
|
||||||
from .open_data_file import open_data_file
|
from .open_data_file import open_data_file
|
||||||
from .Document import Document
|
from .Document import Document
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
logger = get_logger("PDFDocument")
|
||||||
|
|
||||||
|
|
||||||
class PdfDocument(Document):
|
class PdfDocument(Document):
|
||||||
|
|
@ -9,12 +13,21 @@ class PdfDocument(Document):
|
||||||
|
|
||||||
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
||||||
with open_data_file(self.raw_data_location, mode="rb") as stream:
|
with open_data_file(self.raw_data_location, mode="rb") as stream:
|
||||||
file = PdfReader(stream)
|
logger.info(f"Reading PDF:{self.raw_data_location}")
|
||||||
|
try:
|
||||||
|
file = PdfReader(stream, strict=False)
|
||||||
|
except PdfReadError:
|
||||||
|
logger.warning(f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location}")
|
||||||
|
return
|
||||||
|
|
||||||
def get_text():
|
def get_text():
|
||||||
for page in file.pages:
|
try:
|
||||||
page_text = page.extract_text()
|
for page in file.pages:
|
||||||
yield page_text
|
page_text = page.extract_text()
|
||||||
|
yield page_text
|
||||||
|
except PdfReadError:
|
||||||
|
logger.warning(f"PyPDF couldn’t open PDF—skipping: {self.raw_data_location}")
|
||||||
|
return
|
||||||
|
|
||||||
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
|
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue