Merge remote-tracking branch 'origin/dev' into feat/modal-parallelization
This commit is contained in:
commit
fa5ea44345
4 changed files with 46 additions and 9 deletions
|
|
@ -2,6 +2,10 @@ from pypdf import PdfReader
|
||||||
from cognee.modules.chunking.Chunker import Chunker
|
from cognee.modules.chunking.Chunker import Chunker
|
||||||
from .open_data_file import open_data_file
|
from .open_data_file import open_data_file
|
||||||
from .Document import Document
|
from .Document import Document
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
from cognee.modules.data.processing.document_types.exceptions.exceptions import PyPdfInternalError
|
||||||
|
|
||||||
|
logger = get_logger("PDFDocument")
|
||||||
|
|
||||||
|
|
||||||
class PdfDocument(Document):
|
class PdfDocument(Document):
|
||||||
|
|
@ -9,12 +13,19 @@ class PdfDocument(Document):
|
||||||
|
|
||||||
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
def read(self, chunker_cls: Chunker, max_chunk_size: int):
|
||||||
with open_data_file(self.raw_data_location, mode="rb") as stream:
|
with open_data_file(self.raw_data_location, mode="rb") as stream:
|
||||||
file = PdfReader(stream)
|
logger.info(f"Reading PDF:{self.raw_data_location}")
|
||||||
|
try:
|
||||||
|
file = PdfReader(stream, strict=False)
|
||||||
|
except Exception:
|
||||||
|
raise PyPdfInternalError()
|
||||||
|
|
||||||
def get_text():
|
def get_text():
|
||||||
for page in file.pages:
|
try:
|
||||||
page_text = page.extract_text()
|
for page in file.pages:
|
||||||
yield page_text
|
page_text = page.extract_text()
|
||||||
|
yield page_text
|
||||||
|
except Exception:
|
||||||
|
raise PyPdfInternalError()
|
||||||
|
|
||||||
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
|
chunker = chunker_cls(self, get_text=get_text, max_chunk_size=max_chunk_size)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
"""
|
||||||
|
Custom exceptions for the Cognee API.
|
||||||
|
|
||||||
|
This module defines a set of exceptions for the different document types
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .exceptions import PyPdfInternalError
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
from cognee.exceptions import CogneeApiError
|
||||||
|
from fastapi import status
|
||||||
|
|
||||||
|
|
||||||
|
class PyPdfInternalError(CogneeApiError):
|
||||||
|
"""Internal pypdf error"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
message: str = "Error during PyPdf processing. Pdf is damaged or cannot be processed.",
|
||||||
|
name: str = "PyPdfInternalError",
|
||||||
|
status_code=status.WS_1011_INTERNAL_ERROR,
|
||||||
|
):
|
||||||
|
super().__init__(message, name, status_code)
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from typing import AsyncGenerator
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
from cognee.modules.data.processing.document_types.Document import Document
|
from cognee.modules.data.processing.document_types.Document import Document
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
from cognee.modules.data.models import Data
|
from cognee.modules.data.models import Data
|
||||||
|
|
@ -7,6 +8,7 @@ from cognee.infrastructure.databases.relational import get_relational_engine
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
from cognee.modules.chunking.TextChunker import TextChunker
|
from cognee.modules.chunking.TextChunker import TextChunker
|
||||||
from cognee.modules.chunking.Chunker import Chunker
|
from cognee.modules.chunking.Chunker import Chunker
|
||||||
|
from cognee.modules.data.processing.document_types.exceptions.exceptions import PyPdfInternalError
|
||||||
|
|
||||||
|
|
||||||
async def update_document_token_count(document_id: UUID, token_count: int) -> None:
|
async def update_document_token_count(document_id: UUID, token_count: int) -> None:
|
||||||
|
|
@ -38,10 +40,13 @@ async def extract_chunks_from_documents(
|
||||||
"""
|
"""
|
||||||
for document in documents:
|
for document in documents:
|
||||||
document_token_count = 0
|
document_token_count = 0
|
||||||
for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
|
try:
|
||||||
document_token_count += document_chunk.chunk_size
|
for document_chunk in document.read(max_chunk_size=max_chunk_size, chunker_cls=chunker):
|
||||||
document_chunk.belongs_to_set = document.belongs_to_set
|
document_token_count += document_chunk.chunk_size
|
||||||
yield document_chunk
|
document_chunk.belongs_to_set = document.belongs_to_set
|
||||||
|
yield document_chunk
|
||||||
|
|
||||||
await update_document_token_count(document.id, document_token_count)
|
await update_document_token_count(document.id, document_token_count)
|
||||||
|
except PyPdfInternalError:
|
||||||
|
pass
|
||||||
# todo rita
|
# todo rita
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue