cognee/cognee/tests/integration/documents/PdfDocument_test.py
2025-08-27 12:41:13 +02:00

51 lines
1.8 KiB
Python

import os
import sys
import uuid
import pytest
import pathlib
from unittest.mock import patch
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
from cognee.tests.integration.documents.async_gen_zip import async_gen_zip
chunk_by_sentence_module = sys.modules.get("cognee.tasks.chunks.chunk_by_sentence")
GROUND_TRUTH = [
{"word_count": 879, "len_text": 5697, "cut_type": "sentence_end"},
{"word_count": 953, "len_text": 6473, "cut_type": "sentence_end"},
]
@patch.object(
chunk_by_sentence_module, "get_embedding_engine", side_effect=mock_get_embedding_engine
)
@pytest.mark.asyncio
async def test_PdfDocument(mock_engine):
test_file_path = os.path.join(
pathlib.Path(__file__).parent.parent.parent,
"test_data",
"artificial-intelligence.pdf",
)
document = PdfDocument(
id=uuid.uuid4(),
name="Test document.pdf",
raw_data_location=test_file_path,
external_metadata="",
mime_type="",
)
async for ground_truth, paragraph_data in async_gen_zip(
GROUND_TRUTH, document.read(chunker_cls=TextChunker, max_chunk_size=1024)
):
assert ground_truth["word_count"] == paragraph_data.chunk_size, (
f'{ground_truth["word_count"] = } != {paragraph_data.chunk_size = }'
)
assert ground_truth["len_text"] == len(paragraph_data.text), (
f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
)
assert ground_truth["cut_type"] == paragraph_data.cut_type, (
f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'
)