cognee/cognee/tests/tasks/translation/translate_content_test.py
andikarachman 7542f5b360 test: add comprehensive translation module tests
- Add unit tests for translation configuration, language detection, providers, and translate_content task
- Add integration tests for full cognify pipeline with translation
- All 40 tests passing (32 unit + 8 integration)
- Tests use asyncio.run() pattern matching project style
- Tests named with *_test.py suffix per project convention
- Update README with test documentation

Formatting changes:
- Apply ruff format to cognify.py (bracket placement style)

Signed-off-by: andikarachman <andika.rachman.y@gmail.com>
2026-01-02 18:56:53 +07:00

256 lines
8.1 KiB
Python

"""
Unit tests for translate_content task
"""
import asyncio
import os
from uuid import uuid4
from cognee.modules.chunking.models import DocumentChunk
from cognee.modules.data.processing.document_types import TextDocument
from cognee.tasks.translation import translate_content
from cognee.tasks.translation.models import TranslatedContent, LanguageMetadata
def has_openai_key():
"""Check if OpenAI API key is available"""
return bool(os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY"))
def create_test_chunk(text: str, chunk_index: int = 0):
"""Helper to create a DocumentChunk with all required fields"""
# Create a minimal Document for the is_part_of field
doc = TextDocument(
id=uuid4(),
name="test_doc",
raw_data_location="/tmp/test.txt",
external_metadata=None,
mime_type="text/plain",
)
return DocumentChunk(
id=uuid4(),
text=text,
chunk_index=chunk_index,
chunk_size=len(text),
cut_type="sentence",
is_part_of=doc,
)
async def test_translate_content_basic():
"""Test basic content translation"""
if not has_openai_key():
print(" (skipped - no API key)")
return
# Create test chunk with Spanish text
original_text = "Hola mundo, esta es una prueba."
chunk = create_test_chunk(original_text)
result = await translate_content(
data_chunks=[chunk], target_language="en", translation_provider="openai"
)
assert len(result) == 1
# The chunk's text should now be translated (different from original Spanish)
assert result[0].text != original_text # Text should be translated to English
assert result[0].contains is not None
# Check for TranslatedContent in contains
has_translated_content = any(isinstance(item, TranslatedContent) for item in result[0].contains)
assert has_translated_content
async def test_translate_content_preserves_original():
"""Test that original text is preserved"""
if not has_openai_key():
print(" (skipped - no API key)")
return
original_text = "Bonjour le monde"
chunk = create_test_chunk(original_text)
result = await translate_content(
data_chunks=[chunk], target_language="en", preserve_original=True
)
# Find TranslatedContent in contains
translated_content = None
for item in result[0].contains:
if isinstance(item, TranslatedContent):
translated_content = item
break
assert translated_content is not None
assert translated_content.original_text == original_text
assert translated_content.translated_text != original_text
async def test_translate_content_skip_english():
"""Test skipping translation for English text"""
# This test doesn't require API call since English text is skipped
chunk = create_test_chunk("Hello world, this is a test.")
result = await translate_content(
data_chunks=[chunk], target_language="en", skip_if_target_language=True
)
# Text should remain unchanged
assert result[0].text == chunk.text
# Should have LanguageMetadata but not TranslatedContent
has_language_metadata = any(
isinstance(item, LanguageMetadata) for item in (result[0].contains or [])
)
has_translated_content = any(
isinstance(item, TranslatedContent) for item in (result[0].contains or [])
)
assert has_language_metadata
assert not has_translated_content
async def test_translate_content_multiple_chunks():
"""Test translation of multiple chunks"""
if not has_openai_key():
print(" (skipped - no API key)")
return
# Use longer texts to ensure reliable language detection
original_texts = [
"Hola mundo, esta es una prueba de traducción.",
"Bonjour le monde, ceci est un test de traduction.",
"Ciao mondo, questo è un test di traduzione.",
]
chunks = [create_test_chunk(text, i) for i, text in enumerate(original_texts)]
result = await translate_content(data_chunks=chunks, target_language="en")
assert len(result) == 3
# Check that at least some chunks were translated
translated_count = sum(
1
for chunk in result
if any(isinstance(item, TranslatedContent) for item in (chunk.contains or []))
)
assert translated_count >= 2 # At least 2 chunks should be translated
async def test_translate_content_empty_list():
"""Test with empty chunk list"""
result = await translate_content(data_chunks=[], target_language="en")
assert result == []
async def test_translate_content_empty_text():
"""Test with chunk containing empty text"""
chunk = create_test_chunk("")
result = await translate_content(data_chunks=[chunk], target_language="en")
assert len(result) == 1
assert result[0].text == ""
async def test_translate_content_language_metadata():
"""Test that LanguageMetadata is created correctly"""
if not has_openai_key():
print(" (skipped - no API key)")
return
# Use a longer, distinctly Spanish text to ensure reliable detection
chunk = create_test_chunk(
"La inteligencia artificial está cambiando el mundo de manera significativa"
)
result = await translate_content(data_chunks=[chunk], target_language="en")
# Find LanguageMetadata
language_metadata = None
for item in result[0].contains:
if isinstance(item, LanguageMetadata):
language_metadata = item
break
assert language_metadata is not None
# Just check that a language was detected (short texts can be ambiguous)
assert language_metadata.detected_language is not None
assert language_metadata.requires_translation is True
assert language_metadata.language_confidence > 0.0
async def test_translate_content_confidence_threshold():
"""Test with custom confidence threshold"""
if not has_openai_key():
print(" (skipped - no API key)")
return
# Use longer text for more reliable detection
chunk = create_test_chunk("Hola mundo, esta es una frase más larga para mejor detección")
result = await translate_content(
data_chunks=[chunk], target_language="en", confidence_threshold=0.5
)
assert len(result) == 1
async def test_translate_content_no_preserve_original():
"""Test translation without preserving original"""
if not has_openai_key():
print(" (skipped - no API key)")
return
# Use longer text for more reliable detection
chunk = create_test_chunk("Bonjour le monde, comment allez-vous aujourd'hui")
result = await translate_content(
data_chunks=[chunk], target_language="en", preserve_original=False
)
# Find TranslatedContent
translated_content = None
for item in result[0].contains:
if isinstance(item, TranslatedContent):
translated_content = item
break
assert translated_content is not None
assert translated_content.original_text == "" # Should be empty
async def main():
"""Run all translate_content tests"""
await test_translate_content_basic()
print("✓ test_translate_content_basic passed")
await test_translate_content_preserves_original()
print("✓ test_translate_content_preserves_original passed")
await test_translate_content_skip_english()
print("✓ test_translate_content_skip_english passed")
await test_translate_content_multiple_chunks()
print("✓ test_translate_content_multiple_chunks passed")
await test_translate_content_empty_list()
print("✓ test_translate_content_empty_list passed")
await test_translate_content_empty_text()
print("✓ test_translate_content_empty_text passed")
await test_translate_content_language_metadata()
print("✓ test_translate_content_language_metadata passed")
await test_translate_content_confidence_threshold()
print("✓ test_translate_content_confidence_threshold passed")
await test_translate_content_no_preserve_original()
print("✓ test_translate_content_no_preserve_original passed")
print("\nAll translate_content tests passed!")
if __name__ == "__main__":
asyncio.run(main())