From c9b1767050acc8f3530216263b627e7d1527a24e Mon Sep 17 00:00:00 2001 From: andikarachman Date: Thu, 1 Jan 2026 15:46:53 +0700 Subject: [PATCH 01/16] feat(translation): implement multilingual content translation task - Add translation module with OpenAI, Google, Azure provider support - Implement language detection using langdetect - Add TranslatedContent and LanguageMetadata models - Integrate translation task into cognify pipeline - Add auto_translate parameter to cognify() function - Preserve original text alongside translations - Support custom translation providers and target languages --- cognee/api/v1/cognify/cognify.py | 89 +++++- .../llm/prompts/translate_content.txt | 19 ++ cognee/tasks/translation/__init__.py | 96 +++++++ cognee/tasks/translation/config.py | 63 +++++ cognee/tasks/translation/detect_language.py | 190 +++++++++++++ cognee/tasks/translation/exceptions.py | 53 ++++ cognee/tasks/translation/models.py | 72 +++++ .../tasks/translation/providers/__init__.py | 40 +++ .../translation/providers/azure_provider.py | 182 ++++++++++++ cognee/tasks/translation/providers/base.py | 69 +++++ .../translation/providers/google_provider.py | 159 +++++++++++ .../translation/providers/openai_provider.py | 107 +++++++ cognee/tasks/translation/translate_content.py | 265 ++++++++++++++++++ 13 files changed, 1397 insertions(+), 7 deletions(-) create mode 100644 cognee/infrastructure/llm/prompts/translate_content.txt create mode 100644 cognee/tasks/translation/__init__.py create mode 100644 cognee/tasks/translation/config.py create mode 100644 cognee/tasks/translation/detect_language.py create mode 100644 cognee/tasks/translation/exceptions.py create mode 100644 cognee/tasks/translation/models.py create mode 100644 cognee/tasks/translation/providers/__init__.py create mode 100644 cognee/tasks/translation/providers/azure_provider.py create mode 100644 cognee/tasks/translation/providers/base.py create mode 100644 cognee/tasks/translation/providers/google_provider.py create mode 100644 cognee/tasks/translation/providers/openai_provider.py create mode 100644 cognee/tasks/translation/translate_content.py diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index ffc903d68..50071caef 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -26,6 +26,8 @@ from cognee.tasks.documents import ( from cognee.tasks.graph import extract_graph_from_data from cognee.tasks.storage import add_data_points from cognee.tasks.summarization import summarize_text +from cognee.tasks.translation import translate_content +from cognee.tasks.translation.config import TranslationProviderType from cognee.modules.pipelines.layers.pipeline_execution_mode import get_pipeline_executor from cognee.tasks.temporal_graph.extract_events_and_entities import extract_events_and_timestamps from cognee.tasks.temporal_graph.extract_knowledge_graph_from_events import ( @@ -53,6 +55,9 @@ async def cognify( custom_prompt: Optional[str] = None, temporal_cognify: bool = False, data_per_batch: int = 20, + auto_translate: bool = False, + target_language: str = "en", + translation_provider: TranslationProviderType = None, **kwargs, ): """ @@ -118,6 +123,15 @@ async def cognify( If provided, this prompt will be used instead of the default prompts for knowledge graph extraction. The prompt should guide the LLM on how to extract entities and relationships from the text content. + auto_translate: If True, automatically detect and translate non-English content to the + target language before processing. Uses language detection to identify + content that needs translation. Defaults to False. + target_language: Target language code for translation (e.g., "en", "es", "fr"). + Only used when auto_translate=True. Defaults to "en" (English). + translation_provider: Translation service to use ("openai", "google", "azure"). + OpenAI uses the existing LLM infrastructure, Google requires + GOOGLE_TRANSLATE_API_KEY, Azure requires AZURE_TRANSLATOR_KEY. + If not specified, uses TRANSLATION_PROVIDER env var or defaults to "openai". Returns: Union[dict, list[PipelineRunInfo]]: @@ -182,6 +196,14 @@ async def cognify( run_in_background=True ) # Check status later with run_info.pipeline_run_id + + # Auto-translate multilingual content to English + await cognee.add("document_spanish.pdf") + await cognee.cognify( + auto_translate=True, + target_language="en", + translation_provider="openai" # or "google", "azure" + ) ``` @@ -193,6 +215,9 @@ async def cognify( - LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER - LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False) - LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60) + - TRANSLATION_PROVIDER: Default translation provider ("openai", "google", "azure") + - GOOGLE_TRANSLATE_API_KEY: API key for Google Translate + - AZURE_TRANSLATOR_KEY: API key for Azure Translator """ if config is None: ontology_config = get_ontology_env_config() @@ -213,7 +238,13 @@ async def cognify( if temporal_cognify: tasks = await get_temporal_tasks( - user=user, chunker=chunker, chunk_size=chunk_size, chunks_per_batch=chunks_per_batch + user=user, + chunker=chunker, + chunk_size=chunk_size, + chunks_per_batch=chunks_per_batch, + auto_translate=auto_translate, + target_language=target_language, + translation_provider=translation_provider, ) else: tasks = await get_default_tasks( @@ -224,6 +255,9 @@ async def cognify( config=config, custom_prompt=custom_prompt, chunks_per_batch=chunks_per_batch, + auto_translate=auto_translate, + target_language=target_language, + translation_provider=translation_provider, **kwargs, ) @@ -253,6 +287,9 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's config: Config = None, custom_prompt: Optional[str] = None, chunks_per_batch: int = 100, + auto_translate: bool = False, + target_language: str = "en", + translation_provider: TranslationProviderType = None, **kwargs, ) -> list[Task]: if config is None: @@ -285,6 +322,20 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's max_chunk_size=chunk_size or get_max_chunk_tokens(), chunker=chunker, ), # Extract text chunks based on the document type. + ] + + # Add translation task if auto_translate is enabled + if auto_translate: + default_tasks.append( + Task( + translate_content, + target_language=target_language, + translation_provider=translation_provider, + task_config={"batch_size": chunks_per_batch}, + ) + ) + + default_tasks.extend([ Task( extract_graph_from_data, graph_model=graph_model, @@ -302,13 +353,19 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's embed_triplets=embed_triplets, task_config={"batch_size": chunks_per_batch}, ), - ] + ]) return default_tasks async def get_temporal_tasks( - user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10 + user: User = None, + chunker=TextChunker, + chunk_size: int = None, + chunks_per_batch: int = 10, + auto_translate: bool = False, + target_language: str = "en", + translation_provider: TranslationProviderType = None, ) -> list[Task]: """ Builds and returns a list of temporal processing tasks to be executed in sequence. @@ -316,15 +373,19 @@ async def get_temporal_tasks( The pipeline includes: 1. Document classification. 2. Document chunking with a specified or default chunk size. - 3. Event and timestamp extraction from chunks. - 4. Knowledge graph extraction from events. - 5. Batched insertion of data points. + 3. (Optional) Translation of non-English content to target language. + 4. Event and timestamp extraction from chunks. + 5. Knowledge graph extraction from events. + 6. Batched insertion of data points. Args: user (User, optional): The user requesting task execution. chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker. chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default. chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify + auto_translate (bool, optional): If True, translate non-English content. Defaults to False. + target_language (str, optional): Target language for translation. Defaults to "en". + translation_provider (str, optional): Translation provider to use ("openai", "google", "azure"). Returns: list[Task]: A list of Task objects representing the temporal processing pipeline. @@ -339,9 +400,23 @@ async def get_temporal_tasks( max_chunk_size=chunk_size or get_max_chunk_tokens(), chunker=chunker, ), + ] + + # Add translation task if auto_translate is enabled + if auto_translate: + temporal_tasks.append( + Task( + translate_content, + target_language=target_language, + translation_provider=translation_provider, + task_config={"batch_size": chunks_per_batch}, + ) + ) + + temporal_tasks.extend([ Task(extract_events_and_timestamps, task_config={"batch_size": chunks_per_batch}), Task(extract_knowledge_graph_from_events), Task(add_data_points, task_config={"batch_size": chunks_per_batch}), - ] + ]) return temporal_tasks diff --git a/cognee/infrastructure/llm/prompts/translate_content.txt b/cognee/infrastructure/llm/prompts/translate_content.txt new file mode 100644 index 000000000..759e83f31 --- /dev/null +++ b/cognee/infrastructure/llm/prompts/translate_content.txt @@ -0,0 +1,19 @@ +You are an expert translator with deep knowledge of languages, cultures, and linguistics. + +Your task is to: +1. Detect the source language of the provided text if not specified +2. Translate the text accurately to the target language +3. Preserve the original meaning, tone, and intent +4. Maintain proper grammar and natural phrasing in the target language + +Guidelines: +- Preserve technical terms, proper nouns, and specialized vocabulary appropriately +- Maintain formatting such as paragraphs, lists, and emphasis where applicable +- If the text contains code, URLs, or other non-translatable content, preserve them as-is +- Handle idioms and cultural references thoughtfully, adapting when necessary +- Ensure the translation reads naturally to a native speaker of the target language + +Provide the translation in a structured format with: +- The translated text +- The detected source language (ISO 639-1 code like "en", "es", "fr", "de", etc.) +- Any notes about the translation (optional, for ambiguous terms or cultural adaptations) diff --git a/cognee/tasks/translation/__init__.py b/cognee/tasks/translation/__init__.py new file mode 100644 index 000000000..b9836160c --- /dev/null +++ b/cognee/tasks/translation/__init__.py @@ -0,0 +1,96 @@ +""" +Translation task for Cognee. + +This module provides multilingual content translation capabilities, +allowing automatic detection and translation of non-English content +to a target language while preserving original text and metadata. + +Main Components: +- translate_content: Main task function for translating document chunks +- translate_text: Convenience function for translating single texts +- batch_translate_texts: Batch translation for multiple texts +- detect_language: Language detection utility +- TranslatedContent: DataPoint model for translated content +- LanguageMetadata: DataPoint model for language information + +Supported Translation Providers: +- OpenAI (default): Uses GPT models via existing LLM infrastructure +- Google Translate: Requires google-cloud-translate package +- Azure Translator: Requires Azure Translator API key + +Example Usage: + ```python + from cognee.tasks.translation import translate_content, translate_text + + # Translate document chunks in a pipeline + translated_chunks = await translate_content( + chunks, + target_language="en", + translation_provider="openai" + ) + + # Translate a single text + result = await translate_text("Bonjour le monde!") + print(result.translated_text) # "Hello world!" + ``` +""" + +from .config import get_translation_config, TranslationConfig +from .detect_language import ( + detect_language, + detect_language_async, + LanguageDetectionResult, + get_language_name, +) +from .exceptions import ( + TranslationError, + LanguageDetectionError, + TranslationProviderError, + UnsupportedLanguageError, + TranslationConfigError, +) +from .models import TranslatedContent, LanguageMetadata +from .providers import ( + TranslationProvider, + TranslationResult, + get_translation_provider, + OpenAITranslationProvider, + GoogleTranslationProvider, + AzureTranslationProvider, +) +from .translate_content import ( + translate_content, + translate_text, + batch_translate_texts, +) + +__all__ = [ + # Main task functions + "translate_content", + "translate_text", + "batch_translate_texts", + # Language detection + "detect_language", + "detect_language_async", + "LanguageDetectionResult", + "get_language_name", + # Models + "TranslatedContent", + "LanguageMetadata", + # Configuration + "get_translation_config", + "TranslationConfig", + # Providers + "TranslationProvider", + "TranslationResult", + "get_translation_provider", + "OpenAITranslationProvider", + "GoogleTranslationProvider", + "AzureTranslationProvider", + # Exceptions + "TranslationError", + "LanguageDetectionError", + "TranslationProviderError", + "UnsupportedLanguageError", + "TranslationConfigError", +] diff --git a/cognee/tasks/translation/config.py b/cognee/tasks/translation/config.py new file mode 100644 index 000000000..99ed560de --- /dev/null +++ b/cognee/tasks/translation/config.py @@ -0,0 +1,63 @@ +from functools import lru_cache +from typing import Literal, Optional + +from pydantic_settings import BaseSettings, SettingsConfigDict + + +TranslationProviderType = Literal["openai", "google", "azure"] + + +class TranslationConfig(BaseSettings): + """ + Configuration settings for the translation task. + + Environment variables can be used to configure these settings: + - TRANSLATION_PROVIDER: The translation service to use + - TRANSLATION_TARGET_LANGUAGE: Default target language + - TRANSLATION_CONFIDENCE_THRESHOLD: Minimum confidence for language detection + - GOOGLE_TRANSLATE_API_KEY: API key for Google Translate + - AZURE_TRANSLATOR_KEY: API key for Azure Translator + - AZURE_TRANSLATOR_REGION: Region for Azure Translator + """ + + # Translation provider settings + translation_provider: TranslationProviderType = "openai" + target_language: str = "en" + confidence_threshold: float = 0.8 + + # Google Translate settings + google_translate_api_key: Optional[str] = None + google_project_id: Optional[str] = None + + # Azure Translator settings + azure_translator_key: Optional[str] = None + azure_translator_region: Optional[str] = None + azure_translator_endpoint: str = "https://api.cognitive.microsofttranslator.com" + + # OpenAI uses the existing LLM configuration + + # Performance settings + batch_size: int = 10 + max_retries: int = 3 + timeout_seconds: int = 30 + + # Language detection settings + min_text_length_for_detection: int = 10 + skip_detection_for_short_text: bool = True + + model_config = SettingsConfigDict(env_file=".env", extra="allow") + + def to_dict(self) -> dict: + return { + "translation_provider": self.translation_provider, + "target_language": self.target_language, + "confidence_threshold": self.confidence_threshold, + "batch_size": self.batch_size, + "max_retries": self.max_retries, + } + + +@lru_cache +def get_translation_config() -> TranslationConfig: + """Get the translation configuration singleton.""" + return TranslationConfig() diff --git a/cognee/tasks/translation/detect_language.py b/cognee/tasks/translation/detect_language.py new file mode 100644 index 000000000..e223083c0 --- /dev/null +++ b/cognee/tasks/translation/detect_language.py @@ -0,0 +1,190 @@ +from dataclasses import dataclass +from typing import Optional + +from cognee.shared.logging_utils import get_logger + +from .config import get_translation_config +from .exceptions import LanguageDetectionError + +logger = get_logger(__name__) + + +# ISO 639-1 language code to name mapping +LANGUAGE_NAMES = { + "af": "Afrikaans", + "ar": "Arabic", + "bg": "Bulgarian", + "bn": "Bengali", + "ca": "Catalan", + "cs": "Czech", + "cy": "Welsh", + "da": "Danish", + "de": "German", + "el": "Greek", + "en": "English", + "es": "Spanish", + "et": "Estonian", + "fa": "Persian", + "fi": "Finnish", + "fr": "French", + "gu": "Gujarati", + "he": "Hebrew", + "hi": "Hindi", + "hr": "Croatian", + "hu": "Hungarian", + "id": "Indonesian", + "it": "Italian", + "ja": "Japanese", + "kn": "Kannada", + "ko": "Korean", + "lt": "Lithuanian", + "lv": "Latvian", + "mk": "Macedonian", + "ml": "Malayalam", + "mr": "Marathi", + "ne": "Nepali", + "nl": "Dutch", + "no": "Norwegian", + "pa": "Punjabi", + "pl": "Polish", + "pt": "Portuguese", + "ro": "Romanian", + "ru": "Russian", + "sk": "Slovak", + "sl": "Slovenian", + "so": "Somali", + "sq": "Albanian", + "sv": "Swedish", + "sw": "Swahili", + "ta": "Tamil", + "te": "Telugu", + "th": "Thai", + "tl": "Tagalog", + "tr": "Turkish", + "uk": "Ukrainian", + "ur": "Urdu", + "vi": "Vietnamese", + "zh-cn": "Chinese (Simplified)", + "zh-tw": "Chinese (Traditional)", +} + + +@dataclass +class LanguageDetectionResult: + """Result of language detection.""" + + language_code: str + language_name: str + confidence: float + requires_translation: bool + character_count: int + + +def get_language_name(language_code: str) -> str: + """Get the human-readable name for a language code.""" + return LANGUAGE_NAMES.get(language_code.lower(), language_code) + + +def detect_language( + text: str, + target_language: str = "en", + confidence_threshold: float = None, +) -> LanguageDetectionResult: + """ + Detect the language of the given text. + + Uses the langdetect library which is already a dependency of cognee. + + Args: + text: The text to analyze + target_language: The target language for translation comparison + confidence_threshold: Minimum confidence to consider detection reliable + + Returns: + LanguageDetectionResult with language info and translation requirement + + Raises: + LanguageDetectionError: If language detection fails + """ + config = get_translation_config() + threshold = confidence_threshold or config.confidence_threshold + + # Handle empty or very short text + if not text or len(text.strip()) < config.min_text_length_for_detection: + if config.skip_detection_for_short_text: + return LanguageDetectionResult( + language_code="unknown", + language_name="Unknown", + confidence=0.0, + requires_translation=False, + character_count=len(text) if text else 0, + ) + else: + raise LanguageDetectionError( + f"Text too short for reliable language detection: {len(text)} characters" + ) + + try: + from langdetect import detect_langs, LangDetectException + except ImportError: + raise LanguageDetectionError( + "langdetect is required for language detection. Install it with: pip install langdetect" + ) + + try: + # Get detection results with probabilities + detections = detect_langs(text) + + if not detections: + raise LanguageDetectionError("No language detected") + + # Get the most likely language + best_detection = detections[0] + language_code = best_detection.lang + confidence = best_detection.prob + + # Check if translation is needed + requires_translation = ( + language_code.lower() != target_language.lower() and confidence >= threshold + ) + + return LanguageDetectionResult( + language_code=language_code, + language_name=get_language_name(language_code), + confidence=confidence, + requires_translation=requires_translation, + character_count=len(text), + ) + + except LangDetectException as e: + logger.warning(f"Language detection failed: {e}") + raise LanguageDetectionError(f"Language detection failed: {e}", original_error=e) + except Exception as e: + logger.error(f"Unexpected error during language detection: {e}") + raise LanguageDetectionError( + f"Unexpected error during language detection: {e}", original_error=e + ) + + +async def detect_language_async( + text: str, + target_language: str = "en", + confidence_threshold: float = None, +) -> LanguageDetectionResult: + """ + Async wrapper for language detection. + + Args: + text: The text to analyze + target_language: The target language for translation comparison + confidence_threshold: Minimum confidence to consider detection reliable + + Returns: + LanguageDetectionResult with language info and translation requirement + """ + import asyncio + + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + None, detect_language, text, target_language, confidence_threshold + ) diff --git a/cognee/tasks/translation/exceptions.py b/cognee/tasks/translation/exceptions.py new file mode 100644 index 000000000..322e00c7a --- /dev/null +++ b/cognee/tasks/translation/exceptions.py @@ -0,0 +1,53 @@ +class TranslationError(Exception): + """Base exception for translation errors.""" + + def __init__(self, message: str, original_error: Exception = None): + self.message = message + self.original_error = original_error + super().__init__(self.message) + + +class LanguageDetectionError(TranslationError): + """Exception raised when language detection fails.""" + + def __init__(self, message: str = "Failed to detect language", original_error: Exception = None): + super().__init__(message, original_error) + + +class TranslationProviderError(TranslationError): + """Exception raised when the translation provider encounters an error.""" + + def __init__( + self, + provider: str, + message: str = "Translation provider error", + original_error: Exception = None, + ): + self.provider = provider + full_message = f"[{provider}] {message}" + super().__init__(full_message, original_error) + + +class UnsupportedLanguageError(TranslationError): + """Exception raised when the language is not supported.""" + + def __init__( + self, + language: str, + provider: str = None, + message: str = None, + ): + self.language = language + self.provider = provider + if message is None: + message = f"Language '{language}' is not supported" + if provider: + message += f" by {provider}" + super().__init__(message) + + +class TranslationConfigError(TranslationError): + """Exception raised when translation configuration is invalid.""" + + def __init__(self, message: str = "Invalid translation configuration"): + super().__init__(message) diff --git a/cognee/tasks/translation/models.py b/cognee/tasks/translation/models.py new file mode 100644 index 000000000..12854c965 --- /dev/null +++ b/cognee/tasks/translation/models.py @@ -0,0 +1,72 @@ +from datetime import datetime, timezone +from typing import Optional +from uuid import UUID + +from cognee.infrastructure.engine import DataPoint +from cognee.modules.chunking.models import DocumentChunk + + +class TranslatedContent(DataPoint): + """ + Represents translated content with quality metrics. + + This class stores both the original and translated versions of content, + along with metadata about the translation process including source and + target languages, translation provider used, and confidence scores. + + Instance variables include: + + - original_chunk_id: UUID of the original document chunk + - original_text: The original text before translation + - translated_text: The translated text content + - source_language: Detected or specified source language code (e.g., "es", "fr", "de") + - target_language: Target language code for translation (default: "en") + - translation_provider: Name of the translation service used + - confidence_score: Translation quality/confidence score (0.0 to 1.0) + - translation_timestamp: When the translation was performed + - translated_from: Reference to the original DocumentChunk + """ + + original_chunk_id: UUID + original_text: str + translated_text: str + source_language: str + target_language: str = "en" + translation_provider: str + confidence_score: float + translation_timestamp: datetime = None + translated_from: Optional[DocumentChunk] = None + + metadata: dict = {"index_fields": ["source_language", "original_chunk_id", "translated_text"]} + + def __init__(self, **data): + if data.get("translation_timestamp") is None: + data["translation_timestamp"] = datetime.now(timezone.utc) + super().__init__(**data) + + +class LanguageMetadata(DataPoint): + """ + Language information for content. + + This class stores metadata about the detected language of content, + including confidence scores and whether translation is required. + + Instance variables include: + + - content_id: UUID of the associated content + - detected_language: ISO 639-1 language code (e.g., "en", "es", "fr") + - language_confidence: Confidence score for language detection (0.0 to 1.0) + - requires_translation: Whether the content needs translation + - character_count: Number of characters in the content + - language_name: Human-readable language name (e.g., "English", "Spanish") + """ + + content_id: UUID + detected_language: str + language_confidence: float + requires_translation: bool + character_count: int + language_name: Optional[str] = None + + metadata: dict = {"index_fields": ["detected_language", "content_id"]} diff --git a/cognee/tasks/translation/providers/__init__.py b/cognee/tasks/translation/providers/__init__.py new file mode 100644 index 000000000..79a28a586 --- /dev/null +++ b/cognee/tasks/translation/providers/__init__.py @@ -0,0 +1,40 @@ +from .base import TranslationProvider, TranslationResult +from .openai_provider import OpenAITranslationProvider +from .google_provider import GoogleTranslationProvider +from .azure_provider import AzureTranslationProvider + +__all__ = [ + "TranslationProvider", + "TranslationResult", + "OpenAITranslationProvider", + "GoogleTranslationProvider", + "AzureTranslationProvider", +] + + +def get_translation_provider(provider_name: str) -> TranslationProvider: + """ + Factory function to get the appropriate translation provider. + + Args: + provider_name: Name of the provider ("openai", "google", or "azure") + + Returns: + TranslationProvider instance + + Raises: + ValueError: If the provider name is not recognized + """ + providers = { + "openai": OpenAITranslationProvider, + "google": GoogleTranslationProvider, + "azure": AzureTranslationProvider, + } + + if provider_name.lower() not in providers: + raise ValueError( + f"Unknown translation provider: {provider_name}. " + f"Available providers: {list(providers.keys())}" + ) + + return providers[provider_name.lower()]() diff --git a/cognee/tasks/translation/providers/azure_provider.py b/cognee/tasks/translation/providers/azure_provider.py new file mode 100644 index 000000000..4618834ff --- /dev/null +++ b/cognee/tasks/translation/providers/azure_provider.py @@ -0,0 +1,182 @@ +import asyncio +from typing import Optional + +import aiohttp + +from cognee.shared.logging_utils import get_logger + +from .base import TranslationProvider, TranslationResult +from ..config import get_translation_config + +logger = get_logger(__name__) + + +class AzureTranslationProvider(TranslationProvider): + """ + Translation provider using Azure Translator API. + + Requires: + - AZURE_TRANSLATOR_KEY environment variable + - AZURE_TRANSLATOR_REGION environment variable (optional) + """ + + def __init__(self): + self._config = get_translation_config() + + @property + def provider_name(self) -> str: + return "azure" + + def is_available(self) -> bool: + """Check if Azure Translator is available.""" + return self._config.azure_translator_key is not None + + async def translate( + self, + text: str, + target_language: str = "en", + source_language: Optional[str] = None, + ) -> TranslationResult: + """ + Translate text using Azure Translator API. + + Args: + text: The text to translate + target_language: Target language code (default: "en") + source_language: Source language code (optional) + + Returns: + TranslationResult with translated text and metadata + """ + if not self.is_available(): + raise ValueError( + "Azure Translator API key not configured. " + "Set AZURE_TRANSLATOR_KEY environment variable." + ) + + endpoint = f"{self._config.azure_translator_endpoint}/translate" + + params = { + "api-version": "3.0", + "to": target_language, + } + if source_language: + params["from"] = source_language + + headers = { + "Ocp-Apim-Subscription-Key": self._config.azure_translator_key, + "Content-Type": "application/json", + } + if self._config.azure_translator_region: + headers["Ocp-Apim-Subscription-Region"] = self._config.azure_translator_region + + body = [{"text": text}] + + try: + async with aiohttp.ClientSession() as session: + async with session.post( + endpoint, + params=params, + headers=headers, + json=body, + timeout=aiohttp.ClientTimeout(total=self._config.timeout_seconds), + ) as response: + response.raise_for_status() + result = await response.json() + + translation = result[0]["translations"][0] + detected_language = result[0].get("detectedLanguage", {}) + + return TranslationResult( + translated_text=translation["text"], + source_language=source_language + or detected_language.get("language", "unknown"), + target_language=target_language, + confidence_score=detected_language.get("score", 0.9), + provider=self.provider_name, + raw_response=result[0], + ) + + except Exception as e: + logger.error(f"Azure translation failed: {e}") + raise + + async def translate_batch( + self, + texts: list[str], + target_language: str = "en", + source_language: Optional[str] = None, + ) -> list[TranslationResult]: + """ + Translate multiple texts using Azure Translator API. + + Azure Translator supports up to 100 texts per request. + + Args: + texts: List of texts to translate + target_language: Target language code + source_language: Source language code (optional) + + Returns: + List of TranslationResult objects + """ + if not self.is_available(): + raise ValueError("Azure Translator API key not configured.") + + endpoint = f"{self._config.azure_translator_endpoint}/translate" + + params = { + "api-version": "3.0", + "to": target_language, + } + if source_language: + params["from"] = source_language + + headers = { + "Ocp-Apim-Subscription-Key": self._config.azure_translator_key, + "Content-Type": "application/json", + } + if self._config.azure_translator_region: + headers["Ocp-Apim-Subscription-Region"] = self._config.azure_translator_region + + # Azure supports up to 100 texts per request + batch_size = min(100, self._config.batch_size) + all_results = [] + + for i in range(0, len(texts), batch_size): + batch = texts[i : i + batch_size] + body = [{"text": text} for text in batch] + + try: + async with aiohttp.ClientSession() as session: + async with session.post( + endpoint, + params=params, + headers=headers, + json=body, + timeout=aiohttp.ClientTimeout(total=self._config.timeout_seconds), + ) as response: + response.raise_for_status() + results = await response.json() + + for result in results: + translation = result["translations"][0] + detected_language = result.get("detectedLanguage", {}) + + all_results.append( + TranslationResult( + translated_text=translation["text"], + source_language=source_language + or detected_language.get("language", "unknown"), + target_language=target_language, + confidence_score=detected_language.get("score", 0.9), + provider=self.provider_name, + raw_response=result, + ) + ) + + except Exception as e: + logger.error(f"Azure batch translation failed: {e}") + raise + + return all_results diff --git a/cognee/tasks/translation/providers/base.py b/cognee/tasks/translation/providers/base.py new file mode 100644 index 000000000..d8e5e981e --- /dev/null +++ b/cognee/tasks/translation/providers/base.py @@ -0,0 +1,69 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class TranslationResult: + """Result of a translation operation.""" + + translated_text: str + source_language: str + target_language: str + confidence_score: float + provider: str + raw_response: Optional[dict] = None + + +class TranslationProvider(ABC): + """Abstract base class for translation providers.""" + + @property + @abstractmethod + def provider_name(self) -> str: + """Return the name of this translation provider.""" + pass + + @abstractmethod + async def translate( + self, + text: str, + target_language: str = "en", + source_language: Optional[str] = None, + ) -> TranslationResult: + """ + Translate text to the target language. + + Args: + text: The text to translate + target_language: Target language code (default: "en") + source_language: Source language code (optional, will be auto-detected if not provided) + + Returns: + TranslationResult with translated text and metadata + """ + pass + + @abstractmethod + async def translate_batch( + self, + texts: list[str], + target_language: str = "en", + source_language: Optional[str] = None, + ) -> list[TranslationResult]: + """ + Translate multiple texts to the target language. + + Args: + texts: List of texts to translate + target_language: Target language code (default: "en") + source_language: Source language code (optional) + + Returns: + List of TranslationResult objects + """ + pass + + def is_available(self) -> bool: + """Check if this provider is available (has required credentials).""" + return True diff --git a/cognee/tasks/translation/providers/google_provider.py b/cognee/tasks/translation/providers/google_provider.py new file mode 100644 index 000000000..0a7373b54 --- /dev/null +++ b/cognee/tasks/translation/providers/google_provider.py @@ -0,0 +1,159 @@ +import asyncio +from typing import Optional + +from cognee.shared.logging_utils import get_logger + +from .base import TranslationProvider, TranslationResult +from ..config import get_translation_config + +logger = get_logger(__name__) + + +class GoogleTranslationProvider(TranslationProvider): + """ + Translation provider using Google Cloud Translation API. + + Requires: + - google-cloud-translate package + - GOOGLE_TRANSLATE_API_KEY or GOOGLE_PROJECT_ID environment variable + """ + + def __init__(self): + self._client = None + self._config = get_translation_config() + + @property + def provider_name(self) -> str: + return "google" + + def _get_client(self): + """Lazy initialization of Google Translate client.""" + if self._client is None: + try: + from google.cloud import translate_v2 as translate + + self._client = translate.Client() + except ImportError: + raise ImportError( + "google-cloud-translate is required for Google translation. " + "Install it with: pip install google-cloud-translate" + ) + except Exception as e: + logger.error(f"Failed to initialize Google Translate client: {e}") + raise + return self._client + + def is_available(self) -> bool: + """Check if Google Translate is available.""" + try: + self._get_client() + return True + except Exception: + return False + + async def translate( + self, + text: str, + target_language: str = "en", + source_language: Optional[str] = None, + ) -> TranslationResult: + """ + Translate text using Google Translate API. + + Args: + text: The text to translate + target_language: Target language code (default: "en") + source_language: Source language code (optional) + + Returns: + TranslationResult with translated text and metadata + """ + try: + client = self._get_client() + + # Run in thread pool since google-cloud-translate is synchronous + loop = asyncio.get_event_loop() + + if source_language: + result = await loop.run_in_executor( + None, + lambda: client.translate( + text, target_language=target_language, source_language=source_language + ), + ) + else: + result = await loop.run_in_executor( + None, lambda: client.translate(text, target_language=target_language) + ) + + detected_language = result.get("detectedSourceLanguage", source_language or "unknown") + + return TranslationResult( + translated_text=result["translatedText"], + source_language=detected_language, + target_language=target_language, + confidence_score=0.9, # Google Translate is generally reliable + provider=self.provider_name, + raw_response=result, + ) + + except Exception as e: + logger.error(f"Google translation failed: {e}") + raise + + async def translate_batch( + self, + texts: list[str], + target_language: str = "en", + source_language: Optional[str] = None, + ) -> list[TranslationResult]: + """ + Translate multiple texts using Google Translate API. + + Google Translate supports batch translation natively. + + Args: + texts: List of texts to translate + target_language: Target language code + source_language: Source language code (optional) + + Returns: + List of TranslationResult objects + """ + try: + client = self._get_client() + loop = asyncio.get_event_loop() + + if source_language: + results = await loop.run_in_executor( + None, + lambda: client.translate( + texts, target_language=target_language, source_language=source_language + ), + ) + else: + results = await loop.run_in_executor( + None, lambda: client.translate(texts, target_language=target_language) + ) + + translation_results = [] + for result in results: + detected_language = result.get( + "detectedSourceLanguage", source_language or "unknown" + ) + translation_results.append( + TranslationResult( + translated_text=result["translatedText"], + source_language=detected_language, + target_language=target_language, + confidence_score=0.9, + provider=self.provider_name, + raw_response=result, + ) + ) + + return translation_results + + except Exception as e: + logger.error(f"Google batch translation failed: {e}") + raise diff --git a/cognee/tasks/translation/providers/openai_provider.py b/cognee/tasks/translation/providers/openai_provider.py new file mode 100644 index 000000000..2c70c6edb --- /dev/null +++ b/cognee/tasks/translation/providers/openai_provider.py @@ -0,0 +1,107 @@ +import asyncio +from typing import Optional + +from pydantic import BaseModel + +from cognee.infrastructure.llm.LLMGateway import LLMGateway +from cognee.infrastructure.llm.prompts import read_query_prompt +from cognee.shared.logging_utils import get_logger + +from .base import TranslationProvider, TranslationResult + +logger = get_logger(__name__) + + +class TranslationOutput(BaseModel): + """Pydantic model for structured translation output from LLM.""" + + translated_text: str + detected_source_language: str + translation_notes: Optional[str] = None + + +class OpenAITranslationProvider(TranslationProvider): + """ + Translation provider using OpenAI's LLM for translation. + + This provider leverages the existing LLM infrastructure in Cognee + to perform translations using GPT models. + """ + + @property + def provider_name(self) -> str: + return "openai" + + async def translate( + self, + text: str, + target_language: str = "en", + source_language: Optional[str] = None, + ) -> TranslationResult: + """ + Translate text using OpenAI's LLM. + + Args: + text: The text to translate + target_language: Target language code (default: "en") + source_language: Source language code (optional) + + Returns: + TranslationResult with translated text and metadata + """ + try: + system_prompt = read_query_prompt("translate_content.txt") + + # Build the input with context + if source_language: + input_text = ( + f"Translate the following text from {source_language} to {target_language}.\n\n" + f"Text to translate:\n{text}" + ) + else: + input_text = ( + f"Translate the following text to {target_language}. " + f"First detect the source language.\n\n" + f"Text to translate:\n{text}" + ) + + result = await LLMGateway.acreate_structured_output( + text_input=input_text, + system_prompt=system_prompt, + response_model=TranslationOutput, + ) + + return TranslationResult( + translated_text=result.translated_text, + source_language=source_language or result.detected_source_language, + target_language=target_language, + confidence_score=0.95, # LLM translations are generally high quality + provider=self.provider_name, + raw_response={"notes": result.translation_notes}, + ) + + except Exception as e: + logger.error(f"OpenAI translation failed: {e}") + raise + + async def translate_batch( + self, + texts: list[str], + target_language: str = "en", + source_language: Optional[str] = None, + ) -> list[TranslationResult]: + """ + Translate multiple texts using OpenAI's LLM. + + Args: + texts: List of texts to translate + target_language: Target language code + source_language: Source language code (optional) + + Returns: + List of TranslationResult objects + """ + tasks = [ + self.translate(text, target_language, source_language) for text in texts + ] + return await asyncio.gather(*tasks) diff --git a/cognee/tasks/translation/translate_content.py b/cognee/tasks/translation/translate_content.py new file mode 100644 index 000000000..e200f659d --- /dev/null +++ b/cognee/tasks/translation/translate_content.py @@ -0,0 +1,265 @@ +import asyncio +from typing import List, Optional +from uuid import uuid5 + +from cognee.modules.chunking.models import DocumentChunk +from cognee.shared.logging_utils import get_logger + +from .config import get_translation_config, TranslationProviderType +from .detect_language import detect_language_async, LanguageDetectionResult +from .exceptions import TranslationError, LanguageDetectionError +from .models import TranslatedContent, LanguageMetadata +from .providers import get_translation_provider, TranslationResult + +logger = get_logger(__name__) + + +async def translate_content( + data_chunks: List[DocumentChunk], + target_language: str = "en", + translation_provider: TranslationProviderType = None, + confidence_threshold: float = None, + skip_if_target_language: bool = True, + preserve_original: bool = True, +) -> List[DocumentChunk]: + """ + Translate non-English content to the target language. + + This task detects the language of each document chunk and translates + non-target-language content using the specified translation provider. + Original text is preserved alongside translated versions. + + Args: + data_chunks: List of DocumentChunk objects to process + target_language: Target language code (default: "en" for English) + translation_provider: Translation service to use ("openai", "google", "azure") + If not provided, uses config default + confidence_threshold: Minimum confidence for language detection (0.0 to 1.0) + If not provided, uses config default + skip_if_target_language: If True, skip chunks already in target language + preserve_original: If True, store original text in TranslatedContent + + Returns: + List of DocumentChunk objects with translated content. + Chunks that required translation will have TranslatedContent + objects in their 'contains' list. + + Example: + ```python + from cognee.tasks.translation import translate_content + + # Translate chunks using default settings + translated_chunks = await translate_content(chunks) + + # Translate with specific provider + translated_chunks = await translate_content( + chunks, + translation_provider="openai", + confidence_threshold=0.9 + ) + ``` + """ + if not isinstance(data_chunks, list): + raise TranslationError("data_chunks must be a list") + + if len(data_chunks) == 0: + return data_chunks + + # Get configuration + config = get_translation_config() + provider_name = translation_provider or config.translation_provider + threshold = confidence_threshold or config.confidence_threshold + + logger.info( + f"Starting translation task for {len(data_chunks)} chunks " + f"using {provider_name} provider, target language: {target_language}" + ) + + # Get the translation provider + provider = get_translation_provider(provider_name) + + # Process chunks + processed_chunks = [] + + for chunk in data_chunks: + if not hasattr(chunk, "text") or not chunk.text: + processed_chunks.append(chunk) + continue + + try: + # Detect language + detection = await detect_language_async( + chunk.text, target_language, threshold + ) + + # Create language metadata + language_metadata = LanguageMetadata( + id=uuid5(chunk.id, "LanguageMetadata"), + content_id=chunk.id, + detected_language=detection.language_code, + language_confidence=detection.confidence, + requires_translation=detection.requires_translation, + character_count=detection.character_count, + language_name=detection.language_name, + ) + + # Skip if already in target language + if not detection.requires_translation: + if skip_if_target_language: + logger.debug( + f"Skipping chunk {chunk.id}: already in target language " + f"({detection.language_code})" + ) + # Add language metadata to chunk + _add_to_chunk_contains(chunk, language_metadata) + processed_chunks.append(chunk) + continue + + # Translate the content + logger.debug( + f"Translating chunk {chunk.id} from {detection.language_code} " + f"to {target_language}" + ) + + translation_result = await provider.translate( + text=chunk.text, + target_language=target_language, + source_language=detection.language_code, + ) + + # Create TranslatedContent data point + translated_content = TranslatedContent( + id=uuid5(chunk.id, "TranslatedContent"), + original_chunk_id=chunk.id, + original_text=chunk.text if preserve_original else "", + translated_text=translation_result.translated_text, + source_language=translation_result.source_language, + target_language=translation_result.target_language, + translation_provider=translation_result.provider, + confidence_score=translation_result.confidence_score, + translated_from=chunk, + ) + + # Update chunk text with translated content + chunk.text = translation_result.translated_text + + # Add metadata to chunk's contains list + _add_to_chunk_contains(chunk, language_metadata) + _add_to_chunk_contains(chunk, translated_content) + + processed_chunks.append(chunk) + + logger.debug( + f"Successfully translated chunk {chunk.id}: " + f"{detection.language_code} -> {target_language}" + ) + + except LanguageDetectionError as e: + logger.warning(f"Language detection failed for chunk {chunk.id}: {e}") + processed_chunks.append(chunk) + except TranslationError as e: + logger.error(f"Translation failed for chunk {chunk.id}: {e}") + processed_chunks.append(chunk) + except Exception as e: + logger.error(f"Unexpected error processing chunk {chunk.id}: {e}") + processed_chunks.append(chunk) + + logger.info(f"Translation task completed for {len(processed_chunks)} chunks") + return processed_chunks + + +def _add_to_chunk_contains(chunk: DocumentChunk, item) -> None: + """Helper to add an item to a chunk's contains list.""" + if chunk.contains is None: + chunk.contains = [] + chunk.contains.append(item) + + +async def translate_text( + text: str, + target_language: str = "en", + translation_provider: TranslationProviderType = None, + source_language: Optional[str] = None, +) -> TranslationResult: + """ + Translate a single text string. + + This is a convenience function for translating individual texts + without creating DocumentChunk objects. + + Args: + text: The text to translate + target_language: Target language code (default: "en") + translation_provider: Translation service to use + source_language: Source language code (optional, auto-detected if not provided) + + Returns: + TranslationResult with translated text and metadata + + Example: + ```python + from cognee.tasks.translation import translate_text + + result = await translate_text( + "Bonjour le monde!", + target_language="en" + ) + print(result.translated_text) # "Hello world!" + print(result.source_language) # "fr" + ``` + """ + config = get_translation_config() + provider_name = translation_provider or config.translation_provider + + provider = get_translation_provider(provider_name) + + return await provider.translate( + text=text, + target_language=target_language, + source_language=source_language, + ) + + +async def batch_translate_texts( + texts: List[str], + target_language: str = "en", + translation_provider: TranslationProviderType = None, + source_language: Optional[str] = None, +) -> List[TranslationResult]: + """ + Translate multiple text strings in batch. + + This is more efficient than translating texts individually, + especially for providers that support native batch operations. + + Args: + texts: List of texts to translate + target_language: Target language code (default: "en") + translation_provider: Translation service to use + source_language: Source language code (optional) + + Returns: + List of TranslationResult objects + + Example: + ```python + from cognee.tasks.translation import batch_translate_texts + + results = await batch_translate_texts( + ["Hola", "¿Cómo estás?", "Adiós"], + target_language="en" + ) + for result in results: + print(f"{result.source_language}: {result.translated_text}") + ``` + """ + config = get_translation_config() + provider_name = translation_provider or config.translation_provider + + provider = get_translation_provider(provider_name) + + return await provider.translate_batch( + texts=texts, + target_language=target_language, + source_language=source_language, + ) From d7962bd44a6fe5164431263715eec3fad24c79cb Mon Sep 17 00:00:00 2001 From: andikarachman Date: Fri, 2 Jan 2026 15:23:10 +0700 Subject: [PATCH 02/16] fix(translation): Remove UUID fields from vector index_fields - Fixed ValidationError when indexing TranslatedContent and LanguageMetadata - UUID fields (original_chunk_id, content_id) cannot be indexed as text in vector databases - Kept UUID fields in models for relationships, only removed from index_fields metadata --- cognee/tasks/translation/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cognee/tasks/translation/models.py b/cognee/tasks/translation/models.py index 12854c965..da5007312 100644 --- a/cognee/tasks/translation/models.py +++ b/cognee/tasks/translation/models.py @@ -37,7 +37,7 @@ class TranslatedContent(DataPoint): translation_timestamp: datetime = None translated_from: Optional[DocumentChunk] = None - metadata: dict = {"index_fields": ["source_language", "original_chunk_id", "translated_text"]} + metadata: dict = {"index_fields": ["source_language", "translated_text"]} def __init__(self, **data): if data.get("translation_timestamp") is None: @@ -69,4 +69,4 @@ class LanguageMetadata(DataPoint): character_count: int language_name: Optional[str] = None - metadata: dict = {"index_fields": ["detected_language", "content_id"]} + metadata: dict = {"index_fields": ["detected_language"]} From 00e318b3ed2aedcac8ef8fc5dd5f6e686b709cff Mon Sep 17 00:00:00 2001 From: andikarachman Date: Fri, 2 Jan 2026 18:56:53 +0700 Subject: [PATCH 03/16] test: add comprehensive translation module tests - Add unit tests for translation configuration, language detection, providers, and translate_content task - Add integration tests for full cognify pipeline with translation - All 40 tests passing (32 unit + 8 integration) - Tests use asyncio.run() pattern matching project style - Tests named with *_test.py suffix per project convention - Update README with test documentation Formatting changes: - Apply ruff format to cognify.py (bracket placement style) Signed-off-by: andikarachman --- cognee/api/v1/cognify/cognify.py | 52 ++-- cognee/tasks/translation/exceptions.py | 4 +- .../translation/providers/azure_provider.py | 3 +- .../translation/providers/openai_provider.py | 4 +- cognee/tasks/translation/translate_content.py | 7 +- cognee/tests/tasks/translation/README.md | 126 +++++++++ cognee/tests/tasks/translation/__init__.py | 1 + cognee/tests/tasks/translation/config_test.py | 66 +++++ .../tasks/translation/detect_language_test.py | 147 ++++++++++ .../tasks/translation/integration_test.py | 255 +++++++++++++++++ .../tests/tasks/translation/providers_test.py | 201 ++++++++++++++ .../translation/translate_content_test.py | 256 ++++++++++++++++++ 12 files changed, 1087 insertions(+), 35 deletions(-) create mode 100644 cognee/tests/tasks/translation/README.md create mode 100644 cognee/tests/tasks/translation/__init__.py create mode 100644 cognee/tests/tasks/translation/config_test.py create mode 100644 cognee/tests/tasks/translation/detect_language_test.py create mode 100644 cognee/tests/tasks/translation/integration_test.py create mode 100644 cognee/tests/tasks/translation/providers_test.py create mode 100644 cognee/tests/tasks/translation/translate_content_test.py diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 50071caef..1b50b6d2f 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -335,25 +335,27 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's ) ) - default_tasks.extend([ - Task( - extract_graph_from_data, - graph_model=graph_model, - config=config, - custom_prompt=custom_prompt, - task_config={"batch_size": chunks_per_batch}, - **kwargs, - ), # Generate knowledge graphs from the document chunks. - Task( - summarize_text, - task_config={"batch_size": chunks_per_batch}, - ), - Task( - add_data_points, - embed_triplets=embed_triplets, - task_config={"batch_size": chunks_per_batch}, - ), - ]) + default_tasks.extend( + [ + Task( + extract_graph_from_data, + graph_model=graph_model, + config=config, + custom_prompt=custom_prompt, + task_config={"batch_size": chunks_per_batch}, + **kwargs, + ), # Generate knowledge graphs from the document chunks. + Task( + summarize_text, + task_config={"batch_size": chunks_per_batch}, + ), + Task( + add_data_points, + embed_triplets=embed_triplets, + task_config={"batch_size": chunks_per_batch}, + ), + ] + ) return default_tasks @@ -413,10 +415,12 @@ async def get_temporal_tasks( ) ) - temporal_tasks.extend([ - Task(extract_events_and_timestamps, task_config={"batch_size": chunks_per_batch}), - Task(extract_knowledge_graph_from_events), - Task(add_data_points, task_config={"batch_size": chunks_per_batch}), - ]) + temporal_tasks.extend( + [ + Task(extract_events_and_timestamps, task_config={"batch_size": chunks_per_batch}), + Task(extract_knowledge_graph_from_events), + Task(add_data_points, task_config={"batch_size": chunks_per_batch}), + ] + ) return temporal_tasks diff --git a/cognee/tasks/translation/exceptions.py b/cognee/tasks/translation/exceptions.py index 322e00c7a..ba5e74510 100644 --- a/cognee/tasks/translation/exceptions.py +++ b/cognee/tasks/translation/exceptions.py @@ -10,7 +10,9 @@ class TranslationError(Exception): class LanguageDetectionError(TranslationError): """Exception raised when language detection fails.""" - def __init__(self, message: str = "Failed to detect language", original_error: Exception = None): + def __init__( + self, message: str = "Failed to detect language", original_error: Exception = None + ): super().__init__(message, original_error) diff --git a/cognee/tasks/translation/providers/azure_provider.py b/cognee/tasks/translation/providers/azure_provider.py index 4618834ff..2ee1f45d7 100644 --- a/cognee/tasks/translation/providers/azure_provider.py +++ b/cognee/tasks/translation/providers/azure_provider.py @@ -89,8 +89,7 @@ class AzureTranslationProvider(TranslationProvider): return TranslationResult( translated_text=translation["text"], - source_language=source_language - or detected_language.get("language", "unknown"), + source_language=source_language or detected_language.get("language", "unknown"), target_language=target_language, confidence_score=detected_language.get("score", 0.9), provider=self.provider_name, diff --git a/cognee/tasks/translation/providers/openai_provider.py b/cognee/tasks/translation/providers/openai_provider.py index 2c70c6edb..a888d688e 100644 --- a/cognee/tasks/translation/providers/openai_provider.py +++ b/cognee/tasks/translation/providers/openai_provider.py @@ -101,7 +101,5 @@ class OpenAITranslationProvider(TranslationProvider): Returns: List of TranslationResult objects """ - tasks = [ - self.translate(text, target_language, source_language) for text in texts - ] + tasks = [self.translate(text, target_language, source_language) for text in texts] return await asyncio.gather(*tasks) diff --git a/cognee/tasks/translation/translate_content.py b/cognee/tasks/translation/translate_content.py index e200f659d..1c869b132 100644 --- a/cognee/tasks/translation/translate_content.py +++ b/cognee/tasks/translation/translate_content.py @@ -88,9 +88,7 @@ async def translate_content( try: # Detect language - detection = await detect_language_async( - chunk.text, target_language, threshold - ) + detection = await detect_language_async(chunk.text, target_language, threshold) # Create language metadata language_metadata = LanguageMetadata( @@ -117,8 +115,7 @@ async def translate_content( # Translate the content logger.debug( - f"Translating chunk {chunk.id} from {detection.language_code} " - f"to {target_language}" + f"Translating chunk {chunk.id} from {detection.language_code} to {target_language}" ) translation_result = await provider.translate( diff --git a/cognee/tests/tasks/translation/README.md b/cognee/tests/tasks/translation/README.md new file mode 100644 index 000000000..cb56bf18a --- /dev/null +++ b/cognee/tests/tasks/translation/README.md @@ -0,0 +1,126 @@ +# Translation Task Tests + +Unit and integration tests for the multilingual content translation feature. + +## Test Files + +- **config_test.py** - Tests for translation configuration + - Default configuration + - Provider type validation + - Confidence threshold bounds + - Multiple provider API keys + +- **detect_language_test.py** - Tests for language detection functionality + - English, Spanish, French, German, Chinese detection + - Confidence thresholds + - Edge cases (empty text, short text, mixed languages) + +- **providers_test.py** - Tests for translation provider implementations + - OpenAI provider basic translation + - Auto-detection of source language + - Batch translation + - Special characters and formatting preservation + - Error handling + +- **translate_content_test.py** - Tests for the main translate_content task + - Basic translation workflow + - Original text preservation + - Multiple chunks processing + - Language metadata creation + - Skip translation for target language + - Confidence threshold customization + +- **integration_test.py** - End-to-end integration tests + - Full cognify pipeline with translation + - Spanish/French to English translation + - Mixed language datasets + - Search functionality after translation + - Translation disabled mode + +## Running Tests + +### Run all translation tests +```bash +uv run pytest cognee/tests/tasks/translation/ -v +``` + +### Run specific test file +```bash +uv run pytest cognee/tests/tasks/translation/detect_language_test.py -v +``` + +### Run tests directly (without pytest) +```bash +uv run python cognee/tests/tasks/translation/config_test.py +uv run python cognee/tests/tasks/translation/detect_language_test.py +uv run python cognee/tests/tasks/translation/providers_test.py +uv run python cognee/tests/tasks/translation/translate_content_test.py +uv run python cognee/tests/tasks/translation/integration_test.py +``` + +### Run all tests at once +```bash +for f in cognee/tests/tasks/translation/*_test.py; do uv run python "$f"; done +``` + +### Run with coverage +```bash +uv run pytest cognee/tests/tasks/translation/ --cov=cognee.tasks.translation --cov-report=html +``` + +## Prerequisites + +- LLM API key set in environment: `LLM_API_KEY=your_key` +- Tests will be skipped if no API key is available + +## Test Summary + +| Test File | Tests | Description | +|-----------|-------|-------------| +| config_test.py | 4 | Configuration validation | +| detect_language_test.py | 10 | Language detection | +| providers_test.py | 9 | Translation providers | +| translate_content_test.py | 9 | Content translation task | +| integration_test.py | 8 | End-to-end pipeline | +| **Total** | **40** | | + +## Test Categories + +### Configuration (4 tests) +- ✅ Default configuration values +- ✅ Provider type literal validation +- ✅ Confidence threshold bounds +- ✅ Multiple provider API keys + +### Language Detection (10 tests) +- ✅ Multiple language detection (EN, ES, FR, DE, ZH) +- ✅ Confidence scoring +- ✅ Target language matching +- ✅ Short and empty text handling +- ✅ Mixed language detection + +### Translation Providers (9 tests) +- ✅ Provider factory function +- ✅ OpenAI translation +- ✅ Batch operations +- ✅ Auto source language detection +- ✅ Long text handling +- ✅ Special characters preservation +- ✅ Error handling + +### Content Translation (9 tests) +- ✅ DocumentChunk processing +- ✅ Metadata creation (LanguageMetadata, TranslatedContent) +- ✅ Original text preservation +- ✅ Multiple chunk handling +- ✅ Empty text/list handling +- ✅ Confidence threshold customization + +### Integration (8 tests) +- ✅ Full cognify pipeline with auto_translate=True +- ✅ Spanish to English translation +- ✅ French to English translation +- ✅ Mixed language datasets +- ✅ Translation disabled mode +- ✅ Direct translate_text function +- ✅ Search after translation diff --git a/cognee/tests/tasks/translation/__init__.py b/cognee/tests/tasks/translation/__init__.py new file mode 100644 index 000000000..7284dcfa5 --- /dev/null +++ b/cognee/tests/tasks/translation/__init__.py @@ -0,0 +1 @@ +"""Translation task tests""" diff --git a/cognee/tests/tasks/translation/config_test.py b/cognee/tests/tasks/translation/config_test.py new file mode 100644 index 000000000..ee8d6019c --- /dev/null +++ b/cognee/tests/tasks/translation/config_test.py @@ -0,0 +1,66 @@ +""" +Unit tests for translation configuration +""" + +import os +from typing import get_args +from cognee.tasks.translation.config import ( + get_translation_config, + TranslationConfig, + TranslationProviderType, +) + + +def test_default_translation_config(): + """Test default translation configuration""" + config = get_translation_config() + + assert isinstance(config, TranslationConfig) + assert config.translation_provider in ["openai", "google", "azure"] + assert 0.0 <= config.confidence_threshold <= 1.0 + + +def test_translation_provider_type_literal(): + """Test TranslationProviderType Literal type values""" + # Get the allowed values from the Literal type + allowed_values = get_args(TranslationProviderType) + + assert "openai" in allowed_values + assert "google" in allowed_values + assert "azure" in allowed_values + assert len(allowed_values) == 3 + + +def test_confidence_threshold_bounds(): + """Test confidence threshold validation""" + config = TranslationConfig(translation_provider="openai", confidence_threshold=0.9) + + assert 0.0 <= config.confidence_threshold <= 1.0 + + +def test_multiple_provider_keys(): + """Test configuration with multiple provider API keys""" + config = TranslationConfig( + translation_provider="openai", + google_translate_api_key="google_key", + azure_translator_key="azure_key", + ) + + assert config.google_translate_api_key == "google_key" + assert config.azure_translator_key == "azure_key" + + +if __name__ == "__main__": + test_default_translation_config() + print("✓ test_default_translation_config passed") + + test_translation_provider_type_literal() + print("✓ test_translation_provider_type_literal passed") + + test_confidence_threshold_bounds() + print("✓ test_confidence_threshold_bounds passed") + + test_multiple_provider_keys() + print("✓ test_multiple_provider_keys passed") + + print("\nAll config tests passed!") diff --git a/cognee/tests/tasks/translation/detect_language_test.py b/cognee/tests/tasks/translation/detect_language_test.py new file mode 100644 index 000000000..907c94df8 --- /dev/null +++ b/cognee/tests/tasks/translation/detect_language_test.py @@ -0,0 +1,147 @@ +""" +Unit tests for language detection functionality +""" + +import asyncio +from cognee.tasks.translation.detect_language import ( + detect_language_async, + LanguageDetectionResult, +) +from cognee.tasks.translation.exceptions import LanguageDetectionError + + +async def test_detect_english(): + """Test detection of English text""" + result = await detect_language_async("Hello world, this is a test.", target_language="en") + + assert result.language_code == "en" + assert result.requires_translation is False + assert result.confidence > 0.9 + assert result.language_name == "English" + + +async def test_detect_spanish(): + """Test detection of Spanish text""" + result = await detect_language_async("Hola mundo, esta es una prueba.", target_language="en") + + assert result.language_code == "es" + assert result.requires_translation is True + assert result.confidence > 0.9 + assert result.language_name == "Spanish" + + +async def test_detect_french(): + """Test detection of French text""" + result = await detect_language_async( + "Bonjour le monde, ceci est un test.", target_language="en" + ) + + assert result.language_code == "fr" + assert result.requires_translation is True + assert result.confidence > 0.9 + assert result.language_name == "French" + + +async def test_detect_german(): + """Test detection of German text""" + result = await detect_language_async("Hallo Welt, das ist ein Test.", target_language="en") + + assert result.language_code == "de" + assert result.requires_translation is True + assert result.confidence > 0.9 + + +async def test_detect_chinese(): + """Test detection of Chinese text""" + result = await detect_language_async("你好世界,这是一个测试。", target_language="en") + + assert result.language_code == "zh-cn" + assert result.requires_translation is True + assert result.confidence > 0.9 + + +async def test_already_target_language(): + """Test when text is already in target language""" + result = await detect_language_async("This text is already in English.", target_language="en") + + assert result.requires_translation is False + + +async def test_short_text(): + """Test detection with very short text""" + result = await detect_language_async("Hi", target_language="es") + + # Short text may return 'unknown' if langdetect can't reliably detect + assert result.language_code in ["en", "unknown"] + assert result.character_count == 2 + + +async def test_empty_text(): + """Test detection with empty text - returns unknown by default""" + result = await detect_language_async("", target_language="en") + + # With skip_detection_for_short_text=True (default), returns unknown + assert result.language_code == "unknown" + assert result.language_name == "Unknown" + assert result.confidence == 0.0 + assert result.requires_translation is False + assert result.character_count == 0 + + +async def test_confidence_threshold(): + """Test detection respects confidence threshold""" + result = await detect_language_async( + "Hello world", target_language="es", confidence_threshold=0.5 + ) + + assert result.confidence >= 0.5 + + +async def test_mixed_language_text(): + """Test detection with mixed language text (predominantly one language)""" + # Predominantly Spanish with English word + result = await detect_language_async( + "La inteligencia artificial es muy importante en technology moderna.", target_language="en" + ) + + assert result.language_code == "es" # Should detect as Spanish + assert result.requires_translation is True + + +async def main(): + """Run all language detection tests""" + await test_detect_english() + print("✓ test_detect_english passed") + + await test_detect_spanish() + print("✓ test_detect_spanish passed") + + await test_detect_french() + print("✓ test_detect_french passed") + + await test_detect_german() + print("✓ test_detect_german passed") + + await test_detect_chinese() + print("✓ test_detect_chinese passed") + + await test_already_target_language() + print("✓ test_already_target_language passed") + + await test_short_text() + print("✓ test_short_text passed") + + await test_empty_text() + print("✓ test_empty_text passed") + + await test_confidence_threshold() + print("✓ test_confidence_threshold passed") + + await test_mixed_language_text() + print("✓ test_mixed_language_text passed") + + print("\nAll language detection tests passed!") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/tests/tasks/translation/integration_test.py b/cognee/tests/tasks/translation/integration_test.py new file mode 100644 index 000000000..98fcae5d5 --- /dev/null +++ b/cognee/tests/tasks/translation/integration_test.py @@ -0,0 +1,255 @@ +""" +Integration tests for multilingual content translation feature. + +Tests the full cognify pipeline with translation enabled. +""" + +import asyncio +import os + +from cognee import add, cognify, prune, search, SearchType +from cognee.tasks.translation import translate_text +from cognee.tasks.translation.detect_language import detect_language_async + + +def has_openai_key(): + """Check if OpenAI API key is available""" + return bool(os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY")) + + +async def test_quick_translation(): + """Quick smoke test for translation feature""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + await prune.prune_data() + await prune.prune_system(metadata=True) + + spanish_text = "La inteligencia artificial está transformando el mundo de la tecnología." + await add(spanish_text, dataset_name="spanish_test") + + result = await cognify( + datasets=["spanish_test"], + auto_translate=True, + target_language="en", + translation_provider="openai", + ) + + assert result is not None + + +async def test_translation_basic(): + """Test basic translation functionality with English text""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + await prune.prune_data() + await prune.prune_system(metadata=True) + + english_text = "Hello, this is a test document about artificial intelligence." + await add(english_text, dataset_name="test_english") + + result = await cognify( + datasets=["test_english"], + auto_translate=True, + target_language="en", + translation_provider="openai", + ) + + assert result is not None + + search_results = await search( + query_text="What is this document about?", + query_type=SearchType.SUMMARIES, + ) + assert search_results is not None + + +async def test_translation_spanish(): + """Test translation with Spanish text""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + await prune.prune_data() + await prune.prune_system(metadata=True) + + spanish_text = """ + La inteligencia artificial es una rama de la informática que se centra en + crear sistemas capaces de realizar tareas que normalmente requieren inteligencia humana. + Estos sistemas pueden aprender, razonar y resolver problemas complejos. + """ + + await add(spanish_text, dataset_name="test_spanish") + + result = await cognify( + datasets=["test_spanish"], + auto_translate=True, + target_language="en", + translation_provider="openai", + ) + + assert result is not None + + search_results = await search( + query_text="What is artificial intelligence?", + query_type=SearchType.SUMMARIES, + ) + assert search_results is not None + + +async def test_translation_french(): + """Test translation with French text""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + await prune.prune_data() + await prune.prune_system(metadata=True) + + french_text = """ + L'apprentissage automatique est une méthode d'analyse de données qui + automatise la construction de modèles analytiques. C'est une branche + de l'intelligence artificielle basée sur l'idée que les systèmes peuvent + apprendre à partir de données, identifier des modèles et prendre des décisions. + """ + + await add(french_text, dataset_name="test_french") + + result = await cognify( + datasets=["test_french"], + auto_translate=True, + target_language="en", + ) + + assert result is not None + + search_results = await search( + query_text="What is machine learning?", + query_type=SearchType.SUMMARIES, + ) + assert search_results is not None + + +async def test_translation_disabled(): + """Test that cognify works without translation""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + await prune.prune_data() + await prune.prune_system(metadata=True) + + text = "This is a baseline test without translation enabled." + await add(text, dataset_name="test_baseline") + + result = await cognify( + datasets=["test_baseline"], + auto_translate=False, + ) + + assert result is not None + + +async def test_translation_mixed_languages(): + """Test with multiple documents in different languages""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + await prune.prune_data() + await prune.prune_system(metadata=True) + + texts = [ + "Artificial intelligence is transforming the world.", + "La tecnología está cambiando nuestras vidas.", + "Les ordinateurs deviennent de plus en plus puissants.", + ] + + for text in texts: + await add(text, dataset_name="test_mixed") + + result = await cognify( + datasets=["test_mixed"], + auto_translate=True, + target_language="en", + ) + + assert result is not None + + search_results = await search( + query_text="What topics are discussed?", + query_type=SearchType.SUMMARIES, + ) + assert search_results is not None + + +async def test_direct_translation_function(): + """Test the translate_text convenience function directly""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + result = await translate_text( + text="Hola, ¿cómo estás? Espero que tengas un buen día.", + target_language="en", + translation_provider="openai", + ) + + assert result.translated_text is not None + assert result.translated_text != "" + assert result.target_language == "en" + assert result.provider == "openai" + + +async def test_language_detection(): + """Test language detection directly""" + test_texts = [ + ("Hello world, how are you doing today?", "en", False), + ("Bonjour le monde, comment allez-vous aujourd'hui?", "en", True), + ("Hola mundo, cómo estás hoy?", "en", True), + ("This is already in English language", "en", False), + ] + + for text, target_lang, should_translate in test_texts: + result = await detect_language_async(text, target_lang) + assert result.language_code is not None + assert result.confidence > 0.0 + # Only check requires_translation for high-confidence detections + if result.confidence > 0.8: + assert result.requires_translation == should_translate + + +async def main(): + """Run all translation integration tests""" + await test_quick_translation() + print("✓ test_quick_translation passed") + + await test_language_detection() + print("✓ test_language_detection passed") + + await test_direct_translation_function() + print("✓ test_direct_translation_function passed") + + await test_translation_basic() + print("✓ test_translation_basic passed") + + await test_translation_spanish() + print("✓ test_translation_spanish passed") + + await test_translation_french() + print("✓ test_translation_french passed") + + await test_translation_disabled() + print("✓ test_translation_disabled passed") + + await test_translation_mixed_languages() + print("✓ test_translation_mixed_languages passed") + + print("\nAll translation integration tests passed!") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/tests/tasks/translation/providers_test.py b/cognee/tests/tasks/translation/providers_test.py new file mode 100644 index 000000000..5be88a5ed --- /dev/null +++ b/cognee/tests/tasks/translation/providers_test.py @@ -0,0 +1,201 @@ +""" +Unit tests for translation providers +""" + +import asyncio +import os +from cognee.tasks.translation.providers import ( + get_translation_provider, + OpenAITranslationProvider, + TranslationResult, +) +from cognee.tasks.translation.exceptions import TranslationError + + +def has_openai_key(): + """Check if OpenAI API key is available""" + return bool(os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY")) + + +async def test_openai_provider_basic_translation(): + """Test basic translation with OpenAI provider""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + provider = OpenAITranslationProvider() + + result = await provider.translate(text="Hola mundo", target_language="en", source_language="es") + + assert isinstance(result, TranslationResult) + assert result.translated_text is not None + assert len(result.translated_text) > 0 + assert result.source_language == "es" + assert result.target_language == "en" + assert result.provider == "openai" + + +async def test_openai_provider_auto_detect_source(): + """Test translation with automatic source language detection""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + provider = OpenAITranslationProvider() + + result = await provider.translate( + text="Bonjour le monde", + target_language="en", + # source_language not provided - should auto-detect + ) + + assert result.translated_text is not None + assert result.target_language == "en" + + +async def test_openai_provider_long_text(): + """Test translation of longer text""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + provider = OpenAITranslationProvider() + + long_text = """ + La inteligencia artificial es una rama de la informática que se centra en + crear sistemas capaces de realizar tareas que normalmente requieren inteligencia humana. + Estos sistemas pueden aprender, razonar y resolver problemas complejos. + """ + + result = await provider.translate(text=long_text, target_language="en", source_language="es") + + assert len(result.translated_text) > 0 + assert result.source_language == "es" + + +def test_get_translation_provider_factory(): + """Test provider factory function""" + provider = get_translation_provider("openai") + assert isinstance(provider, OpenAITranslationProvider) + + +def test_get_translation_provider_invalid(): + """Test provider factory with invalid provider name""" + try: + get_translation_provider("invalid_provider") + assert False, "Expected TranslationError or ValueError" + except (TranslationError, ValueError): + pass + + +async def test_openai_batch_translation(): + """Test batch translation with OpenAI provider""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + provider = OpenAITranslationProvider() + + texts = ["Hola", "¿Cómo estás?", "Adiós"] + + results = await provider.translate_batch( + texts=texts, target_language="en", source_language="es" + ) + + assert len(results) == len(texts) + for result in results: + assert isinstance(result, TranslationResult) + assert result.translated_text is not None + assert result.source_language == "es" + assert result.target_language == "en" + + +async def test_translation_preserves_formatting(): + """Test that translation preserves basic formatting""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + provider = OpenAITranslationProvider() + + text_with_newlines = "Primera línea.\nSegunda línea." + + result = await provider.translate( + text=text_with_newlines, target_language="en", source_language="es" + ) + + # Should preserve structure (though exact newlines may vary) + assert result.translated_text is not None + assert len(result.translated_text) > 0 + + +async def test_translation_special_characters(): + """Test translation with special characters""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + provider = OpenAITranslationProvider() + + text = "¡Hola! ¿Cómo estás? Está bien." + + result = await provider.translate(text=text, target_language="en", source_language="es") + + assert result.translated_text is not None + assert len(result.translated_text) > 0 + + +async def test_empty_text_translation(): + """Test translation with empty text - should return empty or handle gracefully""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + provider = OpenAITranslationProvider() + + # Empty text may either raise an error or return an empty result + try: + result = await provider.translate(text="", target_language="en", source_language="es") + # If no error, should return a TranslationResult (possibly with empty text) + assert isinstance(result, TranslationResult) + except TranslationError: + # This is also acceptable behavior + pass + + +async def main(): + """Run all provider tests""" + # Sync tests + test_get_translation_provider_factory() + print("✓ test_get_translation_provider_factory passed") + + test_get_translation_provider_invalid() + print("✓ test_get_translation_provider_invalid passed") + + # Async tests + await test_openai_provider_basic_translation() + print("✓ test_openai_provider_basic_translation passed") + + await test_openai_provider_auto_detect_source() + print("✓ test_openai_provider_auto_detect_source passed") + + await test_openai_provider_long_text() + print("✓ test_openai_provider_long_text passed") + + await test_openai_batch_translation() + print("✓ test_openai_batch_translation passed") + + await test_translation_preserves_formatting() + print("✓ test_translation_preserves_formatting passed") + + await test_translation_special_characters() + print("✓ test_translation_special_characters passed") + + await test_empty_text_translation() + print("✓ test_empty_text_translation passed") + + print("\nAll provider tests passed!") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/cognee/tests/tasks/translation/translate_content_test.py b/cognee/tests/tasks/translation/translate_content_test.py new file mode 100644 index 000000000..0d92e339e --- /dev/null +++ b/cognee/tests/tasks/translation/translate_content_test.py @@ -0,0 +1,256 @@ +""" +Unit tests for translate_content task +""" + +import asyncio +import os +from uuid import uuid4 +from cognee.modules.chunking.models import DocumentChunk +from cognee.modules.data.processing.document_types import TextDocument +from cognee.tasks.translation import translate_content +from cognee.tasks.translation.models import TranslatedContent, LanguageMetadata + + +def has_openai_key(): + """Check if OpenAI API key is available""" + return bool(os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY")) + + +def create_test_chunk(text: str, chunk_index: int = 0): + """Helper to create a DocumentChunk with all required fields""" + # Create a minimal Document for the is_part_of field + doc = TextDocument( + id=uuid4(), + name="test_doc", + raw_data_location="/tmp/test.txt", + external_metadata=None, + mime_type="text/plain", + ) + + return DocumentChunk( + id=uuid4(), + text=text, + chunk_index=chunk_index, + chunk_size=len(text), + cut_type="sentence", + is_part_of=doc, + ) + + +async def test_translate_content_basic(): + """Test basic content translation""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + # Create test chunk with Spanish text + original_text = "Hola mundo, esta es una prueba." + chunk = create_test_chunk(original_text) + + result = await translate_content( + data_chunks=[chunk], target_language="en", translation_provider="openai" + ) + + assert len(result) == 1 + # The chunk's text should now be translated (different from original Spanish) + assert result[0].text != original_text # Text should be translated to English + assert result[0].contains is not None + + # Check for TranslatedContent in contains + has_translated_content = any(isinstance(item, TranslatedContent) for item in result[0].contains) + assert has_translated_content + + +async def test_translate_content_preserves_original(): + """Test that original text is preserved""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + original_text = "Bonjour le monde" + chunk = create_test_chunk(original_text) + + result = await translate_content( + data_chunks=[chunk], target_language="en", preserve_original=True + ) + + # Find TranslatedContent in contains + translated_content = None + for item in result[0].contains: + if isinstance(item, TranslatedContent): + translated_content = item + break + + assert translated_content is not None + assert translated_content.original_text == original_text + assert translated_content.translated_text != original_text + + +async def test_translate_content_skip_english(): + """Test skipping translation for English text""" + # This test doesn't require API call since English text is skipped + chunk = create_test_chunk("Hello world, this is a test.") + + result = await translate_content( + data_chunks=[chunk], target_language="en", skip_if_target_language=True + ) + + # Text should remain unchanged + assert result[0].text == chunk.text + + # Should have LanguageMetadata but not TranslatedContent + has_language_metadata = any( + isinstance(item, LanguageMetadata) for item in (result[0].contains or []) + ) + has_translated_content = any( + isinstance(item, TranslatedContent) for item in (result[0].contains or []) + ) + + assert has_language_metadata + assert not has_translated_content + + +async def test_translate_content_multiple_chunks(): + """Test translation of multiple chunks""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + # Use longer texts to ensure reliable language detection + original_texts = [ + "Hola mundo, esta es una prueba de traducción.", + "Bonjour le monde, ceci est un test de traduction.", + "Ciao mondo, questo è un test di traduzione.", + ] + chunks = [create_test_chunk(text, i) for i, text in enumerate(original_texts)] + + result = await translate_content(data_chunks=chunks, target_language="en") + + assert len(result) == 3 + # Check that at least some chunks were translated + translated_count = sum( + 1 + for chunk in result + if any(isinstance(item, TranslatedContent) for item in (chunk.contains or [])) + ) + assert translated_count >= 2 # At least 2 chunks should be translated + + +async def test_translate_content_empty_list(): + """Test with empty chunk list""" + result = await translate_content(data_chunks=[], target_language="en") + + assert result == [] + + +async def test_translate_content_empty_text(): + """Test with chunk containing empty text""" + chunk = create_test_chunk("") + + result = await translate_content(data_chunks=[chunk], target_language="en") + + assert len(result) == 1 + assert result[0].text == "" + + +async def test_translate_content_language_metadata(): + """Test that LanguageMetadata is created correctly""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + # Use a longer, distinctly Spanish text to ensure reliable detection + chunk = create_test_chunk( + "La inteligencia artificial está cambiando el mundo de manera significativa" + ) + + result = await translate_content(data_chunks=[chunk], target_language="en") + + # Find LanguageMetadata + language_metadata = None + for item in result[0].contains: + if isinstance(item, LanguageMetadata): + language_metadata = item + break + + assert language_metadata is not None + # Just check that a language was detected (short texts can be ambiguous) + assert language_metadata.detected_language is not None + assert language_metadata.requires_translation is True + assert language_metadata.language_confidence > 0.0 + + +async def test_translate_content_confidence_threshold(): + """Test with custom confidence threshold""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + # Use longer text for more reliable detection + chunk = create_test_chunk("Hola mundo, esta es una frase más larga para mejor detección") + + result = await translate_content( + data_chunks=[chunk], target_language="en", confidence_threshold=0.5 + ) + + assert len(result) == 1 + + +async def test_translate_content_no_preserve_original(): + """Test translation without preserving original""" + if not has_openai_key(): + print(" (skipped - no API key)") + return + + # Use longer text for more reliable detection + chunk = create_test_chunk("Bonjour le monde, comment allez-vous aujourd'hui") + + result = await translate_content( + data_chunks=[chunk], target_language="en", preserve_original=False + ) + + # Find TranslatedContent + translated_content = None + for item in result[0].contains: + if isinstance(item, TranslatedContent): + translated_content = item + break + + assert translated_content is not None + assert translated_content.original_text == "" # Should be empty + + +async def main(): + """Run all translate_content tests""" + await test_translate_content_basic() + print("✓ test_translate_content_basic passed") + + await test_translate_content_preserves_original() + print("✓ test_translate_content_preserves_original passed") + + await test_translate_content_skip_english() + print("✓ test_translate_content_skip_english passed") + + await test_translate_content_multiple_chunks() + print("✓ test_translate_content_multiple_chunks passed") + + await test_translate_content_empty_list() + print("✓ test_translate_content_empty_list passed") + + await test_translate_content_empty_text() + print("✓ test_translate_content_empty_text passed") + + await test_translate_content_language_metadata() + print("✓ test_translate_content_language_metadata passed") + + await test_translate_content_confidence_threshold() + print("✓ test_translate_content_confidence_threshold passed") + + await test_translate_content_no_preserve_original() + print("✓ test_translate_content_no_preserve_original passed") + + print("\nAll translate_content tests passed!") + + +if __name__ == "__main__": + asyncio.run(main()) From 9e226e2dae1f490b2964c89ceb3405ad72f6fdc2 Mon Sep 17 00:00:00 2001 From: andikarachman Date: Sat, 3 Jan 2026 09:22:10 +0700 Subject: [PATCH 04/16] refactor: address code review feedback - Made is_available() abstract in base.py with proper implementation in providers - Added original_error parameter to UnsupportedLanguageError and TranslationConfigError - Added Field validation for confidence_threshold bounds (0.0-1.0) - Changed @lru_cache to @lru_cache() for explicit style - Added get_translation_provider to __all__ in providers/__init__.py - Replaced deprecated asyncio.get_event_loop() with get_running_loop() - Added debug logging to is_available() in GoogleTranslationProvider - Added TODO comment for confidence score improvement in OpenAIProvider - Added None check for read_query_prompt() with fallback default prompt - Moved ClientSession outside batch loop in AzureTranslationProvider - Fixed Optional[float] type annotation in detect_language() - Added Note section documenting in-place mutation in translate_content() - Added test_confidence_threshold_validation() for bounds testing - Added descriptive assertion messages to config tests - Converted all async tests to use @pytest.mark.asyncio decorators - Replaced manual skip checks with @pytest.mark.skipif - Removed manual main() blocks, tests now pytest-only - Changed Chinese language assertion to use startswith('zh') for flexibility --- cognee/tasks/translation/config.py | 5 +- cognee/tasks/translation/detect_language.py | 4 +- cognee/tasks/translation/exceptions.py | 11 ++- .../tasks/translation/providers/__init__.py | 1 + .../translation/providers/azure_provider.py | 43 +++++----- cognee/tasks/translation/providers/base.py | 11 ++- .../translation/providers/google_provider.py | 7 +- .../translation/providers/openai_provider.py | 17 ++++ cognee/tasks/translation/translate_content.py | 7 ++ cognee/tests/tasks/translation/config_test.py | 78 +++++++++++------ .../tasks/translation/detect_language_test.py | 53 +++--------- .../tasks/translation/integration_test.py | 79 ++++------------- .../tests/tasks/translation/providers_test.py | 84 ++++--------------- .../translation/translate_content_test.py | 79 ++++------------- 14 files changed, 187 insertions(+), 292 deletions(-) diff --git a/cognee/tasks/translation/config.py b/cognee/tasks/translation/config.py index 99ed560de..db8a23870 100644 --- a/cognee/tasks/translation/config.py +++ b/cognee/tasks/translation/config.py @@ -1,6 +1,7 @@ from functools import lru_cache from typing import Literal, Optional +from pydantic import Field from pydantic_settings import BaseSettings, SettingsConfigDict @@ -23,7 +24,7 @@ class TranslationConfig(BaseSettings): # Translation provider settings translation_provider: TranslationProviderType = "openai" target_language: str = "en" - confidence_threshold: float = 0.8 + confidence_threshold: float = Field(default=0.8, ge=0.0, le=1.0) # Google Translate settings google_translate_api_key: Optional[str] = None @@ -57,7 +58,7 @@ class TranslationConfig(BaseSettings): } -@lru_cache +@lru_cache() def get_translation_config() -> TranslationConfig: """Get the translation configuration singleton.""" return TranslationConfig() diff --git a/cognee/tasks/translation/detect_language.py b/cognee/tasks/translation/detect_language.py index e223083c0..00b0bf012 100644 --- a/cognee/tasks/translation/detect_language.py +++ b/cognee/tasks/translation/detect_language.py @@ -88,7 +88,7 @@ def get_language_name(language_code: str) -> str: def detect_language( text: str, target_language: str = "en", - confidence_threshold: float = None, + confidence_threshold: Optional[float] = None, ) -> LanguageDetectionResult: """ Detect the language of the given text. @@ -184,7 +184,7 @@ async def detect_language_async( """ import asyncio - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() return await loop.run_in_executor( None, detect_language, text, target_language, confidence_threshold ) diff --git a/cognee/tasks/translation/exceptions.py b/cognee/tasks/translation/exceptions.py index ba5e74510..d5db128de 100644 --- a/cognee/tasks/translation/exceptions.py +++ b/cognee/tasks/translation/exceptions.py @@ -38,6 +38,7 @@ class UnsupportedLanguageError(TranslationError): language: str, provider: str = None, message: str = None, + original_error: Exception = None, ): self.language = language self.provider = provider @@ -45,11 +46,15 @@ class UnsupportedLanguageError(TranslationError): message = f"Language '{language}' is not supported" if provider: message += f" by {provider}" - super().__init__(message) + super().__init__(message, original_error) class TranslationConfigError(TranslationError): """Exception raised when translation configuration is invalid.""" - def __init__(self, message: str = "Invalid translation configuration"): - super().__init__(message) + def __init__( + self, + message: str = "Invalid translation configuration", + original_error: Exception = None, + ): + super().__init__(message, original_error) diff --git a/cognee/tasks/translation/providers/__init__.py b/cognee/tasks/translation/providers/__init__.py index 79a28a586..2fb8480ef 100644 --- a/cognee/tasks/translation/providers/__init__.py +++ b/cognee/tasks/translation/providers/__init__.py @@ -9,6 +9,7 @@ __all__ = [ "OpenAITranslationProvider", "GoogleTranslationProvider", "AzureTranslationProvider", + "get_translation_provider", ] diff --git a/cognee/tasks/translation/providers/azure_provider.py b/cognee/tasks/translation/providers/azure_provider.py index 2ee1f45d7..349445ca1 100644 --- a/cognee/tasks/translation/providers/azure_provider.py +++ b/cognee/tasks/translation/providers/azure_provider.py @@ -1,4 +1,3 @@ -import asyncio from typing import Optional import aiohttp @@ -142,12 +141,12 @@ class AzureTranslationProvider(TranslationProvider): batch_size = min(100, self._config.batch_size) all_results = [] - for i in range(0, len(texts), batch_size): - batch = texts[i : i + batch_size] - body = [{"text": text} for text in batch] + try: + async with aiohttp.ClientSession() as session: + for i in range(0, len(texts), batch_size): + batch = texts[i : i + batch_size] + body = [{"text": text} for text in batch] - try: - async with aiohttp.ClientSession() as session: async with session.post( endpoint, params=params, @@ -158,24 +157,24 @@ class AzureTranslationProvider(TranslationProvider): response.raise_for_status() results = await response.json() - for result in results: - translation = result["translations"][0] - detected_language = result.get("detectedLanguage", {}) + for result in results: + translation = result["translations"][0] + detected_language = result.get("detectedLanguage", {}) - all_results.append( - TranslationResult( - translated_text=translation["text"], - source_language=source_language - or detected_language.get("language", "unknown"), - target_language=target_language, - confidence_score=detected_language.get("score", 0.9), - provider=self.provider_name, - raw_response=result, + all_results.append( + TranslationResult( + translated_text=translation["text"], + source_language=source_language + or detected_language.get("language", "unknown"), + target_language=target_language, + confidence_score=detected_language.get("score", 0.9), + provider=self.provider_name, + raw_response=result, + ) ) - ) - except Exception as e: - logger.error(f"Azure batch translation failed: {e}") - raise + except Exception as e: + logger.error(f"Azure batch translation failed: {e}") + raise return all_results diff --git a/cognee/tasks/translation/providers/base.py b/cognee/tasks/translation/providers/base.py index d8e5e981e..c92f2f552 100644 --- a/cognee/tasks/translation/providers/base.py +++ b/cognee/tasks/translation/providers/base.py @@ -64,6 +64,13 @@ class TranslationProvider(ABC): """ pass + @abstractmethod def is_available(self) -> bool: - """Check if this provider is available (has required credentials).""" - return True + """Check if this provider is available (has required credentials). + + All providers must implement this method to validate their credentials. + + Returns: + True if the provider has valid credentials and is ready to use. + """ + pass diff --git a/cognee/tasks/translation/providers/google_provider.py b/cognee/tasks/translation/providers/google_provider.py index 0a7373b54..f007575cd 100644 --- a/cognee/tasks/translation/providers/google_provider.py +++ b/cognee/tasks/translation/providers/google_provider.py @@ -48,7 +48,8 @@ class GoogleTranslationProvider(TranslationProvider): try: self._get_client() return True - except Exception: + except Exception as e: + logger.debug(f"Google Translate not available: {e}") return False async def translate( @@ -72,7 +73,7 @@ class GoogleTranslationProvider(TranslationProvider): client = self._get_client() # Run in thread pool since google-cloud-translate is synchronous - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() if source_language: result = await loop.run_in_executor( @@ -122,7 +123,7 @@ class GoogleTranslationProvider(TranslationProvider): """ try: client = self._get_client() - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() if source_language: results = await loop.run_in_executor( diff --git a/cognee/tasks/translation/providers/openai_provider.py b/cognee/tasks/translation/providers/openai_provider.py index a888d688e..95597e368 100644 --- a/cognee/tasks/translation/providers/openai_provider.py +++ b/cognee/tasks/translation/providers/openai_provider.py @@ -52,6 +52,15 @@ class OpenAITranslationProvider(TranslationProvider): try: system_prompt = read_query_prompt("translate_content.txt") + # Validate system prompt was loaded successfully + if system_prompt is None: + logger.warning("translate_content.txt prompt file not found, using default prompt") + system_prompt = ( + "You are a professional translator. Translate the given text accurately " + "while preserving the original meaning, tone, and style. " + "Detect the source language if not provided." + ) + # Build the input with context if source_language: input_text = ( @@ -75,6 +84,8 @@ class OpenAITranslationProvider(TranslationProvider): translated_text=result.translated_text, source_language=source_language or result.detected_source_language, target_language=target_language, + # TODO: Consider deriving confidence from LLM response metadata + # or making configurable via TranslationConfig confidence_score=0.95, # LLM translations are generally high quality provider=self.provider_name, raw_response={"notes": result.translation_notes}, @@ -103,3 +114,9 @@ class OpenAITranslationProvider(TranslationProvider): """ tasks = [self.translate(text, target_language, source_language) for text in texts] return await asyncio.gather(*tasks) + + def is_available(self) -> bool: + """Check if OpenAI provider is available (has required credentials).""" + import os + + return bool(os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY")) diff --git a/cognee/tasks/translation/translate_content.py b/cognee/tasks/translation/translate_content.py index 1c869b132..fcf6ae430 100644 --- a/cognee/tasks/translation/translate_content.py +++ b/cognee/tasks/translation/translate_content.py @@ -44,6 +44,13 @@ async def translate_content( Chunks that required translation will have TranslatedContent objects in their 'contains' list. + Note: + This function mutates the input chunks in-place. Specifically: + - chunk.text is replaced with the translated text + - chunk.contains is updated with LanguageMetadata and TranslatedContent + The original text is preserved in TranslatedContent.original_text + if preserve_original=True. + Example: ```python from cognee.tasks.translation import translate_content diff --git a/cognee/tests/tasks/translation/config_test.py b/cognee/tests/tasks/translation/config_test.py index ee8d6019c..80f76a5f0 100644 --- a/cognee/tests/tasks/translation/config_test.py +++ b/cognee/tests/tasks/translation/config_test.py @@ -2,7 +2,6 @@ Unit tests for translation configuration """ -import os from typing import get_args from cognee.tasks.translation.config import ( get_translation_config, @@ -15,9 +14,15 @@ def test_default_translation_config(): """Test default translation configuration""" config = get_translation_config() - assert isinstance(config, TranslationConfig) - assert config.translation_provider in ["openai", "google", "azure"] - assert 0.0 <= config.confidence_threshold <= 1.0 + assert isinstance(config, TranslationConfig), "Config should be TranslationConfig instance" + assert config.translation_provider in [ + "openai", + "google", + "azure", + ], f"Invalid provider: {config.translation_provider}" + assert 0.0 <= config.confidence_threshold <= 1.0, ( + f"Confidence threshold {config.confidence_threshold} out of bounds [0.0, 1.0]" + ) def test_translation_provider_type_literal(): @@ -25,17 +30,52 @@ def test_translation_provider_type_literal(): # Get the allowed values from the Literal type allowed_values = get_args(TranslationProviderType) - assert "openai" in allowed_values - assert "google" in allowed_values - assert "azure" in allowed_values - assert len(allowed_values) == 3 + assert "openai" in allowed_values, "openai should be an allowed provider" + assert "google" in allowed_values, "google should be an allowed provider" + assert "azure" in allowed_values, "azure should be an allowed provider" + assert len(allowed_values) == 3, f"Expected 3 providers, got {len(allowed_values)}" def test_confidence_threshold_bounds(): """Test confidence threshold validation""" config = TranslationConfig(translation_provider="openai", confidence_threshold=0.9) - assert 0.0 <= config.confidence_threshold <= 1.0 + assert 0.0 <= config.confidence_threshold <= 1.0, ( + f"Confidence threshold {config.confidence_threshold} out of bounds [0.0, 1.0]" + ) + + +def test_confidence_threshold_validation(): + """Test that invalid confidence thresholds are rejected or clamped""" + # Test boundary values - these should work + config_min = TranslationConfig(translation_provider="openai", confidence_threshold=0.0) + assert config_min.confidence_threshold == 0.0, "Minimum bound (0.0) should be valid" + + config_max = TranslationConfig(translation_provider="openai", confidence_threshold=1.0) + assert config_max.confidence_threshold == 1.0, "Maximum bound (1.0) should be valid" + + # Test invalid values - these should either raise ValidationError or be clamped + try: + config_invalid_low = TranslationConfig( + translation_provider="openai", confidence_threshold=-0.1 + ) + # If no error, verify it was clamped to valid range + assert 0.0 <= config_invalid_low.confidence_threshold <= 1.0, ( + f"Invalid low value should be clamped, got {config_invalid_low.confidence_threshold}" + ) + except Exception: + pass # Expected validation error + + try: + config_invalid_high = TranslationConfig( + translation_provider="openai", confidence_threshold=1.5 + ) + # If no error, verify it was clamped to valid range + assert 0.0 <= config_invalid_high.confidence_threshold <= 1.0, ( + f"Invalid high value should be clamped, got {config_invalid_high.confidence_threshold}" + ) + except Exception: + pass # Expected validation error def test_multiple_provider_keys(): @@ -46,21 +86,5 @@ def test_multiple_provider_keys(): azure_translator_key="azure_key", ) - assert config.google_translate_api_key == "google_key" - assert config.azure_translator_key == "azure_key" - - -if __name__ == "__main__": - test_default_translation_config() - print("✓ test_default_translation_config passed") - - test_translation_provider_type_literal() - print("✓ test_translation_provider_type_literal passed") - - test_confidence_threshold_bounds() - print("✓ test_confidence_threshold_bounds passed") - - test_multiple_provider_keys() - print("✓ test_multiple_provider_keys passed") - - print("\nAll config tests passed!") + assert config.google_translate_api_key == "google_key", "Google API key not set correctly" + assert config.azure_translator_key == "azure_key", "Azure API key not set correctly" diff --git a/cognee/tests/tasks/translation/detect_language_test.py b/cognee/tests/tasks/translation/detect_language_test.py index 907c94df8..3845777ba 100644 --- a/cognee/tests/tasks/translation/detect_language_test.py +++ b/cognee/tests/tasks/translation/detect_language_test.py @@ -2,7 +2,7 @@ Unit tests for language detection functionality """ -import asyncio +import pytest from cognee.tasks.translation.detect_language import ( detect_language_async, LanguageDetectionResult, @@ -10,6 +10,7 @@ from cognee.tasks.translation.detect_language import ( from cognee.tasks.translation.exceptions import LanguageDetectionError +@pytest.mark.asyncio async def test_detect_english(): """Test detection of English text""" result = await detect_language_async("Hello world, this is a test.", target_language="en") @@ -20,6 +21,7 @@ async def test_detect_english(): assert result.language_name == "English" +@pytest.mark.asyncio async def test_detect_spanish(): """Test detection of Spanish text""" result = await detect_language_async("Hola mundo, esta es una prueba.", target_language="en") @@ -30,6 +32,7 @@ async def test_detect_spanish(): assert result.language_name == "Spanish" +@pytest.mark.asyncio async def test_detect_french(): """Test detection of French text""" result = await detect_language_async( @@ -42,6 +45,7 @@ async def test_detect_french(): assert result.language_name == "French" +@pytest.mark.asyncio async def test_detect_german(): """Test detection of German text""" result = await detect_language_async("Hallo Welt, das ist ein Test.", target_language="en") @@ -51,15 +55,17 @@ async def test_detect_german(): assert result.confidence > 0.9 +@pytest.mark.asyncio async def test_detect_chinese(): """Test detection of Chinese text""" result = await detect_language_async("你好世界,这是一个测试。", target_language="en") - assert result.language_code == "zh-cn" + assert result.language_code.startswith("zh"), f"Expected Chinese, got {result.language_code}" assert result.requires_translation is True assert result.confidence > 0.9 +@pytest.mark.asyncio async def test_already_target_language(): """Test when text is already in target language""" result = await detect_language_async("This text is already in English.", target_language="en") @@ -67,6 +73,7 @@ async def test_already_target_language(): assert result.requires_translation is False +@pytest.mark.asyncio async def test_short_text(): """Test detection with very short text""" result = await detect_language_async("Hi", target_language="es") @@ -76,6 +83,7 @@ async def test_short_text(): assert result.character_count == 2 +@pytest.mark.asyncio async def test_empty_text(): """Test detection with empty text - returns unknown by default""" result = await detect_language_async("", target_language="en") @@ -88,6 +96,7 @@ async def test_empty_text(): assert result.character_count == 0 +@pytest.mark.asyncio async def test_confidence_threshold(): """Test detection respects confidence threshold""" result = await detect_language_async( @@ -97,6 +106,7 @@ async def test_confidence_threshold(): assert result.confidence >= 0.5 +@pytest.mark.asyncio async def test_mixed_language_text(): """Test detection with mixed language text (predominantly one language)""" # Predominantly Spanish with English word @@ -106,42 +116,3 @@ async def test_mixed_language_text(): assert result.language_code == "es" # Should detect as Spanish assert result.requires_translation is True - - -async def main(): - """Run all language detection tests""" - await test_detect_english() - print("✓ test_detect_english passed") - - await test_detect_spanish() - print("✓ test_detect_spanish passed") - - await test_detect_french() - print("✓ test_detect_french passed") - - await test_detect_german() - print("✓ test_detect_german passed") - - await test_detect_chinese() - print("✓ test_detect_chinese passed") - - await test_already_target_language() - print("✓ test_already_target_language passed") - - await test_short_text() - print("✓ test_short_text passed") - - await test_empty_text() - print("✓ test_empty_text passed") - - await test_confidence_threshold() - print("✓ test_confidence_threshold passed") - - await test_mixed_language_text() - print("✓ test_mixed_language_text passed") - - print("\nAll language detection tests passed!") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/cognee/tests/tasks/translation/integration_test.py b/cognee/tests/tasks/translation/integration_test.py index 98fcae5d5..ff2877959 100644 --- a/cognee/tests/tasks/translation/integration_test.py +++ b/cognee/tests/tasks/translation/integration_test.py @@ -4,9 +4,10 @@ Integration tests for multilingual content translation feature. Tests the full cognify pipeline with translation enabled. """ -import asyncio import os +import pytest + from cognee import add, cognify, prune, search, SearchType from cognee.tasks.translation import translate_text from cognee.tasks.translation.detect_language import detect_language_async @@ -17,12 +18,10 @@ def has_openai_key(): return bool(os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY")) +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_quick_translation(): """Quick smoke test for translation feature""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - await prune.prune_data() await prune.prune_system(metadata=True) @@ -39,12 +38,10 @@ async def test_quick_translation(): assert result is not None +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translation_basic(): """Test basic translation functionality with English text""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - await prune.prune_data() await prune.prune_system(metadata=True) @@ -67,12 +64,10 @@ async def test_translation_basic(): assert search_results is not None +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translation_spanish(): """Test translation with Spanish text""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - await prune.prune_data() await prune.prune_system(metadata=True) @@ -100,12 +95,10 @@ async def test_translation_spanish(): assert search_results is not None +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translation_french(): """Test translation with French text""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - await prune.prune_data() await prune.prune_system(metadata=True) @@ -133,12 +126,10 @@ async def test_translation_french(): assert search_results is not None +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translation_disabled(): """Test that cognify works without translation""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - await prune.prune_data() await prune.prune_system(metadata=True) @@ -153,12 +144,10 @@ async def test_translation_disabled(): assert result is not None +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translation_mixed_languages(): """Test with multiple documents in different languages""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - await prune.prune_data() await prune.prune_system(metadata=True) @@ -186,12 +175,10 @@ async def test_translation_mixed_languages(): assert search_results is not None +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_direct_translation_function(): """Test the translate_text convenience function directly""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - result = await translate_text( text="Hola, ¿cómo estás? Espero que tengas un buen día.", target_language="en", @@ -204,6 +191,7 @@ async def test_direct_translation_function(): assert result.provider == "openai" +@pytest.mark.asyncio async def test_language_detection(): """Test language detection directly""" test_texts = [ @@ -220,36 +208,3 @@ async def test_language_detection(): # Only check requires_translation for high-confidence detections if result.confidence > 0.8: assert result.requires_translation == should_translate - - -async def main(): - """Run all translation integration tests""" - await test_quick_translation() - print("✓ test_quick_translation passed") - - await test_language_detection() - print("✓ test_language_detection passed") - - await test_direct_translation_function() - print("✓ test_direct_translation_function passed") - - await test_translation_basic() - print("✓ test_translation_basic passed") - - await test_translation_spanish() - print("✓ test_translation_spanish passed") - - await test_translation_french() - print("✓ test_translation_french passed") - - await test_translation_disabled() - print("✓ test_translation_disabled passed") - - await test_translation_mixed_languages() - print("✓ test_translation_mixed_languages passed") - - print("\nAll translation integration tests passed!") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/cognee/tests/tasks/translation/providers_test.py b/cognee/tests/tasks/translation/providers_test.py index 5be88a5ed..243a66fe8 100644 --- a/cognee/tests/tasks/translation/providers_test.py +++ b/cognee/tests/tasks/translation/providers_test.py @@ -2,8 +2,10 @@ Unit tests for translation providers """ -import asyncio import os + +import pytest + from cognee.tasks.translation.providers import ( get_translation_provider, OpenAITranslationProvider, @@ -17,12 +19,10 @@ def has_openai_key(): return bool(os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY")) +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_openai_provider_basic_translation(): """Test basic translation with OpenAI provider""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - provider = OpenAITranslationProvider() result = await provider.translate(text="Hola mundo", target_language="en", source_language="es") @@ -35,12 +35,10 @@ async def test_openai_provider_basic_translation(): assert result.provider == "openai" +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_openai_provider_auto_detect_source(): """Test translation with automatic source language detection""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - provider = OpenAITranslationProvider() result = await provider.translate( @@ -53,12 +51,10 @@ async def test_openai_provider_auto_detect_source(): assert result.target_language == "en" +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_openai_provider_long_text(): """Test translation of longer text""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - provider = OpenAITranslationProvider() long_text = """ @@ -88,12 +84,10 @@ def test_get_translation_provider_invalid(): pass +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_openai_batch_translation(): """Test batch translation with OpenAI provider""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - provider = OpenAITranslationProvider() texts = ["Hola", "¿Cómo estás?", "Adiós"] @@ -110,12 +104,10 @@ async def test_openai_batch_translation(): assert result.target_language == "en" +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translation_preserves_formatting(): """Test that translation preserves basic formatting""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - provider = OpenAITranslationProvider() text_with_newlines = "Primera línea.\nSegunda línea." @@ -129,12 +121,10 @@ async def test_translation_preserves_formatting(): assert len(result.translated_text) > 0 +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translation_special_characters(): """Test translation with special characters""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - provider = OpenAITranslationProvider() text = "¡Hola! ¿Cómo estás? Está bien." @@ -145,12 +135,10 @@ async def test_translation_special_characters(): assert len(result.translated_text) > 0 +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_empty_text_translation(): """Test translation with empty text - should return empty or handle gracefully""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - provider = OpenAITranslationProvider() # Empty text may either raise an error or return an empty result @@ -161,41 +149,3 @@ async def test_empty_text_translation(): except TranslationError: # This is also acceptable behavior pass - - -async def main(): - """Run all provider tests""" - # Sync tests - test_get_translation_provider_factory() - print("✓ test_get_translation_provider_factory passed") - - test_get_translation_provider_invalid() - print("✓ test_get_translation_provider_invalid passed") - - # Async tests - await test_openai_provider_basic_translation() - print("✓ test_openai_provider_basic_translation passed") - - await test_openai_provider_auto_detect_source() - print("✓ test_openai_provider_auto_detect_source passed") - - await test_openai_provider_long_text() - print("✓ test_openai_provider_long_text passed") - - await test_openai_batch_translation() - print("✓ test_openai_batch_translation passed") - - await test_translation_preserves_formatting() - print("✓ test_translation_preserves_formatting passed") - - await test_translation_special_characters() - print("✓ test_translation_special_characters passed") - - await test_empty_text_translation() - print("✓ test_empty_text_translation passed") - - print("\nAll provider tests passed!") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/cognee/tests/tasks/translation/translate_content_test.py b/cognee/tests/tasks/translation/translate_content_test.py index 0d92e339e..35b5e60b3 100644 --- a/cognee/tests/tasks/translation/translate_content_test.py +++ b/cognee/tests/tasks/translation/translate_content_test.py @@ -2,9 +2,11 @@ Unit tests for translate_content task """ -import asyncio import os from uuid import uuid4 + +import pytest + from cognee.modules.chunking.models import DocumentChunk from cognee.modules.data.processing.document_types import TextDocument from cognee.tasks.translation import translate_content @@ -37,12 +39,10 @@ def create_test_chunk(text: str, chunk_index: int = 0): ) +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translate_content_basic(): """Test basic content translation""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - # Create test chunk with Spanish text original_text = "Hola mundo, esta es una prueba." chunk = create_test_chunk(original_text) @@ -61,12 +61,10 @@ async def test_translate_content_basic(): assert has_translated_content +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translate_content_preserves_original(): """Test that original text is preserved""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - original_text = "Bonjour le monde" chunk = create_test_chunk(original_text) @@ -86,6 +84,7 @@ async def test_translate_content_preserves_original(): assert translated_content.translated_text != original_text +@pytest.mark.asyncio async def test_translate_content_skip_english(): """Test skipping translation for English text""" # This test doesn't require API call since English text is skipped @@ -110,12 +109,10 @@ async def test_translate_content_skip_english(): assert not has_translated_content +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translate_content_multiple_chunks(): """Test translation of multiple chunks""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - # Use longer texts to ensure reliable language detection original_texts = [ "Hola mundo, esta es una prueba de traducción.", @@ -136,6 +133,7 @@ async def test_translate_content_multiple_chunks(): assert translated_count >= 2 # At least 2 chunks should be translated +@pytest.mark.asyncio async def test_translate_content_empty_list(): """Test with empty chunk list""" result = await translate_content(data_chunks=[], target_language="en") @@ -143,6 +141,7 @@ async def test_translate_content_empty_list(): assert result == [] +@pytest.mark.asyncio async def test_translate_content_empty_text(): """Test with chunk containing empty text""" chunk = create_test_chunk("") @@ -153,12 +152,10 @@ async def test_translate_content_empty_text(): assert result[0].text == "" +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translate_content_language_metadata(): """Test that LanguageMetadata is created correctly""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - # Use a longer, distinctly Spanish text to ensure reliable detection chunk = create_test_chunk( "La inteligencia artificial está cambiando el mundo de manera significativa" @@ -180,12 +177,10 @@ async def test_translate_content_language_metadata(): assert language_metadata.language_confidence > 0.0 +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translate_content_confidence_threshold(): """Test with custom confidence threshold""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - # Use longer text for more reliable detection chunk = create_test_chunk("Hola mundo, esta es una frase más larga para mejor detección") @@ -196,12 +191,10 @@ async def test_translate_content_confidence_threshold(): assert len(result) == 1 +@pytest.mark.asyncio +@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") async def test_translate_content_no_preserve_original(): """Test translation without preserving original""" - if not has_openai_key(): - print(" (skipped - no API key)") - return - # Use longer text for more reliable detection chunk = create_test_chunk("Bonjour le monde, comment allez-vous aujourd'hui") @@ -218,39 +211,3 @@ async def test_translate_content_no_preserve_original(): assert translated_content is not None assert translated_content.original_text == "" # Should be empty - - -async def main(): - """Run all translate_content tests""" - await test_translate_content_basic() - print("✓ test_translate_content_basic passed") - - await test_translate_content_preserves_original() - print("✓ test_translate_content_preserves_original passed") - - await test_translate_content_skip_english() - print("✓ test_translate_content_skip_english passed") - - await test_translate_content_multiple_chunks() - print("✓ test_translate_content_multiple_chunks passed") - - await test_translate_content_empty_list() - print("✓ test_translate_content_empty_list passed") - - await test_translate_content_empty_text() - print("✓ test_translate_content_empty_text passed") - - await test_translate_content_language_metadata() - print("✓ test_translate_content_language_metadata passed") - - await test_translate_content_confidence_threshold() - print("✓ test_translate_content_confidence_threshold passed") - - await test_translate_content_no_preserve_original() - print("✓ test_translate_content_no_preserve_original passed") - - print("\nAll translate_content tests passed!") - - -if __name__ == "__main__": - asyncio.run(main()) From 5d03366dad9abfebe9bfc2c59d2e3ad54ff057a3 Mon Sep 17 00:00:00 2001 From: andikarachman Date: Sat, 3 Jan 2026 09:32:55 +0700 Subject: [PATCH 05/16] docs: add module docstring to base.py --- cognee/tasks/translation/providers/base.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cognee/tasks/translation/providers/base.py b/cognee/tasks/translation/providers/base.py index c92f2f552..f7862d3c1 100644 --- a/cognee/tasks/translation/providers/base.py +++ b/cognee/tasks/translation/providers/base.py @@ -1,3 +1,11 @@ +""" +Base classes for translation providers. + +This module defines the abstract interface that all translation providers must implement. +Providers handle the actual translation of text using external services like OpenAI, +Google Translate, or Azure Translator. +""" + from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Optional From 6b36d9093da56766d35654682a6732b6fdc48698 Mon Sep 17 00:00:00 2001 From: andikarachman Date: Sat, 3 Jan 2026 09:37:26 +0700 Subject: [PATCH 06/16] feat: add timeout_seconds to to_dict() for debugging context --- cognee/tasks/translation/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cognee/tasks/translation/config.py b/cognee/tasks/translation/config.py index db8a23870..b4fe7902d 100644 --- a/cognee/tasks/translation/config.py +++ b/cognee/tasks/translation/config.py @@ -55,6 +55,7 @@ class TranslationConfig(BaseSettings): "confidence_threshold": self.confidence_threshold, "batch_size": self.batch_size, "max_retries": self.max_retries, + "timeout_seconds": self.timeout_seconds, } From 79980293ed2205df354b0d4cecea3de6d9eee24f Mon Sep 17 00:00:00 2001 From: andikarachman Date: Sat, 3 Jan 2026 09:40:58 +0700 Subject: [PATCH 07/16] feat: add rate limiting with asyncio.Semaphore to batch translations --- .../tasks/translation/providers/openai_provider.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cognee/tasks/translation/providers/openai_provider.py b/cognee/tasks/translation/providers/openai_provider.py index 95597e368..b0508974a 100644 --- a/cognee/tasks/translation/providers/openai_provider.py +++ b/cognee/tasks/translation/providers/openai_provider.py @@ -100,19 +100,29 @@ class OpenAITranslationProvider(TranslationProvider): texts: list[str], target_language: str = "en", source_language: Optional[str] = None, + max_concurrent: int = 5, ) -> list[TranslationResult]: """ Translate multiple texts using OpenAI's LLM. + Uses a semaphore to limit concurrent requests and avoid API rate limits. + Args: texts: List of texts to translate target_language: Target language code source_language: Source language code (optional) + max_concurrent: Maximum concurrent translation requests (default: 5) Returns: List of TranslationResult objects """ - tasks = [self.translate(text, target_language, source_language) for text in texts] + semaphore = asyncio.Semaphore(max_concurrent) + + async def limited_translate(text: str) -> TranslationResult: + async with semaphore: + return await self.translate(text, target_language, source_language) + + tasks = [limited_translate(text) for text in texts] return await asyncio.gather(*tasks) def is_available(self) -> bool: From 82e4c451183850ddf78d1839436f3756f90fa625 Mon Sep 17 00:00:00 2001 From: andikarachman Date: Sat, 3 Jan 2026 09:43:56 +0700 Subject: [PATCH 08/16] refactor: use TranslationProviderError instead of ValueError for consistency --- cognee/tasks/translation/providers/azure_provider.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cognee/tasks/translation/providers/azure_provider.py b/cognee/tasks/translation/providers/azure_provider.py index 349445ca1..455f57e3a 100644 --- a/cognee/tasks/translation/providers/azure_provider.py +++ b/cognee/tasks/translation/providers/azure_provider.py @@ -6,6 +6,7 @@ from cognee.shared.logging_utils import get_logger from .base import TranslationProvider, TranslationResult from ..config import get_translation_config +from ..exceptions import TranslationProviderError logger = get_logger(__name__) @@ -48,9 +49,9 @@ class AzureTranslationProvider(TranslationProvider): TranslationResult with translated text and metadata """ if not self.is_available(): - raise ValueError( - "Azure Translator API key not configured. " - "Set AZURE_TRANSLATOR_KEY environment variable." + raise TranslationProviderError( + provider=self.provider_name, + message="Azure Translator API key not configured. Set AZURE_TRANSLATOR_KEY environment variable.", ) endpoint = f"{self._config.azure_translator_endpoint}/translate" From 2a9d795723c9d9953b0a1ec14260b846e37758a7 Mon Sep 17 00:00:00 2001 From: andikarachman Date: Sat, 3 Jan 2026 09:51:33 +0700 Subject: [PATCH 09/16] feat(translation): address PR review feedback - Add langdetect>=1.0.9 as direct dependency in pyproject.toml - Wrap exceptions with TranslationProviderError in azure_provider.py - Add progress logging for large batch translations (every 100 chunks) - Add clear_translation_config_cache helper for testing - Set __cause__ on exceptions for proper exception chaining - Change TranslationResult.confidence_score to Optional[float] - Google provider: set confidence_score=None (API doesn't provide it) - Google provider: simplify translate methods with kwargs dict - Add assertion for result length in integration test --- cognee/tasks/translation/config.py | 5 +++ cognee/tasks/translation/detect_language.py | 2 +- cognee/tasks/translation/exceptions.py | 2 + .../translation/providers/azure_provider.py | 12 +++++- cognee/tasks/translation/providers/base.py | 3 +- .../translation/providers/google_provider.py | 42 +++++++++---------- cognee/tasks/translation/translate_content.py | 7 +++- .../tasks/translation/integration_test.py | 1 + pyproject.toml | 4 +- uv.lock | 2 + 10 files changed, 51 insertions(+), 29 deletions(-) diff --git a/cognee/tasks/translation/config.py b/cognee/tasks/translation/config.py index b4fe7902d..b8c3da485 100644 --- a/cognee/tasks/translation/config.py +++ b/cognee/tasks/translation/config.py @@ -63,3 +63,8 @@ class TranslationConfig(BaseSettings): def get_translation_config() -> TranslationConfig: """Get the translation configuration singleton.""" return TranslationConfig() + + +def clear_translation_config_cache(): + """Clear the cached config for testing purposes.""" + get_translation_config.cache_clear() diff --git a/cognee/tasks/translation/detect_language.py b/cognee/tasks/translation/detect_language.py index 00b0bf012..a474f7144 100644 --- a/cognee/tasks/translation/detect_language.py +++ b/cognee/tasks/translation/detect_language.py @@ -169,7 +169,7 @@ def detect_language( async def detect_language_async( text: str, target_language: str = "en", - confidence_threshold: float = None, + confidence_threshold: Optional[float] = None, ) -> LanguageDetectionResult: """ Async wrapper for language detection. diff --git a/cognee/tasks/translation/exceptions.py b/cognee/tasks/translation/exceptions.py index d5db128de..3ab197fce 100644 --- a/cognee/tasks/translation/exceptions.py +++ b/cognee/tasks/translation/exceptions.py @@ -5,6 +5,8 @@ class TranslationError(Exception): self.message = message self.original_error = original_error super().__init__(self.message) + if original_error: + self.__cause__ = original_error class LanguageDetectionError(TranslationError): diff --git a/cognee/tasks/translation/providers/azure_provider.py b/cognee/tasks/translation/providers/azure_provider.py index 455f57e3a..8a3fc2b01 100644 --- a/cognee/tasks/translation/providers/azure_provider.py +++ b/cognee/tasks/translation/providers/azure_provider.py @@ -98,7 +98,11 @@ class AzureTranslationProvider(TranslationProvider): except Exception as e: logger.error(f"Azure translation failed: {e}") - raise + raise TranslationProviderError( + provider=self.provider_name, + message=f"Translation failed: {e}", + original_error=e, + ) async def translate_batch( self, @@ -176,6 +180,10 @@ class AzureTranslationProvider(TranslationProvider): except Exception as e: logger.error(f"Azure batch translation failed: {e}") - raise + raise TranslationProviderError( + provider=self.provider_name, + message=f"Batch translation failed: {e}", + original_error=e, + ) return all_results diff --git a/cognee/tasks/translation/providers/base.py b/cognee/tasks/translation/providers/base.py index f7862d3c1..37c6744b4 100644 --- a/cognee/tasks/translation/providers/base.py +++ b/cognee/tasks/translation/providers/base.py @@ -18,7 +18,8 @@ class TranslationResult: translated_text: str source_language: str target_language: str - confidence_score: float + # Confidence score from the provider, or None if not available (e.g., Google Translate) + confidence_score: Optional[float] provider: str raw_response: Optional[dict] = None diff --git a/cognee/tasks/translation/providers/google_provider.py b/cognee/tasks/translation/providers/google_provider.py index f007575cd..d6b16545c 100644 --- a/cognee/tasks/translation/providers/google_provider.py +++ b/cognee/tasks/translation/providers/google_provider.py @@ -75,17 +75,15 @@ class GoogleTranslationProvider(TranslationProvider): # Run in thread pool since google-cloud-translate is synchronous loop = asyncio.get_running_loop() + # Build kwargs for translate call + translate_kwargs = {"target_language": target_language} if source_language: - result = await loop.run_in_executor( - None, - lambda: client.translate( - text, target_language=target_language, source_language=source_language - ), - ) - else: - result = await loop.run_in_executor( - None, lambda: client.translate(text, target_language=target_language) - ) + translate_kwargs["source_language"] = source_language + + result = await loop.run_in_executor( + None, + lambda: client.translate(text, **translate_kwargs), + ) detected_language = result.get("detectedSourceLanguage", source_language or "unknown") @@ -93,7 +91,8 @@ class GoogleTranslationProvider(TranslationProvider): translated_text=result["translatedText"], source_language=detected_language, target_language=target_language, - confidence_score=0.9, # Google Translate is generally reliable + # Google Translate API does not provide confidence scores + confidence_score=None, provider=self.provider_name, raw_response=result, ) @@ -125,17 +124,15 @@ class GoogleTranslationProvider(TranslationProvider): client = self._get_client() loop = asyncio.get_running_loop() + # Build kwargs for translate call + translate_kwargs = {"target_language": target_language} if source_language: - results = await loop.run_in_executor( - None, - lambda: client.translate( - texts, target_language=target_language, source_language=source_language - ), - ) - else: - results = await loop.run_in_executor( - None, lambda: client.translate(texts, target_language=target_language) - ) + translate_kwargs["source_language"] = source_language + + results = await loop.run_in_executor( + None, + lambda: client.translate(texts, **translate_kwargs), + ) translation_results = [] for result in results: @@ -147,7 +144,8 @@ class GoogleTranslationProvider(TranslationProvider): translated_text=result["translatedText"], source_language=detected_language, target_language=target_language, - confidence_score=0.9, + # Google Translate API does not provide confidence scores + confidence_score=None, provider=self.provider_name, raw_response=result, ) diff --git a/cognee/tasks/translation/translate_content.py b/cognee/tasks/translation/translate_content.py index fcf6ae430..aa26306c5 100644 --- a/cognee/tasks/translation/translate_content.py +++ b/cognee/tasks/translation/translate_content.py @@ -87,8 +87,13 @@ async def translate_content( # Process chunks processed_chunks = [] + total_chunks = len(data_chunks) + + for chunk_index, chunk in enumerate(data_chunks): + # Log progress for large batches + if chunk_index > 0 and chunk_index % 100 == 0: + logger.info(f"Translation progress: {chunk_index}/{total_chunks} chunks processed") - for chunk in data_chunks: if not hasattr(chunk, "text") or not chunk.text: processed_chunks.append(chunk) continue diff --git a/cognee/tests/tasks/translation/integration_test.py b/cognee/tests/tasks/translation/integration_test.py index ff2877959..18e20ab4b 100644 --- a/cognee/tests/tasks/translation/integration_test.py +++ b/cognee/tests/tasks/translation/integration_test.py @@ -36,6 +36,7 @@ async def test_quick_translation(): ) assert result is not None + assert len(result) > 0 @pytest.mark.asyncio diff --git a/pyproject.toml b/pyproject.toml index c536a526e..0685ebed0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,8 +61,8 @@ dependencies = [ "diskcache>=5.6.3", "aiolimiter>=1.2.1", "urllib3>=2.6.0", - "cbor2>=5.8.0" - + "cbor2>=5.8.0", + "langdetect>=1.0.9", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index 23363f332..d514f62b4 100644 --- a/uv.lock +++ b/uv.lock @@ -961,6 +961,7 @@ dependencies = [ { name = "jinja2" }, { name = "kuzu" }, { name = "lancedb" }, + { name = "langdetect" }, { name = "limits" }, { name = "litellm" }, { name = "mistralai" }, @@ -1160,6 +1161,7 @@ requires-dist = [ { name = "langchain-aws", marker = "extra == 'neptune'", specifier = ">=0.2.22" }, { name = "langchain-core", marker = "extra == 'langchain'", specifier = ">=1.2.5" }, { name = "langchain-text-splitters", marker = "extra == 'langchain'", specifier = ">=0.3.2,<1.0.0" }, + { name = "langdetect", specifier = ">=1.0.9" }, { name = "langfuse", marker = "extra == 'monitoring'", specifier = ">=2.32.0,<3" }, { name = "langsmith", marker = "extra == 'langchain'", specifier = ">=0.2.3,<1.0.0" }, { name = "limits", specifier = ">=4.4.1,<5" }, From b6aa33f343fb2ddbd34250286cc3abc7f81f9fdb Mon Sep 17 00:00:00 2001 From: andikarachman Date: Sat, 3 Jan 2026 11:29:26 +0700 Subject: [PATCH 10/16] refactor: rename OpenAI translation provider to LLM provider - Rename OpenAITranslationProvider to LLMTranslationProvider - Rename openai_provider.py to llm_provider.py - Change provider type from 'openai' to 'llm' in TranslationProviderType - Update all test files to use 'llm' provider and has_llm_api_key() - Add AliasChoices for explicit env var mapping in TranslationConfig - Update translate_content.py to fallback to config.target_language - Update cognify.py docstrings to reference 'llm' provider - Update .env.template and test README documentation The LLM provider now uses whatever LLM is configured in cognee (OpenAI, Azure, Ollama, Anthropic, etc.) instead of being tied to OpenAI. --- .env.template | 29 +++++++ cognee/api/v1/cognify/cognify.py | 12 +-- cognee/api/v1/config/config.py | 27 +++++++ cognee/tasks/translation/__init__.py | 8 +- cognee/tasks/translation/config.py | 76 ++++++++++++++----- .../tasks/translation/providers/__init__.py | 11 ++- .../{openai_provider.py => llm_provider.py} | 33 +++++--- cognee/tasks/translation/translate_content.py | 36 +++++---- cognee/tests/tasks/translation/README.md | 44 ++++++++++- cognee/tests/tasks/translation/config_test.py | 16 ++-- .../tasks/translation/integration_test.py | 30 ++++---- .../tests/tasks/translation/providers_test.py | 54 ++++++------- .../translation/translate_content_test.py | 20 ++--- 13 files changed, 277 insertions(+), 119 deletions(-) rename cognee/tasks/translation/providers/{openai_provider.py => llm_provider.py} (79%) diff --git a/.env.template b/.env.template index fe168cf91..ae14fc720 100644 --- a/.env.template +++ b/.env.template @@ -145,6 +145,35 @@ VECTOR_DATASET_DATABASE_HANDLER="lancedb" # ONTOLOGY_FILE_PATH=YOUR_FULL_FULE_PATH # Default: empty # To add ontology resolvers, either set them as it is set in ontology_example or add full_path and settings as envs. +################################################################################ +# 🌐 Translation Settings +################################################################################ + +# Translation provider: llm (uses configured LLM), google, or azure +# "llm" uses whichever LLM is configured above (OpenAI, Azure, Ollama, Anthropic, etc.) +# "google" and "azure" use dedicated translation APIs +TRANSLATION_PROVIDER="llm" + +# Default target language for translations (ISO 639-1 code, e.g., en, es, fr, de) +TARGET_LANGUAGE="en" + +# Minimum confidence threshold for language detection (0.0 to 1.0) +CONFIDENCE_THRESHOLD=0.8 + +# -- Google Translate settings (required if using google provider) ----------- +# GOOGLE_TRANSLATE_API_KEY="your-google-api-key" +# GOOGLE_PROJECT_ID="your-google-project-id" + +# -- Azure Translator settings (required if using azure provider) ------------ +# AZURE_TRANSLATOR_KEY="your-azure-translator-key" +# AZURE_TRANSLATOR_REGION="westeurope" +# AZURE_TRANSLATOR_ENDPOINT="https://api.cognitive.microsofttranslator.com" + +# -- Performance settings ---------------------------------------------------- +# TRANSLATION_BATCH_SIZE=10 +# TRANSLATION_MAX_RETRIES=3 +# TRANSLATION_TIMEOUT_SECONDS=30 + ################################################################################ # 🔄 MIGRATION (RELATIONAL → GRAPH) SETTINGS ################################################################################ diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index 1b50b6d2f..f9e01084c 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -128,10 +128,10 @@ async def cognify( content that needs translation. Defaults to False. target_language: Target language code for translation (e.g., "en", "es", "fr"). Only used when auto_translate=True. Defaults to "en" (English). - translation_provider: Translation service to use ("openai", "google", "azure"). - OpenAI uses the existing LLM infrastructure, Google requires + translation_provider: Translation service to use ("llm", "google", "azure"). + LLM uses the existing LLM infrastructure, Google requires GOOGLE_TRANSLATE_API_KEY, Azure requires AZURE_TRANSLATOR_KEY. - If not specified, uses TRANSLATION_PROVIDER env var or defaults to "openai". + If not specified, uses TRANSLATION_PROVIDER env var or defaults to "llm". Returns: Union[dict, list[PipelineRunInfo]]: @@ -202,7 +202,7 @@ async def cognify( await cognee.cognify( auto_translate=True, target_language="en", - translation_provider="openai" # or "google", "azure" + translation_provider="llm" # or "google", "azure" ) ``` @@ -215,7 +215,7 @@ async def cognify( - LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER - LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False) - LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60) - - TRANSLATION_PROVIDER: Default translation provider ("openai", "google", "azure") + - TRANSLATION_PROVIDER: Default translation provider ("llm", "google", "azure") - GOOGLE_TRANSLATE_API_KEY: API key for Google Translate - AZURE_TRANSLATOR_KEY: API key for Azure Translator """ @@ -387,7 +387,7 @@ async def get_temporal_tasks( chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify auto_translate (bool, optional): If True, translate non-English content. Defaults to False. target_language (str, optional): Target language for translation. Defaults to "en". - translation_provider (str, optional): Translation provider to use ("openai", "google", "azure"). + translation_provider (str, optional): Translation provider to use ("llm", "google", "azure"). Returns: list[Task]: A list of Task objects representing the temporal processing pipeline. diff --git a/cognee/api/v1/config/config.py b/cognee/api/v1/config/config.py index 464753438..490032e2d 100644 --- a/cognee/api/v1/config/config.py +++ b/cognee/api/v1/config/config.py @@ -10,6 +10,7 @@ from cognee.infrastructure.llm.config import ( get_llm_config, ) from cognee.infrastructure.databases.relational import get_relational_config, get_migration_config +from cognee.tasks.translation.config import get_translation_config from cognee.api.v1.exceptions.exceptions import InvalidConfigAttributeError @@ -176,3 +177,29 @@ class config: def set_vector_db_url(db_url: str): vector_db_config = get_vectordb_config() vector_db_config.vector_db_url = db_url + + # Translation configuration methods + + @staticmethod + def set_translation_provider(provider: str): + """Set the translation provider (llm, google, azure).""" + translation_config = get_translation_config() + translation_config.translation_provider = provider + + @staticmethod + def set_translation_target_language(target_language: str): + """Set the default target language for translations.""" + translation_config = get_translation_config() + translation_config.target_language = target_language + + @staticmethod + def set_translation_config(config_dict: dict): + """ + Updates the translation config with values from config_dict. + """ + translation_config = get_translation_config() + for key, value in config_dict.items(): + if hasattr(translation_config, key): + object.__setattr__(translation_config, key, value) + else: + raise InvalidConfigAttributeError(attribute=key) diff --git a/cognee/tasks/translation/__init__.py b/cognee/tasks/translation/__init__.py index b9836160c..ed2ec6e58 100644 --- a/cognee/tasks/translation/__init__.py +++ b/cognee/tasks/translation/__init__.py @@ -14,7 +14,7 @@ Main Components: - LanguageMetadata: DataPoint model for language information Supported Translation Providers: -- OpenAI (default): Uses GPT models via existing LLM infrastructure +- LLM (default): Uses the configured LLM via existing infrastructure - Google Translate: Requires google-cloud-translate package - Azure Translator: Requires Azure Translator API key @@ -26,7 +26,7 @@ Example Usage: translated_chunks = await translate_content( chunks, target_language="en", - translation_provider="openai" + translation_provider="llm" ) # Translate a single text @@ -54,7 +54,7 @@ from .providers import ( TranslationProvider, TranslationResult, get_translation_provider, - OpenAITranslationProvider, + LLMTranslationProvider, GoogleTranslationProvider, AzureTranslationProvider, ) @@ -84,7 +84,7 @@ __all__ = [ "TranslationProvider", "TranslationResult", "get_translation_provider", - "OpenAITranslationProvider", + "LLMTranslationProvider", "GoogleTranslationProvider", "AzureTranslationProvider", # Exceptions diff --git a/cognee/tasks/translation/config.py b/cognee/tasks/translation/config.py index b8c3da485..cf52dbdb7 100644 --- a/cognee/tasks/translation/config.py +++ b/cognee/tasks/translation/config.py @@ -1,11 +1,11 @@ from functools import lru_cache from typing import Literal, Optional -from pydantic import Field +from pydantic import AliasChoices, Field from pydantic_settings import BaseSettings, SettingsConfigDict -TranslationProviderType = Literal["openai", "google", "azure"] +TranslationProviderType = Literal["llm", "google", "azure"] class TranslationConfig(BaseSettings): @@ -13,34 +13,74 @@ class TranslationConfig(BaseSettings): Configuration settings for the translation task. Environment variables can be used to configure these settings: - - TRANSLATION_PROVIDER: The translation service to use - - TRANSLATION_TARGET_LANGUAGE: Default target language - - TRANSLATION_CONFIDENCE_THRESHOLD: Minimum confidence for language detection + - TRANSLATION_PROVIDER: The translation service to use ("llm", "google", "azure") + - TARGET_LANGUAGE: Default target language (ISO 639-1 code, e.g., "en", "es", "fr") + - CONFIDENCE_THRESHOLD: Minimum confidence for language detection (0.0 to 1.0) - GOOGLE_TRANSLATE_API_KEY: API key for Google Translate + - GOOGLE_PROJECT_ID: Google Cloud project ID - AZURE_TRANSLATOR_KEY: API key for Azure Translator - AZURE_TRANSLATOR_REGION: Region for Azure Translator + - AZURE_TRANSLATOR_ENDPOINT: Endpoint URL for Azure Translator + - TRANSLATION_BATCH_SIZE: Number of texts to translate per batch + - TRANSLATION_MAX_RETRIES: Maximum retry attempts on failure + - TRANSLATION_TIMEOUT_SECONDS: Request timeout in seconds """ # Translation provider settings - translation_provider: TranslationProviderType = "openai" - target_language: str = "en" - confidence_threshold: float = Field(default=0.8, ge=0.0, le=1.0) + translation_provider: TranslationProviderType = Field( + default="llm", + validation_alias=AliasChoices("TRANSLATION_PROVIDER", "translation_provider"), + ) + target_language: str = Field( + default="en", + validation_alias=AliasChoices("TARGET_LANGUAGE", "target_language"), + ) + confidence_threshold: float = Field( + default=0.8, + ge=0.0, + le=1.0, + validation_alias=AliasChoices("CONFIDENCE_THRESHOLD", "confidence_threshold"), + ) # Google Translate settings - google_translate_api_key: Optional[str] = None - google_project_id: Optional[str] = None + google_translate_api_key: Optional[str] = Field( + default=None, + validation_alias=AliasChoices("GOOGLE_TRANSLATE_API_KEY", "google_translate_api_key"), + ) + google_project_id: Optional[str] = Field( + default=None, + validation_alias=AliasChoices("GOOGLE_PROJECT_ID", "google_project_id"), + ) # Azure Translator settings - azure_translator_key: Optional[str] = None - azure_translator_region: Optional[str] = None - azure_translator_endpoint: str = "https://api.cognitive.microsofttranslator.com" + azure_translator_key: Optional[str] = Field( + default=None, + validation_alias=AliasChoices("AZURE_TRANSLATOR_KEY", "azure_translator_key"), + ) + azure_translator_region: Optional[str] = Field( + default=None, + validation_alias=AliasChoices("AZURE_TRANSLATOR_REGION", "azure_translator_region"), + ) + azure_translator_endpoint: str = Field( + default="https://api.cognitive.microsofttranslator.com", + validation_alias=AliasChoices("AZURE_TRANSLATOR_ENDPOINT", "azure_translator_endpoint"), + ) - # OpenAI uses the existing LLM configuration + # LLM provider uses the existing LLM configuration - # Performance settings - batch_size: int = 10 - max_retries: int = 3 - timeout_seconds: int = 30 + # Performance settings (with TRANSLATION_ prefix for env vars) + batch_size: int = Field( + default=10, + validation_alias=AliasChoices("TRANSLATION_BATCH_SIZE", "batch_size"), + ) + max_retries: int = Field( + default=3, + validation_alias=AliasChoices("TRANSLATION_MAX_RETRIES", "max_retries"), + ) + timeout_seconds: int = Field( + default=30, + validation_alias=AliasChoices("TRANSLATION_TIMEOUT_SECONDS", "timeout_seconds"), + ) # Language detection settings min_text_length_for_detection: int = 10 diff --git a/cognee/tasks/translation/providers/__init__.py b/cognee/tasks/translation/providers/__init__.py index 2fb8480ef..f76023022 100644 --- a/cognee/tasks/translation/providers/__init__.py +++ b/cognee/tasks/translation/providers/__init__.py @@ -1,12 +1,12 @@ from .base import TranslationProvider, TranslationResult -from .openai_provider import OpenAITranslationProvider +from .llm_provider import LLMTranslationProvider from .google_provider import GoogleTranslationProvider from .azure_provider import AzureTranslationProvider __all__ = [ "TranslationProvider", "TranslationResult", - "OpenAITranslationProvider", + "LLMTranslationProvider", "GoogleTranslationProvider", "AzureTranslationProvider", "get_translation_provider", @@ -18,7 +18,10 @@ def get_translation_provider(provider_name: str) -> TranslationProvider: Factory function to get the appropriate translation provider. Args: - provider_name: Name of the provider ("openai", "google", or "azure") + provider_name: Name of the provider: + - "llm": Uses the configured LLM (OpenAI, Azure, Ollama, Anthropic, etc.) + - "google": Uses Google Cloud Translation API + - "azure": Uses Azure Translator API Returns: TranslationProvider instance @@ -27,7 +30,7 @@ def get_translation_provider(provider_name: str) -> TranslationProvider: ValueError: If the provider name is not recognized """ providers = { - "openai": OpenAITranslationProvider, + "llm": LLMTranslationProvider, "google": GoogleTranslationProvider, "azure": AzureTranslationProvider, } diff --git a/cognee/tasks/translation/providers/openai_provider.py b/cognee/tasks/translation/providers/llm_provider.py similarity index 79% rename from cognee/tasks/translation/providers/openai_provider.py rename to cognee/tasks/translation/providers/llm_provider.py index b0508974a..2e92811ee 100644 --- a/cognee/tasks/translation/providers/openai_provider.py +++ b/cognee/tasks/translation/providers/llm_provider.py @@ -4,6 +4,7 @@ from typing import Optional from pydantic import BaseModel from cognee.infrastructure.llm.LLMGateway import LLMGateway +from cognee.infrastructure.llm.config import get_llm_config from cognee.infrastructure.llm.prompts import read_query_prompt from cognee.shared.logging_utils import get_logger @@ -20,17 +21,24 @@ class TranslationOutput(BaseModel): translation_notes: Optional[str] = None -class OpenAITranslationProvider(TranslationProvider): +class LLMTranslationProvider(TranslationProvider): """ - Translation provider using OpenAI's LLM for translation. + Translation provider using the configured LLM for translation. This provider leverages the existing LLM infrastructure in Cognee - to perform translations using GPT models. + to perform translations using any LLM configured via LLM_PROVIDER + (OpenAI, Azure, Ollama, Anthropic, etc.). + + The LLM used is determined by the cognee LLM configuration settings: + - LLM_PROVIDER: The LLM provider (openai, azure, ollama, etc.) + - LLM_MODEL: The model to use + - LLM_API_KEY: API key for the provider """ @property def provider_name(self) -> str: - return "openai" + """Return 'llm' as the provider name.""" + return "llm" async def translate( self, @@ -39,7 +47,7 @@ class OpenAITranslationProvider(TranslationProvider): source_language: Optional[str] = None, ) -> TranslationResult: """ - Translate text using OpenAI's LLM. + Translate text using the configured LLM. Args: text: The text to translate @@ -92,7 +100,7 @@ class OpenAITranslationProvider(TranslationProvider): ) except Exception as e: - logger.error(f"OpenAI translation failed: {e}") + logger.error(f"LLM translation failed: {e}") raise async def translate_batch( @@ -103,7 +111,7 @@ class OpenAITranslationProvider(TranslationProvider): max_concurrent: int = 5, ) -> list[TranslationResult]: """ - Translate multiple texts using OpenAI's LLM. + Translate multiple texts using the configured LLM. Uses a semaphore to limit concurrent requests and avoid API rate limits. @@ -126,7 +134,10 @@ class OpenAITranslationProvider(TranslationProvider): return await asyncio.gather(*tasks) def is_available(self) -> bool: - """Check if OpenAI provider is available (has required credentials).""" - import os - - return bool(os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY")) + """Check if LLM provider is available (has required credentials).""" + try: + llm_config = get_llm_config() + # Check if API key is configured (required for most providers) + return bool(llm_config.llm_api_key) + except Exception: + return False diff --git a/cognee/tasks/translation/translate_content.py b/cognee/tasks/translation/translate_content.py index aa26306c5..984082469 100644 --- a/cognee/tasks/translation/translate_content.py +++ b/cognee/tasks/translation/translate_content.py @@ -16,7 +16,7 @@ logger = get_logger(__name__) async def translate_content( data_chunks: List[DocumentChunk], - target_language: str = "en", + target_language: str = None, translation_provider: TranslationProviderType = None, confidence_threshold: float = None, skip_if_target_language: bool = True, @@ -32,7 +32,8 @@ async def translate_content( Args: data_chunks: List of DocumentChunk objects to process target_language: Target language code (default: "en" for English) - translation_provider: Translation service to use ("openai", "google", "azure") + If not provided, uses config default + translation_provider: Translation service to use ("llm", "google", "azure") If not provided, uses config default confidence_threshold: Minimum confidence for language detection (0.0 to 1.0) If not provided, uses config default @@ -61,7 +62,7 @@ async def translate_content( # Translate with specific provider translated_chunks = await translate_content( chunks, - translation_provider="openai", + translation_provider="llm", confidence_threshold=0.9 ) ``` @@ -75,11 +76,12 @@ async def translate_content( # Get configuration config = get_translation_config() provider_name = translation_provider or config.translation_provider + target_lang = target_language or config.target_language threshold = confidence_threshold or config.confidence_threshold logger.info( f"Starting translation task for {len(data_chunks)} chunks " - f"using {provider_name} provider, target language: {target_language}" + f"using {provider_name} provider, target language: {target_lang}" ) # Get the translation provider @@ -100,7 +102,7 @@ async def translate_content( try: # Detect language - detection = await detect_language_async(chunk.text, target_language, threshold) + detection = await detect_language_async(chunk.text, target_lang, threshold) # Create language metadata language_metadata = LanguageMetadata( @@ -127,12 +129,12 @@ async def translate_content( # Translate the content logger.debug( - f"Translating chunk {chunk.id} from {detection.language_code} to {target_language}" + f"Translating chunk {chunk.id} from {detection.language_code} to {target_lang}" ) translation_result = await provider.translate( text=chunk.text, - target_language=target_language, + target_language=target_lang, source_language=detection.language_code, ) @@ -160,7 +162,7 @@ async def translate_content( logger.debug( f"Successfully translated chunk {chunk.id}: " - f"{detection.language_code} -> {target_language}" + f"{detection.language_code} -> {target_lang}" ) except LanguageDetectionError as e: @@ -186,7 +188,7 @@ def _add_to_chunk_contains(chunk: DocumentChunk, item) -> None: async def translate_text( text: str, - target_language: str = "en", + target_language: str = None, translation_provider: TranslationProviderType = None, source_language: Optional[str] = None, ) -> TranslationResult: @@ -198,8 +200,10 @@ async def translate_text( Args: text: The text to translate - target_language: Target language code (default: "en") + target_language: Target language code (default: uses config, typically "en") + If not provided, uses config default translation_provider: Translation service to use + If not provided, uses config default source_language: Source language code (optional, auto-detected if not provided) Returns: @@ -219,19 +223,20 @@ async def translate_text( """ config = get_translation_config() provider_name = translation_provider or config.translation_provider + target_lang = target_language or config.target_language provider = get_translation_provider(provider_name) return await provider.translate( text=text, - target_language=target_language, + target_language=target_lang, source_language=source_language, ) async def batch_translate_texts( texts: List[str], - target_language: str = "en", + target_language: str = None, translation_provider: TranslationProviderType = None, source_language: Optional[str] = None, ) -> List[TranslationResult]: @@ -243,8 +248,10 @@ async def batch_translate_texts( Args: texts: List of texts to translate - target_language: Target language code (default: "en") + target_language: Target language code (default: uses config, typically "en") + If not provided, uses config default translation_provider: Translation service to use + If not provided, uses config default source_language: Source language code (optional) Returns: @@ -264,11 +271,12 @@ async def batch_translate_texts( """ config = get_translation_config() provider_name = translation_provider or config.translation_provider + target_lang = target_language or config.target_language provider = get_translation_provider(provider_name) return await provider.translate_batch( texts=texts, - target_language=target_language, + target_language=target_lang, source_language=source_language, ) diff --git a/cognee/tests/tasks/translation/README.md b/cognee/tests/tasks/translation/README.md index cb56bf18a..50b589ea0 100644 --- a/cognee/tests/tasks/translation/README.md +++ b/cognee/tests/tasks/translation/README.md @@ -16,7 +16,7 @@ Unit and integration tests for the multilingual content translation feature. - Edge cases (empty text, short text, mixed languages) - **providers_test.py** - Tests for translation provider implementations - - OpenAI provider basic translation + - LLM provider basic translation - Auto-detection of source language - Batch translation - Special characters and formatting preservation @@ -73,6 +73,46 @@ uv run pytest cognee/tests/tasks/translation/ --cov=cognee.tasks.translation --c - LLM API key set in environment: `LLM_API_KEY=your_key` - Tests will be skipped if no API key is available +**Note:** The translation feature uses the same LLM model configured for other cognee tasks (via `LLM_MODEL` and `LLM_PROVIDER` environment variables). This means any LLM provider supported by cognee (OpenAI, Azure, Anthropic, Ollama, etc.) can be used for translation. + +## Usage Example + +```python +import cognee +from cognee.tasks.translation import translate_text + +# Configure translation (optional - defaults to LLM provider) +cognee.config.set_translation_config( + provider="llm", # Uses configured LLM (default) + target_language="en", # Target language code + confidence_threshold=0.7 # Minimum confidence for language detection +) + +# Translate text directly +result = await translate_text( + text="Bonjour le monde", + target_language="en" +) +print(result.translated_text) # "Hello world" + +# Or use auto-translation in the cognify pipeline +await cognee.add("Hola, ¿cómo estás?") +await cognee.cognify(auto_translate=True) + +# Search works on translated content +results = await cognee.search("how are you") +``` + +### Alternative Translation Providers + +```python +# Use Google Cloud Translate (requires GOOGLE_TRANSLATE_API_KEY) +cognee.config.set_translation_provider("google") + +# Use Azure Translator (requires AZURE_TRANSLATOR_KEY and AZURE_TRANSLATOR_REGION) +cognee.config.set_translation_provider("azure") +``` + ## Test Summary | Test File | Tests | Description | @@ -101,7 +141,7 @@ uv run pytest cognee/tests/tasks/translation/ --cov=cognee.tasks.translation --c ### Translation Providers (9 tests) - ✅ Provider factory function -- ✅ OpenAI translation +- ✅ LLM translation - ✅ Batch operations - ✅ Auto source language detection - ✅ Long text handling diff --git a/cognee/tests/tasks/translation/config_test.py b/cognee/tests/tasks/translation/config_test.py index 80f76a5f0..d5cf4971c 100644 --- a/cognee/tests/tasks/translation/config_test.py +++ b/cognee/tests/tasks/translation/config_test.py @@ -16,7 +16,7 @@ def test_default_translation_config(): assert isinstance(config, TranslationConfig), "Config should be TranslationConfig instance" assert config.translation_provider in [ - "openai", + "llm", "google", "azure", ], f"Invalid provider: {config.translation_provider}" @@ -30,7 +30,7 @@ def test_translation_provider_type_literal(): # Get the allowed values from the Literal type allowed_values = get_args(TranslationProviderType) - assert "openai" in allowed_values, "openai should be an allowed provider" + assert "llm" in allowed_values, "llm should be an allowed provider" assert "google" in allowed_values, "google should be an allowed provider" assert "azure" in allowed_values, "azure should be an allowed provider" assert len(allowed_values) == 3, f"Expected 3 providers, got {len(allowed_values)}" @@ -38,7 +38,7 @@ def test_translation_provider_type_literal(): def test_confidence_threshold_bounds(): """Test confidence threshold validation""" - config = TranslationConfig(translation_provider="openai", confidence_threshold=0.9) + config = TranslationConfig(translation_provider="llm", confidence_threshold=0.9) assert 0.0 <= config.confidence_threshold <= 1.0, ( f"Confidence threshold {config.confidence_threshold} out of bounds [0.0, 1.0]" @@ -48,16 +48,16 @@ def test_confidence_threshold_bounds(): def test_confidence_threshold_validation(): """Test that invalid confidence thresholds are rejected or clamped""" # Test boundary values - these should work - config_min = TranslationConfig(translation_provider="openai", confidence_threshold=0.0) + config_min = TranslationConfig(translation_provider="llm", confidence_threshold=0.0) assert config_min.confidence_threshold == 0.0, "Minimum bound (0.0) should be valid" - config_max = TranslationConfig(translation_provider="openai", confidence_threshold=1.0) + config_max = TranslationConfig(translation_provider="llm", confidence_threshold=1.0) assert config_max.confidence_threshold == 1.0, "Maximum bound (1.0) should be valid" # Test invalid values - these should either raise ValidationError or be clamped try: config_invalid_low = TranslationConfig( - translation_provider="openai", confidence_threshold=-0.1 + translation_provider="llm", confidence_threshold=-0.1 ) # If no error, verify it was clamped to valid range assert 0.0 <= config_invalid_low.confidence_threshold <= 1.0, ( @@ -68,7 +68,7 @@ def test_confidence_threshold_validation(): try: config_invalid_high = TranslationConfig( - translation_provider="openai", confidence_threshold=1.5 + translation_provider="llm", confidence_threshold=1.5 ) # If no error, verify it was clamped to valid range assert 0.0 <= config_invalid_high.confidence_threshold <= 1.0, ( @@ -81,7 +81,7 @@ def test_confidence_threshold_validation(): def test_multiple_provider_keys(): """Test configuration with multiple provider API keys""" config = TranslationConfig( - translation_provider="openai", + translation_provider="llm", google_translate_api_key="google_key", azure_translator_key="azure_key", ) diff --git a/cognee/tests/tasks/translation/integration_test.py b/cognee/tests/tasks/translation/integration_test.py index 18e20ab4b..c62904744 100644 --- a/cognee/tests/tasks/translation/integration_test.py +++ b/cognee/tests/tasks/translation/integration_test.py @@ -13,13 +13,13 @@ from cognee.tasks.translation import translate_text from cognee.tasks.translation.detect_language import detect_language_async -def has_openai_key(): - """Check if OpenAI API key is available""" - return bool(os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY")) +def has_llm_api_key(): + """Check if LLM API key is available""" + return bool(os.environ.get("LLM_API_KEY")) @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_quick_translation(): """Quick smoke test for translation feature""" await prune.prune_data() @@ -32,7 +32,7 @@ async def test_quick_translation(): datasets=["spanish_test"], auto_translate=True, target_language="en", - translation_provider="openai", + translation_provider="llm", ) assert result is not None @@ -40,7 +40,7 @@ async def test_quick_translation(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_basic(): """Test basic translation functionality with English text""" await prune.prune_data() @@ -53,7 +53,7 @@ async def test_translation_basic(): datasets=["test_english"], auto_translate=True, target_language="en", - translation_provider="openai", + translation_provider="llm", ) assert result is not None @@ -66,7 +66,7 @@ async def test_translation_basic(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_spanish(): """Test translation with Spanish text""" await prune.prune_data() @@ -84,7 +84,7 @@ async def test_translation_spanish(): datasets=["test_spanish"], auto_translate=True, target_language="en", - translation_provider="openai", + translation_provider="llm", ) assert result is not None @@ -97,7 +97,7 @@ async def test_translation_spanish(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_french(): """Test translation with French text""" await prune.prune_data() @@ -128,7 +128,7 @@ async def test_translation_french(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_disabled(): """Test that cognify works without translation""" await prune.prune_data() @@ -146,7 +146,7 @@ async def test_translation_disabled(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_mixed_languages(): """Test with multiple documents in different languages""" await prune.prune_data() @@ -177,19 +177,19 @@ async def test_translation_mixed_languages(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_direct_translation_function(): """Test the translate_text convenience function directly""" result = await translate_text( text="Hola, ¿cómo estás? Espero que tengas un buen día.", target_language="en", - translation_provider="openai", + translation_provider="llm", ) assert result.translated_text is not None assert result.translated_text != "" assert result.target_language == "en" - assert result.provider == "openai" + assert result.provider == "llm" @pytest.mark.asyncio diff --git a/cognee/tests/tasks/translation/providers_test.py b/cognee/tests/tasks/translation/providers_test.py index 243a66fe8..0573a974f 100644 --- a/cognee/tests/tasks/translation/providers_test.py +++ b/cognee/tests/tasks/translation/providers_test.py @@ -8,22 +8,22 @@ import pytest from cognee.tasks.translation.providers import ( get_translation_provider, - OpenAITranslationProvider, + LLMTranslationProvider, TranslationResult, ) from cognee.tasks.translation.exceptions import TranslationError -def has_openai_key(): - """Check if OpenAI API key is available""" - return bool(os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY")) +def has_llm_api_key(): + """Check if LLM API key is available""" + return bool(os.environ.get("LLM_API_KEY")) @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") -async def test_openai_provider_basic_translation(): - """Test basic translation with OpenAI provider""" - provider = OpenAITranslationProvider() +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") +async def test_llm_provider_basic_translation(): + """Test basic translation with LLM provider (uses configured LLM)""" + provider = LLMTranslationProvider() result = await provider.translate(text="Hola mundo", target_language="en", source_language="es") @@ -32,14 +32,14 @@ async def test_openai_provider_basic_translation(): assert len(result.translated_text) > 0 assert result.source_language == "es" assert result.target_language == "en" - assert result.provider == "openai" + assert result.provider == "llm" @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") -async def test_openai_provider_auto_detect_source(): +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") +async def test_llm_provider_auto_detect_source(): """Test translation with automatic source language detection""" - provider = OpenAITranslationProvider() + provider = LLMTranslationProvider() result = await provider.translate( text="Bonjour le monde", @@ -52,10 +52,10 @@ async def test_openai_provider_auto_detect_source(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") -async def test_openai_provider_long_text(): +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") +async def test_llm_provider_long_text(): """Test translation of longer text""" - provider = OpenAITranslationProvider() + provider = LLMTranslationProvider() long_text = """ La inteligencia artificial es una rama de la informática que se centra en @@ -71,8 +71,8 @@ async def test_openai_provider_long_text(): def test_get_translation_provider_factory(): """Test provider factory function""" - provider = get_translation_provider("openai") - assert isinstance(provider, OpenAITranslationProvider) + provider = get_translation_provider("llm") + assert isinstance(provider, LLMTranslationProvider) def test_get_translation_provider_invalid(): @@ -85,10 +85,10 @@ def test_get_translation_provider_invalid(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") -async def test_openai_batch_translation(): - """Test batch translation with OpenAI provider""" - provider = OpenAITranslationProvider() +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") +async def test_llm_batch_translation(): + """Test batch translation with LLM provider""" + provider = LLMTranslationProvider() texts = ["Hola", "¿Cómo estás?", "Adiós"] @@ -105,10 +105,10 @@ async def test_openai_batch_translation(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_preserves_formatting(): """Test that translation preserves basic formatting""" - provider = OpenAITranslationProvider() + provider = LLMTranslationProvider() text_with_newlines = "Primera línea.\nSegunda línea." @@ -122,10 +122,10 @@ async def test_translation_preserves_formatting(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_special_characters(): """Test translation with special characters""" - provider = OpenAITranslationProvider() + provider = LLMTranslationProvider() text = "¡Hola! ¿Cómo estás? Está bien." @@ -136,10 +136,10 @@ async def test_translation_special_characters(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_empty_text_translation(): """Test translation with empty text - should return empty or handle gracefully""" - provider = OpenAITranslationProvider() + provider = LLMTranslationProvider() # Empty text may either raise an error or return an empty result try: diff --git a/cognee/tests/tasks/translation/translate_content_test.py b/cognee/tests/tasks/translation/translate_content_test.py index 35b5e60b3..87fa5b67c 100644 --- a/cognee/tests/tasks/translation/translate_content_test.py +++ b/cognee/tests/tasks/translation/translate_content_test.py @@ -13,9 +13,9 @@ from cognee.tasks.translation import translate_content from cognee.tasks.translation.models import TranslatedContent, LanguageMetadata -def has_openai_key(): - """Check if OpenAI API key is available""" - return bool(os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY")) +def has_llm_api_key(): + """Check if LLM API key is available""" + return bool(os.environ.get("LLM_API_KEY")) def create_test_chunk(text: str, chunk_index: int = 0): @@ -40,7 +40,7 @@ def create_test_chunk(text: str, chunk_index: int = 0): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translate_content_basic(): """Test basic content translation""" # Create test chunk with Spanish text @@ -48,7 +48,7 @@ async def test_translate_content_basic(): chunk = create_test_chunk(original_text) result = await translate_content( - data_chunks=[chunk], target_language="en", translation_provider="openai" + data_chunks=[chunk], target_language="en", translation_provider="llm" ) assert len(result) == 1 @@ -62,7 +62,7 @@ async def test_translate_content_basic(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translate_content_preserves_original(): """Test that original text is preserved""" original_text = "Bonjour le monde" @@ -110,7 +110,7 @@ async def test_translate_content_skip_english(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translate_content_multiple_chunks(): """Test translation of multiple chunks""" # Use longer texts to ensure reliable language detection @@ -153,7 +153,7 @@ async def test_translate_content_empty_text(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translate_content_language_metadata(): """Test that LanguageMetadata is created correctly""" # Use a longer, distinctly Spanish text to ensure reliable detection @@ -178,7 +178,7 @@ async def test_translate_content_language_metadata(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translate_content_confidence_threshold(): """Test with custom confidence threshold""" # Use longer text for more reliable detection @@ -192,7 +192,7 @@ async def test_translate_content_confidence_threshold(): @pytest.mark.asyncio -@pytest.mark.skipif(not has_openai_key(), reason="No OpenAI API key available") +@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translate_content_no_preserve_original(): """Test translation without preserving original""" # Use longer text for more reliable detection From d70957978e2f2c2a5efbdef5e1f628f5578e7e35 Mon Sep 17 00:00:00 2001 From: andikarachman Date: Sun, 4 Jan 2026 12:07:15 +0700 Subject: [PATCH 11/16] fix: improve error handling consistency - Use TranslationProviderError instead of ValueError in azure_provider.py batch translation - Replace bare except blocks with specific ValidationError in config_test.py --- cognee/tasks/translation/providers/azure_provider.py | 5 ++++- cognee/tests/tasks/translation/config_test.py | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cognee/tasks/translation/providers/azure_provider.py b/cognee/tasks/translation/providers/azure_provider.py index 8a3fc2b01..368585ffc 100644 --- a/cognee/tasks/translation/providers/azure_provider.py +++ b/cognee/tasks/translation/providers/azure_provider.py @@ -124,7 +124,10 @@ class AzureTranslationProvider(TranslationProvider): List of TranslationResult objects """ if not self.is_available(): - raise ValueError("Azure Translator API key not configured.") + raise TranslationProviderError( + provider=self.provider_name, + message="Azure Translator API key not configured. Set AZURE_TRANSLATOR_KEY environment variable.", + ) endpoint = f"{self._config.azure_translator_endpoint}/translate" diff --git a/cognee/tests/tasks/translation/config_test.py b/cognee/tests/tasks/translation/config_test.py index d5cf4971c..248bf70f3 100644 --- a/cognee/tests/tasks/translation/config_test.py +++ b/cognee/tests/tasks/translation/config_test.py @@ -3,6 +3,9 @@ Unit tests for translation configuration """ from typing import get_args + +from pydantic import ValidationError + from cognee.tasks.translation.config import ( get_translation_config, TranslationConfig, @@ -63,7 +66,7 @@ def test_confidence_threshold_validation(): assert 0.0 <= config_invalid_low.confidence_threshold <= 1.0, ( f"Invalid low value should be clamped, got {config_invalid_low.confidence_threshold}" ) - except Exception: + except ValidationError: pass # Expected validation error try: @@ -74,7 +77,7 @@ def test_confidence_threshold_validation(): assert 0.0 <= config_invalid_high.confidence_threshold <= 1.0, ( f"Invalid high value should be clamped, got {config_invalid_high.confidence_threshold}" ) - except Exception: + except ValidationError: pass # Expected validation error From aac115cc8461213568de0d1ea03fb20a8625f637 Mon Sep 17 00:00:00 2001 From: andikarachman Date: Sun, 4 Jan 2026 13:07:43 +0700 Subject: [PATCH 12/16] fix(tests): resolve asyncio event loop issue in translation integration tests --- .gitignore | 3 +++ .../tasks/translation/integration_test.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/.gitignore b/.gitignore index 7c3095d08..8db408a7b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ cognee/.data/ code_pipeline_output*/ +# Test output files +test_outputs/ + *.lance/ .DS_Store # Byte-compiled / optimized / DLL files diff --git a/cognee/tests/tasks/translation/integration_test.py b/cognee/tests/tasks/translation/integration_test.py index c62904744..54d9b6418 100644 --- a/cognee/tests/tasks/translation/integration_test.py +++ b/cognee/tests/tasks/translation/integration_test.py @@ -18,10 +18,24 @@ def has_llm_api_key(): return bool(os.environ.get("LLM_API_KEY")) +async def reset_engines(): + """Reset cached engines to avoid event loop issues between tests.""" + from cognee.infrastructure.databases.graph.get_graph_engine import create_graph_engine + from cognee.infrastructure.databases.relational.create_relational_engine import ( + create_relational_engine, + ) + from cognee.infrastructure.databases.vector.create_vector_engine import create_vector_engine + + create_graph_engine.cache_clear() + create_vector_engine.cache_clear() + create_relational_engine.cache_clear() + + @pytest.mark.asyncio @pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_quick_translation(): """Quick smoke test for translation feature""" + await reset_engines() await prune.prune_data() await prune.prune_system(metadata=True) @@ -43,6 +57,7 @@ async def test_quick_translation(): @pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_basic(): """Test basic translation functionality with English text""" + await reset_engines() await prune.prune_data() await prune.prune_system(metadata=True) @@ -69,6 +84,7 @@ async def test_translation_basic(): @pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_spanish(): """Test translation with Spanish text""" + await reset_engines() await prune.prune_data() await prune.prune_system(metadata=True) @@ -100,6 +116,7 @@ async def test_translation_spanish(): @pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_french(): """Test translation with French text""" + await reset_engines() await prune.prune_data() await prune.prune_system(metadata=True) @@ -131,6 +148,7 @@ async def test_translation_french(): @pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_disabled(): """Test that cognify works without translation""" + await reset_engines() await prune.prune_data() await prune.prune_system(metadata=True) @@ -149,6 +167,7 @@ async def test_translation_disabled(): @pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_translation_mixed_languages(): """Test with multiple documents in different languages""" + await reset_engines() await prune.prune_data() await prune.prune_system(metadata=True) From d2f98fe8805b5d88d6acbdc9b76a5e025870c517 Mon Sep 17 00:00:00 2001 From: andikarachman Date: Tue, 13 Jan 2026 19:08:15 +0700 Subject: [PATCH 13/16] refactor: remove auto-translate from cognify pipeline The translation module is kept as a standalone task that can be used independently. Users can manually add translate_content to their custom pipelines if needed. --- cognee/api/v1/cognify/cognify.py | 129 ++++++------------------------- 1 file changed, 25 insertions(+), 104 deletions(-) diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index f9e01084c..ffc903d68 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -26,8 +26,6 @@ from cognee.tasks.documents import ( from cognee.tasks.graph import extract_graph_from_data from cognee.tasks.storage import add_data_points from cognee.tasks.summarization import summarize_text -from cognee.tasks.translation import translate_content -from cognee.tasks.translation.config import TranslationProviderType from cognee.modules.pipelines.layers.pipeline_execution_mode import get_pipeline_executor from cognee.tasks.temporal_graph.extract_events_and_entities import extract_events_and_timestamps from cognee.tasks.temporal_graph.extract_knowledge_graph_from_events import ( @@ -55,9 +53,6 @@ async def cognify( custom_prompt: Optional[str] = None, temporal_cognify: bool = False, data_per_batch: int = 20, - auto_translate: bool = False, - target_language: str = "en", - translation_provider: TranslationProviderType = None, **kwargs, ): """ @@ -123,15 +118,6 @@ async def cognify( If provided, this prompt will be used instead of the default prompts for knowledge graph extraction. The prompt should guide the LLM on how to extract entities and relationships from the text content. - auto_translate: If True, automatically detect and translate non-English content to the - target language before processing. Uses language detection to identify - content that needs translation. Defaults to False. - target_language: Target language code for translation (e.g., "en", "es", "fr"). - Only used when auto_translate=True. Defaults to "en" (English). - translation_provider: Translation service to use ("llm", "google", "azure"). - LLM uses the existing LLM infrastructure, Google requires - GOOGLE_TRANSLATE_API_KEY, Azure requires AZURE_TRANSLATOR_KEY. - If not specified, uses TRANSLATION_PROVIDER env var or defaults to "llm". Returns: Union[dict, list[PipelineRunInfo]]: @@ -196,14 +182,6 @@ async def cognify( run_in_background=True ) # Check status later with run_info.pipeline_run_id - - # Auto-translate multilingual content to English - await cognee.add("document_spanish.pdf") - await cognee.cognify( - auto_translate=True, - target_language="en", - translation_provider="llm" # or "google", "azure" - ) ``` @@ -215,9 +193,6 @@ async def cognify( - LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER - LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False) - LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60) - - TRANSLATION_PROVIDER: Default translation provider ("llm", "google", "azure") - - GOOGLE_TRANSLATE_API_KEY: API key for Google Translate - - AZURE_TRANSLATOR_KEY: API key for Azure Translator """ if config is None: ontology_config = get_ontology_env_config() @@ -238,13 +213,7 @@ async def cognify( if temporal_cognify: tasks = await get_temporal_tasks( - user=user, - chunker=chunker, - chunk_size=chunk_size, - chunks_per_batch=chunks_per_batch, - auto_translate=auto_translate, - target_language=target_language, - translation_provider=translation_provider, + user=user, chunker=chunker, chunk_size=chunk_size, chunks_per_batch=chunks_per_batch ) else: tasks = await get_default_tasks( @@ -255,9 +224,6 @@ async def cognify( config=config, custom_prompt=custom_prompt, chunks_per_batch=chunks_per_batch, - auto_translate=auto_translate, - target_language=target_language, - translation_provider=translation_provider, **kwargs, ) @@ -287,9 +253,6 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's config: Config = None, custom_prompt: Optional[str] = None, chunks_per_batch: int = 100, - auto_translate: bool = False, - target_language: str = "en", - translation_provider: TranslationProviderType = None, **kwargs, ) -> list[Task]: if config is None: @@ -322,52 +285,30 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's max_chunk_size=chunk_size or get_max_chunk_tokens(), chunker=chunker, ), # Extract text chunks based on the document type. + Task( + extract_graph_from_data, + graph_model=graph_model, + config=config, + custom_prompt=custom_prompt, + task_config={"batch_size": chunks_per_batch}, + **kwargs, + ), # Generate knowledge graphs from the document chunks. + Task( + summarize_text, + task_config={"batch_size": chunks_per_batch}, + ), + Task( + add_data_points, + embed_triplets=embed_triplets, + task_config={"batch_size": chunks_per_batch}, + ), ] - # Add translation task if auto_translate is enabled - if auto_translate: - default_tasks.append( - Task( - translate_content, - target_language=target_language, - translation_provider=translation_provider, - task_config={"batch_size": chunks_per_batch}, - ) - ) - - default_tasks.extend( - [ - Task( - extract_graph_from_data, - graph_model=graph_model, - config=config, - custom_prompt=custom_prompt, - task_config={"batch_size": chunks_per_batch}, - **kwargs, - ), # Generate knowledge graphs from the document chunks. - Task( - summarize_text, - task_config={"batch_size": chunks_per_batch}, - ), - Task( - add_data_points, - embed_triplets=embed_triplets, - task_config={"batch_size": chunks_per_batch}, - ), - ] - ) - return default_tasks async def get_temporal_tasks( - user: User = None, - chunker=TextChunker, - chunk_size: int = None, - chunks_per_batch: int = 10, - auto_translate: bool = False, - target_language: str = "en", - translation_provider: TranslationProviderType = None, + user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10 ) -> list[Task]: """ Builds and returns a list of temporal processing tasks to be executed in sequence. @@ -375,19 +316,15 @@ async def get_temporal_tasks( The pipeline includes: 1. Document classification. 2. Document chunking with a specified or default chunk size. - 3. (Optional) Translation of non-English content to target language. - 4. Event and timestamp extraction from chunks. - 5. Knowledge graph extraction from events. - 6. Batched insertion of data points. + 3. Event and timestamp extraction from chunks. + 4. Knowledge graph extraction from events. + 5. Batched insertion of data points. Args: user (User, optional): The user requesting task execution. chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker. chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default. chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify - auto_translate (bool, optional): If True, translate non-English content. Defaults to False. - target_language (str, optional): Target language for translation. Defaults to "en". - translation_provider (str, optional): Translation provider to use ("llm", "google", "azure"). Returns: list[Task]: A list of Task objects representing the temporal processing pipeline. @@ -402,25 +339,9 @@ async def get_temporal_tasks( max_chunk_size=chunk_size or get_max_chunk_tokens(), chunker=chunker, ), + Task(extract_events_and_timestamps, task_config={"batch_size": chunks_per_batch}), + Task(extract_knowledge_graph_from_events), + Task(add_data_points, task_config={"batch_size": chunks_per_batch}), ] - # Add translation task if auto_translate is enabled - if auto_translate: - temporal_tasks.append( - Task( - translate_content, - target_language=target_language, - translation_provider=translation_provider, - task_config={"batch_size": chunks_per_batch}, - ) - ) - - temporal_tasks.extend( - [ - Task(extract_events_and_timestamps, task_config={"batch_size": chunks_per_batch}), - Task(extract_knowledge_graph_from_events), - Task(add_data_points, task_config={"batch_size": chunks_per_batch}), - ] - ) - return temporal_tasks From eecebb9e2d96dbaa6965f859243794f20645bb25 Mon Sep 17 00:00:00 2001 From: andikarachman Date: Tue, 13 Jan 2026 19:10:52 +0700 Subject: [PATCH 14/16] test: remove cognify pipeline tests from translation integration tests Keep only standalone translation module tests (translate_text and language detection) that don't depend on the cognify pipeline. --- .../tasks/translation/integration_test.py | 180 +----------------- 1 file changed, 1 insertion(+), 179 deletions(-) diff --git a/cognee/tests/tasks/translation/integration_test.py b/cognee/tests/tasks/translation/integration_test.py index 54d9b6418..21791b272 100644 --- a/cognee/tests/tasks/translation/integration_test.py +++ b/cognee/tests/tasks/translation/integration_test.py @@ -1,14 +1,13 @@ """ Integration tests for multilingual content translation feature. -Tests the full cognify pipeline with translation enabled. +Tests the translation module standalone functionality. """ import os import pytest -from cognee import add, cognify, prune, search, SearchType from cognee.tasks.translation import translate_text from cognee.tasks.translation.detect_language import detect_language_async @@ -18,183 +17,6 @@ def has_llm_api_key(): return bool(os.environ.get("LLM_API_KEY")) -async def reset_engines(): - """Reset cached engines to avoid event loop issues between tests.""" - from cognee.infrastructure.databases.graph.get_graph_engine import create_graph_engine - from cognee.infrastructure.databases.relational.create_relational_engine import ( - create_relational_engine, - ) - from cognee.infrastructure.databases.vector.create_vector_engine import create_vector_engine - - create_graph_engine.cache_clear() - create_vector_engine.cache_clear() - create_relational_engine.cache_clear() - - -@pytest.mark.asyncio -@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") -async def test_quick_translation(): - """Quick smoke test for translation feature""" - await reset_engines() - await prune.prune_data() - await prune.prune_system(metadata=True) - - spanish_text = "La inteligencia artificial está transformando el mundo de la tecnología." - await add(spanish_text, dataset_name="spanish_test") - - result = await cognify( - datasets=["spanish_test"], - auto_translate=True, - target_language="en", - translation_provider="llm", - ) - - assert result is not None - assert len(result) > 0 - - -@pytest.mark.asyncio -@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") -async def test_translation_basic(): - """Test basic translation functionality with English text""" - await reset_engines() - await prune.prune_data() - await prune.prune_system(metadata=True) - - english_text = "Hello, this is a test document about artificial intelligence." - await add(english_text, dataset_name="test_english") - - result = await cognify( - datasets=["test_english"], - auto_translate=True, - target_language="en", - translation_provider="llm", - ) - - assert result is not None - - search_results = await search( - query_text="What is this document about?", - query_type=SearchType.SUMMARIES, - ) - assert search_results is not None - - -@pytest.mark.asyncio -@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") -async def test_translation_spanish(): - """Test translation with Spanish text""" - await reset_engines() - await prune.prune_data() - await prune.prune_system(metadata=True) - - spanish_text = """ - La inteligencia artificial es una rama de la informática que se centra en - crear sistemas capaces de realizar tareas que normalmente requieren inteligencia humana. - Estos sistemas pueden aprender, razonar y resolver problemas complejos. - """ - - await add(spanish_text, dataset_name="test_spanish") - - result = await cognify( - datasets=["test_spanish"], - auto_translate=True, - target_language="en", - translation_provider="llm", - ) - - assert result is not None - - search_results = await search( - query_text="What is artificial intelligence?", - query_type=SearchType.SUMMARIES, - ) - assert search_results is not None - - -@pytest.mark.asyncio -@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") -async def test_translation_french(): - """Test translation with French text""" - await reset_engines() - await prune.prune_data() - await prune.prune_system(metadata=True) - - french_text = """ - L'apprentissage automatique est une méthode d'analyse de données qui - automatise la construction de modèles analytiques. C'est une branche - de l'intelligence artificielle basée sur l'idée que les systèmes peuvent - apprendre à partir de données, identifier des modèles et prendre des décisions. - """ - - await add(french_text, dataset_name="test_french") - - result = await cognify( - datasets=["test_french"], - auto_translate=True, - target_language="en", - ) - - assert result is not None - - search_results = await search( - query_text="What is machine learning?", - query_type=SearchType.SUMMARIES, - ) - assert search_results is not None - - -@pytest.mark.asyncio -@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") -async def test_translation_disabled(): - """Test that cognify works without translation""" - await reset_engines() - await prune.prune_data() - await prune.prune_system(metadata=True) - - text = "This is a baseline test without translation enabled." - await add(text, dataset_name="test_baseline") - - result = await cognify( - datasets=["test_baseline"], - auto_translate=False, - ) - - assert result is not None - - -@pytest.mark.asyncio -@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") -async def test_translation_mixed_languages(): - """Test with multiple documents in different languages""" - await reset_engines() - await prune.prune_data() - await prune.prune_system(metadata=True) - - texts = [ - "Artificial intelligence is transforming the world.", - "La tecnología está cambiando nuestras vidas.", - "Les ordinateurs deviennent de plus en plus puissants.", - ] - - for text in texts: - await add(text, dataset_name="test_mixed") - - result = await cognify( - datasets=["test_mixed"], - auto_translate=True, - target_language="en", - ) - - assert result is not None - - search_results = await search( - query_text="What topics are discussed?", - query_type=SearchType.SUMMARIES, - ) - assert search_results is not None - - @pytest.mark.asyncio @pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") async def test_direct_translation_function(): From ce57451403bcd1ba5d54a9e34a31b7032c0dc366 Mon Sep 17 00:00:00 2001 From: andikarachman Date: Tue, 13 Jan 2026 19:13:27 +0700 Subject: [PATCH 15/16] docs: update README to reflect removed cognify pipeline tests --- cognee/tests/tasks/translation/README.md | 29 ++++++------------------ 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/cognee/tests/tasks/translation/README.md b/cognee/tests/tasks/translation/README.md index 50b589ea0..6962822cc 100644 --- a/cognee/tests/tasks/translation/README.md +++ b/cognee/tests/tasks/translation/README.md @@ -30,12 +30,9 @@ Unit and integration tests for the multilingual content translation feature. - Skip translation for target language - Confidence threshold customization -- **integration_test.py** - End-to-end integration tests - - Full cognify pipeline with translation - - Spanish/French to English translation - - Mixed language datasets - - Search functionality after translation - - Translation disabled mode +- **integration_test.py** - Standalone translation module tests + - Direct translate_text function + - Language detection functionality ## Running Tests @@ -94,13 +91,6 @@ result = await translate_text( target_language="en" ) print(result.translated_text) # "Hello world" - -# Or use auto-translation in the cognify pipeline -await cognee.add("Hola, ¿cómo estás?") -await cognee.cognify(auto_translate=True) - -# Search works on translated content -results = await cognee.search("how are you") ``` ### Alternative Translation Providers @@ -121,8 +111,8 @@ cognee.config.set_translation_provider("azure") | detect_language_test.py | 10 | Language detection | | providers_test.py | 9 | Translation providers | | translate_content_test.py | 9 | Content translation task | -| integration_test.py | 8 | End-to-end pipeline | -| **Total** | **40** | | +| integration_test.py | 2 | Standalone translation tests | +| **Total** | **34** | | ## Test Categories @@ -156,11 +146,6 @@ cognee.config.set_translation_provider("azure") - ✅ Empty text/list handling - ✅ Confidence threshold customization -### Integration (8 tests) -- ✅ Full cognify pipeline with auto_translate=True -- ✅ Spanish to English translation -- ✅ French to English translation -- ✅ Mixed language datasets -- ✅ Translation disabled mode +### Integration (2 tests) - ✅ Direct translate_text function -- ✅ Search after translation +- ✅ Language detection functionality From 3dca104bdfaab23470e118d8a1efe52e09aa58bb Mon Sep 17 00:00:00 2001 From: andikarachman Date: Tue, 13 Jan 2026 19:16:06 +0700 Subject: [PATCH 16/16] refactor: remove integration_test.py and update README --- cognee/tests/tasks/translation/README.md | 4 -- .../tasks/translation/integration_test.py | 52 ------------------- 2 files changed, 56 deletions(-) delete mode 100644 cognee/tests/tasks/translation/integration_test.py diff --git a/cognee/tests/tasks/translation/README.md b/cognee/tests/tasks/translation/README.md index 6962822cc..075dc71db 100644 --- a/cognee/tests/tasks/translation/README.md +++ b/cognee/tests/tasks/translation/README.md @@ -30,10 +30,6 @@ Unit and integration tests for the multilingual content translation feature. - Skip translation for target language - Confidence threshold customization -- **integration_test.py** - Standalone translation module tests - - Direct translate_text function - - Language detection functionality - ## Running Tests ### Run all translation tests diff --git a/cognee/tests/tasks/translation/integration_test.py b/cognee/tests/tasks/translation/integration_test.py deleted file mode 100644 index 21791b272..000000000 --- a/cognee/tests/tasks/translation/integration_test.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -Integration tests for multilingual content translation feature. - -Tests the translation module standalone functionality. -""" - -import os - -import pytest - -from cognee.tasks.translation import translate_text -from cognee.tasks.translation.detect_language import detect_language_async - - -def has_llm_api_key(): - """Check if LLM API key is available""" - return bool(os.environ.get("LLM_API_KEY")) - - -@pytest.mark.asyncio -@pytest.mark.skipif(not has_llm_api_key(), reason="No LLM API key available") -async def test_direct_translation_function(): - """Test the translate_text convenience function directly""" - result = await translate_text( - text="Hola, ¿cómo estás? Espero que tengas un buen día.", - target_language="en", - translation_provider="llm", - ) - - assert result.translated_text is not None - assert result.translated_text != "" - assert result.target_language == "en" - assert result.provider == "llm" - - -@pytest.mark.asyncio -async def test_language_detection(): - """Test language detection directly""" - test_texts = [ - ("Hello world, how are you doing today?", "en", False), - ("Bonjour le monde, comment allez-vous aujourd'hui?", "en", True), - ("Hola mundo, cómo estás hoy?", "en", True), - ("This is already in English language", "en", False), - ] - - for text, target_lang, should_translate in test_texts: - result = await detect_language_async(text, target_lang) - assert result.language_code is not None - assert result.confidence > 0.0 - # Only check requires_translation for high-confidence detections - if result.confidence > 0.8: - assert result.requires_translation == should_translate