diff --git a/cognee/tasks/translation/config.py b/cognee/tasks/translation/config.py index b4fe7902d..b8c3da485 100644 --- a/cognee/tasks/translation/config.py +++ b/cognee/tasks/translation/config.py @@ -63,3 +63,8 @@ class TranslationConfig(BaseSettings): def get_translation_config() -> TranslationConfig: """Get the translation configuration singleton.""" return TranslationConfig() + + +def clear_translation_config_cache(): + """Clear the cached config for testing purposes.""" + get_translation_config.cache_clear() diff --git a/cognee/tasks/translation/detect_language.py b/cognee/tasks/translation/detect_language.py index 00b0bf012..a474f7144 100644 --- a/cognee/tasks/translation/detect_language.py +++ b/cognee/tasks/translation/detect_language.py @@ -169,7 +169,7 @@ def detect_language( async def detect_language_async( text: str, target_language: str = "en", - confidence_threshold: float = None, + confidence_threshold: Optional[float] = None, ) -> LanguageDetectionResult: """ Async wrapper for language detection. diff --git a/cognee/tasks/translation/exceptions.py b/cognee/tasks/translation/exceptions.py index d5db128de..3ab197fce 100644 --- a/cognee/tasks/translation/exceptions.py +++ b/cognee/tasks/translation/exceptions.py @@ -5,6 +5,8 @@ class TranslationError(Exception): self.message = message self.original_error = original_error super().__init__(self.message) + if original_error: + self.__cause__ = original_error class LanguageDetectionError(TranslationError): diff --git a/cognee/tasks/translation/providers/azure_provider.py b/cognee/tasks/translation/providers/azure_provider.py index 455f57e3a..8a3fc2b01 100644 --- a/cognee/tasks/translation/providers/azure_provider.py +++ b/cognee/tasks/translation/providers/azure_provider.py @@ -98,7 +98,11 @@ class AzureTranslationProvider(TranslationProvider): except Exception as e: logger.error(f"Azure translation failed: {e}") - raise + raise TranslationProviderError( + provider=self.provider_name, + message=f"Translation failed: {e}", + original_error=e, + ) async def translate_batch( self, @@ -176,6 +180,10 @@ class AzureTranslationProvider(TranslationProvider): except Exception as e: logger.error(f"Azure batch translation failed: {e}") - raise + raise TranslationProviderError( + provider=self.provider_name, + message=f"Batch translation failed: {e}", + original_error=e, + ) return all_results diff --git a/cognee/tasks/translation/providers/base.py b/cognee/tasks/translation/providers/base.py index f7862d3c1..37c6744b4 100644 --- a/cognee/tasks/translation/providers/base.py +++ b/cognee/tasks/translation/providers/base.py @@ -18,7 +18,8 @@ class TranslationResult: translated_text: str source_language: str target_language: str - confidence_score: float + # Confidence score from the provider, or None if not available (e.g., Google Translate) + confidence_score: Optional[float] provider: str raw_response: Optional[dict] = None diff --git a/cognee/tasks/translation/providers/google_provider.py b/cognee/tasks/translation/providers/google_provider.py index f007575cd..d6b16545c 100644 --- a/cognee/tasks/translation/providers/google_provider.py +++ b/cognee/tasks/translation/providers/google_provider.py @@ -75,17 +75,15 @@ class GoogleTranslationProvider(TranslationProvider): # Run in thread pool since google-cloud-translate is synchronous loop = asyncio.get_running_loop() + # Build kwargs for translate call + translate_kwargs = {"target_language": target_language} if source_language: - result = await loop.run_in_executor( - None, - lambda: client.translate( - text, target_language=target_language, source_language=source_language - ), - ) - else: - result = await loop.run_in_executor( - None, lambda: client.translate(text, target_language=target_language) - ) + translate_kwargs["source_language"] = source_language + + result = await loop.run_in_executor( + None, + lambda: client.translate(text, **translate_kwargs), + ) detected_language = result.get("detectedSourceLanguage", source_language or "unknown") @@ -93,7 +91,8 @@ class GoogleTranslationProvider(TranslationProvider): translated_text=result["translatedText"], source_language=detected_language, target_language=target_language, - confidence_score=0.9, # Google Translate is generally reliable + # Google Translate API does not provide confidence scores + confidence_score=None, provider=self.provider_name, raw_response=result, ) @@ -125,17 +124,15 @@ class GoogleTranslationProvider(TranslationProvider): client = self._get_client() loop = asyncio.get_running_loop() + # Build kwargs for translate call + translate_kwargs = {"target_language": target_language} if source_language: - results = await loop.run_in_executor( - None, - lambda: client.translate( - texts, target_language=target_language, source_language=source_language - ), - ) - else: - results = await loop.run_in_executor( - None, lambda: client.translate(texts, target_language=target_language) - ) + translate_kwargs["source_language"] = source_language + + results = await loop.run_in_executor( + None, + lambda: client.translate(texts, **translate_kwargs), + ) translation_results = [] for result in results: @@ -147,7 +144,8 @@ class GoogleTranslationProvider(TranslationProvider): translated_text=result["translatedText"], source_language=detected_language, target_language=target_language, - confidence_score=0.9, + # Google Translate API does not provide confidence scores + confidence_score=None, provider=self.provider_name, raw_response=result, ) diff --git a/cognee/tasks/translation/translate_content.py b/cognee/tasks/translation/translate_content.py index fcf6ae430..aa26306c5 100644 --- a/cognee/tasks/translation/translate_content.py +++ b/cognee/tasks/translation/translate_content.py @@ -87,8 +87,13 @@ async def translate_content( # Process chunks processed_chunks = [] + total_chunks = len(data_chunks) + + for chunk_index, chunk in enumerate(data_chunks): + # Log progress for large batches + if chunk_index > 0 and chunk_index % 100 == 0: + logger.info(f"Translation progress: {chunk_index}/{total_chunks} chunks processed") - for chunk in data_chunks: if not hasattr(chunk, "text") or not chunk.text: processed_chunks.append(chunk) continue diff --git a/cognee/tests/tasks/translation/integration_test.py b/cognee/tests/tasks/translation/integration_test.py index ff2877959..18e20ab4b 100644 --- a/cognee/tests/tasks/translation/integration_test.py +++ b/cognee/tests/tasks/translation/integration_test.py @@ -36,6 +36,7 @@ async def test_quick_translation(): ) assert result is not None + assert len(result) > 0 @pytest.mark.asyncio diff --git a/pyproject.toml b/pyproject.toml index 3b37c27c3..078935dc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ dependencies = [ "fakeredis[lua]>=2.32.0", "diskcache>=5.6.3", "aiolimiter>=1.2.1", + "langdetect>=1.0.9", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index c2d97aa12..09d31be74 100644 --- a/uv.lock +++ b/uv.lock @@ -965,6 +965,7 @@ dependencies = [ { name = "jinja2" }, { name = "kuzu" }, { name = "lancedb" }, + { name = "langdetect" }, { name = "limits" }, { name = "litellm" }, { name = "mistralai" }, @@ -1160,6 +1161,7 @@ requires-dist = [ { name = "lancedb", specifier = ">=0.24.0,<1.0.0" }, { name = "langchain-aws", marker = "extra == 'neptune'", specifier = ">=0.2.22" }, { name = "langchain-text-splitters", marker = "extra == 'langchain'", specifier = ">=0.3.2,<1.0.0" }, + { name = "langdetect", specifier = ">=1.0.9" }, { name = "langfuse", marker = "extra == 'monitoring'", specifier = ">=2.32.0,<3" }, { name = "langsmith", marker = "extra == 'langchain'", specifier = ">=0.2.3,<1.0.0" }, { name = "limits", specifier = ">=4.4.1,<5" },