feat(translation): implement multilingual content translation task
- Add translation module with OpenAI, Google, Azure provider support - Implement language detection using langdetect - Add TranslatedContent and LanguageMetadata models - Integrate translation task into cognify pipeline - Add auto_translate parameter to cognify() function - Preserve original text alongside translations - Support custom translation providers and target languages
This commit is contained in:
parent
5b42b21af5
commit
db0818cd33
13 changed files with 1397 additions and 7 deletions
|
|
@ -26,6 +26,8 @@ from cognee.tasks.documents import (
|
||||||
from cognee.tasks.graph import extract_graph_from_data
|
from cognee.tasks.graph import extract_graph_from_data
|
||||||
from cognee.tasks.storage import add_data_points
|
from cognee.tasks.storage import add_data_points
|
||||||
from cognee.tasks.summarization import summarize_text
|
from cognee.tasks.summarization import summarize_text
|
||||||
|
from cognee.tasks.translation import translate_content
|
||||||
|
from cognee.tasks.translation.config import TranslationProviderType
|
||||||
from cognee.modules.pipelines.layers.pipeline_execution_mode import get_pipeline_executor
|
from cognee.modules.pipelines.layers.pipeline_execution_mode import get_pipeline_executor
|
||||||
from cognee.tasks.temporal_graph.extract_events_and_entities import extract_events_and_timestamps
|
from cognee.tasks.temporal_graph.extract_events_and_entities import extract_events_and_timestamps
|
||||||
from cognee.tasks.temporal_graph.extract_knowledge_graph_from_events import (
|
from cognee.tasks.temporal_graph.extract_knowledge_graph_from_events import (
|
||||||
|
|
@ -53,6 +55,9 @@ async def cognify(
|
||||||
custom_prompt: Optional[str] = None,
|
custom_prompt: Optional[str] = None,
|
||||||
temporal_cognify: bool = False,
|
temporal_cognify: bool = False,
|
||||||
data_per_batch: int = 20,
|
data_per_batch: int = 20,
|
||||||
|
auto_translate: bool = False,
|
||||||
|
target_language: str = "en",
|
||||||
|
translation_provider: TranslationProviderType = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|
@ -118,6 +123,15 @@ async def cognify(
|
||||||
If provided, this prompt will be used instead of the default prompts for
|
If provided, this prompt will be used instead of the default prompts for
|
||||||
knowledge graph extraction. The prompt should guide the LLM on how to
|
knowledge graph extraction. The prompt should guide the LLM on how to
|
||||||
extract entities and relationships from the text content.
|
extract entities and relationships from the text content.
|
||||||
|
auto_translate: If True, automatically detect and translate non-English content to the
|
||||||
|
target language before processing. Uses language detection to identify
|
||||||
|
content that needs translation. Defaults to False.
|
||||||
|
target_language: Target language code for translation (e.g., "en", "es", "fr").
|
||||||
|
Only used when auto_translate=True. Defaults to "en" (English).
|
||||||
|
translation_provider: Translation service to use ("openai", "google", "azure").
|
||||||
|
OpenAI uses the existing LLM infrastructure, Google requires
|
||||||
|
GOOGLE_TRANSLATE_API_KEY, Azure requires AZURE_TRANSLATOR_KEY.
|
||||||
|
If not specified, uses TRANSLATION_PROVIDER env var or defaults to "openai".
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Union[dict, list[PipelineRunInfo]]:
|
Union[dict, list[PipelineRunInfo]]:
|
||||||
|
|
@ -182,6 +196,14 @@ async def cognify(
|
||||||
run_in_background=True
|
run_in_background=True
|
||||||
)
|
)
|
||||||
# Check status later with run_info.pipeline_run_id
|
# Check status later with run_info.pipeline_run_id
|
||||||
|
|
||||||
|
# Auto-translate multilingual content to English
|
||||||
|
await cognee.add("document_spanish.pdf")
|
||||||
|
await cognee.cognify(
|
||||||
|
auto_translate=True,
|
||||||
|
target_language="en",
|
||||||
|
translation_provider="openai" # or "google", "azure"
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -193,6 +215,9 @@ async def cognify(
|
||||||
- LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER
|
- LLM_PROVIDER, LLM_MODEL, VECTOR_DB_PROVIDER, GRAPH_DATABASE_PROVIDER
|
||||||
- LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False)
|
- LLM_RATE_LIMIT_ENABLED: Enable rate limiting (default: False)
|
||||||
- LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60)
|
- LLM_RATE_LIMIT_REQUESTS: Max requests per interval (default: 60)
|
||||||
|
- TRANSLATION_PROVIDER: Default translation provider ("openai", "google", "azure")
|
||||||
|
- GOOGLE_TRANSLATE_API_KEY: API key for Google Translate
|
||||||
|
- AZURE_TRANSLATOR_KEY: API key for Azure Translator
|
||||||
"""
|
"""
|
||||||
if config is None:
|
if config is None:
|
||||||
ontology_config = get_ontology_env_config()
|
ontology_config = get_ontology_env_config()
|
||||||
|
|
@ -213,7 +238,13 @@ async def cognify(
|
||||||
|
|
||||||
if temporal_cognify:
|
if temporal_cognify:
|
||||||
tasks = await get_temporal_tasks(
|
tasks = await get_temporal_tasks(
|
||||||
user=user, chunker=chunker, chunk_size=chunk_size, chunks_per_batch=chunks_per_batch
|
user=user,
|
||||||
|
chunker=chunker,
|
||||||
|
chunk_size=chunk_size,
|
||||||
|
chunks_per_batch=chunks_per_batch,
|
||||||
|
auto_translate=auto_translate,
|
||||||
|
target_language=target_language,
|
||||||
|
translation_provider=translation_provider,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
tasks = await get_default_tasks(
|
tasks = await get_default_tasks(
|
||||||
|
|
@ -224,6 +255,9 @@ async def cognify(
|
||||||
config=config,
|
config=config,
|
||||||
custom_prompt=custom_prompt,
|
custom_prompt=custom_prompt,
|
||||||
chunks_per_batch=chunks_per_batch,
|
chunks_per_batch=chunks_per_batch,
|
||||||
|
auto_translate=auto_translate,
|
||||||
|
target_language=target_language,
|
||||||
|
translation_provider=translation_provider,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -253,6 +287,9 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
||||||
config: Config = None,
|
config: Config = None,
|
||||||
custom_prompt: Optional[str] = None,
|
custom_prompt: Optional[str] = None,
|
||||||
chunks_per_batch: int = 100,
|
chunks_per_batch: int = 100,
|
||||||
|
auto_translate: bool = False,
|
||||||
|
target_language: str = "en",
|
||||||
|
translation_provider: TranslationProviderType = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> list[Task]:
|
) -> list[Task]:
|
||||||
if config is None:
|
if config is None:
|
||||||
|
|
@ -285,6 +322,20 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
||||||
max_chunk_size=chunk_size or get_max_chunk_tokens(),
|
max_chunk_size=chunk_size or get_max_chunk_tokens(),
|
||||||
chunker=chunker,
|
chunker=chunker,
|
||||||
), # Extract text chunks based on the document type.
|
), # Extract text chunks based on the document type.
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add translation task if auto_translate is enabled
|
||||||
|
if auto_translate:
|
||||||
|
default_tasks.append(
|
||||||
|
Task(
|
||||||
|
translate_content,
|
||||||
|
target_language=target_language,
|
||||||
|
translation_provider=translation_provider,
|
||||||
|
task_config={"batch_size": chunks_per_batch},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
default_tasks.extend([
|
||||||
Task(
|
Task(
|
||||||
extract_graph_from_data,
|
extract_graph_from_data,
|
||||||
graph_model=graph_model,
|
graph_model=graph_model,
|
||||||
|
|
@ -302,13 +353,19 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
||||||
embed_triplets=embed_triplets,
|
embed_triplets=embed_triplets,
|
||||||
task_config={"batch_size": chunks_per_batch},
|
task_config={"batch_size": chunks_per_batch},
|
||||||
),
|
),
|
||||||
]
|
])
|
||||||
|
|
||||||
return default_tasks
|
return default_tasks
|
||||||
|
|
||||||
|
|
||||||
async def get_temporal_tasks(
|
async def get_temporal_tasks(
|
||||||
user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10
|
user: User = None,
|
||||||
|
chunker=TextChunker,
|
||||||
|
chunk_size: int = None,
|
||||||
|
chunks_per_batch: int = 10,
|
||||||
|
auto_translate: bool = False,
|
||||||
|
target_language: str = "en",
|
||||||
|
translation_provider: TranslationProviderType = None,
|
||||||
) -> list[Task]:
|
) -> list[Task]:
|
||||||
"""
|
"""
|
||||||
Builds and returns a list of temporal processing tasks to be executed in sequence.
|
Builds and returns a list of temporal processing tasks to be executed in sequence.
|
||||||
|
|
@ -316,15 +373,19 @@ async def get_temporal_tasks(
|
||||||
The pipeline includes:
|
The pipeline includes:
|
||||||
1. Document classification.
|
1. Document classification.
|
||||||
2. Document chunking with a specified or default chunk size.
|
2. Document chunking with a specified or default chunk size.
|
||||||
3. Event and timestamp extraction from chunks.
|
3. (Optional) Translation of non-English content to target language.
|
||||||
4. Knowledge graph extraction from events.
|
4. Event and timestamp extraction from chunks.
|
||||||
5. Batched insertion of data points.
|
5. Knowledge graph extraction from events.
|
||||||
|
6. Batched insertion of data points.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
user (User, optional): The user requesting task execution.
|
user (User, optional): The user requesting task execution.
|
||||||
chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker.
|
chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker.
|
||||||
chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default.
|
chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default.
|
||||||
chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify
|
chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify
|
||||||
|
auto_translate (bool, optional): If True, translate non-English content. Defaults to False.
|
||||||
|
target_language (str, optional): Target language for translation. Defaults to "en".
|
||||||
|
translation_provider (str, optional): Translation provider to use ("openai", "google", "azure").
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list[Task]: A list of Task objects representing the temporal processing pipeline.
|
list[Task]: A list of Task objects representing the temporal processing pipeline.
|
||||||
|
|
@ -339,9 +400,23 @@ async def get_temporal_tasks(
|
||||||
max_chunk_size=chunk_size or get_max_chunk_tokens(),
|
max_chunk_size=chunk_size or get_max_chunk_tokens(),
|
||||||
chunker=chunker,
|
chunker=chunker,
|
||||||
),
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add translation task if auto_translate is enabled
|
||||||
|
if auto_translate:
|
||||||
|
temporal_tasks.append(
|
||||||
|
Task(
|
||||||
|
translate_content,
|
||||||
|
target_language=target_language,
|
||||||
|
translation_provider=translation_provider,
|
||||||
|
task_config={"batch_size": chunks_per_batch},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
temporal_tasks.extend([
|
||||||
Task(extract_events_and_timestamps, task_config={"batch_size": chunks_per_batch}),
|
Task(extract_events_and_timestamps, task_config={"batch_size": chunks_per_batch}),
|
||||||
Task(extract_knowledge_graph_from_events),
|
Task(extract_knowledge_graph_from_events),
|
||||||
Task(add_data_points, task_config={"batch_size": chunks_per_batch}),
|
Task(add_data_points, task_config={"batch_size": chunks_per_batch}),
|
||||||
]
|
])
|
||||||
|
|
||||||
return temporal_tasks
|
return temporal_tasks
|
||||||
|
|
|
||||||
19
cognee/infrastructure/llm/prompts/translate_content.txt
Normal file
19
cognee/infrastructure/llm/prompts/translate_content.txt
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
You are an expert translator with deep knowledge of languages, cultures, and linguistics.
|
||||||
|
|
||||||
|
Your task is to:
|
||||||
|
1. Detect the source language of the provided text if not specified
|
||||||
|
2. Translate the text accurately to the target language
|
||||||
|
3. Preserve the original meaning, tone, and intent
|
||||||
|
4. Maintain proper grammar and natural phrasing in the target language
|
||||||
|
|
||||||
|
Guidelines:
|
||||||
|
- Preserve technical terms, proper nouns, and specialized vocabulary appropriately
|
||||||
|
- Maintain formatting such as paragraphs, lists, and emphasis where applicable
|
||||||
|
- If the text contains code, URLs, or other non-translatable content, preserve them as-is
|
||||||
|
- Handle idioms and cultural references thoughtfully, adapting when necessary
|
||||||
|
- Ensure the translation reads naturally to a native speaker of the target language
|
||||||
|
|
||||||
|
Provide the translation in a structured format with:
|
||||||
|
- The translated text
|
||||||
|
- The detected source language (ISO 639-1 code like "en", "es", "fr", "de", etc.)
|
||||||
|
- Any notes about the translation (optional, for ambiguous terms or cultural adaptations)
|
||||||
96
cognee/tasks/translation/__init__.py
Normal file
96
cognee/tasks/translation/__init__.py
Normal file
|
|
@ -0,0 +1,96 @@
|
||||||
|
"""
|
||||||
|
Translation task for Cognee.
|
||||||
|
|
||||||
|
This module provides multilingual content translation capabilities,
|
||||||
|
allowing automatic detection and translation of non-English content
|
||||||
|
to a target language while preserving original text and metadata.
|
||||||
|
|
||||||
|
Main Components:
|
||||||
|
- translate_content: Main task function for translating document chunks
|
||||||
|
- translate_text: Convenience function for translating single texts
|
||||||
|
- batch_translate_texts: Batch translation for multiple texts
|
||||||
|
- detect_language: Language detection utility
|
||||||
|
- TranslatedContent: DataPoint model for translated content
|
||||||
|
- LanguageMetadata: DataPoint model for language information
|
||||||
|
|
||||||
|
Supported Translation Providers:
|
||||||
|
- OpenAI (default): Uses GPT models via existing LLM infrastructure
|
||||||
|
- Google Translate: Requires google-cloud-translate package
|
||||||
|
- Azure Translator: Requires Azure Translator API key
|
||||||
|
|
||||||
|
Example Usage:
|
||||||
|
```python
|
||||||
|
from cognee.tasks.translation import translate_content, translate_text
|
||||||
|
|
||||||
|
# Translate document chunks in a pipeline
|
||||||
|
translated_chunks = await translate_content(
|
||||||
|
chunks,
|
||||||
|
target_language="en",
|
||||||
|
translation_provider="openai"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Translate a single text
|
||||||
|
result = await translate_text("Bonjour le monde!")
|
||||||
|
print(result.translated_text) # "Hello world!"
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .config import get_translation_config, TranslationConfig
|
||||||
|
from .detect_language import (
|
||||||
|
detect_language,
|
||||||
|
detect_language_async,
|
||||||
|
LanguageDetectionResult,
|
||||||
|
get_language_name,
|
||||||
|
)
|
||||||
|
from .exceptions import (
|
||||||
|
TranslationError,
|
||||||
|
LanguageDetectionError,
|
||||||
|
TranslationProviderError,
|
||||||
|
UnsupportedLanguageError,
|
||||||
|
TranslationConfigError,
|
||||||
|
)
|
||||||
|
from .models import TranslatedContent, LanguageMetadata
|
||||||
|
from .providers import (
|
||||||
|
TranslationProvider,
|
||||||
|
TranslationResult,
|
||||||
|
get_translation_provider,
|
||||||
|
OpenAITranslationProvider,
|
||||||
|
GoogleTranslationProvider,
|
||||||
|
AzureTranslationProvider,
|
||||||
|
)
|
||||||
|
from .translate_content import (
|
||||||
|
translate_content,
|
||||||
|
translate_text,
|
||||||
|
batch_translate_texts,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# Main task functions
|
||||||
|
"translate_content",
|
||||||
|
"translate_text",
|
||||||
|
"batch_translate_texts",
|
||||||
|
# Language detection
|
||||||
|
"detect_language",
|
||||||
|
"detect_language_async",
|
||||||
|
"LanguageDetectionResult",
|
||||||
|
"get_language_name",
|
||||||
|
# Models
|
||||||
|
"TranslatedContent",
|
||||||
|
"LanguageMetadata",
|
||||||
|
# Configuration
|
||||||
|
"get_translation_config",
|
||||||
|
"TranslationConfig",
|
||||||
|
# Providers
|
||||||
|
"TranslationProvider",
|
||||||
|
"TranslationResult",
|
||||||
|
"get_translation_provider",
|
||||||
|
"OpenAITranslationProvider",
|
||||||
|
"GoogleTranslationProvider",
|
||||||
|
"AzureTranslationProvider",
|
||||||
|
# Exceptions
|
||||||
|
"TranslationError",
|
||||||
|
"LanguageDetectionError",
|
||||||
|
"TranslationProviderError",
|
||||||
|
"UnsupportedLanguageError",
|
||||||
|
"TranslationConfigError",
|
||||||
|
]
|
||||||
63
cognee/tasks/translation/config.py
Normal file
63
cognee/tasks/translation/config.py
Normal file
|
|
@ -0,0 +1,63 @@
|
||||||
|
from functools import lru_cache
|
||||||
|
from typing import Literal, Optional
|
||||||
|
|
||||||
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
|
||||||
|
TranslationProviderType = Literal["openai", "google", "azure"]
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationConfig(BaseSettings):
|
||||||
|
"""
|
||||||
|
Configuration settings for the translation task.
|
||||||
|
|
||||||
|
Environment variables can be used to configure these settings:
|
||||||
|
- TRANSLATION_PROVIDER: The translation service to use
|
||||||
|
- TRANSLATION_TARGET_LANGUAGE: Default target language
|
||||||
|
- TRANSLATION_CONFIDENCE_THRESHOLD: Minimum confidence for language detection
|
||||||
|
- GOOGLE_TRANSLATE_API_KEY: API key for Google Translate
|
||||||
|
- AZURE_TRANSLATOR_KEY: API key for Azure Translator
|
||||||
|
- AZURE_TRANSLATOR_REGION: Region for Azure Translator
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Translation provider settings
|
||||||
|
translation_provider: TranslationProviderType = "openai"
|
||||||
|
target_language: str = "en"
|
||||||
|
confidence_threshold: float = 0.8
|
||||||
|
|
||||||
|
# Google Translate settings
|
||||||
|
google_translate_api_key: Optional[str] = None
|
||||||
|
google_project_id: Optional[str] = None
|
||||||
|
|
||||||
|
# Azure Translator settings
|
||||||
|
azure_translator_key: Optional[str] = None
|
||||||
|
azure_translator_region: Optional[str] = None
|
||||||
|
azure_translator_endpoint: str = "https://api.cognitive.microsofttranslator.com"
|
||||||
|
|
||||||
|
# OpenAI uses the existing LLM configuration
|
||||||
|
|
||||||
|
# Performance settings
|
||||||
|
batch_size: int = 10
|
||||||
|
max_retries: int = 3
|
||||||
|
timeout_seconds: int = 30
|
||||||
|
|
||||||
|
# Language detection settings
|
||||||
|
min_text_length_for_detection: int = 10
|
||||||
|
skip_detection_for_short_text: bool = True
|
||||||
|
|
||||||
|
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
||||||
|
|
||||||
|
def to_dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
"translation_provider": self.translation_provider,
|
||||||
|
"target_language": self.target_language,
|
||||||
|
"confidence_threshold": self.confidence_threshold,
|
||||||
|
"batch_size": self.batch_size,
|
||||||
|
"max_retries": self.max_retries,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
|
def get_translation_config() -> TranslationConfig:
|
||||||
|
"""Get the translation configuration singleton."""
|
||||||
|
return TranslationConfig()
|
||||||
190
cognee/tasks/translation/detect_language.py
Normal file
190
cognee/tasks/translation/detect_language.py
Normal file
|
|
@ -0,0 +1,190 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
from .config import get_translation_config
|
||||||
|
from .exceptions import LanguageDetectionError
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ISO 639-1 language code to name mapping
|
||||||
|
LANGUAGE_NAMES = {
|
||||||
|
"af": "Afrikaans",
|
||||||
|
"ar": "Arabic",
|
||||||
|
"bg": "Bulgarian",
|
||||||
|
"bn": "Bengali",
|
||||||
|
"ca": "Catalan",
|
||||||
|
"cs": "Czech",
|
||||||
|
"cy": "Welsh",
|
||||||
|
"da": "Danish",
|
||||||
|
"de": "German",
|
||||||
|
"el": "Greek",
|
||||||
|
"en": "English",
|
||||||
|
"es": "Spanish",
|
||||||
|
"et": "Estonian",
|
||||||
|
"fa": "Persian",
|
||||||
|
"fi": "Finnish",
|
||||||
|
"fr": "French",
|
||||||
|
"gu": "Gujarati",
|
||||||
|
"he": "Hebrew",
|
||||||
|
"hi": "Hindi",
|
||||||
|
"hr": "Croatian",
|
||||||
|
"hu": "Hungarian",
|
||||||
|
"id": "Indonesian",
|
||||||
|
"it": "Italian",
|
||||||
|
"ja": "Japanese",
|
||||||
|
"kn": "Kannada",
|
||||||
|
"ko": "Korean",
|
||||||
|
"lt": "Lithuanian",
|
||||||
|
"lv": "Latvian",
|
||||||
|
"mk": "Macedonian",
|
||||||
|
"ml": "Malayalam",
|
||||||
|
"mr": "Marathi",
|
||||||
|
"ne": "Nepali",
|
||||||
|
"nl": "Dutch",
|
||||||
|
"no": "Norwegian",
|
||||||
|
"pa": "Punjabi",
|
||||||
|
"pl": "Polish",
|
||||||
|
"pt": "Portuguese",
|
||||||
|
"ro": "Romanian",
|
||||||
|
"ru": "Russian",
|
||||||
|
"sk": "Slovak",
|
||||||
|
"sl": "Slovenian",
|
||||||
|
"so": "Somali",
|
||||||
|
"sq": "Albanian",
|
||||||
|
"sv": "Swedish",
|
||||||
|
"sw": "Swahili",
|
||||||
|
"ta": "Tamil",
|
||||||
|
"te": "Telugu",
|
||||||
|
"th": "Thai",
|
||||||
|
"tl": "Tagalog",
|
||||||
|
"tr": "Turkish",
|
||||||
|
"uk": "Ukrainian",
|
||||||
|
"ur": "Urdu",
|
||||||
|
"vi": "Vietnamese",
|
||||||
|
"zh-cn": "Chinese (Simplified)",
|
||||||
|
"zh-tw": "Chinese (Traditional)",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LanguageDetectionResult:
|
||||||
|
"""Result of language detection."""
|
||||||
|
|
||||||
|
language_code: str
|
||||||
|
language_name: str
|
||||||
|
confidence: float
|
||||||
|
requires_translation: bool
|
||||||
|
character_count: int
|
||||||
|
|
||||||
|
|
||||||
|
def get_language_name(language_code: str) -> str:
|
||||||
|
"""Get the human-readable name for a language code."""
|
||||||
|
return LANGUAGE_NAMES.get(language_code.lower(), language_code)
|
||||||
|
|
||||||
|
|
||||||
|
def detect_language(
|
||||||
|
text: str,
|
||||||
|
target_language: str = "en",
|
||||||
|
confidence_threshold: float = None,
|
||||||
|
) -> LanguageDetectionResult:
|
||||||
|
"""
|
||||||
|
Detect the language of the given text.
|
||||||
|
|
||||||
|
Uses the langdetect library which is already a dependency of cognee.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to analyze
|
||||||
|
target_language: The target language for translation comparison
|
||||||
|
confidence_threshold: Minimum confidence to consider detection reliable
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LanguageDetectionResult with language info and translation requirement
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
LanguageDetectionError: If language detection fails
|
||||||
|
"""
|
||||||
|
config = get_translation_config()
|
||||||
|
threshold = confidence_threshold or config.confidence_threshold
|
||||||
|
|
||||||
|
# Handle empty or very short text
|
||||||
|
if not text or len(text.strip()) < config.min_text_length_for_detection:
|
||||||
|
if config.skip_detection_for_short_text:
|
||||||
|
return LanguageDetectionResult(
|
||||||
|
language_code="unknown",
|
||||||
|
language_name="Unknown",
|
||||||
|
confidence=0.0,
|
||||||
|
requires_translation=False,
|
||||||
|
character_count=len(text) if text else 0,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise LanguageDetectionError(
|
||||||
|
f"Text too short for reliable language detection: {len(text)} characters"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from langdetect import detect_langs, LangDetectException
|
||||||
|
except ImportError:
|
||||||
|
raise LanguageDetectionError(
|
||||||
|
"langdetect is required for language detection. Install it with: pip install langdetect"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get detection results with probabilities
|
||||||
|
detections = detect_langs(text)
|
||||||
|
|
||||||
|
if not detections:
|
||||||
|
raise LanguageDetectionError("No language detected")
|
||||||
|
|
||||||
|
# Get the most likely language
|
||||||
|
best_detection = detections[0]
|
||||||
|
language_code = best_detection.lang
|
||||||
|
confidence = best_detection.prob
|
||||||
|
|
||||||
|
# Check if translation is needed
|
||||||
|
requires_translation = (
|
||||||
|
language_code.lower() != target_language.lower() and confidence >= threshold
|
||||||
|
)
|
||||||
|
|
||||||
|
return LanguageDetectionResult(
|
||||||
|
language_code=language_code,
|
||||||
|
language_name=get_language_name(language_code),
|
||||||
|
confidence=confidence,
|
||||||
|
requires_translation=requires_translation,
|
||||||
|
character_count=len(text),
|
||||||
|
)
|
||||||
|
|
||||||
|
except LangDetectException as e:
|
||||||
|
logger.warning(f"Language detection failed: {e}")
|
||||||
|
raise LanguageDetectionError(f"Language detection failed: {e}", original_error=e)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error during language detection: {e}")
|
||||||
|
raise LanguageDetectionError(
|
||||||
|
f"Unexpected error during language detection: {e}", original_error=e
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def detect_language_async(
|
||||||
|
text: str,
|
||||||
|
target_language: str = "en",
|
||||||
|
confidence_threshold: float = None,
|
||||||
|
) -> LanguageDetectionResult:
|
||||||
|
"""
|
||||||
|
Async wrapper for language detection.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to analyze
|
||||||
|
target_language: The target language for translation comparison
|
||||||
|
confidence_threshold: Minimum confidence to consider detection reliable
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LanguageDetectionResult with language info and translation requirement
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
return await loop.run_in_executor(
|
||||||
|
None, detect_language, text, target_language, confidence_threshold
|
||||||
|
)
|
||||||
53
cognee/tasks/translation/exceptions.py
Normal file
53
cognee/tasks/translation/exceptions.py
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
class TranslationError(Exception):
|
||||||
|
"""Base exception for translation errors."""
|
||||||
|
|
||||||
|
def __init__(self, message: str, original_error: Exception = None):
|
||||||
|
self.message = message
|
||||||
|
self.original_error = original_error
|
||||||
|
super().__init__(self.message)
|
||||||
|
|
||||||
|
|
||||||
|
class LanguageDetectionError(TranslationError):
|
||||||
|
"""Exception raised when language detection fails."""
|
||||||
|
|
||||||
|
def __init__(self, message: str = "Failed to detect language", original_error: Exception = None):
|
||||||
|
super().__init__(message, original_error)
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationProviderError(TranslationError):
|
||||||
|
"""Exception raised when the translation provider encounters an error."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
provider: str,
|
||||||
|
message: str = "Translation provider error",
|
||||||
|
original_error: Exception = None,
|
||||||
|
):
|
||||||
|
self.provider = provider
|
||||||
|
full_message = f"[{provider}] {message}"
|
||||||
|
super().__init__(full_message, original_error)
|
||||||
|
|
||||||
|
|
||||||
|
class UnsupportedLanguageError(TranslationError):
|
||||||
|
"""Exception raised when the language is not supported."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
language: str,
|
||||||
|
provider: str = None,
|
||||||
|
message: str = None,
|
||||||
|
):
|
||||||
|
self.language = language
|
||||||
|
self.provider = provider
|
||||||
|
if message is None:
|
||||||
|
message = f"Language '{language}' is not supported"
|
||||||
|
if provider:
|
||||||
|
message += f" by {provider}"
|
||||||
|
super().__init__(message)
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationConfigError(TranslationError):
|
||||||
|
"""Exception raised when translation configuration is invalid."""
|
||||||
|
|
||||||
|
def __init__(self, message: str = "Invalid translation configuration"):
|
||||||
|
super().__init__(message)
|
||||||
72
cognee/tasks/translation/models.py
Normal file
72
cognee/tasks/translation/models.py
Normal file
|
|
@ -0,0 +1,72 @@
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional
|
||||||
|
from uuid import UUID
|
||||||
|
|
||||||
|
from cognee.infrastructure.engine import DataPoint
|
||||||
|
from cognee.modules.chunking.models import DocumentChunk
|
||||||
|
|
||||||
|
|
||||||
|
class TranslatedContent(DataPoint):
|
||||||
|
"""
|
||||||
|
Represents translated content with quality metrics.
|
||||||
|
|
||||||
|
This class stores both the original and translated versions of content,
|
||||||
|
along with metadata about the translation process including source and
|
||||||
|
target languages, translation provider used, and confidence scores.
|
||||||
|
|
||||||
|
Instance variables include:
|
||||||
|
|
||||||
|
- original_chunk_id: UUID of the original document chunk
|
||||||
|
- original_text: The original text before translation
|
||||||
|
- translated_text: The translated text content
|
||||||
|
- source_language: Detected or specified source language code (e.g., "es", "fr", "de")
|
||||||
|
- target_language: Target language code for translation (default: "en")
|
||||||
|
- translation_provider: Name of the translation service used
|
||||||
|
- confidence_score: Translation quality/confidence score (0.0 to 1.0)
|
||||||
|
- translation_timestamp: When the translation was performed
|
||||||
|
- translated_from: Reference to the original DocumentChunk
|
||||||
|
"""
|
||||||
|
|
||||||
|
original_chunk_id: UUID
|
||||||
|
original_text: str
|
||||||
|
translated_text: str
|
||||||
|
source_language: str
|
||||||
|
target_language: str = "en"
|
||||||
|
translation_provider: str
|
||||||
|
confidence_score: float
|
||||||
|
translation_timestamp: datetime = None
|
||||||
|
translated_from: Optional[DocumentChunk] = None
|
||||||
|
|
||||||
|
metadata: dict = {"index_fields": ["source_language", "original_chunk_id", "translated_text"]}
|
||||||
|
|
||||||
|
def __init__(self, **data):
|
||||||
|
if data.get("translation_timestamp") is None:
|
||||||
|
data["translation_timestamp"] = datetime.now(timezone.utc)
|
||||||
|
super().__init__(**data)
|
||||||
|
|
||||||
|
|
||||||
|
class LanguageMetadata(DataPoint):
|
||||||
|
"""
|
||||||
|
Language information for content.
|
||||||
|
|
||||||
|
This class stores metadata about the detected language of content,
|
||||||
|
including confidence scores and whether translation is required.
|
||||||
|
|
||||||
|
Instance variables include:
|
||||||
|
|
||||||
|
- content_id: UUID of the associated content
|
||||||
|
- detected_language: ISO 639-1 language code (e.g., "en", "es", "fr")
|
||||||
|
- language_confidence: Confidence score for language detection (0.0 to 1.0)
|
||||||
|
- requires_translation: Whether the content needs translation
|
||||||
|
- character_count: Number of characters in the content
|
||||||
|
- language_name: Human-readable language name (e.g., "English", "Spanish")
|
||||||
|
"""
|
||||||
|
|
||||||
|
content_id: UUID
|
||||||
|
detected_language: str
|
||||||
|
language_confidence: float
|
||||||
|
requires_translation: bool
|
||||||
|
character_count: int
|
||||||
|
language_name: Optional[str] = None
|
||||||
|
|
||||||
|
metadata: dict = {"index_fields": ["detected_language", "content_id"]}
|
||||||
40
cognee/tasks/translation/providers/__init__.py
Normal file
40
cognee/tasks/translation/providers/__init__.py
Normal file
|
|
@ -0,0 +1,40 @@
|
||||||
|
from .base import TranslationProvider, TranslationResult
|
||||||
|
from .openai_provider import OpenAITranslationProvider
|
||||||
|
from .google_provider import GoogleTranslationProvider
|
||||||
|
from .azure_provider import AzureTranslationProvider
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"TranslationProvider",
|
||||||
|
"TranslationResult",
|
||||||
|
"OpenAITranslationProvider",
|
||||||
|
"GoogleTranslationProvider",
|
||||||
|
"AzureTranslationProvider",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_translation_provider(provider_name: str) -> TranslationProvider:
|
||||||
|
"""
|
||||||
|
Factory function to get the appropriate translation provider.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
provider_name: Name of the provider ("openai", "google", or "azure")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TranslationProvider instance
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the provider name is not recognized
|
||||||
|
"""
|
||||||
|
providers = {
|
||||||
|
"openai": OpenAITranslationProvider,
|
||||||
|
"google": GoogleTranslationProvider,
|
||||||
|
"azure": AzureTranslationProvider,
|
||||||
|
}
|
||||||
|
|
||||||
|
if provider_name.lower() not in providers:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown translation provider: {provider_name}. "
|
||||||
|
f"Available providers: {list(providers.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return providers[provider_name.lower()]()
|
||||||
182
cognee/tasks/translation/providers/azure_provider.py
Normal file
182
cognee/tasks/translation/providers/azure_provider.py
Normal file
|
|
@ -0,0 +1,182 @@
|
||||||
|
import asyncio
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
from .base import TranslationProvider, TranslationResult
|
||||||
|
from ..config import get_translation_config
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class AzureTranslationProvider(TranslationProvider):
|
||||||
|
"""
|
||||||
|
Translation provider using Azure Translator API.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
- AZURE_TRANSLATOR_KEY environment variable
|
||||||
|
- AZURE_TRANSLATOR_REGION environment variable (optional)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._config = get_translation_config()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def provider_name(self) -> str:
|
||||||
|
return "azure"
|
||||||
|
|
||||||
|
def is_available(self) -> bool:
|
||||||
|
"""Check if Azure Translator is available."""
|
||||||
|
return self._config.azure_translator_key is not None
|
||||||
|
|
||||||
|
async def translate(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
target_language: str = "en",
|
||||||
|
source_language: Optional[str] = None,
|
||||||
|
) -> TranslationResult:
|
||||||
|
"""
|
||||||
|
Translate text using Azure Translator API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to translate
|
||||||
|
target_language: Target language code (default: "en")
|
||||||
|
source_language: Source language code (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TranslationResult with translated text and metadata
|
||||||
|
"""
|
||||||
|
if not self.is_available():
|
||||||
|
raise ValueError(
|
||||||
|
"Azure Translator API key not configured. "
|
||||||
|
"Set AZURE_TRANSLATOR_KEY environment variable."
|
||||||
|
)
|
||||||
|
|
||||||
|
endpoint = f"{self._config.azure_translator_endpoint}/translate"
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"api-version": "3.0",
|
||||||
|
"to": target_language,
|
||||||
|
}
|
||||||
|
if source_language:
|
||||||
|
params["from"] = source_language
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Ocp-Apim-Subscription-Key": self._config.azure_translator_key,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
if self._config.azure_translator_region:
|
||||||
|
headers["Ocp-Apim-Subscription-Region"] = self._config.azure_translator_region
|
||||||
|
|
||||||
|
body = [{"text": text}]
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(
|
||||||
|
endpoint,
|
||||||
|
params=params,
|
||||||
|
headers=headers,
|
||||||
|
json=body,
|
||||||
|
timeout=aiohttp.ClientTimeout(total=self._config.timeout_seconds),
|
||||||
|
) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
result = await response.json()
|
||||||
|
|
||||||
|
translation = result[0]["translations"][0]
|
||||||
|
detected_language = result[0].get("detectedLanguage", {})
|
||||||
|
|
||||||
|
return TranslationResult(
|
||||||
|
translated_text=translation["text"],
|
||||||
|
source_language=source_language
|
||||||
|
or detected_language.get("language", "unknown"),
|
||||||
|
target_language=target_language,
|
||||||
|
confidence_score=detected_language.get("score", 0.9),
|
||||||
|
provider=self.provider_name,
|
||||||
|
raw_response=result[0],
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Azure translation failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
async def translate_batch(
|
||||||
|
self,
|
||||||
|
texts: list[str],
|
||||||
|
target_language: str = "en",
|
||||||
|
source_language: Optional[str] = None,
|
||||||
|
) -> list[TranslationResult]:
|
||||||
|
"""
|
||||||
|
Translate multiple texts using Azure Translator API.
|
||||||
|
|
||||||
|
Azure Translator supports up to 100 texts per request.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: List of texts to translate
|
||||||
|
target_language: Target language code
|
||||||
|
source_language: Source language code (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of TranslationResult objects
|
||||||
|
"""
|
||||||
|
if not self.is_available():
|
||||||
|
raise ValueError("Azure Translator API key not configured.")
|
||||||
|
|
||||||
|
endpoint = f"{self._config.azure_translator_endpoint}/translate"
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"api-version": "3.0",
|
||||||
|
"to": target_language,
|
||||||
|
}
|
||||||
|
if source_language:
|
||||||
|
params["from"] = source_language
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Ocp-Apim-Subscription-Key": self._config.azure_translator_key,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
if self._config.azure_translator_region:
|
||||||
|
headers["Ocp-Apim-Subscription-Region"] = self._config.azure_translator_region
|
||||||
|
|
||||||
|
# Azure supports up to 100 texts per request
|
||||||
|
batch_size = min(100, self._config.batch_size)
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
for i in range(0, len(texts), batch_size):
|
||||||
|
batch = texts[i : i + batch_size]
|
||||||
|
body = [{"text": text} for text in batch]
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(
|
||||||
|
endpoint,
|
||||||
|
params=params,
|
||||||
|
headers=headers,
|
||||||
|
json=body,
|
||||||
|
timeout=aiohttp.ClientTimeout(total=self._config.timeout_seconds),
|
||||||
|
) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
results = await response.json()
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
translation = result["translations"][0]
|
||||||
|
detected_language = result.get("detectedLanguage", {})
|
||||||
|
|
||||||
|
all_results.append(
|
||||||
|
TranslationResult(
|
||||||
|
translated_text=translation["text"],
|
||||||
|
source_language=source_language
|
||||||
|
or detected_language.get("language", "unknown"),
|
||||||
|
target_language=target_language,
|
||||||
|
confidence_score=detected_language.get("score", 0.9),
|
||||||
|
provider=self.provider_name,
|
||||||
|
raw_response=result,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Azure batch translation failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
return all_results
|
||||||
69
cognee/tasks/translation/providers/base.py
Normal file
69
cognee/tasks/translation/providers/base.py
Normal file
|
|
@ -0,0 +1,69 @@
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TranslationResult:
|
||||||
|
"""Result of a translation operation."""
|
||||||
|
|
||||||
|
translated_text: str
|
||||||
|
source_language: str
|
||||||
|
target_language: str
|
||||||
|
confidence_score: float
|
||||||
|
provider: str
|
||||||
|
raw_response: Optional[dict] = None
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationProvider(ABC):
|
||||||
|
"""Abstract base class for translation providers."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def provider_name(self) -> str:
|
||||||
|
"""Return the name of this translation provider."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def translate(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
target_language: str = "en",
|
||||||
|
source_language: Optional[str] = None,
|
||||||
|
) -> TranslationResult:
|
||||||
|
"""
|
||||||
|
Translate text to the target language.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to translate
|
||||||
|
target_language: Target language code (default: "en")
|
||||||
|
source_language: Source language code (optional, will be auto-detected if not provided)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TranslationResult with translated text and metadata
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def translate_batch(
|
||||||
|
self,
|
||||||
|
texts: list[str],
|
||||||
|
target_language: str = "en",
|
||||||
|
source_language: Optional[str] = None,
|
||||||
|
) -> list[TranslationResult]:
|
||||||
|
"""
|
||||||
|
Translate multiple texts to the target language.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: List of texts to translate
|
||||||
|
target_language: Target language code (default: "en")
|
||||||
|
source_language: Source language code (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of TranslationResult objects
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def is_available(self) -> bool:
|
||||||
|
"""Check if this provider is available (has required credentials)."""
|
||||||
|
return True
|
||||||
159
cognee/tasks/translation/providers/google_provider.py
Normal file
159
cognee/tasks/translation/providers/google_provider.py
Normal file
|
|
@ -0,0 +1,159 @@
|
||||||
|
import asyncio
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
from .base import TranslationProvider, TranslationResult
|
||||||
|
from ..config import get_translation_config
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleTranslationProvider(TranslationProvider):
|
||||||
|
"""
|
||||||
|
Translation provider using Google Cloud Translation API.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
- google-cloud-translate package
|
||||||
|
- GOOGLE_TRANSLATE_API_KEY or GOOGLE_PROJECT_ID environment variable
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._client = None
|
||||||
|
self._config = get_translation_config()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def provider_name(self) -> str:
|
||||||
|
return "google"
|
||||||
|
|
||||||
|
def _get_client(self):
|
||||||
|
"""Lazy initialization of Google Translate client."""
|
||||||
|
if self._client is None:
|
||||||
|
try:
|
||||||
|
from google.cloud import translate_v2 as translate
|
||||||
|
|
||||||
|
self._client = translate.Client()
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"google-cloud-translate is required for Google translation. "
|
||||||
|
"Install it with: pip install google-cloud-translate"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to initialize Google Translate client: {e}")
|
||||||
|
raise
|
||||||
|
return self._client
|
||||||
|
|
||||||
|
def is_available(self) -> bool:
|
||||||
|
"""Check if Google Translate is available."""
|
||||||
|
try:
|
||||||
|
self._get_client()
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def translate(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
target_language: str = "en",
|
||||||
|
source_language: Optional[str] = None,
|
||||||
|
) -> TranslationResult:
|
||||||
|
"""
|
||||||
|
Translate text using Google Translate API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to translate
|
||||||
|
target_language: Target language code (default: "en")
|
||||||
|
source_language: Source language code (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TranslationResult with translated text and metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
client = self._get_client()
|
||||||
|
|
||||||
|
# Run in thread pool since google-cloud-translate is synchronous
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
if source_language:
|
||||||
|
result = await loop.run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: client.translate(
|
||||||
|
text, target_language=target_language, source_language=source_language
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
result = await loop.run_in_executor(
|
||||||
|
None, lambda: client.translate(text, target_language=target_language)
|
||||||
|
)
|
||||||
|
|
||||||
|
detected_language = result.get("detectedSourceLanguage", source_language or "unknown")
|
||||||
|
|
||||||
|
return TranslationResult(
|
||||||
|
translated_text=result["translatedText"],
|
||||||
|
source_language=detected_language,
|
||||||
|
target_language=target_language,
|
||||||
|
confidence_score=0.9, # Google Translate is generally reliable
|
||||||
|
provider=self.provider_name,
|
||||||
|
raw_response=result,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Google translation failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
async def translate_batch(
|
||||||
|
self,
|
||||||
|
texts: list[str],
|
||||||
|
target_language: str = "en",
|
||||||
|
source_language: Optional[str] = None,
|
||||||
|
) -> list[TranslationResult]:
|
||||||
|
"""
|
||||||
|
Translate multiple texts using Google Translate API.
|
||||||
|
|
||||||
|
Google Translate supports batch translation natively.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: List of texts to translate
|
||||||
|
target_language: Target language code
|
||||||
|
source_language: Source language code (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of TranslationResult objects
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
client = self._get_client()
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
if source_language:
|
||||||
|
results = await loop.run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: client.translate(
|
||||||
|
texts, target_language=target_language, source_language=source_language
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
results = await loop.run_in_executor(
|
||||||
|
None, lambda: client.translate(texts, target_language=target_language)
|
||||||
|
)
|
||||||
|
|
||||||
|
translation_results = []
|
||||||
|
for result in results:
|
||||||
|
detected_language = result.get(
|
||||||
|
"detectedSourceLanguage", source_language or "unknown"
|
||||||
|
)
|
||||||
|
translation_results.append(
|
||||||
|
TranslationResult(
|
||||||
|
translated_text=result["translatedText"],
|
||||||
|
source_language=detected_language,
|
||||||
|
target_language=target_language,
|
||||||
|
confidence_score=0.9,
|
||||||
|
provider=self.provider_name,
|
||||||
|
raw_response=result,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return translation_results
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Google batch translation failed: {e}")
|
||||||
|
raise
|
||||||
107
cognee/tasks/translation/providers/openai_provider.py
Normal file
107
cognee/tasks/translation/providers/openai_provider.py
Normal file
|
|
@ -0,0 +1,107 @@
|
||||||
|
import asyncio
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
||||||
|
from cognee.infrastructure.llm.prompts import read_query_prompt
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
from .base import TranslationProvider, TranslationResult
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationOutput(BaseModel):
|
||||||
|
"""Pydantic model for structured translation output from LLM."""
|
||||||
|
|
||||||
|
translated_text: str
|
||||||
|
detected_source_language: str
|
||||||
|
translation_notes: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAITranslationProvider(TranslationProvider):
|
||||||
|
"""
|
||||||
|
Translation provider using OpenAI's LLM for translation.
|
||||||
|
|
||||||
|
This provider leverages the existing LLM infrastructure in Cognee
|
||||||
|
to perform translations using GPT models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def provider_name(self) -> str:
|
||||||
|
return "openai"
|
||||||
|
|
||||||
|
async def translate(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
target_language: str = "en",
|
||||||
|
source_language: Optional[str] = None,
|
||||||
|
) -> TranslationResult:
|
||||||
|
"""
|
||||||
|
Translate text using OpenAI's LLM.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to translate
|
||||||
|
target_language: Target language code (default: "en")
|
||||||
|
source_language: Source language code (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TranslationResult with translated text and metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
system_prompt = read_query_prompt("translate_content.txt")
|
||||||
|
|
||||||
|
# Build the input with context
|
||||||
|
if source_language:
|
||||||
|
input_text = (
|
||||||
|
f"Translate the following text from {source_language} to {target_language}.\n\n"
|
||||||
|
f"Text to translate:\n{text}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
input_text = (
|
||||||
|
f"Translate the following text to {target_language}. "
|
||||||
|
f"First detect the source language.\n\n"
|
||||||
|
f"Text to translate:\n{text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await LLMGateway.acreate_structured_output(
|
||||||
|
text_input=input_text,
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
response_model=TranslationOutput,
|
||||||
|
)
|
||||||
|
|
||||||
|
return TranslationResult(
|
||||||
|
translated_text=result.translated_text,
|
||||||
|
source_language=source_language or result.detected_source_language,
|
||||||
|
target_language=target_language,
|
||||||
|
confidence_score=0.95, # LLM translations are generally high quality
|
||||||
|
provider=self.provider_name,
|
||||||
|
raw_response={"notes": result.translation_notes},
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"OpenAI translation failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
async def translate_batch(
|
||||||
|
self,
|
||||||
|
texts: list[str],
|
||||||
|
target_language: str = "en",
|
||||||
|
source_language: Optional[str] = None,
|
||||||
|
) -> list[TranslationResult]:
|
||||||
|
"""
|
||||||
|
Translate multiple texts using OpenAI's LLM.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: List of texts to translate
|
||||||
|
target_language: Target language code
|
||||||
|
source_language: Source language code (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of TranslationResult objects
|
||||||
|
"""
|
||||||
|
tasks = [
|
||||||
|
self.translate(text, target_language, source_language) for text in texts
|
||||||
|
]
|
||||||
|
return await asyncio.gather(*tasks)
|
||||||
265
cognee/tasks/translation/translate_content.py
Normal file
265
cognee/tasks/translation/translate_content.py
Normal file
|
|
@ -0,0 +1,265 @@
|
||||||
|
import asyncio
|
||||||
|
from typing import List, Optional
|
||||||
|
from uuid import uuid5
|
||||||
|
|
||||||
|
from cognee.modules.chunking.models import DocumentChunk
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
from .config import get_translation_config, TranslationProviderType
|
||||||
|
from .detect_language import detect_language_async, LanguageDetectionResult
|
||||||
|
from .exceptions import TranslationError, LanguageDetectionError
|
||||||
|
from .models import TranslatedContent, LanguageMetadata
|
||||||
|
from .providers import get_translation_provider, TranslationResult
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def translate_content(
|
||||||
|
data_chunks: List[DocumentChunk],
|
||||||
|
target_language: str = "en",
|
||||||
|
translation_provider: TranslationProviderType = None,
|
||||||
|
confidence_threshold: float = None,
|
||||||
|
skip_if_target_language: bool = True,
|
||||||
|
preserve_original: bool = True,
|
||||||
|
) -> List[DocumentChunk]:
|
||||||
|
"""
|
||||||
|
Translate non-English content to the target language.
|
||||||
|
|
||||||
|
This task detects the language of each document chunk and translates
|
||||||
|
non-target-language content using the specified translation provider.
|
||||||
|
Original text is preserved alongside translated versions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_chunks: List of DocumentChunk objects to process
|
||||||
|
target_language: Target language code (default: "en" for English)
|
||||||
|
translation_provider: Translation service to use ("openai", "google", "azure")
|
||||||
|
If not provided, uses config default
|
||||||
|
confidence_threshold: Minimum confidence for language detection (0.0 to 1.0)
|
||||||
|
If not provided, uses config default
|
||||||
|
skip_if_target_language: If True, skip chunks already in target language
|
||||||
|
preserve_original: If True, store original text in TranslatedContent
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of DocumentChunk objects with translated content.
|
||||||
|
Chunks that required translation will have TranslatedContent
|
||||||
|
objects in their 'contains' list.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
from cognee.tasks.translation import translate_content
|
||||||
|
|
||||||
|
# Translate chunks using default settings
|
||||||
|
translated_chunks = await translate_content(chunks)
|
||||||
|
|
||||||
|
# Translate with specific provider
|
||||||
|
translated_chunks = await translate_content(
|
||||||
|
chunks,
|
||||||
|
translation_provider="openai",
|
||||||
|
confidence_threshold=0.9
|
||||||
|
)
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
if not isinstance(data_chunks, list):
|
||||||
|
raise TranslationError("data_chunks must be a list")
|
||||||
|
|
||||||
|
if len(data_chunks) == 0:
|
||||||
|
return data_chunks
|
||||||
|
|
||||||
|
# Get configuration
|
||||||
|
config = get_translation_config()
|
||||||
|
provider_name = translation_provider or config.translation_provider
|
||||||
|
threshold = confidence_threshold or config.confidence_threshold
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Starting translation task for {len(data_chunks)} chunks "
|
||||||
|
f"using {provider_name} provider, target language: {target_language}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get the translation provider
|
||||||
|
provider = get_translation_provider(provider_name)
|
||||||
|
|
||||||
|
# Process chunks
|
||||||
|
processed_chunks = []
|
||||||
|
|
||||||
|
for chunk in data_chunks:
|
||||||
|
if not hasattr(chunk, "text") or not chunk.text:
|
||||||
|
processed_chunks.append(chunk)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Detect language
|
||||||
|
detection = await detect_language_async(
|
||||||
|
chunk.text, target_language, threshold
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create language metadata
|
||||||
|
language_metadata = LanguageMetadata(
|
||||||
|
id=uuid5(chunk.id, "LanguageMetadata"),
|
||||||
|
content_id=chunk.id,
|
||||||
|
detected_language=detection.language_code,
|
||||||
|
language_confidence=detection.confidence,
|
||||||
|
requires_translation=detection.requires_translation,
|
||||||
|
character_count=detection.character_count,
|
||||||
|
language_name=detection.language_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Skip if already in target language
|
||||||
|
if not detection.requires_translation:
|
||||||
|
if skip_if_target_language:
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping chunk {chunk.id}: already in target language "
|
||||||
|
f"({detection.language_code})"
|
||||||
|
)
|
||||||
|
# Add language metadata to chunk
|
||||||
|
_add_to_chunk_contains(chunk, language_metadata)
|
||||||
|
processed_chunks.append(chunk)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Translate the content
|
||||||
|
logger.debug(
|
||||||
|
f"Translating chunk {chunk.id} from {detection.language_code} "
|
||||||
|
f"to {target_language}"
|
||||||
|
)
|
||||||
|
|
||||||
|
translation_result = await provider.translate(
|
||||||
|
text=chunk.text,
|
||||||
|
target_language=target_language,
|
||||||
|
source_language=detection.language_code,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create TranslatedContent data point
|
||||||
|
translated_content = TranslatedContent(
|
||||||
|
id=uuid5(chunk.id, "TranslatedContent"),
|
||||||
|
original_chunk_id=chunk.id,
|
||||||
|
original_text=chunk.text if preserve_original else "",
|
||||||
|
translated_text=translation_result.translated_text,
|
||||||
|
source_language=translation_result.source_language,
|
||||||
|
target_language=translation_result.target_language,
|
||||||
|
translation_provider=translation_result.provider,
|
||||||
|
confidence_score=translation_result.confidence_score,
|
||||||
|
translated_from=chunk,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update chunk text with translated content
|
||||||
|
chunk.text = translation_result.translated_text
|
||||||
|
|
||||||
|
# Add metadata to chunk's contains list
|
||||||
|
_add_to_chunk_contains(chunk, language_metadata)
|
||||||
|
_add_to_chunk_contains(chunk, translated_content)
|
||||||
|
|
||||||
|
processed_chunks.append(chunk)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Successfully translated chunk {chunk.id}: "
|
||||||
|
f"{detection.language_code} -> {target_language}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except LanguageDetectionError as e:
|
||||||
|
logger.warning(f"Language detection failed for chunk {chunk.id}: {e}")
|
||||||
|
processed_chunks.append(chunk)
|
||||||
|
except TranslationError as e:
|
||||||
|
logger.error(f"Translation failed for chunk {chunk.id}: {e}")
|
||||||
|
processed_chunks.append(chunk)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error processing chunk {chunk.id}: {e}")
|
||||||
|
processed_chunks.append(chunk)
|
||||||
|
|
||||||
|
logger.info(f"Translation task completed for {len(processed_chunks)} chunks")
|
||||||
|
return processed_chunks
|
||||||
|
|
||||||
|
|
||||||
|
def _add_to_chunk_contains(chunk: DocumentChunk, item) -> None:
|
||||||
|
"""Helper to add an item to a chunk's contains list."""
|
||||||
|
if chunk.contains is None:
|
||||||
|
chunk.contains = []
|
||||||
|
chunk.contains.append(item)
|
||||||
|
|
||||||
|
|
||||||
|
async def translate_text(
|
||||||
|
text: str,
|
||||||
|
target_language: str = "en",
|
||||||
|
translation_provider: TranslationProviderType = None,
|
||||||
|
source_language: Optional[str] = None,
|
||||||
|
) -> TranslationResult:
|
||||||
|
"""
|
||||||
|
Translate a single text string.
|
||||||
|
|
||||||
|
This is a convenience function for translating individual texts
|
||||||
|
without creating DocumentChunk objects.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to translate
|
||||||
|
target_language: Target language code (default: "en")
|
||||||
|
translation_provider: Translation service to use
|
||||||
|
source_language: Source language code (optional, auto-detected if not provided)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TranslationResult with translated text and metadata
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
from cognee.tasks.translation import translate_text
|
||||||
|
|
||||||
|
result = await translate_text(
|
||||||
|
"Bonjour le monde!",
|
||||||
|
target_language="en"
|
||||||
|
)
|
||||||
|
print(result.translated_text) # "Hello world!"
|
||||||
|
print(result.source_language) # "fr"
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
config = get_translation_config()
|
||||||
|
provider_name = translation_provider or config.translation_provider
|
||||||
|
|
||||||
|
provider = get_translation_provider(provider_name)
|
||||||
|
|
||||||
|
return await provider.translate(
|
||||||
|
text=text,
|
||||||
|
target_language=target_language,
|
||||||
|
source_language=source_language,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def batch_translate_texts(
|
||||||
|
texts: List[str],
|
||||||
|
target_language: str = "en",
|
||||||
|
translation_provider: TranslationProviderType = None,
|
||||||
|
source_language: Optional[str] = None,
|
||||||
|
) -> List[TranslationResult]:
|
||||||
|
"""
|
||||||
|
Translate multiple text strings in batch.
|
||||||
|
|
||||||
|
This is more efficient than translating texts individually,
|
||||||
|
especially for providers that support native batch operations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: List of texts to translate
|
||||||
|
target_language: Target language code (default: "en")
|
||||||
|
translation_provider: Translation service to use
|
||||||
|
source_language: Source language code (optional)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of TranslationResult objects
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
from cognee.tasks.translation import batch_translate_texts
|
||||||
|
|
||||||
|
results = await batch_translate_texts(
|
||||||
|
["Hola", "¿Cómo estás?", "Adiós"],
|
||||||
|
target_language="en"
|
||||||
|
)
|
||||||
|
for result in results:
|
||||||
|
print(f"{result.source_language}: {result.translated_text}")
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
config = get_translation_config()
|
||||||
|
provider_name = translation_provider or config.translation_provider
|
||||||
|
|
||||||
|
provider = get_translation_provider(provider_name)
|
||||||
|
|
||||||
|
return await provider.translate_batch(
|
||||||
|
texts=texts,
|
||||||
|
target_language=target_language,
|
||||||
|
source_language=source_language,
|
||||||
|
)
|
||||||
Loading…
Add table
Reference in a new issue