cognee/cognee/tasks/translation/detect_language.py
andikarachman 8a794ea85d feat(translation): implement multilingual content translation task
- Add translation module with OpenAI, Google, Azure provider support
- Implement language detection using langdetect
- Add TranslatedContent and LanguageMetadata models
- Integrate translation task into cognify pipeline
- Add auto_translate parameter to cognify() function
- Preserve original text alongside translations
- Support custom translation providers and target languages
2026-01-04 12:06:50 +07:00

190 lines
5.3 KiB
Python

from dataclasses import dataclass
from typing import Optional
from cognee.shared.logging_utils import get_logger
from .config import get_translation_config
from .exceptions import LanguageDetectionError
logger = get_logger(__name__)
# ISO 639-1 language code to name mapping
LANGUAGE_NAMES = {
"af": "Afrikaans",
"ar": "Arabic",
"bg": "Bulgarian",
"bn": "Bengali",
"ca": "Catalan",
"cs": "Czech",
"cy": "Welsh",
"da": "Danish",
"de": "German",
"el": "Greek",
"en": "English",
"es": "Spanish",
"et": "Estonian",
"fa": "Persian",
"fi": "Finnish",
"fr": "French",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
"hr": "Croatian",
"hu": "Hungarian",
"id": "Indonesian",
"it": "Italian",
"ja": "Japanese",
"kn": "Kannada",
"ko": "Korean",
"lt": "Lithuanian",
"lv": "Latvian",
"mk": "Macedonian",
"ml": "Malayalam",
"mr": "Marathi",
"ne": "Nepali",
"nl": "Dutch",
"no": "Norwegian",
"pa": "Punjabi",
"pl": "Polish",
"pt": "Portuguese",
"ro": "Romanian",
"ru": "Russian",
"sk": "Slovak",
"sl": "Slovenian",
"so": "Somali",
"sq": "Albanian",
"sv": "Swedish",
"sw": "Swahili",
"ta": "Tamil",
"te": "Telugu",
"th": "Thai",
"tl": "Tagalog",
"tr": "Turkish",
"uk": "Ukrainian",
"ur": "Urdu",
"vi": "Vietnamese",
"zh-cn": "Chinese (Simplified)",
"zh-tw": "Chinese (Traditional)",
}
@dataclass
class LanguageDetectionResult:
"""Result of language detection."""
language_code: str
language_name: str
confidence: float
requires_translation: bool
character_count: int
def get_language_name(language_code: str) -> str:
"""Get the human-readable name for a language code."""
return LANGUAGE_NAMES.get(language_code.lower(), language_code)
def detect_language(
text: str,
target_language: str = "en",
confidence_threshold: float = None,
) -> LanguageDetectionResult:
"""
Detect the language of the given text.
Uses the langdetect library which is already a dependency of cognee.
Args:
text: The text to analyze
target_language: The target language for translation comparison
confidence_threshold: Minimum confidence to consider detection reliable
Returns:
LanguageDetectionResult with language info and translation requirement
Raises:
LanguageDetectionError: If language detection fails
"""
config = get_translation_config()
threshold = confidence_threshold or config.confidence_threshold
# Handle empty or very short text
if not text or len(text.strip()) < config.min_text_length_for_detection:
if config.skip_detection_for_short_text:
return LanguageDetectionResult(
language_code="unknown",
language_name="Unknown",
confidence=0.0,
requires_translation=False,
character_count=len(text) if text else 0,
)
else:
raise LanguageDetectionError(
f"Text too short for reliable language detection: {len(text)} characters"
)
try:
from langdetect import detect_langs, LangDetectException
except ImportError:
raise LanguageDetectionError(
"langdetect is required for language detection. Install it with: pip install langdetect"
)
try:
# Get detection results with probabilities
detections = detect_langs(text)
if not detections:
raise LanguageDetectionError("No language detected")
# Get the most likely language
best_detection = detections[0]
language_code = best_detection.lang
confidence = best_detection.prob
# Check if translation is needed
requires_translation = (
language_code.lower() != target_language.lower() and confidence >= threshold
)
return LanguageDetectionResult(
language_code=language_code,
language_name=get_language_name(language_code),
confidence=confidence,
requires_translation=requires_translation,
character_count=len(text),
)
except LangDetectException as e:
logger.warning(f"Language detection failed: {e}")
raise LanguageDetectionError(f"Language detection failed: {e}", original_error=e)
except Exception as e:
logger.error(f"Unexpected error during language detection: {e}")
raise LanguageDetectionError(
f"Unexpected error during language detection: {e}", original_error=e
)
async def detect_language_async(
text: str,
target_language: str = "en",
confidence_threshold: float = None,
) -> LanguageDetectionResult:
"""
Async wrapper for language detection.
Args:
text: The text to analyze
target_language: The target language for translation comparison
confidence_threshold: Minimum confidence to consider detection reliable
Returns:
LanguageDetectionResult with language info and translation requirement
"""
import asyncio
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None, detect_language, text, target_language, confidence_threshold
)