- Add translation module with OpenAI, Google, Azure provider support - Implement language detection using langdetect - Add TranslatedContent and LanguageMetadata models - Integrate translation task into cognify pipeline - Add auto_translate parameter to cognify() function - Preserve original text alongside translations - Support custom translation providers and target languages
190 lines
5.3 KiB
Python
190 lines
5.3 KiB
Python
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
from cognee.shared.logging_utils import get_logger
|
|
|
|
from .config import get_translation_config
|
|
from .exceptions import LanguageDetectionError
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
# ISO 639-1 language code to name mapping
|
|
LANGUAGE_NAMES = {
|
|
"af": "Afrikaans",
|
|
"ar": "Arabic",
|
|
"bg": "Bulgarian",
|
|
"bn": "Bengali",
|
|
"ca": "Catalan",
|
|
"cs": "Czech",
|
|
"cy": "Welsh",
|
|
"da": "Danish",
|
|
"de": "German",
|
|
"el": "Greek",
|
|
"en": "English",
|
|
"es": "Spanish",
|
|
"et": "Estonian",
|
|
"fa": "Persian",
|
|
"fi": "Finnish",
|
|
"fr": "French",
|
|
"gu": "Gujarati",
|
|
"he": "Hebrew",
|
|
"hi": "Hindi",
|
|
"hr": "Croatian",
|
|
"hu": "Hungarian",
|
|
"id": "Indonesian",
|
|
"it": "Italian",
|
|
"ja": "Japanese",
|
|
"kn": "Kannada",
|
|
"ko": "Korean",
|
|
"lt": "Lithuanian",
|
|
"lv": "Latvian",
|
|
"mk": "Macedonian",
|
|
"ml": "Malayalam",
|
|
"mr": "Marathi",
|
|
"ne": "Nepali",
|
|
"nl": "Dutch",
|
|
"no": "Norwegian",
|
|
"pa": "Punjabi",
|
|
"pl": "Polish",
|
|
"pt": "Portuguese",
|
|
"ro": "Romanian",
|
|
"ru": "Russian",
|
|
"sk": "Slovak",
|
|
"sl": "Slovenian",
|
|
"so": "Somali",
|
|
"sq": "Albanian",
|
|
"sv": "Swedish",
|
|
"sw": "Swahili",
|
|
"ta": "Tamil",
|
|
"te": "Telugu",
|
|
"th": "Thai",
|
|
"tl": "Tagalog",
|
|
"tr": "Turkish",
|
|
"uk": "Ukrainian",
|
|
"ur": "Urdu",
|
|
"vi": "Vietnamese",
|
|
"zh-cn": "Chinese (Simplified)",
|
|
"zh-tw": "Chinese (Traditional)",
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class LanguageDetectionResult:
|
|
"""Result of language detection."""
|
|
|
|
language_code: str
|
|
language_name: str
|
|
confidence: float
|
|
requires_translation: bool
|
|
character_count: int
|
|
|
|
|
|
def get_language_name(language_code: str) -> str:
|
|
"""Get the human-readable name for a language code."""
|
|
return LANGUAGE_NAMES.get(language_code.lower(), language_code)
|
|
|
|
|
|
def detect_language(
|
|
text: str,
|
|
target_language: str = "en",
|
|
confidence_threshold: float = None,
|
|
) -> LanguageDetectionResult:
|
|
"""
|
|
Detect the language of the given text.
|
|
|
|
Uses the langdetect library which is already a dependency of cognee.
|
|
|
|
Args:
|
|
text: The text to analyze
|
|
target_language: The target language for translation comparison
|
|
confidence_threshold: Minimum confidence to consider detection reliable
|
|
|
|
Returns:
|
|
LanguageDetectionResult with language info and translation requirement
|
|
|
|
Raises:
|
|
LanguageDetectionError: If language detection fails
|
|
"""
|
|
config = get_translation_config()
|
|
threshold = confidence_threshold or config.confidence_threshold
|
|
|
|
# Handle empty or very short text
|
|
if not text or len(text.strip()) < config.min_text_length_for_detection:
|
|
if config.skip_detection_for_short_text:
|
|
return LanguageDetectionResult(
|
|
language_code="unknown",
|
|
language_name="Unknown",
|
|
confidence=0.0,
|
|
requires_translation=False,
|
|
character_count=len(text) if text else 0,
|
|
)
|
|
else:
|
|
raise LanguageDetectionError(
|
|
f"Text too short for reliable language detection: {len(text)} characters"
|
|
)
|
|
|
|
try:
|
|
from langdetect import detect_langs, LangDetectException
|
|
except ImportError:
|
|
raise LanguageDetectionError(
|
|
"langdetect is required for language detection. Install it with: pip install langdetect"
|
|
)
|
|
|
|
try:
|
|
# Get detection results with probabilities
|
|
detections = detect_langs(text)
|
|
|
|
if not detections:
|
|
raise LanguageDetectionError("No language detected")
|
|
|
|
# Get the most likely language
|
|
best_detection = detections[0]
|
|
language_code = best_detection.lang
|
|
confidence = best_detection.prob
|
|
|
|
# Check if translation is needed
|
|
requires_translation = (
|
|
language_code.lower() != target_language.lower() and confidence >= threshold
|
|
)
|
|
|
|
return LanguageDetectionResult(
|
|
language_code=language_code,
|
|
language_name=get_language_name(language_code),
|
|
confidence=confidence,
|
|
requires_translation=requires_translation,
|
|
character_count=len(text),
|
|
)
|
|
|
|
except LangDetectException as e:
|
|
logger.warning(f"Language detection failed: {e}")
|
|
raise LanguageDetectionError(f"Language detection failed: {e}", original_error=e)
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error during language detection: {e}")
|
|
raise LanguageDetectionError(
|
|
f"Unexpected error during language detection: {e}", original_error=e
|
|
)
|
|
|
|
|
|
async def detect_language_async(
|
|
text: str,
|
|
target_language: str = "en",
|
|
confidence_threshold: float = None,
|
|
) -> LanguageDetectionResult:
|
|
"""
|
|
Async wrapper for language detection.
|
|
|
|
Args:
|
|
text: The text to analyze
|
|
target_language: The target language for translation comparison
|
|
confidence_threshold: Minimum confidence to consider detection reliable
|
|
|
|
Returns:
|
|
LanguageDetectionResult with language info and translation requirement
|
|
"""
|
|
import asyncio
|
|
|
|
loop = asyncio.get_event_loop()
|
|
return await loop.run_in_executor(
|
|
None, detect_language, text, target_language, confidence_threshold
|
|
)
|