cognee/cognee/tasks/documents/detect_language.py
Boris ebf1f81b35
fix: code cleanup [COG-781] (#667)
<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin
2025-03-26 18:32:43 +01:00

39 lines
1.2 KiB
Python

from cognee.shared.logging_utils import get_logger, ERROR
logger = get_logger(level=ERROR)
async def detect_language(text: str):
"""
Detect the language of the given text and return its ISO 639-1 language code.
If the detected language is Croatian ("hr"), it maps to Serbian ("sr").
The text is trimmed to the first 100 characters for efficient processing.
Parameters:
text (str): The text for language detection.
Returns:
str: The ISO 639-1 language code of the detected language, or "None" in case of an error.
"""
from langdetect import detect, LangDetectException
# Trim the text to the first 100 characters
trimmed_text = text[:100]
try:
# Detect the language using langdetect
detected_lang_iso639_1 = detect(trimmed_text)
# Special case: map "hr" (Croatian) to "sr" (Serbian ISO 639-2)
if detected_lang_iso639_1 == "hr":
return "sr"
return detected_lang_iso639_1
except LangDetectException as e:
logger.error(f"Language detection error: {e}")
except Exception as e:
logger.error(f"Unexpected error: {e}")
raise e
return None