refactor: simplify text encoding by removing redundant safe_encode_for_llm
This commit is contained in:
parent
806081645f
commit
ced3aef7cb
1 changed files with 2 additions and 47 deletions
|
|
@ -1422,7 +1422,7 @@ async def use_llm_func_with_cache(
|
||||||
LLM response text
|
LLM response text
|
||||||
"""
|
"""
|
||||||
# Sanitize input text to prevent UTF-8 encoding errors for all LLM providers
|
# Sanitize input text to prevent UTF-8 encoding errors for all LLM providers
|
||||||
safe_input_text = safe_encode_for_llm(input_text, f"llm_input_{cache_type}")
|
safe_input_text = sanitize_text_for_encoding(input_text)
|
||||||
|
|
||||||
# Sanitize history messages if provided
|
# Sanitize history messages if provided
|
||||||
safe_history_messages = None
|
safe_history_messages = None
|
||||||
|
|
@ -1431,9 +1431,7 @@ async def use_llm_func_with_cache(
|
||||||
for i, msg in enumerate(history_messages):
|
for i, msg in enumerate(history_messages):
|
||||||
safe_msg = msg.copy()
|
safe_msg = msg.copy()
|
||||||
if "content" in safe_msg:
|
if "content" in safe_msg:
|
||||||
safe_msg["content"] = safe_encode_for_llm(
|
safe_msg["content"] = sanitize_text_for_encoding(safe_msg["content"])
|
||||||
safe_msg["content"], f"history_message_{i}"
|
|
||||||
)
|
|
||||||
safe_history_messages.append(safe_msg)
|
safe_history_messages.append(safe_msg)
|
||||||
|
|
||||||
if llm_response_cache:
|
if llm_response_cache:
|
||||||
|
|
@ -1668,49 +1666,6 @@ def sanitize_text_for_encoding(text: str, replacement_char: str = "") -> str:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def safe_encode_for_llm(content: str, context: str = "unknown") -> str:
|
|
||||||
"""Safely encode content for LLM API calls with comprehensive error handling.
|
|
||||||
|
|
||||||
This is the main function to use before sending text to LLM APIs to prevent
|
|
||||||
UTF-8 encoding errors.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
content: Text content to encode safely
|
|
||||||
context: Context description for logging (e.g., "document_chunk", "prompt")
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Safely encoded text that won't cause UTF-8 encoding errors
|
|
||||||
"""
|
|
||||||
if not content:
|
|
||||||
return content
|
|
||||||
|
|
||||||
original_length = len(content)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Apply text sanitization
|
|
||||||
sanitized = sanitize_text_for_encoding(content)
|
|
||||||
|
|
||||||
# Check if any changes were made
|
|
||||||
if len(sanitized) != original_length or sanitized != content:
|
|
||||||
# Count replaced characters (empty replacement chars)
|
|
||||||
replaced_count = original_length - len(sanitized)
|
|
||||||
logger.info(
|
|
||||||
f"Text encoding safety: Removed {replaced_count} problematic chars "
|
|
||||||
f"(original: {original_length} chars, sanitized: {len(sanitized)} chars)"
|
|
||||||
)
|
|
||||||
|
|
||||||
return sanitized
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(
|
|
||||||
f"Text encoding safety: Failed to sanitize {context} content: {str(e)}"
|
|
||||||
)
|
|
||||||
# Return a safe fallback
|
|
||||||
return (
|
|
||||||
f"[CONTENT_SANITIZATION_ERROR: {original_length} characters from {context}]"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def check_storage_env_vars(storage_name: str) -> None:
|
def check_storage_env_vars(storage_name: str) -> None:
|
||||||
"""Check if all required environment variables for storage implementation exist
|
"""Check if all required environment variables for storage implementation exist
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue