From 64058c771f5a31a1345ab803499f7fb93b8fbb30 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 19 Aug 2025 17:19:39 +0800 Subject: [PATCH] Refactor: Harden `compute_args_hash` against Unicode errors --- lightrag/utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index a9950f9b..bbafd9f1 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -265,7 +265,8 @@ class EmbeddingFunc: def compute_args_hash(*args: Any) -> str: - """Compute a hash for the given arguments. + """Compute a hash for the given arguments with safe Unicode handling. + Args: *args: Arguments to hash Returns: @@ -276,8 +277,14 @@ def compute_args_hash(*args: Any) -> str: # Convert all arguments to strings and join them args_str = "".join([str(arg) for arg in args]) - # Compute MD5 hash - return hashlib.md5(args_str.encode()).hexdigest() + # Use 'replace' error handling to safely encode problematic Unicode characters + # This replaces invalid characters with Unicode replacement character (U+FFFD) + try: + return hashlib.md5(args_str.encode("utf-8")).hexdigest() + except UnicodeEncodeError: + # Handle surrogate characters and other encoding issues + safe_bytes = args_str.encode("utf-8", errors="replace") + return hashlib.md5(safe_bytes).hexdigest() def generate_cache_key(mode: str, cache_type: str, hash_value: str) -> str: