From a7eba61067c30f93d9efdc8bdc79710925f30f41 Mon Sep 17 00:00:00 2001 From: gooodboyAo <107522555+gooodboyAo@users.noreply.github.com> Date: Wed, 6 Aug 2025 10:36:50 +0800 Subject: [PATCH] FIX: If chunk["content_with_weight"] contains one or more unpaired surrogate characters (such as incomplete emoji or other special characters), then calling .encode("utf-8") directly will raise a UnicodeEncodeError. (#9246) FIX: If chunk["content_with_weight"] contains one or more unpaired surrogate characters (such as incomplete emoji or other special characters), then calling .encode("utf-8") directly will raise a UnicodeEncodeError. ### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/svr/task_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 23a0dc214..f5799a964 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -284,7 +284,7 @@ async def build_chunks(task, progress_callback): try: d = copy.deepcopy(document) d.update(chunk) - d["id"] = xxhash.xxh64((chunk["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest() + d["id"] = xxhash.xxh64((chunk["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest() d["create_time"] = str(datetime.now()).replace("T", " ")[:19] d["create_timestamp_flt"] = datetime.now().timestamp() if not d.get("image"):