From cca0800ed404719b45f51e0382bfb8ccab22d52e Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 12 Nov 2025 16:16:28 +0800
Subject: [PATCH] Fix migration to reload sanitized data and prevent memory
 corruption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Reload cleaned data after sanitization
• Update shared memory with clean data
• Add specific surrogate char tests
• Test migration sanitization flow
• Prevent dirty data in memory
---
 lightrag/kg/json_kv_impl.py           | 15 ++++-
 tests/test_write_json_optimization.py | 90 +++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py
index b3d9a34f..3f99dd4d 100644
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@@ -237,7 +237,7 @@ class JsonKVStorage(BaseKVStorage):
             data: Original data dictionary that may contain legacy structure
 
         Returns:
-            Migrated data dictionary with flattened cache keys
+            Migrated data dictionary with flattened cache keys (sanitized if needed)
         """
         from lightrag.utils import generate_cache_key
 
@@ -274,8 +274,17 @@ class JsonKVStorage(BaseKVStorage):
             logger.info(
                 f"[{self.workspace}] Migrated {migration_count} legacy cache entries to flattened structure"
             )
-            # Persist migrated data immediately
-            write_json(migrated_data, self._file_name)
+            # Persist migrated data immediately and check if sanitization was applied
+            needs_reload = write_json(migrated_data, self._file_name)
+
+            # If data was sanitized during write, reload cleaned data
+            if needs_reload:
+                logger.info(
+                    f"[{self.workspace}] Reloading sanitized migration data for {self.namespace}"
+                )
+                cleaned_data = load_json(self._file_name)
+                if cleaned_data:
+                    return cleaned_data  # Return cleaned data to update shared memory
 
         return migrated_data
 
diff --git a/tests/test_write_json_optimization.py b/tests/test_write_json_optimization.py
index ea555c50..9c4105b9 100644
--- a/tests/test_write_json_optimization.py
+++ b/tests/test_write_json_optimization.py
@@ -208,6 +208,88 @@ class TestWriteJsonOptimization:
         finally:
             os.unlink(temp_file)
 
+    def test_specific_surrogate_udc9a(self):
+        """Test specific surrogate character \\udc9a mentioned in the issue"""
+        # Test the exact surrogate character from the error message:
+        # UnicodeEncodeError: 'utf-8' codec can't encode character '\\udc9a'
+        data_with_udc9a = {
+            "text": "Some text with surrogate\udc9acharacter",
+            "position": 201,  # As mentioned in the error
+            "clean_field": "Normal text",
+        }
+
+        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
+            temp_file = f.name
+
+        try:
+            # Write data - should trigger sanitization
+            needs_reload = write_json(data_with_udc9a, temp_file)
+            assert needs_reload, "Data with \\udc9a should trigger sanitization"
+
+            # Verify surrogate was removed
+            loaded_data = load_json(temp_file)
+            assert loaded_data is not None
+            assert "\udc9a" not in loaded_data["text"], "\\udc9a should be removed"
+            assert (
+                loaded_data["clean_field"] == "Normal text"
+            ), "Clean fields should remain"
+        finally:
+            os.unlink(temp_file)
+
+    def test_migration_with_surrogate_sanitization(self):
+        """Test that migration process handles surrogate characters correctly
+
+        This test simulates the scenario where legacy cache contains surrogate
+        characters and ensures they are cleaned during migration.
+        """
+        # Simulate legacy cache data with surrogate characters
+        legacy_data_with_surrogates = {
+            "cache_entry_1": {
+                "return": "Result with\ud800surrogate",
+                "cache_type": "extract",
+                "original_prompt": "Some\udc9aprompt",
+            },
+            "cache_entry_2": {
+                "return": "Clean result",
+                "cache_type": "query",
+                "original_prompt": "Clean prompt",
+            },
+        }
+
+        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
+            temp_file = f.name
+
+        try:
+            # First write the dirty data directly (simulating legacy cache file)
+            # Use custom encoder to force write even with surrogates
+            with open(temp_file, "w", encoding="utf-8") as f:
+                json.dump(
+                    legacy_data_with_surrogates,
+                    f,
+                    cls=SanitizingJSONEncoder,
+                    ensure_ascii=False,
+                )
+
+            # Load and verify surrogates were cleaned during initial write
+            loaded_data = load_json(temp_file)
+            assert loaded_data is not None
+
+            # The data should be sanitized
+            assert (
+                "\ud800" not in loaded_data["cache_entry_1"]["return"]
+            ), "Surrogate in return should be removed"
+            assert (
+                "\udc9a" not in loaded_data["cache_entry_1"]["original_prompt"]
+            ), "Surrogate in prompt should be removed"
+
+            # Clean data should remain unchanged
+            assert (
+                loaded_data["cache_entry_2"]["return"] == "Clean result"
+            ), "Clean data should remain"
+
+        finally:
+            os.unlink(temp_file)
+
 
 if __name__ == "__main__":
     # Run tests
@@ -241,4 +323,12 @@ if __name__ == "__main__":
     test.test_empty_and_none_strings()
     print("✓ Passed")
 
+    print("Running test_specific_surrogate_udc9a...")
+    test.test_specific_surrogate_udc9a()
+    print("✓ Passed")
+
+    print("Running test_migration_with_surrogate_sanitization...")
+    test.test_migration_with_surrogate_sanitization()
+    print("✓ Passed")
+
     print("\n✅ All tests passed!")