Fix migration to reload sanitized data and prevent memory corruption

• Reload cleaned data after sanitization • Update shared memory with clean data • Add specific surrogate char tests • Test migration sanitization flow • Prevent dirty data in memory
2025-11-12 16:16:28 +08:00 · 2025-11-12 16:16:28 +08:00 · cca0800ed4
commit cca0800ed4
parent 7f54f47093
2 changed files with 102 additions and 3 deletions
--- a/lightrag/kg/json_kv_impl.py
+++ b/lightrag/kg/json_kv_impl.py
@ -237,7 +237,7 @@ class JsonKVStorage(BaseKVStorage):
            data: Original data dictionary that may contain legacy structure
        Returns:
-            Migrated data dictionary with flattened cache keys
+            Migrated data dictionary with flattened cache keys (sanitized if needed)
        """
        from lightrag.utils import generate_cache_key
@ -274,8 +274,17 @@ class JsonKVStorage(BaseKVStorage):
            logger.info(
                f"[{self.workspace}] Migrated {migration_count} legacy cache entries to flattened structure"
            )
-            # Persist migrated data immediately
+            # Persist migrated data immediately and check if sanitization was applied
-            write_json(migrated_data, self._file_name)
+            needs_reload = write_json(migrated_data, self._file_name)
            # If data was sanitized during write, reload cleaned data
            if needs_reload:
                logger.info(
                    f"[{self.workspace}] Reloading sanitized migration data for {self.namespace}"
                )
                cleaned_data = load_json(self._file_name)
                if cleaned_data:
                    return cleaned_data  # Return cleaned data to update shared memory
        return migrated_data
--- a/tests/test_write_json_optimization.py
+++ b/tests/test_write_json_optimization.py
@ -208,6 +208,88 @@ class TestWriteJsonOptimization:
        finally:
            os.unlink(temp_file)
    def test_specific_surrogate_udc9a(self):
        """Test specific surrogate character \\udc9a mentioned in the issue"""
        # Test the exact surrogate character from the error message:
        # UnicodeEncodeError: 'utf-8' codec can't encode character '\\udc9a'
        data_with_udc9a = {
            "text": "Some text with surrogate\udc9acharacter",
            "position": 201,  # As mentioned in the error
            "clean_field": "Normal text",
        }
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
            temp_file = f.name
        try:
            # Write data - should trigger sanitization
            needs_reload = write_json(data_with_udc9a, temp_file)
            assert needs_reload, "Data with \\udc9a should trigger sanitization"
            # Verify surrogate was removed
            loaded_data = load_json(temp_file)
            assert loaded_data is not None
            assert "\udc9a" not in loaded_data["text"], "\\udc9a should be removed"
            assert (
                loaded_data["clean_field"] == "Normal text"
            ), "Clean fields should remain"
        finally:
            os.unlink(temp_file)
    def test_migration_with_surrogate_sanitization(self):
        """Test that migration process handles surrogate characters correctly
        This test simulates the scenario where legacy cache contains surrogate
        characters and ensures they are cleaned during migration.
        """
        # Simulate legacy cache data with surrogate characters
        legacy_data_with_surrogates = {
            "cache_entry_1": {
                "return": "Result with\ud800surrogate",
                "cache_type": "extract",
                "original_prompt": "Some\udc9aprompt",
            },
            "cache_entry_2": {
                "return": "Clean result",
                "cache_type": "query",
                "original_prompt": "Clean prompt",
            },
        }
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
            temp_file = f.name
        try:
            # First write the dirty data directly (simulating legacy cache file)
            # Use custom encoder to force write even with surrogates
            with open(temp_file, "w", encoding="utf-8") as f:
                json.dump(
                    legacy_data_with_surrogates,
                    f,
                    cls=SanitizingJSONEncoder,
                    ensure_ascii=False,
                )
            # Load and verify surrogates were cleaned during initial write
            loaded_data = load_json(temp_file)
            assert loaded_data is not None
            # The data should be sanitized
            assert (
                "\ud800" not in loaded_data["cache_entry_1"]["return"]
            ), "Surrogate in return should be removed"
            assert (
                "\udc9a" not in loaded_data["cache_entry_1"]["original_prompt"]
            ), "Surrogate in prompt should be removed"
            # Clean data should remain unchanged
            assert (
                loaded_data["cache_entry_2"]["return"] == "Clean result"
            ), "Clean data should remain"
        finally:
            os.unlink(temp_file)
 if __name__ == "__main__":
    # Run tests
@ -241,4 +323,12 @@ if __name__ == "__main__":
    test.test_empty_and_none_strings()
    print("✓ Passed")
    print("Running test_specific_surrogate_udc9a...")
    test.test_specific_surrogate_udc9a()
    print("✓ Passed")
    print("Running test_migration_with_surrogate_sanitization...")
    test.test_migration_with_surrogate_sanitization()
    print("✓ Passed")
    print("\n✅ All tests passed!")