Fix migration to reload sanitized data and prevent memory corruption
• Reload cleaned data after sanitization • Update shared memory with clean data • Add specific surrogate char tests • Test migration sanitization flow • Prevent dirty data in memory
This commit is contained in:
parent
7f54f47093
commit
cca0800ed4
2 changed files with 102 additions and 3 deletions
|
|
@ -237,7 +237,7 @@ class JsonKVStorage(BaseKVStorage):
|
|||
data: Original data dictionary that may contain legacy structure
|
||||
|
||||
Returns:
|
||||
Migrated data dictionary with flattened cache keys
|
||||
Migrated data dictionary with flattened cache keys (sanitized if needed)
|
||||
"""
|
||||
from lightrag.utils import generate_cache_key
|
||||
|
||||
|
|
@ -274,8 +274,17 @@ class JsonKVStorage(BaseKVStorage):
|
|||
logger.info(
|
||||
f"[{self.workspace}] Migrated {migration_count} legacy cache entries to flattened structure"
|
||||
)
|
||||
# Persist migrated data immediately
|
||||
write_json(migrated_data, self._file_name)
|
||||
# Persist migrated data immediately and check if sanitization was applied
|
||||
needs_reload = write_json(migrated_data, self._file_name)
|
||||
|
||||
# If data was sanitized during write, reload cleaned data
|
||||
if needs_reload:
|
||||
logger.info(
|
||||
f"[{self.workspace}] Reloading sanitized migration data for {self.namespace}"
|
||||
)
|
||||
cleaned_data = load_json(self._file_name)
|
||||
if cleaned_data:
|
||||
return cleaned_data # Return cleaned data to update shared memory
|
||||
|
||||
return migrated_data
|
||||
|
||||
|
|
|
|||
|
|
@ -208,6 +208,88 @@ class TestWriteJsonOptimization:
|
|||
finally:
|
||||
os.unlink(temp_file)
|
||||
|
||||
def test_specific_surrogate_udc9a(self):
|
||||
"""Test specific surrogate character \\udc9a mentioned in the issue"""
|
||||
# Test the exact surrogate character from the error message:
|
||||
# UnicodeEncodeError: 'utf-8' codec can't encode character '\\udc9a'
|
||||
data_with_udc9a = {
|
||||
"text": "Some text with surrogate\udc9acharacter",
|
||||
"position": 201, # As mentioned in the error
|
||||
"clean_field": "Normal text",
|
||||
}
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
||||
temp_file = f.name
|
||||
|
||||
try:
|
||||
# Write data - should trigger sanitization
|
||||
needs_reload = write_json(data_with_udc9a, temp_file)
|
||||
assert needs_reload, "Data with \\udc9a should trigger sanitization"
|
||||
|
||||
# Verify surrogate was removed
|
||||
loaded_data = load_json(temp_file)
|
||||
assert loaded_data is not None
|
||||
assert "\udc9a" not in loaded_data["text"], "\\udc9a should be removed"
|
||||
assert (
|
||||
loaded_data["clean_field"] == "Normal text"
|
||||
), "Clean fields should remain"
|
||||
finally:
|
||||
os.unlink(temp_file)
|
||||
|
||||
def test_migration_with_surrogate_sanitization(self):
|
||||
"""Test that migration process handles surrogate characters correctly
|
||||
|
||||
This test simulates the scenario where legacy cache contains surrogate
|
||||
characters and ensures they are cleaned during migration.
|
||||
"""
|
||||
# Simulate legacy cache data with surrogate characters
|
||||
legacy_data_with_surrogates = {
|
||||
"cache_entry_1": {
|
||||
"return": "Result with\ud800surrogate",
|
||||
"cache_type": "extract",
|
||||
"original_prompt": "Some\udc9aprompt",
|
||||
},
|
||||
"cache_entry_2": {
|
||||
"return": "Clean result",
|
||||
"cache_type": "query",
|
||||
"original_prompt": "Clean prompt",
|
||||
},
|
||||
}
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
||||
temp_file = f.name
|
||||
|
||||
try:
|
||||
# First write the dirty data directly (simulating legacy cache file)
|
||||
# Use custom encoder to force write even with surrogates
|
||||
with open(temp_file, "w", encoding="utf-8") as f:
|
||||
json.dump(
|
||||
legacy_data_with_surrogates,
|
||||
f,
|
||||
cls=SanitizingJSONEncoder,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
# Load and verify surrogates were cleaned during initial write
|
||||
loaded_data = load_json(temp_file)
|
||||
assert loaded_data is not None
|
||||
|
||||
# The data should be sanitized
|
||||
assert (
|
||||
"\ud800" not in loaded_data["cache_entry_1"]["return"]
|
||||
), "Surrogate in return should be removed"
|
||||
assert (
|
||||
"\udc9a" not in loaded_data["cache_entry_1"]["original_prompt"]
|
||||
), "Surrogate in prompt should be removed"
|
||||
|
||||
# Clean data should remain unchanged
|
||||
assert (
|
||||
loaded_data["cache_entry_2"]["return"] == "Clean result"
|
||||
), "Clean data should remain"
|
||||
|
||||
finally:
|
||||
os.unlink(temp_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run tests
|
||||
|
|
@ -241,4 +323,12 @@ if __name__ == "__main__":
|
|||
test.test_empty_and_none_strings()
|
||||
print("✓ Passed")
|
||||
|
||||
print("Running test_specific_surrogate_udc9a...")
|
||||
test.test_specific_surrogate_udc9a()
|
||||
print("✓ Passed")
|
||||
|
||||
print("Running test_migration_with_surrogate_sanitization...")
|
||||
test.test_migration_with_surrogate_sanitization()
|
||||
print("✓ Passed")
|
||||
|
||||
print("\n✅ All tests passed!")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue