cherry-pick cca0800e
This commit is contained in:
parent
1368d3a1fe
commit
1a167fb7f7
2 changed files with 14 additions and 73 deletions
|
|
@ -13,7 +13,7 @@ from lightrag.utils import (
|
||||||
from lightrag.exceptions import StorageNotInitializedError
|
from lightrag.exceptions import StorageNotInitializedError
|
||||||
from .shared_storage import (
|
from .shared_storage import (
|
||||||
get_namespace_data,
|
get_namespace_data,
|
||||||
get_namespace_lock,
|
get_storage_lock,
|
||||||
get_data_init_lock,
|
get_data_init_lock,
|
||||||
get_update_flag,
|
get_update_flag,
|
||||||
set_all_update_flags,
|
set_all_update_flags,
|
||||||
|
|
@ -30,10 +30,12 @@ class JsonKVStorage(BaseKVStorage):
|
||||||
if self.workspace:
|
if self.workspace:
|
||||||
# Include workspace in the file path for data isolation
|
# Include workspace in the file path for data isolation
|
||||||
workspace_dir = os.path.join(working_dir, self.workspace)
|
workspace_dir = os.path.join(working_dir, self.workspace)
|
||||||
|
self.final_namespace = f"{self.workspace}_{self.namespace}"
|
||||||
else:
|
else:
|
||||||
# Default behavior when workspace is empty
|
# Default behavior when workspace is empty
|
||||||
workspace_dir = working_dir
|
workspace_dir = working_dir
|
||||||
self.workspace = ""
|
self.final_namespace = self.namespace
|
||||||
|
self.workspace = "_"
|
||||||
|
|
||||||
os.makedirs(workspace_dir, exist_ok=True)
|
os.makedirs(workspace_dir, exist_ok=True)
|
||||||
self._file_name = os.path.join(workspace_dir, f"kv_store_{self.namespace}.json")
|
self._file_name = os.path.join(workspace_dir, f"kv_store_{self.namespace}.json")
|
||||||
|
|
@ -44,20 +46,12 @@ class JsonKVStorage(BaseKVStorage):
|
||||||
|
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
"""Initialize storage data"""
|
"""Initialize storage data"""
|
||||||
self._storage_lock = get_namespace_lock(
|
self._storage_lock = get_storage_lock()
|
||||||
self.namespace, workspace=self.workspace
|
self.storage_updated = await get_update_flag(self.final_namespace)
|
||||||
)
|
|
||||||
self.storage_updated = await get_update_flag(
|
|
||||||
self.namespace, workspace=self.workspace
|
|
||||||
)
|
|
||||||
async with get_data_init_lock():
|
async with get_data_init_lock():
|
||||||
# check need_init must before get_namespace_data
|
# check need_init must before get_namespace_data
|
||||||
need_init = await try_initialize_namespace(
|
need_init = await try_initialize_namespace(self.final_namespace)
|
||||||
self.namespace, workspace=self.workspace
|
self._data = await get_namespace_data(self.final_namespace)
|
||||||
)
|
|
||||||
self._data = await get_namespace_data(
|
|
||||||
self.namespace, workspace=self.workspace
|
|
||||||
)
|
|
||||||
if need_init:
|
if need_init:
|
||||||
loaded_data = load_json(self._file_name) or {}
|
loaded_data = load_json(self._file_name) or {}
|
||||||
async with self._storage_lock:
|
async with self._storage_lock:
|
||||||
|
|
@ -97,11 +91,11 @@ class JsonKVStorage(BaseKVStorage):
|
||||||
f"[{self.workspace}] Reloading sanitized data into shared memory for {self.namespace}"
|
f"[{self.workspace}] Reloading sanitized data into shared memory for {self.namespace}"
|
||||||
)
|
)
|
||||||
cleaned_data = load_json(self._file_name)
|
cleaned_data = load_json(self._file_name)
|
||||||
if cleaned_data is not None:
|
if cleaned_data:
|
||||||
self._data.clear()
|
self._data.clear()
|
||||||
self._data.update(cleaned_data)
|
self._data.update(cleaned_data)
|
||||||
|
|
||||||
await clear_all_update_flags(self.namespace, workspace=self.workspace)
|
await clear_all_update_flags(self.final_namespace)
|
||||||
|
|
||||||
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
async def get_by_id(self, id: str) -> dict[str, Any] | None:
|
||||||
async with self._storage_lock:
|
async with self._storage_lock:
|
||||||
|
|
@ -174,7 +168,7 @@ class JsonKVStorage(BaseKVStorage):
|
||||||
v["_id"] = k
|
v["_id"] = k
|
||||||
|
|
||||||
self._data.update(data)
|
self._data.update(data)
|
||||||
await set_all_update_flags(self.namespace, workspace=self.workspace)
|
await set_all_update_flags(self.final_namespace)
|
||||||
|
|
||||||
async def delete(self, ids: list[str]) -> None:
|
async def delete(self, ids: list[str]) -> None:
|
||||||
"""Delete specific records from storage by their IDs
|
"""Delete specific records from storage by their IDs
|
||||||
|
|
@ -197,7 +191,7 @@ class JsonKVStorage(BaseKVStorage):
|
||||||
any_deleted = True
|
any_deleted = True
|
||||||
|
|
||||||
if any_deleted:
|
if any_deleted:
|
||||||
await set_all_update_flags(self.namespace, workspace=self.workspace)
|
await set_all_update_flags(self.final_namespace)
|
||||||
|
|
||||||
async def is_empty(self) -> bool:
|
async def is_empty(self) -> bool:
|
||||||
"""Check if the storage is empty
|
"""Check if the storage is empty
|
||||||
|
|
@ -225,7 +219,7 @@ class JsonKVStorage(BaseKVStorage):
|
||||||
try:
|
try:
|
||||||
async with self._storage_lock:
|
async with self._storage_lock:
|
||||||
self._data.clear()
|
self._data.clear()
|
||||||
await set_all_update_flags(self.namespace, workspace=self.workspace)
|
await set_all_update_flags(self.final_namespace)
|
||||||
|
|
||||||
await self.index_done_callback()
|
await self.index_done_callback()
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|
@ -289,7 +283,7 @@ class JsonKVStorage(BaseKVStorage):
|
||||||
f"[{self.workspace}] Reloading sanitized migration data for {self.namespace}"
|
f"[{self.workspace}] Reloading sanitized migration data for {self.namespace}"
|
||||||
)
|
)
|
||||||
cleaned_data = load_json(self._file_name)
|
cleaned_data = load_json(self._file_name)
|
||||||
if cleaned_data is not None:
|
if cleaned_data:
|
||||||
return cleaned_data # Return cleaned data to update shared memory
|
return cleaned_data # Return cleaned data to update shared memory
|
||||||
|
|
||||||
return migrated_data
|
return migrated_data
|
||||||
|
|
|
||||||
|
|
@ -290,55 +290,6 @@ class TestWriteJsonOptimization:
|
||||||
finally:
|
finally:
|
||||||
os.unlink(temp_file)
|
os.unlink(temp_file)
|
||||||
|
|
||||||
def test_empty_values_after_sanitization(self):
|
|
||||||
"""Test that data with empty values after sanitization is properly handled
|
|
||||||
|
|
||||||
Critical edge case: When sanitization results in data with empty string values,
|
|
||||||
we must use 'if cleaned_data is not None' instead of 'if cleaned_data' to ensure
|
|
||||||
proper reload, since truthy check on dict depends on content, not just existence.
|
|
||||||
"""
|
|
||||||
# Create data where ALL values are only surrogate characters
|
|
||||||
all_dirty_data = {
|
|
||||||
"key1": "\ud800\udc00\ud801",
|
|
||||||
"key2": "\ud802\ud803",
|
|
||||||
}
|
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
|
||||||
temp_file = f.name
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Write dirty data - should trigger sanitization
|
|
||||||
needs_reload = write_json(all_dirty_data, temp_file)
|
|
||||||
assert needs_reload, "All-dirty data should trigger sanitization"
|
|
||||||
|
|
||||||
# Load the sanitized data
|
|
||||||
cleaned_data = load_json(temp_file)
|
|
||||||
|
|
||||||
# Critical assertions for the edge case
|
|
||||||
assert cleaned_data is not None, "Cleaned data should not be None"
|
|
||||||
# Sanitization removes surrogates but preserves keys with empty values
|
|
||||||
assert cleaned_data == {
|
|
||||||
"key1": "",
|
|
||||||
"key2": "",
|
|
||||||
}, "Surrogates should be removed, keys preserved"
|
|
||||||
# This dict is truthy because it has keys (even with empty values)
|
|
||||||
assert cleaned_data, "Dict with keys is truthy"
|
|
||||||
|
|
||||||
# Test the actual edge case: empty dict
|
|
||||||
empty_data = {}
|
|
||||||
needs_reload2 = write_json(empty_data, temp_file)
|
|
||||||
assert not needs_reload2, "Empty dict is clean"
|
|
||||||
|
|
||||||
reloaded_empty = load_json(temp_file)
|
|
||||||
assert reloaded_empty is not None, "Empty dict should not be None"
|
|
||||||
assert reloaded_empty == {}, "Empty dict should remain empty"
|
|
||||||
assert (
|
|
||||||
not reloaded_empty
|
|
||||||
), "Empty dict evaluates to False (the critical check)"
|
|
||||||
|
|
||||||
finally:
|
|
||||||
os.unlink(temp_file)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Run tests
|
# Run tests
|
||||||
|
|
@ -380,8 +331,4 @@ if __name__ == "__main__":
|
||||||
test.test_migration_with_surrogate_sanitization()
|
test.test_migration_with_surrogate_sanitization()
|
||||||
print("✓ Passed")
|
print("✓ Passed")
|
||||||
|
|
||||||
print("Running test_empty_values_after_sanitization...")
|
|
||||||
test.test_empty_values_after_sanitization()
|
|
||||||
print("✓ Passed")
|
|
||||||
|
|
||||||
print("\n✅ All tests passed!")
|
print("\n✅ All tests passed!")
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue