diff --git a/lightrag/kg/json_doc_status_impl.py b/lightrag/kg/json_doc_status_impl.py index 3a36f58c..bf6e7b17 100644 --- a/lightrag/kg/json_doc_status_impl.py +++ b/lightrag/kg/json_doc_status_impl.py @@ -171,7 +171,7 @@ class JsonDocStatusStorage(DocStatusStorage): f"[{self.workspace}] Reloading sanitized data into shared memory for {self.namespace}" ) cleaned_data = load_json(self._file_name) - if cleaned_data: + if cleaned_data is not None: self._data.clear() self._data.update(cleaned_data) diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 3f99dd4d..f9adb20f 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -91,7 +91,7 @@ class JsonKVStorage(BaseKVStorage): f"[{self.workspace}] Reloading sanitized data into shared memory for {self.namespace}" ) cleaned_data = load_json(self._file_name) - if cleaned_data: + if cleaned_data is not None: self._data.clear() self._data.update(cleaned_data) @@ -283,7 +283,7 @@ class JsonKVStorage(BaseKVStorage): f"[{self.workspace}] Reloading sanitized migration data for {self.namespace}" ) cleaned_data = load_json(self._file_name) - if cleaned_data: + if cleaned_data is not None: return cleaned_data # Return cleaned data to update shared memory return migrated_data diff --git a/tests/test_write_json_optimization.py b/tests/test_write_json_optimization.py index 9c4105b9..0a92904f 100644 --- a/tests/test_write_json_optimization.py +++ b/tests/test_write_json_optimization.py @@ -290,6 +290,55 @@ class TestWriteJsonOptimization: finally: os.unlink(temp_file) + def test_empty_values_after_sanitization(self): + """Test that data with empty values after sanitization is properly handled + + Critical edge case: When sanitization results in data with empty string values, + we must use 'if cleaned_data is not None' instead of 'if cleaned_data' to ensure + proper reload, since truthy check on dict depends on content, not just existence. + """ + # Create data where ALL values are only surrogate characters + all_dirty_data = { + "key1": "\ud800\udc00\ud801", + "key2": "\ud802\ud803", + } + + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: + temp_file = f.name + + try: + # Write dirty data - should trigger sanitization + needs_reload = write_json(all_dirty_data, temp_file) + assert needs_reload, "All-dirty data should trigger sanitization" + + # Load the sanitized data + cleaned_data = load_json(temp_file) + + # Critical assertions for the edge case + assert cleaned_data is not None, "Cleaned data should not be None" + # Sanitization removes surrogates but preserves keys with empty values + assert cleaned_data == { + "key1": "", + "key2": "", + }, "Surrogates should be removed, keys preserved" + # This dict is truthy because it has keys (even with empty values) + assert cleaned_data, "Dict with keys is truthy" + + # Test the actual edge case: empty dict + empty_data = {} + needs_reload2 = write_json(empty_data, temp_file) + assert not needs_reload2, "Empty dict is clean" + + reloaded_empty = load_json(temp_file) + assert reloaded_empty is not None, "Empty dict should not be None" + assert reloaded_empty == {}, "Empty dict should remain empty" + assert ( + not reloaded_empty + ), "Empty dict evaluates to False (the critical check)" + + finally: + os.unlink(temp_file) + if __name__ == "__main__": # Run tests @@ -331,4 +380,8 @@ if __name__ == "__main__": test.test_migration_with_surrogate_sanitization() print("āœ“ Passed") + print("Running test_empty_values_after_sanitization...") + test.test_empty_values_after_sanitization() + print("āœ“ Passed") + print("\nāœ… All tests passed!")