test: add unit test for Case 1 sequential workspace migration bug

Add test_case1_sequential_workspace_migration to verify the fix for
the multi-tenant data loss bug in PostgreSQL Case 1 migration.

Problem:
- When workspace_a migrates first (Case 4: only legacy table exists)
- Then workspace_b initializes later (Case 1: both tables exist)
- Bug: Case 1 only checked if legacy table was globally empty
- Result: workspace_b's data was not migrated, causing data loss

Test Scenario:
1. Legacy table contains data from both workspace_a (3 records) and
   workspace_b (3 records)
2. workspace_a initializes first → triggers Case 4 migration
3. workspace_b initializes second → triggers Case 1 migration
4. Verify workspace_b's data is correctly migrated to new table
5. Verify workspace_b's data is deleted from legacy table
6. Verify legacy table is dropped when empty

This test uses mock tracking of inserted records to verify migration
behavior without requiring a real PostgreSQL database.

Related: GitHub PR #2391 comment #2553973066
This commit is contained in:
BukeLy 2025-11-26 01:32:07 +08:00
parent a8f5c9bd33
commit 0fb7c5bc3b
2 changed files with 233 additions and 2 deletions

View file

@ -574,3 +574,232 @@ async def test_case1_nonempty_legacy_warning(
print(
f"✅ Case 1b: Legacy table '{dropped_table}' with data preserved (warning only)"
)
@pytest.mark.asyncio
async def test_case1_sequential_workspace_migration(
mock_client_manager, mock_pg_db, mock_embedding_func
):
"""
Case 1c: Sequential workspace migration (Multi-tenant scenario)
Critical bug fix verification:
Timeline:
1. Legacy table has workspace_a (3 records) + workspace_b (3 records)
2. Workspace A initializes first Case 4 (only legacy exists) migrates A's data
3. Workspace B initializes later Case 1 (both tables exist) should migrate B's data
4. Verify workspace B's data is correctly migrated to new table
5. Verify legacy table is cleaned up after both workspaces migrate
This test verifies the fix where Case 1 now checks and migrates current
workspace's data instead of just checking if legacy table is empty globally.
"""
config = {
"embedding_batch_num": 10,
"vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8},
}
embedding_func = EmbeddingFunc(
embedding_dim=1536,
func=mock_embedding_func.func,
model_name="test-model",
)
# Mock data: Legacy table has 6 records total (3 from workspace_a, 3 from workspace_b)
mock_rows_a = [
{"id": f"a_{i}", "content": f"A content {i}", "workspace": "workspace_a"}
for i in range(3)
]
mock_rows_b = [
{"id": f"b_{i}", "content": f"B content {i}", "workspace": "workspace_b"}
for i in range(3)
]
# Track migration state
migration_state = {"new_table_exists": False, "workspace_a_migrated": False}
# Step 1: Simulate workspace_a initialization (Case 4)
# CRITICAL: Set db.workspace to workspace_a
mock_pg_db.workspace = "workspace_a"
storage_a = PGVectorStorage(
namespace=NameSpace.VECTOR_STORE_CHUNKS,
global_config=config,
embedding_func=embedding_func,
workspace="workspace_a",
)
# Mock table_exists for workspace_a
async def mock_table_exists_a(db, table_name):
if table_name == storage_a.legacy_table_name:
return True
if table_name == storage_a.table_name:
return migration_state["new_table_exists"]
return False
# Track inserted records count for verification
inserted_count = {"workspace_a": 0}
# Mock execute to track inserts
async def mock_execute_a(sql, data=None, **kwargs):
if sql and "INSERT INTO" in sql.upper():
inserted_count["workspace_a"] += 1
return None
# Mock query for workspace_a (Case 4)
async def mock_query_a(sql, params=None, multirows=False, **kwargs):
sql_upper = sql.upper()
base_name = storage_a.legacy_table_name.upper()
if "COUNT(*)" in sql:
has_model_suffix = "TEST_MODEL_1536D" in sql_upper
is_legacy = base_name in sql_upper and not has_model_suffix
has_workspace_filter = "WHERE workspace" in sql
if is_legacy and has_workspace_filter:
workspace = params[0] if params and len(params) > 0 else None
if workspace == "workspace_a":
# After migration starts, pretend legacy is empty for this workspace
return {"count": 3 - inserted_count["workspace_a"]}
elif workspace == "workspace_b":
return {"count": 3}
elif is_legacy and not has_workspace_filter:
# Global count in legacy table
remaining = 6 - inserted_count["workspace_a"]
return {"count": remaining}
elif has_model_suffix:
# New table count (for verification)
return {"count": inserted_count["workspace_a"]}
elif multirows and "SELECT *" in sql:
if "WHERE workspace" in sql:
workspace = params[0] if params and len(params) > 0 else None
if workspace == "workspace_a":
offset = params[1] if len(params) > 1 else 0
limit = params[2] if len(params) > 2 else 500
return mock_rows_a[offset : offset + limit]
return {}
mock_pg_db.query = AsyncMock(side_effect=mock_query_a)
mock_pg_db.execute = AsyncMock(side_effect=mock_execute_a)
# Initialize workspace_a (Case 4)
with (
patch(
"lightrag.kg.postgres_impl._pg_table_exists",
side_effect=mock_table_exists_a,
),
patch("lightrag.kg.postgres_impl._pg_create_table", AsyncMock()),
):
await storage_a.initialize()
migration_state["new_table_exists"] = True
migration_state["workspace_a_migrated"] = True
print("✅ Step 1: Workspace A initialized (Case 4)")
assert mock_pg_db.execute.call_count >= 3
print(f"✅ Step 1: {mock_pg_db.execute.call_count} execute calls")
# Step 2: Simulate workspace_b initialization (Case 1)
# CRITICAL: Set db.workspace to workspace_b
mock_pg_db.workspace = "workspace_b"
storage_b = PGVectorStorage(
namespace=NameSpace.VECTOR_STORE_CHUNKS,
global_config=config,
embedding_func=embedding_func,
workspace="workspace_b",
)
mock_pg_db.reset_mock()
migration_state["workspace_b_migrated"] = False
# Mock table_exists for workspace_b (both exist)
async def mock_table_exists_b(db, table_name):
return True
# Track inserted records count for workspace_b
inserted_count["workspace_b"] = 0
# Mock execute for workspace_b to track inserts
async def mock_execute_b(sql, data=None, **kwargs):
if sql and "INSERT INTO" in sql.upper():
inserted_count["workspace_b"] += 1
return None
# Mock query for workspace_b (Case 1)
async def mock_query_b(sql, params=None, multirows=False, **kwargs):
sql_upper = sql.upper()
base_name = storage_b.legacy_table_name.upper()
if "COUNT(*)" in sql:
has_model_suffix = "TEST_MODEL_1536D" in sql_upper
is_legacy = base_name in sql_upper and not has_model_suffix
has_workspace_filter = "WHERE workspace" in sql
if is_legacy and has_workspace_filter:
workspace = params[0] if params and len(params) > 0 else None
if workspace == "workspace_b":
# After migration starts, pretend legacy is empty for this workspace
return {"count": 3 - inserted_count["workspace_b"]}
elif workspace == "workspace_a":
return {"count": 0} # Already migrated
elif is_legacy and not has_workspace_filter:
# Global count: only workspace_b data remains
return {"count": 3 - inserted_count["workspace_b"]}
elif has_model_suffix:
# New table total count (workspace_a: 3 + workspace_b: inserted)
if has_workspace_filter:
workspace = params[0] if params and len(params) > 0 else None
if workspace == "workspace_b":
return {"count": inserted_count["workspace_b"]}
elif workspace == "workspace_a":
return {"count": 3}
else:
# Total count in new table (for verification)
return {"count": 3 + inserted_count["workspace_b"]}
elif multirows and "SELECT *" in sql:
if "WHERE workspace" in sql:
workspace = params[0] if params and len(params) > 0 else None
if workspace == "workspace_b":
offset = params[1] if len(params) > 1 else 0
limit = params[2] if len(params) > 2 else 500
return mock_rows_b[offset : offset + limit]
return {}
mock_pg_db.query = AsyncMock(side_effect=mock_query_b)
mock_pg_db.execute = AsyncMock(side_effect=mock_execute_b)
# Initialize workspace_b (Case 1)
with patch(
"lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists_b
):
await storage_b.initialize()
migration_state["workspace_b_migrated"] = True
print("✅ Step 2: Workspace B initialized (Case 1)")
# Verify workspace_b migration happened
execute_calls = mock_pg_db.execute.call_args_list
insert_calls = [
call for call in execute_calls if call[0][0] and "INSERT INTO" in call[0][0]
]
assert len(insert_calls) >= 3, f"Expected >= 3 inserts, got {len(insert_calls)}"
print(f"✅ Step 2: {len(insert_calls)} insert calls")
# Verify DELETE and DROP TABLE
delete_calls = [
call
for call in execute_calls
if call[0][0]
and "DELETE FROM" in call[0][0]
and "WHERE workspace" in call[0][0]
]
assert len(delete_calls) >= 1, "Expected DELETE workspace_b data"
print("✅ Step 2: DELETE workspace_b from legacy")
drop_calls = [
call for call in execute_calls if call[0][0] and "DROP TABLE" in call[0][0]
]
assert len(drop_calls) >= 1, "Expected DROP TABLE"
print("✅ Step 2: Legacy table dropped")
print("\n🎉 Case 1c: Sequential workspace migration verified!")

6
uv.lock generated
View file

@ -1,5 +1,5 @@
version = 1
revision = 3
revision = 2
requires-python = ">=3.10"
resolution-markers = [
"python_full_version >= '3.14' and python_full_version < '4' and platform_machine == 'x86_64' and sys_platform == 'darwin'",
@ -2735,7 +2735,6 @@ requires-dist = [
{ name = "json-repair", marker = "extra == 'api'" },
{ name = "langfuse", marker = "extra == 'observability'", specifier = ">=3.8.1" },
{ name = "lightrag-hku", extras = ["api", "offline-llm", "offline-storage"], marker = "extra == 'offline'" },
{ name = "lightrag-hku", extras = ["pytest"], marker = "extra == 'evaluation'" },
{ name = "llama-index", marker = "extra == 'offline-llm'", specifier = ">=0.9.0,<1.0.0" },
{ name = "nano-vectordb" },
{ name = "nano-vectordb", marker = "extra == 'api'" },
@ -2753,6 +2752,7 @@ requires-dist = [
{ name = "passlib", extras = ["bcrypt"], marker = "extra == 'api'" },
{ name = "pipmaster" },
{ name = "pipmaster", marker = "extra == 'api'" },
{ name = "pre-commit", marker = "extra == 'evaluation'" },
{ name = "pre-commit", marker = "extra == 'pytest'" },
{ name = "psutil", marker = "extra == 'api'" },
{ name = "pycryptodome", marker = "extra == 'api'", specifier = ">=3.0.0,<4.0.0" },
@ -2764,7 +2764,9 @@ requires-dist = [
{ name = "pypdf", marker = "extra == 'api'", specifier = ">=6.1.0" },
{ name = "pypinyin" },
{ name = "pypinyin", marker = "extra == 'api'" },
{ name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" },
{ name = "pytest", marker = "extra == 'pytest'", specifier = ">=8.4.2" },
{ name = "pytest-asyncio", marker = "extra == 'evaluation'", specifier = ">=1.2.0" },
{ name = "pytest-asyncio", marker = "extra == 'pytest'", specifier = ">=1.2.0" },
{ name = "python-docx", marker = "extra == 'api'", specifier = ">=0.8.11,<2.0.0" },
{ name = "python-dotenv" },