fix: prevent data loss in PostgreSQL migration and add doc_status table creation

This commit fixes two critical issues in PostgreSQL storage:

BUG 1: Legacy table cleanup causing data loss across workspaces
---------------------------------------------------------------
PROBLEM:
- After migrating workspace_a data from legacy table, the ENTIRE legacy
  table was deleted
- This caused workspace_b's data (still in legacy table) to be lost
- Multi-tenant data isolation was violated

FIX:
- Implement workspace-aware cleanup: only delete migrated workspace's data
- Check if other workspaces still have data before dropping table
- Only drop legacy table when it becomes completely empty
- If other workspace data exists, preserve legacy table with remaining records

Location: postgres_impl.py PGVectorStorage.setup_table() lines 2510-2567

Test verification:
- test_workspace_migration_isolation_e2e_postgres validates this fix

BUG 2: PGDocStatusStorage missing table initialization
-------------------------------------------------------
PROBLEM:
- PGDocStatusStorage.initialize() only set workspace, never created table
- Caused "relation 'lightrag_doc_status' does not exist" errors
- document insertion (ainsert) failed immediately

FIX:
- Add table creation to initialize() method using _pg_create_table()
- Consistent with other storage implementations:
  * MongoDocStatusStorage creates collections
  * JsonDocStatusStorage creates directories
  * PGDocStatusStorage now creates tables ✓

Location: postgres_impl.py PGDocStatusStorage.initialize() lines 2965-2971

Test Results:
- Unit tests: 13/13 passed (test_unified_lock_safety,
  test_workspace_migration_isolation, test_dimension_mismatch)
- E2E tests require PostgreSQL server

Related: PR #2391 (Vector Storage Model Isolation)
This commit is contained in:
BukeLy 2025-11-23 16:43:49 +08:00
parent 204a2535c8
commit 16fff353d9

View file

@ -2507,23 +2507,63 @@ class PGVectorStorage(BaseVectorStorage):
# Create vector index after successful migration
await db._create_vector_index(table_name, embedding_dim)
# Delete legacy table after successful migration
# Data has been verified to match, so legacy table is no longer needed
# and keeping it would cause Case 1 warnings on next startup
# Clean up migrated data from legacy table
# CRITICAL: Only delete current workspace's data, not the entire table!
# Other workspaces may still have data in the legacy table.
try:
logger.info(
f"PostgreSQL: Deleting legacy table '{legacy_table_name}'..."
)
drop_query = f"DROP TABLE {legacy_table_name}"
await db.execute(drop_query, None)
logger.info(
f"PostgreSQL: Legacy table '{legacy_table_name}' deleted successfully"
)
if workspace:
# Delete only current workspace's migrated data
logger.info(
f"PostgreSQL: Deleting migrated workspace '{workspace}' data from legacy table '{legacy_table_name}'..."
)
delete_query = (
f"DELETE FROM {legacy_table_name} WHERE workspace = $1"
)
await db.execute(delete_query, [workspace])
logger.info(
f"PostgreSQL: Deleted workspace '{workspace}' data from legacy table"
)
# Check if legacy table still has data from other workspaces
remaining_query = (
f"SELECT COUNT(*) as count FROM {legacy_table_name}"
)
remaining_result = await db.query(remaining_query, [])
remaining_count = (
remaining_result.get("count", 0) if remaining_result else 0
)
if remaining_count == 0:
# Table is now empty, safe to drop
logger.info(
f"PostgreSQL: Legacy table '{legacy_table_name}' is empty, deleting..."
)
drop_query = f"DROP TABLE {legacy_table_name}"
await db.execute(drop_query, None)
logger.info(
f"PostgreSQL: Legacy table '{legacy_table_name}' deleted successfully"
)
else:
# Table still has data from other workspaces, preserve it
logger.info(
f"PostgreSQL: Legacy table '{legacy_table_name}' preserved ({remaining_count} records from other workspaces remain)"
)
else:
# No workspace specified - delete entire table (legacy behavior for backward compatibility)
logger.warning(
f"PostgreSQL: No workspace specified, deleting entire legacy table '{legacy_table_name}'..."
)
drop_query = f"DROP TABLE {legacy_table_name}"
await db.execute(drop_query, None)
logger.info(
f"PostgreSQL: Legacy table '{legacy_table_name}' deleted"
)
except Exception as delete_error:
# If deletion fails, user will see Case 1 warning on next startup
# If cleanup fails, log warning but don't fail migration
logger.warning(
f"PostgreSQL: Failed to delete legacy table '{legacy_table_name}': {delete_error}. "
"You may need to delete it manually."
f"PostgreSQL: Failed to clean up legacy table '{legacy_table_name}': {delete_error}. "
"Migration succeeded, but manual cleanup may be needed."
)
except PostgreSQLMigrationError:
@ -2922,6 +2962,12 @@ class PGDocStatusStorage(DocStatusStorage):
# Use "default" for compatibility (lowest priority)
self.workspace = "default"
# Create table if not exists
table_name = namespace_to_table_name(self.namespace)
table_exists = await _pg_table_exists(self.db, table_name)
if not table_exists:
await _pg_create_table(self.db, table_name, table_name)
async def finalize(self):
if self.db is not None:
await ClientManager.release_client(self.db)