From 16fff353d9ebfb53f9d97c7bb594479f3ef9c07c Mon Sep 17 00:00:00 2001 From: BukeLy Date: Sun, 23 Nov 2025 16:43:49 +0800 Subject: [PATCH] fix: prevent data loss in PostgreSQL migration and add doc_status table creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes two critical issues in PostgreSQL storage: BUG 1: Legacy table cleanup causing data loss across workspaces --------------------------------------------------------------- PROBLEM: - After migrating workspace_a data from legacy table, the ENTIRE legacy table was deleted - This caused workspace_b's data (still in legacy table) to be lost - Multi-tenant data isolation was violated FIX: - Implement workspace-aware cleanup: only delete migrated workspace's data - Check if other workspaces still have data before dropping table - Only drop legacy table when it becomes completely empty - If other workspace data exists, preserve legacy table with remaining records Location: postgres_impl.py PGVectorStorage.setup_table() lines 2510-2567 Test verification: - test_workspace_migration_isolation_e2e_postgres validates this fix BUG 2: PGDocStatusStorage missing table initialization ------------------------------------------------------- PROBLEM: - PGDocStatusStorage.initialize() only set workspace, never created table - Caused "relation 'lightrag_doc_status' does not exist" errors - document insertion (ainsert) failed immediately FIX: - Add table creation to initialize() method using _pg_create_table() - Consistent with other storage implementations: * MongoDocStatusStorage creates collections * JsonDocStatusStorage creates directories * PGDocStatusStorage now creates tables ✓ Location: postgres_impl.py PGDocStatusStorage.initialize() lines 2965-2971 Test Results: - Unit tests: 13/13 passed (test_unified_lock_safety, test_workspace_migration_isolation, test_dimension_mismatch) - E2E tests require PostgreSQL server Related: PR #2391 (Vector Storage Model Isolation) --- lightrag/kg/postgres_impl.py | 74 +++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 4780d728..3c63f9b8 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -2507,23 +2507,63 @@ class PGVectorStorage(BaseVectorStorage): # Create vector index after successful migration await db._create_vector_index(table_name, embedding_dim) - # Delete legacy table after successful migration - # Data has been verified to match, so legacy table is no longer needed - # and keeping it would cause Case 1 warnings on next startup + # Clean up migrated data from legacy table + # CRITICAL: Only delete current workspace's data, not the entire table! + # Other workspaces may still have data in the legacy table. try: - logger.info( - f"PostgreSQL: Deleting legacy table '{legacy_table_name}'..." - ) - drop_query = f"DROP TABLE {legacy_table_name}" - await db.execute(drop_query, None) - logger.info( - f"PostgreSQL: Legacy table '{legacy_table_name}' deleted successfully" - ) + if workspace: + # Delete only current workspace's migrated data + logger.info( + f"PostgreSQL: Deleting migrated workspace '{workspace}' data from legacy table '{legacy_table_name}'..." + ) + delete_query = ( + f"DELETE FROM {legacy_table_name} WHERE workspace = $1" + ) + await db.execute(delete_query, [workspace]) + logger.info( + f"PostgreSQL: Deleted workspace '{workspace}' data from legacy table" + ) + + # Check if legacy table still has data from other workspaces + remaining_query = ( + f"SELECT COUNT(*) as count FROM {legacy_table_name}" + ) + remaining_result = await db.query(remaining_query, []) + remaining_count = ( + remaining_result.get("count", 0) if remaining_result else 0 + ) + + if remaining_count == 0: + # Table is now empty, safe to drop + logger.info( + f"PostgreSQL: Legacy table '{legacy_table_name}' is empty, deleting..." + ) + drop_query = f"DROP TABLE {legacy_table_name}" + await db.execute(drop_query, None) + logger.info( + f"PostgreSQL: Legacy table '{legacy_table_name}' deleted successfully" + ) + else: + # Table still has data from other workspaces, preserve it + logger.info( + f"PostgreSQL: Legacy table '{legacy_table_name}' preserved ({remaining_count} records from other workspaces remain)" + ) + else: + # No workspace specified - delete entire table (legacy behavior for backward compatibility) + logger.warning( + f"PostgreSQL: No workspace specified, deleting entire legacy table '{legacy_table_name}'..." + ) + drop_query = f"DROP TABLE {legacy_table_name}" + await db.execute(drop_query, None) + logger.info( + f"PostgreSQL: Legacy table '{legacy_table_name}' deleted" + ) + except Exception as delete_error: - # If deletion fails, user will see Case 1 warning on next startup + # If cleanup fails, log warning but don't fail migration logger.warning( - f"PostgreSQL: Failed to delete legacy table '{legacy_table_name}': {delete_error}. " - "You may need to delete it manually." + f"PostgreSQL: Failed to clean up legacy table '{legacy_table_name}': {delete_error}. " + "Migration succeeded, but manual cleanup may be needed." ) except PostgreSQLMigrationError: @@ -2922,6 +2962,12 @@ class PGDocStatusStorage(DocStatusStorage): # Use "default" for compatibility (lowest priority) self.workspace = "default" + # Create table if not exists + table_name = namespace_to_table_name(self.namespace) + table_exists = await _pg_table_exists(self.db, table_name) + if not table_exists: + await _pg_create_table(self.db, table_name, table_name) + async def finalize(self): if self.db is not None: await ClientManager.release_client(self.db)