""" Tests for dimension mismatch handling during migration. This test module verifies that both PostgreSQL and Qdrant storage backends properly detect and handle vector dimension mismatches when migrating from legacy collections/tables to new ones with different embedding models. """ import pytest from unittest.mock import MagicMock, AsyncMock, patch from lightrag.kg.qdrant_impl import QdrantVectorDBStorage from lightrag.kg.postgres_impl import PGVectorStorage # Note: Tests should use proper table names that have DDL templates # Valid base tables: LIGHTRAG_VDB_CHUNKS, LIGHTRAG_VDB_ENTITIES, LIGHTRAG_VDB_RELATIONSHIPS, # LIGHTRAG_DOC_CHUNKS, LIGHTRAG_DOC_FULL_DOCS, LIGHTRAG_DOC_TEXT_CHUNKS class TestQdrantDimensionMismatch: """Test suite for Qdrant dimension mismatch handling.""" def test_qdrant_dimension_mismatch_skip_migration(self): """ Test that Qdrant skips migration when dimensions don't match. Scenario: Legacy collection has 1536d vectors, new model expects 3072d. Expected: Migration skipped, new empty collection created, legacy preserved. """ from qdrant_client import models # Setup mock client client = MagicMock() # Mock legacy collection with 1536d vectors legacy_collection_info = MagicMock() legacy_collection_info.config.params.vectors.size = 1536 # Setup collection existence checks def collection_exists_side_effect(name): if name == "lightrag_chunks": # legacy return True elif name == "lightrag_chunks_model_3072d": # new return False return False client.collection_exists.side_effect = collection_exists_side_effect client.get_collection.return_value = legacy_collection_info client.count.return_value.count = 100 # Legacy has data # Call setup_collection with 3072d (different from legacy 1536d) QdrantVectorDBStorage.setup_collection( client, "lightrag_chunks_model_3072d", namespace="chunks", workspace="test", vectors_config=models.VectorParams( size=3072, distance=models.Distance.COSINE ), ) # Verify new collection was created client.create_collection.assert_called_once() # Verify migration was NOT attempted (no scroll/upsert calls) client.scroll.assert_not_called() client.upsert.assert_not_called() def test_qdrant_dimension_match_proceed_migration(self): """ Test that Qdrant proceeds with migration when dimensions match. Scenario: Legacy collection has 1536d vectors, new model also expects 1536d. Expected: Migration proceeds normally. """ from qdrant_client import models client = MagicMock() # Mock legacy collection with 1536d vectors (matching new) legacy_collection_info = MagicMock() legacy_collection_info.config.params.vectors.size = 1536 def collection_exists_side_effect(name): if name == "lightrag_chunks": # legacy return True elif name == "lightrag_chunks_model_1536d": # new return False return False client.collection_exists.side_effect = collection_exists_side_effect client.get_collection.return_value = legacy_collection_info client.count.return_value.count = 100 # Legacy has data # Mock scroll to return sample data sample_point = MagicMock() sample_point.id = "test_id" sample_point.vector = [0.1] * 1536 sample_point.payload = {"id": "test"} client.scroll.return_value = ([sample_point], None) # Mock _find_legacy_collection to return the legacy collection name with patch( "lightrag.kg.qdrant_impl._find_legacy_collection", return_value="lightrag_chunks", ): # Call setup_collection with matching 1536d QdrantVectorDBStorage.setup_collection( client, "lightrag_chunks_model_1536d", namespace="chunks", workspace="test", vectors_config=models.VectorParams( size=1536, distance=models.Distance.COSINE ), ) # Verify migration WAS attempted client.create_collection.assert_called_once() client.scroll.assert_called() client.upsert.assert_called() class TestPostgresDimensionMismatch: """Test suite for PostgreSQL dimension mismatch handling.""" @pytest.mark.asyncio async def test_postgres_dimension_mismatch_skip_migration_metadata(self): """ Test that PostgreSQL skips migration when dimensions don't match (via metadata). Scenario: Legacy table has 1536d vectors (detected via pg_attribute), new model expects 3072d. Expected: Migration skipped, new empty table created, legacy preserved. """ # Setup mock database db = AsyncMock() # Mock table existence and dimension checks async def query_side_effect(query, params, **kwargs): if "information_schema.tables" in query: if params[0] == "LIGHTRAG_DOC_CHUNKS": # legacy return {"exists": True} elif params[0] == "LIGHTRAG_DOC_CHUNKS_model_3072d": # new return {"exists": False} elif "COUNT(*)" in query: return {"count": 100} # Legacy has data elif "pg_attribute" in query: return {"vector_dim": 1536} # Legacy has 1536d vectors return {} db.query.side_effect = query_side_effect db.execute = AsyncMock() db._create_vector_index = AsyncMock() # Call setup_table with 3072d (different from legacy 1536d) await PGVectorStorage.setup_table( db, "LIGHTRAG_DOC_CHUNKS_model_3072d", legacy_table_name="LIGHTRAG_DOC_CHUNKS", base_table="LIGHTRAG_DOC_CHUNKS", embedding_dim=3072, workspace="test", ) # Verify migration was NOT attempted (no INSERT calls) # Note: _pg_create_table is mocked, so we check INSERT calls to verify migration was skipped insert_calls = [ call for call in db.execute.call_args_list if call[0][0] and "INSERT INTO" in call[0][0] ] assert ( len(insert_calls) == 0 ), "Migration should be skipped due to dimension mismatch" @pytest.mark.asyncio async def test_postgres_dimension_mismatch_skip_migration_sampling(self): """ Test that PostgreSQL skips migration when dimensions don't match (via sampling). Scenario: Legacy table dimension detection fails via metadata, falls back to vector sampling, detects 1536d vs expected 3072d. Expected: Migration skipped, new empty table created, legacy preserved. """ db = AsyncMock() # Mock table existence and dimension checks async def query_side_effect(query, params, **kwargs): if "information_schema.tables" in query: if params[0] == "LIGHTRAG_DOC_CHUNKS": # legacy return {"exists": True} elif params[0] == "LIGHTRAG_DOC_CHUNKS_model_3072d": # new return {"exists": False} elif "COUNT(*)" in query: return {"count": 100} # Legacy has data elif "pg_attribute" in query: return {"vector_dim": -1} # Metadata check fails elif "SELECT content_vector FROM" in query: # Return sample vector with 1536 dimensions return {"content_vector": [0.1] * 1536} return {} db.query.side_effect = query_side_effect db.execute = AsyncMock() db._create_vector_index = AsyncMock() # Call setup_table with 3072d (different from legacy 1536d) await PGVectorStorage.setup_table( db, "LIGHTRAG_DOC_CHUNKS_model_3072d", legacy_table_name="LIGHTRAG_DOC_CHUNKS", base_table="LIGHTRAG_DOC_CHUNKS", embedding_dim=3072, workspace="test", ) # Verify new table was created create_table_calls = [ call for call in db.execute.call_args_list if call[0][0] and "CREATE TABLE" in call[0][0] ] assert len(create_table_calls) > 0, "New table should be created" # Verify migration was NOT attempted insert_calls = [ call for call in db.execute.call_args_list if call[0][0] and "INSERT INTO" in call[0][0] ] assert len(insert_calls) == 0, "Migration should be skipped" @pytest.mark.asyncio async def test_postgres_dimension_match_proceed_migration(self): """ Test that PostgreSQL proceeds with migration when dimensions match. Scenario: Legacy table has 1536d vectors, new model also expects 1536d. Expected: Migration proceeds normally. """ db = AsyncMock() async def query_side_effect(query, params, **kwargs): multirows = kwargs.get("multirows", False) if "information_schema.tables" in query: if params[0] == "LIGHTRAG_DOC_CHUNKS": # legacy return {"exists": True} elif params[0] == "LIGHTRAG_DOC_CHUNKS_model_1536d": # new return {"exists": False} elif "COUNT(*)" in query: return {"count": 100} # Legacy has data elif "pg_attribute" in query: return {"vector_dim": 1536} # Legacy has matching 1536d elif "SELECT * FROM" in query and multirows: # Return sample data for migration (first batch) # Handle workspace filtering: params = [workspace, offset, limit] if "WHERE workspace" in query: offset = params[1] if len(params) > 1 else 0 else: offset = params[0] if params else 0 if offset == 0: # First batch return [ { "id": "test1", "content_vector": [0.1] * 1536, "workspace": "test", }, { "id": "test2", "content_vector": [0.2] * 1536, "workspace": "test", }, ] else: # offset > 0 return [] # No more data return {} db.query.side_effect = query_side_effect db.execute = AsyncMock() db._create_vector_index = AsyncMock() # Mock _pg_table_exists async def mock_table_exists(db_inst, name): if name == "LIGHTRAG_DOC_CHUNKS": # legacy exists return True elif name == "LIGHTRAG_DOC_CHUNKS_model_1536d": # new doesn't exist return False return False with patch( "lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists, ): # Call setup_table with matching 1536d await PGVectorStorage.setup_table( db, "LIGHTRAG_DOC_CHUNKS_model_1536d", legacy_table_name="LIGHTRAG_DOC_CHUNKS", base_table="LIGHTRAG_DOC_CHUNKS", embedding_dim=1536, workspace="test", ) # Verify migration WAS attempted (INSERT calls made) insert_calls = [ call for call in db.execute.call_args_list if call[0][0] and "INSERT INTO" in call[0][0] ] assert ( len(insert_calls) > 0 ), "Migration should proceed with matching dimensions"