LightRAG/tests/test_dimension_mismatch.py
BukeLy 3b8a1e64b7 style: apply ruff formatting fixes to test files
Apply ruff-format fixes to 6 test files to pass pre-commit checks:
- test_dimension_mismatch.py
- test_e2e_multi_instance.py
- test_no_model_suffix_safety.py
- test_postgres_migration.py
- test_unified_lock_safety.py
- test_workspace_migration_isolation.py

Changes are primarily assert statement reformatting to match ruff style guide.
2025-11-23 16:59:02 +08:00

316 lines
12 KiB
Python

"""
Tests for dimension mismatch handling during migration.
This test module verifies that both PostgreSQL and Qdrant storage backends
properly detect and handle vector dimension mismatches when migrating from
legacy collections/tables to new ones with different embedding models.
"""
import pytest
from unittest.mock import MagicMock, AsyncMock, patch
from lightrag.kg.qdrant_impl import QdrantVectorDBStorage
from lightrag.kg.postgres_impl import PGVectorStorage
# Note: Tests should use proper table names that have DDL templates
# Valid base tables: LIGHTRAG_VDB_CHUNKS, LIGHTRAG_VDB_ENTITIES, LIGHTRAG_VDB_RELATIONSHIPS,
# LIGHTRAG_DOC_CHUNKS, LIGHTRAG_DOC_FULL_DOCS, LIGHTRAG_DOC_TEXT_CHUNKS
class TestQdrantDimensionMismatch:
"""Test suite for Qdrant dimension mismatch handling."""
def test_qdrant_dimension_mismatch_skip_migration(self):
"""
Test that Qdrant skips migration when dimensions don't match.
Scenario: Legacy collection has 1536d vectors, new model expects 3072d.
Expected: Migration skipped, new empty collection created, legacy preserved.
"""
from qdrant_client import models
# Setup mock client
client = MagicMock()
# Mock legacy collection with 1536d vectors
legacy_collection_info = MagicMock()
legacy_collection_info.config.params.vectors.size = 1536
# Setup collection existence checks
def collection_exists_side_effect(name):
if name == "lightrag_chunks": # legacy
return True
elif name == "lightrag_chunks_model_3072d": # new
return False
return False
client.collection_exists.side_effect = collection_exists_side_effect
client.get_collection.return_value = legacy_collection_info
client.count.return_value.count = 100 # Legacy has data
# Call setup_collection with 3072d (different from legacy 1536d)
QdrantVectorDBStorage.setup_collection(
client,
"lightrag_chunks_model_3072d",
namespace="chunks",
workspace="test",
vectors_config=models.VectorParams(
size=3072, distance=models.Distance.COSINE
),
)
# Verify new collection was created
client.create_collection.assert_called_once()
# Verify migration was NOT attempted (no scroll/upsert calls)
client.scroll.assert_not_called()
client.upsert.assert_not_called()
def test_qdrant_dimension_match_proceed_migration(self):
"""
Test that Qdrant proceeds with migration when dimensions match.
Scenario: Legacy collection has 1536d vectors, new model also expects 1536d.
Expected: Migration proceeds normally.
"""
from qdrant_client import models
client = MagicMock()
# Mock legacy collection with 1536d vectors (matching new)
legacy_collection_info = MagicMock()
legacy_collection_info.config.params.vectors.size = 1536
def collection_exists_side_effect(name):
if name == "lightrag_chunks": # legacy
return True
elif name == "lightrag_chunks_model_1536d": # new
return False
return False
client.collection_exists.side_effect = collection_exists_side_effect
client.get_collection.return_value = legacy_collection_info
client.count.return_value.count = 100 # Legacy has data
# Mock scroll to return sample data
sample_point = MagicMock()
sample_point.id = "test_id"
sample_point.vector = [0.1] * 1536
sample_point.payload = {"id": "test"}
client.scroll.return_value = ([sample_point], None)
# Mock _find_legacy_collection to return the legacy collection name
with patch(
"lightrag.kg.qdrant_impl._find_legacy_collection",
return_value="lightrag_chunks",
):
# Call setup_collection with matching 1536d
QdrantVectorDBStorage.setup_collection(
client,
"lightrag_chunks_model_1536d",
namespace="chunks",
workspace="test",
vectors_config=models.VectorParams(
size=1536, distance=models.Distance.COSINE
),
)
# Verify migration WAS attempted
client.create_collection.assert_called_once()
client.scroll.assert_called()
client.upsert.assert_called()
class TestPostgresDimensionMismatch:
"""Test suite for PostgreSQL dimension mismatch handling."""
@pytest.mark.asyncio
async def test_postgres_dimension_mismatch_skip_migration_metadata(self):
"""
Test that PostgreSQL skips migration when dimensions don't match (via metadata).
Scenario: Legacy table has 1536d vectors (detected via pg_attribute),
new model expects 3072d.
Expected: Migration skipped, new empty table created, legacy preserved.
"""
# Setup mock database
db = AsyncMock()
# Mock table existence and dimension checks
async def query_side_effect(query, params, **kwargs):
if "information_schema.tables" in query:
if params[0] == "LIGHTRAG_DOC_CHUNKS": # legacy
return {"exists": True}
elif params[0] == "LIGHTRAG_DOC_CHUNKS_model_3072d": # new
return {"exists": False}
elif "COUNT(*)" in query:
return {"count": 100} # Legacy has data
elif "pg_attribute" in query:
return {"vector_dim": 1536} # Legacy has 1536d vectors
return {}
db.query.side_effect = query_side_effect
db.execute = AsyncMock()
db._create_vector_index = AsyncMock()
# Call setup_table with 3072d (different from legacy 1536d)
await PGVectorStorage.setup_table(
db,
"LIGHTRAG_DOC_CHUNKS_model_3072d",
legacy_table_name="LIGHTRAG_DOC_CHUNKS",
base_table="LIGHTRAG_DOC_CHUNKS",
embedding_dim=3072,
workspace="test",
)
# Verify migration was NOT attempted (no INSERT calls)
# Note: _pg_create_table is mocked, so we check INSERT calls to verify migration was skipped
insert_calls = [
call
for call in db.execute.call_args_list
if call[0][0] and "INSERT INTO" in call[0][0]
]
assert (
len(insert_calls) == 0
), "Migration should be skipped due to dimension mismatch"
@pytest.mark.asyncio
async def test_postgres_dimension_mismatch_skip_migration_sampling(self):
"""
Test that PostgreSQL skips migration when dimensions don't match (via sampling).
Scenario: Legacy table dimension detection fails via metadata,
falls back to vector sampling, detects 1536d vs expected 3072d.
Expected: Migration skipped, new empty table created, legacy preserved.
"""
db = AsyncMock()
# Mock table existence and dimension checks
async def query_side_effect(query, params, **kwargs):
if "information_schema.tables" in query:
if params[0] == "LIGHTRAG_DOC_CHUNKS": # legacy
return {"exists": True}
elif params[0] == "LIGHTRAG_DOC_CHUNKS_model_3072d": # new
return {"exists": False}
elif "COUNT(*)" in query:
return {"count": 100} # Legacy has data
elif "pg_attribute" in query:
return {"vector_dim": -1} # Metadata check fails
elif "SELECT content_vector FROM" in query:
# Return sample vector with 1536 dimensions
return {"content_vector": [0.1] * 1536}
return {}
db.query.side_effect = query_side_effect
db.execute = AsyncMock()
db._create_vector_index = AsyncMock()
# Call setup_table with 3072d (different from legacy 1536d)
await PGVectorStorage.setup_table(
db,
"LIGHTRAG_DOC_CHUNKS_model_3072d",
legacy_table_name="LIGHTRAG_DOC_CHUNKS",
base_table="LIGHTRAG_DOC_CHUNKS",
embedding_dim=3072,
workspace="test",
)
# Verify new table was created
create_table_calls = [
call
for call in db.execute.call_args_list
if call[0][0] and "CREATE TABLE" in call[0][0]
]
assert len(create_table_calls) > 0, "New table should be created"
# Verify migration was NOT attempted
insert_calls = [
call
for call in db.execute.call_args_list
if call[0][0] and "INSERT INTO" in call[0][0]
]
assert len(insert_calls) == 0, "Migration should be skipped"
@pytest.mark.asyncio
async def test_postgres_dimension_match_proceed_migration(self):
"""
Test that PostgreSQL proceeds with migration when dimensions match.
Scenario: Legacy table has 1536d vectors, new model also expects 1536d.
Expected: Migration proceeds normally.
"""
db = AsyncMock()
async def query_side_effect(query, params, **kwargs):
multirows = kwargs.get("multirows", False)
if "information_schema.tables" in query:
if params[0] == "LIGHTRAG_DOC_CHUNKS": # legacy
return {"exists": True}
elif params[0] == "LIGHTRAG_DOC_CHUNKS_model_1536d": # new
return {"exists": False}
elif "COUNT(*)" in query:
return {"count": 100} # Legacy has data
elif "pg_attribute" in query:
return {"vector_dim": 1536} # Legacy has matching 1536d
elif "SELECT * FROM" in query and multirows:
# Return sample data for migration (first batch)
# Handle workspace filtering: params = [workspace, offset, limit]
if "WHERE workspace" in query:
offset = params[1] if len(params) > 1 else 0
else:
offset = params[0] if params else 0
if offset == 0: # First batch
return [
{
"id": "test1",
"content_vector": [0.1] * 1536,
"workspace": "test",
},
{
"id": "test2",
"content_vector": [0.2] * 1536,
"workspace": "test",
},
]
else: # offset > 0
return [] # No more data
return {}
db.query.side_effect = query_side_effect
db.execute = AsyncMock()
db._create_vector_index = AsyncMock()
# Mock _pg_table_exists
async def mock_table_exists(db_inst, name):
if name == "LIGHTRAG_DOC_CHUNKS": # legacy exists
return True
elif name == "LIGHTRAG_DOC_CHUNKS_model_1536d": # new doesn't exist
return False
return False
with patch(
"lightrag.kg.postgres_impl._pg_table_exists",
side_effect=mock_table_exists,
):
# Call setup_table with matching 1536d
await PGVectorStorage.setup_table(
db,
"LIGHTRAG_DOC_CHUNKS_model_1536d",
legacy_table_name="LIGHTRAG_DOC_CHUNKS",
base_table="LIGHTRAG_DOC_CHUNKS",
embedding_dim=1536,
workspace="test",
)
# Verify migration WAS attempted (INSERT calls made)
insert_calls = [
call
for call in db.execute.call_args_list
if call[0][0] and "INSERT INTO" in call[0][0]
]
assert (
len(insert_calls) > 0
), "Migration should proceed with matching dimensions"