Why this change is needed: CI reported 5 lint errors that needed to be fixed: - Unused import of 'patch' in test_dimension_mismatch.py - Unnecessary f-string prefixes without placeholders - Bare except clauses without exception type How it solves it: - Removed unused 'patch' import (auto-fixed by ruff) - Removed unnecessary f-string prefixes (auto-fixed by ruff) - Changed bare 'except:' to 'except Exception:' for proper exception handling Impact: - Code now passes all ruff lint checks - Better exception handling practices (doesn't catch SystemExit/KeyboardInterrupt) - Cleaner, more maintainable test code Testing: Verified with: uv run ruff check tests/ Result: All checks passed!
290 lines
11 KiB
Python
290 lines
11 KiB
Python
"""
|
|
Tests for dimension mismatch handling during migration.
|
|
|
|
This test module verifies that both PostgreSQL and Qdrant storage backends
|
|
properly detect and handle vector dimension mismatches when migrating from
|
|
legacy collections/tables to new ones with different embedding models.
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import MagicMock, AsyncMock
|
|
|
|
from lightrag.kg.qdrant_impl import QdrantVectorDBStorage
|
|
from lightrag.kg.postgres_impl import PGVectorStorage
|
|
|
|
|
|
class TestQdrantDimensionMismatch:
|
|
"""Test suite for Qdrant dimension mismatch handling."""
|
|
|
|
def test_qdrant_dimension_mismatch_skip_migration(self):
|
|
"""
|
|
Test that Qdrant skips migration when dimensions don't match.
|
|
|
|
Scenario: Legacy collection has 1536d vectors, new model expects 3072d.
|
|
Expected: Migration skipped, new empty collection created, legacy preserved.
|
|
"""
|
|
from qdrant_client import models
|
|
|
|
# Setup mock client
|
|
client = MagicMock()
|
|
|
|
# Mock legacy collection with 1536d vectors
|
|
legacy_collection_info = MagicMock()
|
|
legacy_collection_info.config.params.vectors.size = 1536
|
|
|
|
# Setup collection existence checks
|
|
def collection_exists_side_effect(name):
|
|
if name == "lightrag_chunks": # legacy
|
|
return True
|
|
elif name == "lightrag_chunks_model_3072d": # new
|
|
return False
|
|
return False
|
|
|
|
client.collection_exists.side_effect = collection_exists_side_effect
|
|
client.get_collection.return_value = legacy_collection_info
|
|
client.count.return_value.count = 100 # Legacy has data
|
|
|
|
# Call setup_collection with 3072d (different from legacy 1536d)
|
|
QdrantVectorDBStorage.setup_collection(
|
|
client,
|
|
"lightrag_chunks_model_3072d",
|
|
namespace="chunks",
|
|
workspace="test",
|
|
vectors_config=models.VectorParams(
|
|
size=3072, distance=models.Distance.COSINE
|
|
),
|
|
)
|
|
|
|
# Verify new collection was created
|
|
client.create_collection.assert_called_once()
|
|
|
|
# Verify migration was NOT attempted (no scroll/upsert calls)
|
|
client.scroll.assert_not_called()
|
|
client.upsert.assert_not_called()
|
|
|
|
def test_qdrant_dimension_match_proceed_migration(self):
|
|
"""
|
|
Test that Qdrant proceeds with migration when dimensions match.
|
|
|
|
Scenario: Legacy collection has 1536d vectors, new model also expects 1536d.
|
|
Expected: Migration proceeds normally.
|
|
"""
|
|
from qdrant_client import models
|
|
|
|
client = MagicMock()
|
|
|
|
# Mock legacy collection with 1536d vectors (matching new)
|
|
legacy_collection_info = MagicMock()
|
|
legacy_collection_info.config.params.vectors.size = 1536
|
|
|
|
def collection_exists_side_effect(name):
|
|
if name == "lightrag_chunks": # legacy
|
|
return True
|
|
elif name == "lightrag_chunks_model_1536d": # new
|
|
return False
|
|
return False
|
|
|
|
client.collection_exists.side_effect = collection_exists_side_effect
|
|
client.get_collection.return_value = legacy_collection_info
|
|
client.count.return_value.count = 100 # Legacy has data
|
|
|
|
# Mock scroll to return sample data
|
|
sample_point = MagicMock()
|
|
sample_point.id = "test_id"
|
|
sample_point.vector = [0.1] * 1536
|
|
sample_point.payload = {"id": "test"}
|
|
client.scroll.return_value = ([sample_point], None)
|
|
|
|
# Call setup_collection with matching 1536d
|
|
QdrantVectorDBStorage.setup_collection(
|
|
client,
|
|
"lightrag_chunks_model_1536d",
|
|
namespace="chunks",
|
|
workspace="test",
|
|
vectors_config=models.VectorParams(
|
|
size=1536, distance=models.Distance.COSINE
|
|
),
|
|
)
|
|
|
|
# Verify migration WAS attempted
|
|
client.create_collection.assert_called_once()
|
|
client.scroll.assert_called()
|
|
client.upsert.assert_called()
|
|
|
|
|
|
class TestPostgresDimensionMismatch:
|
|
"""Test suite for PostgreSQL dimension mismatch handling."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_postgres_dimension_mismatch_skip_migration_metadata(self):
|
|
"""
|
|
Test that PostgreSQL skips migration when dimensions don't match (via metadata).
|
|
|
|
Scenario: Legacy table has 1536d vectors (detected via pg_attribute),
|
|
new model expects 3072d.
|
|
Expected: Migration skipped, new empty table created, legacy preserved.
|
|
"""
|
|
# Setup mock database
|
|
db = AsyncMock()
|
|
|
|
# Mock table existence and dimension checks
|
|
async def query_side_effect(query, params, **kwargs):
|
|
if "information_schema.tables" in query:
|
|
if params[0] == "lightrag_doc_chunks": # legacy
|
|
return {"exists": True}
|
|
elif params[0] == "lightrag_doc_chunks_model_3072d": # new
|
|
return {"exists": False}
|
|
elif "COUNT(*)" in query:
|
|
return {"count": 100} # Legacy has data
|
|
elif "pg_attribute" in query:
|
|
return {"vector_dim": 1536} # Legacy has 1536d vectors
|
|
return {}
|
|
|
|
db.query.side_effect = query_side_effect
|
|
db.execute = AsyncMock()
|
|
db._create_vector_index = AsyncMock()
|
|
|
|
# Call setup_table with 3072d (different from legacy 1536d)
|
|
await PGVectorStorage.setup_table(
|
|
db,
|
|
"lightrag_doc_chunks_model_3072d",
|
|
legacy_table_name="lightrag_doc_chunks",
|
|
base_table="lightrag_doc_chunks",
|
|
embedding_dim=3072,
|
|
)
|
|
|
|
# Verify new table was created (DDL executed)
|
|
create_table_calls = [
|
|
call
|
|
for call in db.execute.call_args_list
|
|
if call[0][0] and "CREATE TABLE" in call[0][0]
|
|
]
|
|
assert len(create_table_calls) > 0, "New table should be created"
|
|
|
|
# Verify migration was NOT attempted (no INSERT calls)
|
|
insert_calls = [
|
|
call
|
|
for call in db.execute.call_args_list
|
|
if call[0][0] and "INSERT INTO" in call[0][0]
|
|
]
|
|
assert len(insert_calls) == 0, "Migration should be skipped"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_postgres_dimension_mismatch_skip_migration_sampling(self):
|
|
"""
|
|
Test that PostgreSQL skips migration when dimensions don't match (via sampling).
|
|
|
|
Scenario: Legacy table dimension detection fails via metadata,
|
|
falls back to vector sampling, detects 1536d vs expected 3072d.
|
|
Expected: Migration skipped, new empty table created, legacy preserved.
|
|
"""
|
|
db = AsyncMock()
|
|
|
|
# Mock table existence and dimension checks
|
|
async def query_side_effect(query, params, **kwargs):
|
|
if "information_schema.tables" in query:
|
|
if params[0] == "lightrag_doc_chunks": # legacy
|
|
return {"exists": True}
|
|
elif params[0] == "lightrag_doc_chunks_model_3072d": # new
|
|
return {"exists": False}
|
|
elif "COUNT(*)" in query:
|
|
return {"count": 100} # Legacy has data
|
|
elif "pg_attribute" in query:
|
|
return {"vector_dim": -1} # Metadata check fails
|
|
elif "SELECT content_vector FROM" in query:
|
|
# Return sample vector with 1536 dimensions
|
|
return {"content_vector": [0.1] * 1536}
|
|
return {}
|
|
|
|
db.query.side_effect = query_side_effect
|
|
db.execute = AsyncMock()
|
|
db._create_vector_index = AsyncMock()
|
|
|
|
# Call setup_table with 3072d (different from legacy 1536d)
|
|
await PGVectorStorage.setup_table(
|
|
db,
|
|
"lightrag_doc_chunks_model_3072d",
|
|
legacy_table_name="lightrag_doc_chunks",
|
|
base_table="lightrag_doc_chunks",
|
|
embedding_dim=3072,
|
|
)
|
|
|
|
# Verify new table was created
|
|
create_table_calls = [
|
|
call
|
|
for call in db.execute.call_args_list
|
|
if call[0][0] and "CREATE TABLE" in call[0][0]
|
|
]
|
|
assert len(create_table_calls) > 0, "New table should be created"
|
|
|
|
# Verify migration was NOT attempted
|
|
insert_calls = [
|
|
call
|
|
for call in db.execute.call_args_list
|
|
if call[0][0] and "INSERT INTO" in call[0][0]
|
|
]
|
|
assert len(insert_calls) == 0, "Migration should be skipped"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_postgres_dimension_match_proceed_migration(self):
|
|
"""
|
|
Test that PostgreSQL proceeds with migration when dimensions match.
|
|
|
|
Scenario: Legacy table has 1536d vectors, new model also expects 1536d.
|
|
Expected: Migration proceeds normally.
|
|
"""
|
|
db = AsyncMock()
|
|
|
|
async def query_side_effect(query, params, **kwargs):
|
|
multirows = kwargs.get("multirows", False)
|
|
|
|
if "information_schema.tables" in query:
|
|
if params[0] == "lightrag_doc_chunks": # legacy
|
|
return {"exists": True}
|
|
elif params[0] == "lightrag_doc_chunks_model_1536d": # new
|
|
return {"exists": False}
|
|
elif "COUNT(*)" in query:
|
|
return {"count": 100} # Legacy has data
|
|
elif "pg_attribute" in query:
|
|
return {"vector_dim": 1536} # Legacy has matching 1536d
|
|
elif "SELECT * FROM" in query and multirows:
|
|
# Return sample data for migration (first batch)
|
|
if params[0] == 0: # offset = 0
|
|
return [
|
|
{
|
|
"id": "test1",
|
|
"content_vector": [0.1] * 1536,
|
|
"workspace": "test",
|
|
},
|
|
{
|
|
"id": "test2",
|
|
"content_vector": [0.2] * 1536,
|
|
"workspace": "test",
|
|
},
|
|
]
|
|
else: # offset > 0
|
|
return [] # No more data
|
|
return {}
|
|
|
|
db.query.side_effect = query_side_effect
|
|
db.execute = AsyncMock()
|
|
db._create_vector_index = AsyncMock()
|
|
|
|
# Call setup_table with matching 1536d
|
|
await PGVectorStorage.setup_table(
|
|
db,
|
|
"lightrag_doc_chunks_model_1536d",
|
|
legacy_table_name="lightrag_doc_chunks",
|
|
base_table="lightrag_doc_chunks",
|
|
embedding_dim=1536,
|
|
)
|
|
|
|
# Verify migration WAS attempted (INSERT calls made)
|
|
insert_calls = [
|
|
call
|
|
for call in db.execute.call_args_list
|
|
if call[0][0] and "INSERT INTO" in call[0][0]
|
|
]
|
|
assert (
|
|
len(insert_calls) > 0
|
|
), "Migration should proceed with matching dimensions"
|