feat: implement dimension compatibility checks for PostgreSQL and Qdrant migrations
This update introduces checks for vector dimension compatibility before migrating legacy data in both PostgreSQL and Qdrant storage implementations. If a dimension mismatch is detected, the migration is skipped to prevent data loss, and a new empty table or collection is created for the new embedding model. Key changes include: - Added dimension checks in `PGVectorStorage` and `QdrantVectorDBStorage` classes. - Enhanced logging to inform users about dimension mismatches and the creation of new storage. - Updated E2E tests to validate the new behavior, ensuring legacy data is preserved and new structures are created correctly. Impact: - Prevents potential data corruption during migrations with mismatched dimensions. - Improves user experience by providing clear logging and maintaining legacy data integrity. Testing: - New tests confirm that the system behaves as expected when encountering dimension mismatches.
This commit is contained in:
parent
e0767b1a47
commit
5180c1e395
5 changed files with 527 additions and 80 deletions
|
|
@ -2344,6 +2344,71 @@ class PGVectorStorage(BaseVectorStorage):
|
||||||
await db._create_vector_index(table_name, embedding_dim)
|
await db._create_vector_index(table_name, embedding_dim)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Check vector dimension compatibility before migration
|
||||||
|
legacy_dim = None
|
||||||
|
try:
|
||||||
|
# Try to get vector dimension from pg_attribute metadata
|
||||||
|
dim_query = """
|
||||||
|
SELECT
|
||||||
|
CASE
|
||||||
|
WHEN typname = 'vector' THEN
|
||||||
|
COALESCE(atttypmod, -1)
|
||||||
|
ELSE -1
|
||||||
|
END as vector_dim
|
||||||
|
FROM pg_attribute a
|
||||||
|
JOIN pg_type t ON a.atttypid = t.oid
|
||||||
|
WHERE a.attrelid = $1::regclass
|
||||||
|
AND a.attname = 'content_vector'
|
||||||
|
"""
|
||||||
|
dim_result = await db.query(dim_query, [legacy_table_name])
|
||||||
|
legacy_dim = dim_result.get("vector_dim", -1) if dim_result else -1
|
||||||
|
|
||||||
|
if legacy_dim <= 0:
|
||||||
|
# Alternative: Try to detect by sampling a vector
|
||||||
|
logger.info(
|
||||||
|
"PostgreSQL: Metadata dimension check failed, trying vector sampling..."
|
||||||
|
)
|
||||||
|
sample_query = (
|
||||||
|
f"SELECT content_vector FROM {legacy_table_name} LIMIT 1"
|
||||||
|
)
|
||||||
|
sample_result = await db.query(sample_query, [])
|
||||||
|
if sample_result and sample_result.get("content_vector"):
|
||||||
|
vector_data = sample_result["content_vector"]
|
||||||
|
# pgvector returns list directly
|
||||||
|
if isinstance(vector_data, (list, tuple)):
|
||||||
|
legacy_dim = len(vector_data)
|
||||||
|
elif isinstance(vector_data, str):
|
||||||
|
import json
|
||||||
|
|
||||||
|
vector_list = json.loads(vector_data)
|
||||||
|
legacy_dim = len(vector_list)
|
||||||
|
|
||||||
|
if legacy_dim > 0 and embedding_dim and legacy_dim != embedding_dim:
|
||||||
|
logger.warning(
|
||||||
|
f"PostgreSQL: Dimension mismatch detected! "
|
||||||
|
f"Legacy table '{legacy_table_name}' has {legacy_dim}d vectors, "
|
||||||
|
f"but new embedding model expects {embedding_dim}d. "
|
||||||
|
f"Migration skipped to prevent data loss. "
|
||||||
|
f"Legacy table preserved as '{legacy_table_name}'. "
|
||||||
|
f"Creating new empty table '{table_name}' for new data."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create new table but skip migration
|
||||||
|
await _pg_create_table(db, table_name, base_table, embedding_dim)
|
||||||
|
await db._create_vector_index(table_name, embedding_dim)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"PostgreSQL: New table '{table_name}' created. "
|
||||||
|
f"To query legacy data, please use a {legacy_dim}d embedding model."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"PostgreSQL: Could not verify legacy table vector dimension: {e}. "
|
||||||
|
f"Proceeding with caution..."
|
||||||
|
)
|
||||||
|
|
||||||
# Create new table first
|
# Create new table first
|
||||||
logger.info(f"PostgreSQL: Creating new table '{table_name}'")
|
logger.info(f"PostgreSQL: Creating new table '{table_name}'")
|
||||||
await _pg_create_table(db, table_name, base_table, embedding_dim)
|
await _pg_create_table(db, table_name, base_table, embedding_dim)
|
||||||
|
|
|
||||||
|
|
@ -251,6 +251,51 @@ class QdrantVectorDBStorage(BaseVectorStorage):
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Check vector dimension compatibility before migration
|
||||||
|
try:
|
||||||
|
legacy_info = client.get_collection(legacy_collection)
|
||||||
|
legacy_dim = legacy_info.config.params.vectors.size
|
||||||
|
|
||||||
|
# Get expected dimension from kwargs
|
||||||
|
new_dim = (
|
||||||
|
kwargs.get("vectors_config").size
|
||||||
|
if "vectors_config" in kwargs
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
if new_dim and legacy_dim != new_dim:
|
||||||
|
logger.warning(
|
||||||
|
f"Qdrant: Dimension mismatch detected! "
|
||||||
|
f"Legacy collection '{legacy_collection}' has {legacy_dim}d vectors, "
|
||||||
|
f"but new embedding model expects {new_dim}d. "
|
||||||
|
f"Migration skipped to prevent data loss. "
|
||||||
|
f"Legacy collection preserved as '{legacy_collection}'. "
|
||||||
|
f"Creating new empty collection '{collection_name}' for new data."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create new collection but skip migration
|
||||||
|
client.create_collection(collection_name, **kwargs)
|
||||||
|
client.create_payload_index(
|
||||||
|
collection_name=collection_name,
|
||||||
|
field_name=WORKSPACE_ID_FIELD,
|
||||||
|
field_schema=models.KeywordIndexParams(
|
||||||
|
type=models.KeywordIndexType.KEYWORD,
|
||||||
|
is_tenant=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Qdrant: New collection '{collection_name}' created. "
|
||||||
|
f"To query legacy data, please use a {legacy_dim}d embedding model."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Qdrant: Could not verify legacy collection dimension: {e}. "
|
||||||
|
f"Proceeding with caution..."
|
||||||
|
)
|
||||||
|
|
||||||
# Create new collection first
|
# Create new collection first
|
||||||
logger.info(f"Qdrant: Creating new collection '{collection_name}'")
|
logger.info(f"Qdrant: Creating new collection '{collection_name}'")
|
||||||
client.create_collection(collection_name, **kwargs)
|
client.create_collection(collection_name, **kwargs)
|
||||||
|
|
|
||||||
290
tests/test_dimension_mismatch.py
Normal file
290
tests/test_dimension_mismatch.py
Normal file
|
|
@ -0,0 +1,290 @@
|
||||||
|
"""
|
||||||
|
Tests for dimension mismatch handling during migration.
|
||||||
|
|
||||||
|
This test module verifies that both PostgreSQL and Qdrant storage backends
|
||||||
|
properly detect and handle vector dimension mismatches when migrating from
|
||||||
|
legacy collections/tables to new ones with different embedding models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import MagicMock, AsyncMock, patch
|
||||||
|
|
||||||
|
from lightrag.kg.qdrant_impl import QdrantVectorDBStorage
|
||||||
|
from lightrag.kg.postgres_impl import PGVectorStorage
|
||||||
|
|
||||||
|
|
||||||
|
class TestQdrantDimensionMismatch:
|
||||||
|
"""Test suite for Qdrant dimension mismatch handling."""
|
||||||
|
|
||||||
|
def test_qdrant_dimension_mismatch_skip_migration(self):
|
||||||
|
"""
|
||||||
|
Test that Qdrant skips migration when dimensions don't match.
|
||||||
|
|
||||||
|
Scenario: Legacy collection has 1536d vectors, new model expects 3072d.
|
||||||
|
Expected: Migration skipped, new empty collection created, legacy preserved.
|
||||||
|
"""
|
||||||
|
from qdrant_client import models
|
||||||
|
|
||||||
|
# Setup mock client
|
||||||
|
client = MagicMock()
|
||||||
|
|
||||||
|
# Mock legacy collection with 1536d vectors
|
||||||
|
legacy_collection_info = MagicMock()
|
||||||
|
legacy_collection_info.config.params.vectors.size = 1536
|
||||||
|
|
||||||
|
# Setup collection existence checks
|
||||||
|
def collection_exists_side_effect(name):
|
||||||
|
if name == "lightrag_chunks": # legacy
|
||||||
|
return True
|
||||||
|
elif name == "lightrag_chunks_model_3072d": # new
|
||||||
|
return False
|
||||||
|
return False
|
||||||
|
|
||||||
|
client.collection_exists.side_effect = collection_exists_side_effect
|
||||||
|
client.get_collection.return_value = legacy_collection_info
|
||||||
|
client.count.return_value.count = 100 # Legacy has data
|
||||||
|
|
||||||
|
# Call setup_collection with 3072d (different from legacy 1536d)
|
||||||
|
QdrantVectorDBStorage.setup_collection(
|
||||||
|
client,
|
||||||
|
"lightrag_chunks_model_3072d",
|
||||||
|
namespace="chunks",
|
||||||
|
workspace="test",
|
||||||
|
vectors_config=models.VectorParams(
|
||||||
|
size=3072, distance=models.Distance.COSINE
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify new collection was created
|
||||||
|
client.create_collection.assert_called_once()
|
||||||
|
|
||||||
|
# Verify migration was NOT attempted (no scroll/upsert calls)
|
||||||
|
client.scroll.assert_not_called()
|
||||||
|
client.upsert.assert_not_called()
|
||||||
|
|
||||||
|
def test_qdrant_dimension_match_proceed_migration(self):
|
||||||
|
"""
|
||||||
|
Test that Qdrant proceeds with migration when dimensions match.
|
||||||
|
|
||||||
|
Scenario: Legacy collection has 1536d vectors, new model also expects 1536d.
|
||||||
|
Expected: Migration proceeds normally.
|
||||||
|
"""
|
||||||
|
from qdrant_client import models
|
||||||
|
|
||||||
|
client = MagicMock()
|
||||||
|
|
||||||
|
# Mock legacy collection with 1536d vectors (matching new)
|
||||||
|
legacy_collection_info = MagicMock()
|
||||||
|
legacy_collection_info.config.params.vectors.size = 1536
|
||||||
|
|
||||||
|
def collection_exists_side_effect(name):
|
||||||
|
if name == "lightrag_chunks": # legacy
|
||||||
|
return True
|
||||||
|
elif name == "lightrag_chunks_model_1536d": # new
|
||||||
|
return False
|
||||||
|
return False
|
||||||
|
|
||||||
|
client.collection_exists.side_effect = collection_exists_side_effect
|
||||||
|
client.get_collection.return_value = legacy_collection_info
|
||||||
|
client.count.return_value.count = 100 # Legacy has data
|
||||||
|
|
||||||
|
# Mock scroll to return sample data
|
||||||
|
sample_point = MagicMock()
|
||||||
|
sample_point.id = "test_id"
|
||||||
|
sample_point.vector = [0.1] * 1536
|
||||||
|
sample_point.payload = {"id": "test"}
|
||||||
|
client.scroll.return_value = ([sample_point], None)
|
||||||
|
|
||||||
|
# Call setup_collection with matching 1536d
|
||||||
|
QdrantVectorDBStorage.setup_collection(
|
||||||
|
client,
|
||||||
|
"lightrag_chunks_model_1536d",
|
||||||
|
namespace="chunks",
|
||||||
|
workspace="test",
|
||||||
|
vectors_config=models.VectorParams(
|
||||||
|
size=1536, distance=models.Distance.COSINE
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify migration WAS attempted
|
||||||
|
client.create_collection.assert_called_once()
|
||||||
|
client.scroll.assert_called()
|
||||||
|
client.upsert.assert_called()
|
||||||
|
|
||||||
|
|
||||||
|
class TestPostgresDimensionMismatch:
|
||||||
|
"""Test suite for PostgreSQL dimension mismatch handling."""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_postgres_dimension_mismatch_skip_migration_metadata(self):
|
||||||
|
"""
|
||||||
|
Test that PostgreSQL skips migration when dimensions don't match (via metadata).
|
||||||
|
|
||||||
|
Scenario: Legacy table has 1536d vectors (detected via pg_attribute),
|
||||||
|
new model expects 3072d.
|
||||||
|
Expected: Migration skipped, new empty table created, legacy preserved.
|
||||||
|
"""
|
||||||
|
# Setup mock database
|
||||||
|
db = AsyncMock()
|
||||||
|
|
||||||
|
# Mock table existence and dimension checks
|
||||||
|
async def query_side_effect(query, params, **kwargs):
|
||||||
|
if "information_schema.tables" in query:
|
||||||
|
if params[0] == "lightrag_doc_chunks": # legacy
|
||||||
|
return {"exists": True}
|
||||||
|
elif params[0] == "lightrag_doc_chunks_model_3072d": # new
|
||||||
|
return {"exists": False}
|
||||||
|
elif "COUNT(*)" in query:
|
||||||
|
return {"count": 100} # Legacy has data
|
||||||
|
elif "pg_attribute" in query:
|
||||||
|
return {"vector_dim": 1536} # Legacy has 1536d vectors
|
||||||
|
return {}
|
||||||
|
|
||||||
|
db.query.side_effect = query_side_effect
|
||||||
|
db.execute = AsyncMock()
|
||||||
|
db._create_vector_index = AsyncMock()
|
||||||
|
|
||||||
|
# Call setup_table with 3072d (different from legacy 1536d)
|
||||||
|
await PGVectorStorage.setup_table(
|
||||||
|
db,
|
||||||
|
"lightrag_doc_chunks_model_3072d",
|
||||||
|
legacy_table_name="lightrag_doc_chunks",
|
||||||
|
base_table="lightrag_doc_chunks",
|
||||||
|
embedding_dim=3072,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify new table was created (DDL executed)
|
||||||
|
create_table_calls = [
|
||||||
|
call
|
||||||
|
for call in db.execute.call_args_list
|
||||||
|
if call[0][0] and "CREATE TABLE" in call[0][0]
|
||||||
|
]
|
||||||
|
assert len(create_table_calls) > 0, "New table should be created"
|
||||||
|
|
||||||
|
# Verify migration was NOT attempted (no INSERT calls)
|
||||||
|
insert_calls = [
|
||||||
|
call
|
||||||
|
for call in db.execute.call_args_list
|
||||||
|
if call[0][0] and "INSERT INTO" in call[0][0]
|
||||||
|
]
|
||||||
|
assert len(insert_calls) == 0, "Migration should be skipped"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_postgres_dimension_mismatch_skip_migration_sampling(self):
|
||||||
|
"""
|
||||||
|
Test that PostgreSQL skips migration when dimensions don't match (via sampling).
|
||||||
|
|
||||||
|
Scenario: Legacy table dimension detection fails via metadata,
|
||||||
|
falls back to vector sampling, detects 1536d vs expected 3072d.
|
||||||
|
Expected: Migration skipped, new empty table created, legacy preserved.
|
||||||
|
"""
|
||||||
|
db = AsyncMock()
|
||||||
|
|
||||||
|
# Mock table existence and dimension checks
|
||||||
|
async def query_side_effect(query, params, **kwargs):
|
||||||
|
if "information_schema.tables" in query:
|
||||||
|
if params[0] == "lightrag_doc_chunks": # legacy
|
||||||
|
return {"exists": True}
|
||||||
|
elif params[0] == "lightrag_doc_chunks_model_3072d": # new
|
||||||
|
return {"exists": False}
|
||||||
|
elif "COUNT(*)" in query:
|
||||||
|
return {"count": 100} # Legacy has data
|
||||||
|
elif "pg_attribute" in query:
|
||||||
|
return {"vector_dim": -1} # Metadata check fails
|
||||||
|
elif "SELECT content_vector FROM" in query:
|
||||||
|
# Return sample vector with 1536 dimensions
|
||||||
|
return {"content_vector": [0.1] * 1536}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
db.query.side_effect = query_side_effect
|
||||||
|
db.execute = AsyncMock()
|
||||||
|
db._create_vector_index = AsyncMock()
|
||||||
|
|
||||||
|
# Call setup_table with 3072d (different from legacy 1536d)
|
||||||
|
await PGVectorStorage.setup_table(
|
||||||
|
db,
|
||||||
|
"lightrag_doc_chunks_model_3072d",
|
||||||
|
legacy_table_name="lightrag_doc_chunks",
|
||||||
|
base_table="lightrag_doc_chunks",
|
||||||
|
embedding_dim=3072,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify new table was created
|
||||||
|
create_table_calls = [
|
||||||
|
call
|
||||||
|
for call in db.execute.call_args_list
|
||||||
|
if call[0][0] and "CREATE TABLE" in call[0][0]
|
||||||
|
]
|
||||||
|
assert len(create_table_calls) > 0, "New table should be created"
|
||||||
|
|
||||||
|
# Verify migration was NOT attempted
|
||||||
|
insert_calls = [
|
||||||
|
call
|
||||||
|
for call in db.execute.call_args_list
|
||||||
|
if call[0][0] and "INSERT INTO" in call[0][0]
|
||||||
|
]
|
||||||
|
assert len(insert_calls) == 0, "Migration should be skipped"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_postgres_dimension_match_proceed_migration(self):
|
||||||
|
"""
|
||||||
|
Test that PostgreSQL proceeds with migration when dimensions match.
|
||||||
|
|
||||||
|
Scenario: Legacy table has 1536d vectors, new model also expects 1536d.
|
||||||
|
Expected: Migration proceeds normally.
|
||||||
|
"""
|
||||||
|
db = AsyncMock()
|
||||||
|
|
||||||
|
async def query_side_effect(query, params, **kwargs):
|
||||||
|
multirows = kwargs.get("multirows", False)
|
||||||
|
|
||||||
|
if "information_schema.tables" in query:
|
||||||
|
if params[0] == "lightrag_doc_chunks": # legacy
|
||||||
|
return {"exists": True}
|
||||||
|
elif params[0] == "lightrag_doc_chunks_model_1536d": # new
|
||||||
|
return {"exists": False}
|
||||||
|
elif "COUNT(*)" in query:
|
||||||
|
return {"count": 100} # Legacy has data
|
||||||
|
elif "pg_attribute" in query:
|
||||||
|
return {"vector_dim": 1536} # Legacy has matching 1536d
|
||||||
|
elif "SELECT * FROM" in query and multirows:
|
||||||
|
# Return sample data for migration (first batch)
|
||||||
|
if params[0] == 0: # offset = 0
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"id": "test1",
|
||||||
|
"content_vector": [0.1] * 1536,
|
||||||
|
"workspace": "test",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "test2",
|
||||||
|
"content_vector": [0.2] * 1536,
|
||||||
|
"workspace": "test",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
else: # offset > 0
|
||||||
|
return [] # No more data
|
||||||
|
return {}
|
||||||
|
|
||||||
|
db.query.side_effect = query_side_effect
|
||||||
|
db.execute = AsyncMock()
|
||||||
|
db._create_vector_index = AsyncMock()
|
||||||
|
|
||||||
|
# Call setup_table with matching 1536d
|
||||||
|
await PGVectorStorage.setup_table(
|
||||||
|
db,
|
||||||
|
"lightrag_doc_chunks_model_1536d",
|
||||||
|
legacy_table_name="lightrag_doc_chunks",
|
||||||
|
base_table="lightrag_doc_chunks",
|
||||||
|
embedding_dim=1536,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify migration WAS attempted (INSERT calls made)
|
||||||
|
insert_calls = [
|
||||||
|
call
|
||||||
|
for call in db.execute.call_args_list
|
||||||
|
if call[0][0] and "INSERT INTO" in call[0][0]
|
||||||
|
]
|
||||||
|
assert (
|
||||||
|
len(insert_calls) > 0
|
||||||
|
), "Migration should proceed with matching dimensions"
|
||||||
|
|
@ -1169,56 +1169,73 @@ async def test_dimension_mismatch_postgres(
|
||||||
|
|
||||||
print("📦 Initializing LightRAG with new model (3072d)...")
|
print("📦 Initializing LightRAG with new model (3072d)...")
|
||||||
|
|
||||||
# This should handle dimension mismatch gracefully
|
# With our fix, this should handle dimension mismatch gracefully:
|
||||||
# Either: 1) Create new table for new model, or 2) Raise clear error
|
# Expected behavior:
|
||||||
try:
|
# 1. Detect dimension mismatch (1536d legacy vs 3072d new)
|
||||||
rag = LightRAG(
|
# 2. Skip migration to prevent data corruption
|
||||||
working_dir=temp_dir,
|
# 3. Preserve legacy table with original data
|
||||||
llm_model_func=mock_llm_func,
|
# 4. Create new empty table for 3072d model
|
||||||
embedding_func=embedding_func_new,
|
# 5. System initializes successfully
|
||||||
tokenizer=mock_tokenizer,
|
|
||||||
kv_storage="PGKVStorage",
|
|
||||||
vector_storage="PGVectorStorage",
|
|
||||||
doc_status_storage="PGDocStatusStorage",
|
|
||||||
vector_db_storage_cls_kwargs={
|
|
||||||
**pg_config,
|
|
||||||
"cosine_better_than_threshold": 0.8,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
await rag.initialize_storages()
|
rag = LightRAG(
|
||||||
|
working_dir=temp_dir,
|
||||||
|
llm_model_func=mock_llm_func,
|
||||||
|
embedding_func=embedding_func_new,
|
||||||
|
tokenizer=mock_tokenizer,
|
||||||
|
kv_storage="PGKVStorage",
|
||||||
|
vector_storage="PGVectorStorage",
|
||||||
|
doc_status_storage="PGDocStatusStorage",
|
||||||
|
vector_db_storage_cls_kwargs={
|
||||||
|
**pg_config,
|
||||||
|
"cosine_better_than_threshold": 0.8,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
# Check what happened
|
await rag.initialize_storages()
|
||||||
new_table = rag.chunks_vdb.table_name
|
|
||||||
print(f"✅ Initialization succeeded, new table: {new_table}")
|
|
||||||
|
|
||||||
# Verify new table has correct dimension (3072d)
|
# Verify expected behavior
|
||||||
# Check if both tables exist
|
new_table = rag.chunks_vdb.table_name
|
||||||
check_legacy = f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = '{legacy_table}')"
|
print(f"✅ Initialization succeeded, new table: {new_table}")
|
||||||
check_new = f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = '{new_table.lower()}')"
|
|
||||||
|
|
||||||
legacy_exists = await pg_cleanup.query(check_legacy, [])
|
# 1. New table should exist and be created with model suffix
|
||||||
new_exists = await pg_cleanup.query(check_new, [])
|
assert "text_embedding_3_large_3072d" in new_table.lower()
|
||||||
|
check_new = f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = '{new_table.lower()}')"
|
||||||
|
new_exists = await pg_cleanup.query(check_new, [])
|
||||||
|
assert new_exists.get("exists") is True, "New table should exist"
|
||||||
|
print(f"✅ New table created: {new_table}")
|
||||||
|
|
||||||
print(f"✅ Legacy table exists: {legacy_exists.get('exists')}")
|
# 2. Legacy table should be preserved (not deleted)
|
||||||
print(f"✅ New table exists: {new_exists.get('exists')}")
|
check_legacy = f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = '{legacy_table}')"
|
||||||
|
legacy_exists = await pg_cleanup.query(check_legacy, [])
|
||||||
|
assert (
|
||||||
|
legacy_exists.get("exists") is True
|
||||||
|
), "Legacy table should be preserved when dimensions don't match"
|
||||||
|
print(f"✅ Legacy table preserved: {legacy_table}")
|
||||||
|
|
||||||
# Test should verify proper handling:
|
# 3. Legacy table should still have original data (not migrated)
|
||||||
# - New table created with 3072d
|
legacy_count_result = await pg_cleanup.query(
|
||||||
# - Legacy table preserved (or migrated to dimension-matched table)
|
f"SELECT COUNT(*) as count FROM {legacy_table}", []
|
||||||
# - System is operational
|
)
|
||||||
|
legacy_count = legacy_count_result.get("count", 0)
|
||||||
|
assert (
|
||||||
|
legacy_count == 3
|
||||||
|
), f"Legacy table should still have 3 records, got {legacy_count}"
|
||||||
|
print(f"✅ Legacy data preserved: {legacy_count} records")
|
||||||
|
|
||||||
await rag.finalize_storages()
|
# 4. New table should be empty (migration skipped)
|
||||||
|
new_count_result = await pg_cleanup.query(
|
||||||
|
f"SELECT COUNT(*) as count FROM {new_table}", []
|
||||||
|
)
|
||||||
|
new_count = new_count_result.get("count", 0)
|
||||||
|
assert (
|
||||||
|
new_count == 0
|
||||||
|
), f"New table should be empty (migration skipped), got {new_count}"
|
||||||
|
print(f"✅ New table is empty (migration correctly skipped): {new_count} records")
|
||||||
|
|
||||||
except Exception as e:
|
# 5. System should be operational
|
||||||
# If it raises an error, it should be a clear, actionable error
|
print("✅ System initialized successfully despite dimension mismatch")
|
||||||
print(f"⚠️ Initialization raised exception: {e}")
|
|
||||||
# Verify error message is clear and actionable
|
await rag.finalize_storages()
|
||||||
assert any(
|
|
||||||
keyword in str(e).lower()
|
|
||||||
for keyword in ["dimension", "mismatch", "1536", "3072"]
|
|
||||||
), f"Error message should mention dimension mismatch: {e}"
|
|
||||||
print("✅ Clear error message provided to user")
|
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
@ -1293,50 +1310,72 @@ async def test_dimension_mismatch_qdrant(
|
||||||
|
|
||||||
print("📦 Initializing LightRAG with new model (1024d)...")
|
print("📦 Initializing LightRAG with new model (1024d)...")
|
||||||
|
|
||||||
# This should handle dimension mismatch gracefully
|
# With our fix, this should handle dimension mismatch gracefully:
|
||||||
try:
|
# Expected behavior:
|
||||||
rag = LightRAG(
|
# 1. Detect dimension mismatch (768d legacy vs 1024d new)
|
||||||
working_dir=temp_dir,
|
# 2. Skip migration to prevent data corruption
|
||||||
llm_model_func=mock_llm_func,
|
# 3. Preserve legacy collection with original data
|
||||||
embedding_func=embedding_func_new,
|
# 4. Create new empty collection for 1024d model
|
||||||
tokenizer=mock_tokenizer,
|
# 5. System initializes successfully
|
||||||
vector_storage="QdrantVectorDBStorage",
|
|
||||||
vector_db_storage_cls_kwargs={
|
|
||||||
**qdrant_config,
|
|
||||||
"cosine_better_than_threshold": 0.8,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
await rag.initialize_storages()
|
rag = LightRAG(
|
||||||
|
working_dir=temp_dir,
|
||||||
|
llm_model_func=mock_llm_func,
|
||||||
|
embedding_func=embedding_func_new,
|
||||||
|
tokenizer=mock_tokenizer,
|
||||||
|
vector_storage="QdrantVectorDBStorage",
|
||||||
|
vector_db_storage_cls_kwargs={
|
||||||
|
**qdrant_config,
|
||||||
|
"cosine_better_than_threshold": 0.8,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
# Check what happened
|
await rag.initialize_storages()
|
||||||
new_collection = rag.chunks_vdb.final_namespace
|
|
||||||
print(f"✅ Initialization succeeded, new collection: {new_collection}")
|
|
||||||
|
|
||||||
# Verify collections
|
# Verify expected behavior
|
||||||
legacy_exists = client.collection_exists(legacy_collection)
|
new_collection = rag.chunks_vdb.final_namespace
|
||||||
new_exists = client.collection_exists(new_collection)
|
print(f"✅ Initialization succeeded, new collection: {new_collection}")
|
||||||
|
|
||||||
print(f"✅ Legacy collection exists: {legacy_exists}")
|
# 1. New collection should exist with model suffix
|
||||||
print(f"✅ New collection exists: {new_exists}")
|
assert "bge_large_1024d" in new_collection
|
||||||
|
assert client.collection_exists(
|
||||||
|
new_collection
|
||||||
|
), f"New collection {new_collection} should exist"
|
||||||
|
print(f"✅ New collection created: {new_collection}")
|
||||||
|
|
||||||
# Verify new collection has correct dimension
|
# 2. Legacy collection should be preserved (not deleted)
|
||||||
collection_info = client.get_collection(new_collection)
|
legacy_exists = client.collection_exists(legacy_collection)
|
||||||
new_dim = collection_info.config.params.vectors.size
|
assert (
|
||||||
print(f"✅ New collection dimension: {new_dim}d")
|
legacy_exists
|
||||||
assert new_dim == 1024, f"New collection should have 1024d, got {new_dim}d"
|
), "Legacy collection should be preserved when dimensions don't match"
|
||||||
|
print(f"✅ Legacy collection preserved: {legacy_collection}")
|
||||||
|
|
||||||
await rag.finalize_storages()
|
# 3. Legacy collection should still have original data (not migrated)
|
||||||
|
legacy_count = client.count(legacy_collection).count
|
||||||
|
assert (
|
||||||
|
legacy_count == 3
|
||||||
|
), f"Legacy collection should still have 3 vectors, got {legacy_count}"
|
||||||
|
print(f"✅ Legacy data preserved: {legacy_count} vectors")
|
||||||
|
|
||||||
except Exception as e:
|
# 4. New collection should be empty (migration skipped)
|
||||||
# If it raises an error, it should be a clear, actionable error
|
new_count = client.count(new_collection).count
|
||||||
print(f"⚠️ Initialization raised exception: {e}")
|
assert (
|
||||||
# Verify error message is clear and actionable
|
new_count == 0
|
||||||
assert any(
|
), f"New collection should be empty (migration skipped), got {new_count}"
|
||||||
keyword in str(e).lower()
|
print(
|
||||||
for keyword in ["dimension", "mismatch", "768", "1024"]
|
f"✅ New collection is empty (migration correctly skipped): {new_count} vectors"
|
||||||
), f"Error message should mention dimension mismatch: {e}"
|
)
|
||||||
print("✅ Clear error message provided to user")
|
|
||||||
|
# 5. Verify new collection has correct dimension
|
||||||
|
collection_info = client.get_collection(new_collection)
|
||||||
|
new_dim = collection_info.config.params.vectors.size
|
||||||
|
assert new_dim == 1024, f"New collection should have 1024d, got {new_dim}d"
|
||||||
|
print(f"✅ New collection dimension verified: {new_dim}d")
|
||||||
|
|
||||||
|
# 6. System should be operational
|
||||||
|
print("✅ System initialized successfully despite dimension mismatch")
|
||||||
|
|
||||||
|
await rag.finalize_storages()
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
|
||||||
|
|
@ -12,9 +12,11 @@ def mock_qdrant_client():
|
||||||
client = mock_client_cls.return_value
|
client = mock_client_cls.return_value
|
||||||
client.collection_exists.return_value = False
|
client.collection_exists.return_value = False
|
||||||
client.count.return_value.count = 0
|
client.count.return_value.count = 0
|
||||||
# Mock payload schema for get_collection
|
# Mock payload schema and vector config for get_collection
|
||||||
collection_info = MagicMock()
|
collection_info = MagicMock()
|
||||||
collection_info.payload_schema = {}
|
collection_info.payload_schema = {}
|
||||||
|
# Mock vector dimension to match mock_embedding_func (768d)
|
||||||
|
collection_info.config.params.vectors.size = 768
|
||||||
client.get_collection.return_value = collection_info
|
client.get_collection.return_value = collection_info
|
||||||
yield client
|
yield client
|
||||||
|
|
||||||
|
|
@ -254,6 +256,12 @@ async def test_scenario_2_legacy_upgrade_migration(
|
||||||
lambda name: name == legacy_collection
|
lambda name: name == legacy_collection
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Mock legacy collection info with 1536d vectors
|
||||||
|
legacy_collection_info = MagicMock()
|
||||||
|
legacy_collection_info.payload_schema = {}
|
||||||
|
legacy_collection_info.config.params.vectors.size = 1536
|
||||||
|
mock_qdrant_client.get_collection.return_value = legacy_collection_info
|
||||||
|
|
||||||
# Mock legacy data
|
# Mock legacy data
|
||||||
mock_qdrant_client.count.return_value.count = 150
|
mock_qdrant_client.count.return_value.count = 150
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue