Fix pytest fixture scope incompatibility with pytest-asyncio. Changed fixture scope from "module" to "function" to match pytest-asyncio's default event loop scope. Issue: ScopeMismatch error when accessing function-scoped event loop fixture from module-scoped fixtures. Testing: Fixes E2E test execution in GitHub Actions
346 lines
11 KiB
Python
346 lines
11 KiB
Python
"""
|
|
E2E Tests for Qdrant Vector Storage Model Isolation
|
|
|
|
These tests use a REAL Qdrant server.
|
|
Unlike unit tests, these verify actual collection operations, data migration,
|
|
and multi-model isolation scenarios.
|
|
|
|
Prerequisites:
|
|
- Qdrant server running
|
|
- Environment variables: QDRANT_URL (optional QDRANT_API_KEY)
|
|
"""
|
|
|
|
import os
|
|
import pytest
|
|
import asyncio
|
|
import numpy as np
|
|
from lightrag.utils import EmbeddingFunc
|
|
from lightrag.kg.qdrant_impl import QdrantVectorDBStorage
|
|
from lightrag.namespace import NameSpace
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.models import Distance, VectorParams
|
|
|
|
|
|
# E2E test configuration from environment
|
|
@pytest.fixture(scope="function")
|
|
def qdrant_config():
|
|
"""Real Qdrant configuration from environment variables"""
|
|
return {
|
|
"url": os.getenv("QDRANT_URL", "http://localhost:6333"),
|
|
"api_key": os.getenv("QDRANT_API_KEY", None),
|
|
}
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
def qdrant_client(qdrant_config):
|
|
"""Create a real Qdrant client"""
|
|
client = QdrantClient(
|
|
url=qdrant_config["url"],
|
|
api_key=qdrant_config["api_key"],
|
|
timeout=60,
|
|
)
|
|
yield client
|
|
# Client auto-closes
|
|
|
|
|
|
@pytest.fixture
|
|
async def cleanup_collections(qdrant_client):
|
|
"""Cleanup test collections before and after each test"""
|
|
collections_to_delete = [
|
|
"lightrag_vdb_chunks", # legacy
|
|
"e2e_test_chunks", # legacy with workspace
|
|
"lightrag_vdb_chunks_test_model_768d",
|
|
"lightrag_vdb_chunks_text_embedding_ada_002_1536d",
|
|
"lightrag_vdb_chunks_bge_small_768d",
|
|
"lightrag_vdb_chunks_bge_large_1024d",
|
|
]
|
|
|
|
# Cleanup before test
|
|
for collection in collections_to_delete:
|
|
try:
|
|
if qdrant_client.collection_exists(collection):
|
|
qdrant_client.delete_collection(collection)
|
|
except Exception:
|
|
pass
|
|
|
|
yield
|
|
|
|
# Cleanup after test
|
|
for collection in collections_to_delete:
|
|
try:
|
|
if qdrant_client.collection_exists(collection):
|
|
qdrant_client.delete_collection(collection)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_embedding_func():
|
|
"""Create a mock embedding function for testing"""
|
|
async def embed_func(texts, **kwargs):
|
|
return np.array([[0.1] * 768 for _ in texts])
|
|
|
|
return EmbeddingFunc(
|
|
embedding_dim=768,
|
|
func=embed_func,
|
|
model_name="test_model"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_e2e_qdrant_fresh_installation(qdrant_client, cleanup_collections, mock_embedding_func, qdrant_config):
|
|
"""
|
|
E2E Test: Fresh Qdrant installation with model_name specified
|
|
|
|
Scenario: New workspace, no legacy collection
|
|
Expected: Create new collection with model suffix, no migration needed
|
|
"""
|
|
print("\n[E2E Test] Fresh Qdrant installation with model_name")
|
|
|
|
# Create storage with model_name
|
|
storage = QdrantVectorDBStorage(
|
|
namespace=NameSpace.VECTOR_STORE_CHUNKS,
|
|
global_config={
|
|
"embedding_batch_num": 10,
|
|
"vector_db_storage_cls_kwargs": {
|
|
"url": qdrant_config["url"],
|
|
"api_key": qdrant_config["api_key"],
|
|
"cosine_better_than_threshold": 0.8,
|
|
}
|
|
},
|
|
embedding_func=mock_embedding_func,
|
|
workspace="e2e_test"
|
|
)
|
|
|
|
# Initialize storage (should create new collection)
|
|
await storage.initialize()
|
|
|
|
# Verify collection name
|
|
assert "test_model_768d" in storage.final_namespace
|
|
expected_collection = "lightrag_vdb_chunks_test_model_768d"
|
|
assert storage.final_namespace == expected_collection
|
|
|
|
# Verify collection exists
|
|
assert qdrant_client.collection_exists(expected_collection), \
|
|
f"Collection {expected_collection} should exist"
|
|
|
|
# Verify collection properties
|
|
collection_info = qdrant_client.get_collection(expected_collection)
|
|
assert collection_info.vectors_count == 0, "New collection should be empty"
|
|
print(f"✅ Fresh installation successful: {expected_collection} created")
|
|
|
|
# Verify legacy collection does NOT exist
|
|
assert not qdrant_client.collection_exists("lightrag_vdb_chunks"), \
|
|
"Legacy collection should not exist"
|
|
assert not qdrant_client.collection_exists("e2e_test_chunks"), \
|
|
"Legacy workspace collection should not exist"
|
|
|
|
await storage.finalize()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_e2e_qdrant_legacy_migration(qdrant_client, cleanup_collections, qdrant_config):
|
|
"""
|
|
E2E Test: Upgrade from legacy Qdrant collection with automatic migration
|
|
|
|
Scenario:
|
|
1. Create legacy collection (without model suffix)
|
|
2. Insert test data
|
|
3. Initialize with model_name (triggers migration)
|
|
4. Verify data migrated to new collection
|
|
"""
|
|
print("\n[E2E Test] Legacy Qdrant collection migration")
|
|
|
|
# Step 1: Create legacy collection and insert data
|
|
legacy_collection = "e2e_test_chunks" # workspace-prefixed legacy name
|
|
|
|
qdrant_client.create_collection(
|
|
collection_name=legacy_collection,
|
|
vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
|
|
)
|
|
|
|
# Insert test data into legacy collection
|
|
from qdrant_client.models import PointStruct
|
|
|
|
test_points = [
|
|
PointStruct(
|
|
id=i,
|
|
vector=[0.1] * 1536,
|
|
payload={
|
|
"workspace_id": "e2e_test",
|
|
"content": f"Legacy content {i}",
|
|
"id": f"legacy_doc_{i}",
|
|
}
|
|
)
|
|
for i in range(10)
|
|
]
|
|
|
|
qdrant_client.upsert(
|
|
collection_name=legacy_collection,
|
|
points=test_points,
|
|
wait=True,
|
|
)
|
|
|
|
# Verify legacy data exists
|
|
legacy_info = qdrant_client.get_collection(legacy_collection)
|
|
legacy_count = legacy_info.vectors_count
|
|
assert legacy_count == 10, f"Expected 10 vectors in legacy collection, got {legacy_count}"
|
|
print(f"✅ Legacy collection created with {legacy_count} vectors")
|
|
|
|
# Step 2: Initialize storage with model_name (triggers migration)
|
|
async def embed_func(texts, **kwargs):
|
|
return np.array([[0.1] * 1536 for _ in texts])
|
|
|
|
embedding_func = EmbeddingFunc(
|
|
embedding_dim=1536,
|
|
func=embed_func,
|
|
model_name="text-embedding-ada-002"
|
|
)
|
|
|
|
storage = QdrantVectorDBStorage(
|
|
namespace=NameSpace.VECTOR_STORE_CHUNKS,
|
|
global_config={
|
|
"embedding_batch_num": 10,
|
|
"vector_db_storage_cls_kwargs": {
|
|
"url": qdrant_config["url"],
|
|
"api_key": qdrant_config["api_key"],
|
|
"cosine_better_than_threshold": 0.8,
|
|
}
|
|
},
|
|
embedding_func=embedding_func,
|
|
workspace="e2e_test"
|
|
)
|
|
|
|
# Initialize (should trigger migration)
|
|
print("🔄 Starting migration...")
|
|
await storage.initialize()
|
|
print("✅ Migration completed")
|
|
|
|
# Step 3: Verify migration
|
|
new_collection = storage.final_namespace
|
|
assert "text_embedding_ada_002_1536d" in new_collection
|
|
|
|
# Verify new collection exists and has data
|
|
assert qdrant_client.collection_exists(new_collection), \
|
|
f"New collection {new_collection} should exist"
|
|
|
|
new_info = qdrant_client.get_collection(new_collection)
|
|
new_count = new_info.vectors_count
|
|
|
|
assert new_count == legacy_count, \
|
|
f"Expected {legacy_count} vectors in new collection, got {new_count}"
|
|
print(f"✅ Data migration verified: {new_count}/{legacy_count} vectors migrated")
|
|
|
|
# Verify data content
|
|
sample_points = qdrant_client.scroll(
|
|
collection_name=new_collection,
|
|
limit=1,
|
|
with_payload=True,
|
|
)[0]
|
|
|
|
assert len(sample_points) > 0, "Should have at least one point"
|
|
sample = sample_points[0]
|
|
assert "Legacy content" in sample.payload.get("content", "")
|
|
print(f"✅ Data integrity verified: {sample.payload.get('id')}")
|
|
|
|
await storage.finalize()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_e2e_qdrant_multi_model_coexistence(qdrant_client, cleanup_collections, qdrant_config):
|
|
"""
|
|
E2E Test: Multiple embedding models coexisting in Qdrant
|
|
|
|
Scenario:
|
|
1. Create storage with model A (768d)
|
|
2. Create storage with model B (1024d)
|
|
3. Verify separate collections created
|
|
4. Verify data isolation
|
|
"""
|
|
print("\n[E2E Test] Multi-model coexistence in Qdrant")
|
|
|
|
# Model A: 768 dimensions
|
|
async def embed_func_a(texts, **kwargs):
|
|
return np.array([[0.1] * 768 for _ in texts])
|
|
|
|
embedding_func_a = EmbeddingFunc(
|
|
embedding_dim=768,
|
|
func=embed_func_a,
|
|
model_name="bge-small"
|
|
)
|
|
|
|
storage_a = QdrantVectorDBStorage(
|
|
namespace=NameSpace.VECTOR_STORE_CHUNKS,
|
|
global_config={
|
|
"embedding_batch_num": 10,
|
|
"vector_db_storage_cls_kwargs": {
|
|
"url": qdrant_config["url"],
|
|
"api_key": qdrant_config["api_key"],
|
|
"cosine_better_than_threshold": 0.8,
|
|
}
|
|
},
|
|
embedding_func=embedding_func_a,
|
|
workspace="e2e_test"
|
|
)
|
|
|
|
await storage_a.initialize()
|
|
collection_a = storage_a.final_namespace
|
|
assert "bge_small_768d" in collection_a
|
|
print(f"✅ Model A collection created: {collection_a}")
|
|
|
|
# Model B: 1024 dimensions
|
|
async def embed_func_b(texts, **kwargs):
|
|
return np.array([[0.1] * 1024 for _ in texts])
|
|
|
|
embedding_func_b = EmbeddingFunc(
|
|
embedding_dim=1024,
|
|
func=embed_func_b,
|
|
model_name="bge-large"
|
|
)
|
|
|
|
storage_b = QdrantVectorDBStorage(
|
|
namespace=NameSpace.VECTOR_STORE_CHUNKS,
|
|
global_config={
|
|
"embedding_batch_num": 10,
|
|
"vector_db_storage_cls_kwargs": {
|
|
"url": qdrant_config["url"],
|
|
"api_key": qdrant_config["api_key"],
|
|
"cosine_better_than_threshold": 0.8,
|
|
}
|
|
},
|
|
embedding_func=embedding_func_b,
|
|
workspace="e2e_test"
|
|
)
|
|
|
|
await storage_b.initialize()
|
|
collection_b = storage_b.final_namespace
|
|
assert "bge_large_1024d" in collection_b
|
|
print(f"✅ Model B collection created: {collection_b}")
|
|
|
|
# Verify collections are different
|
|
assert collection_a != collection_b, "Collections should have different names"
|
|
print(f"✅ Collection isolation verified: {collection_a} != {collection_b}")
|
|
|
|
# Verify both collections exist
|
|
assert qdrant_client.collection_exists(collection_a), \
|
|
f"Collection {collection_a} should exist"
|
|
assert qdrant_client.collection_exists(collection_b), \
|
|
f"Collection {collection_b} should exist"
|
|
print("✅ Both collections exist in Qdrant")
|
|
|
|
# Verify vector dimensions
|
|
info_a = qdrant_client.get_collection(collection_a)
|
|
info_b = qdrant_client.get_collection(collection_b)
|
|
|
|
# Qdrant stores vector config in config.params.vectors
|
|
assert info_a.config.params.vectors.size == 768, "Model A should use 768 dimensions"
|
|
assert info_b.config.params.vectors.size == 1024, "Model B should use 1024 dimensions"
|
|
print(f"✅ Vector dimensions verified: {info_a.config.params.vectors.size}d vs {info_b.config.params.vectors.size}d")
|
|
|
|
await storage_a.finalize()
|
|
await storage_b.finalize()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Run tests with pytest
|
|
pytest.main([__file__, "-v", "-s"])
|