diff --git a/MIGRATION_ANALYSIS.md b/MIGRATION_ANALYSIS.md new file mode 100644 index 00000000..f7af17ab --- /dev/null +++ b/MIGRATION_ANALYSIS.md @@ -0,0 +1,376 @@ +# Vector Model Isolation - 迁移场景覆盖分析 + +## 执行日期 +2025-11-20 + +## 关键发现 + +### ⚠️ 严重问题:Qdrant Legacy 命名不兼容 + +#### 问题描述 + +**旧版本(main分支)的Qdrant命名**: +```python +# Get legacy namespace for data migration from old version +if effective_workspace: + self.legacy_namespace = f"{effective_workspace}_{self.namespace}" +else: + self.legacy_namespace = self.namespace + +self.final_namespace = f"lightrag_vdb_{self.namespace}" +``` + +示例: +- workspace="my_workspace", namespace="chunks" +- legacy_namespace = "my_workspace_chunks" +- final_namespace = "lightrag_vdb_chunks" + +**新版本(feature分支)的Qdrant命名**: +```python +# Legacy collection name (without model suffix, for migration) +self.legacy_namespace = f"lightrag_vdb_{self.namespace}" + +# New naming scheme with model isolation +self.final_namespace = f"lightrag_vdb_{self.namespace}_{model_suffix}" +``` + +示例: +- workspace="my_workspace", namespace="chunks" +- legacy_namespace = "lightrag_vdb_chunks" ❌ 与旧版不匹配! +- final_namespace = "lightrag_vdb_chunks_text_embedding_ada_002_1536d" + +#### 影响分析 + +1. **从旧版本升级时的迁移失败**: + - 旧版本用户的collection名称可能是:`my_workspace_chunks` 或 `chunks` + - 新版本尝试从 `lightrag_vdb_chunks` 迁移 + - 结果:找不到legacy collection,无法自动迁移! + +2. **数据丢失风险**: + - 用户升级后可能看不到旧数据 + - 需要手动迁移数据 + +### ✅ PostgreSQL 迁移逻辑正确 + +PostgreSQL的迁移逻辑比较清晰: + +**旧版本**: +- 表名直接使用 `lightrag_vdb_chunks` 等固定名称 + +**新版本**: +- legacy_table_name = `lightrag_vdb_chunks` +- table_name = `lightrag_vdb_chunks_{model}_{dim}d` + +这个逻辑是正确的,因为旧版PostgreSQL就是使用固定表名。 + +--- + +## 测试覆盖情况分析 + +### 当前E2E测试覆盖的场景 + +| 测试名称 | 数据库 | 测试场景 | 覆盖Case | +|---------|--------|---------|---------| +| `test_legacy_migration_postgres` | PostgreSQL | 从legacy表迁移 | Case 4: Legacy→New | +| `test_legacy_migration_qdrant` | Qdrant | 从legacy collection迁移 | Case 4: Legacy→New | +| `test_multi_instance_postgres` | PostgreSQL | 多模型共存 | Case 3: 创建新表 | +| `test_multi_instance_qdrant` | Qdrant | 多模型共存 | Case 3: 创建新collection | + +### 缺失的测试场景 + +#### 未覆盖的Case + +1. ❌ **Case 1: 新旧共存警告** + - 场景:legacy和new都存在 + - 预期:只输出警告,不迁移 + - 状态:未测试 + +2. ❌ **Case 2: 已迁移场景** + - 场景:只有new存在,legacy已删除 + - 预期:检查索引,正常使用 + - 状态:未测试 + +3. ❌ **从真实旧版本升级** + - 场景:用户从LightRAG旧版本升级 + - Qdrant: legacy名称是 `{workspace}_{namespace}` 或 `{namespace}` + - 预期:能正确识别并迁移 + - 状态:**未覆盖,存在兼容性问题!** + +#### 未覆盖的边界情况 + +1. ❌ **空数据迁移** + - 场景:legacy存在但为空 + - 预期:跳过迁移,创建新表/collection + - 状态:代码有逻辑,但未测试 + +2. ❌ **迁移失败回滚** + - 场景:迁移过程中断 + - 预期:抛出异常,数据一致性保证 + - 状态:未测试 + +3. ❌ **Workspace隔离验证** + - 场景:同一collection/table内多个workspace + - 预期:数据完全隔离 + - 状态:未明确测试 + +4. ❌ **模型切换场景** + - 场景:用户切换embedding模型 + - 预期:创建新表/collection,旧数据保留 + - 状态:未测试 + +--- + +## 向后兼容性分析 + +### ✅ PostgreSQL - 完全兼容 + +- 旧版本表名:`lightrag_vdb_chunks` +- 新版本识别:`legacy_table_name = "lightrag_vdb_chunks"` +- 结论:**完全兼容** + +### ❌ Qdrant - 不兼容! + +#### 兼容性问题详情 + +**场景1:使用workspace的旧版用户** +```python +# 旧版本 (main) +workspace = "prod" +legacy_namespace = "prod_chunks" # 旧版生成的名称 +final_namespace = "lightrag_vdb_chunks" + +# 新版本 (feature) +legacy_namespace = "lightrag_vdb_chunks" # 新版期望的legacy名称 +final_namespace = "lightrag_vdb_chunks_text_embedding_ada_002_1536d" + +# 结果:找不到 "prod_chunks" collection,迁移失败! +``` + +**场景2:不使用workspace的旧版用户** +```python +# 旧版本 (main) +workspace = None +legacy_namespace = "chunks" # 旧版生成的名称 +final_namespace = "lightrag_vdb_chunks" + +# 新版本 (feature) +legacy_namespace = "lightrag_vdb_chunks" # 新版期望的legacy名称 +final_namespace = "lightrag_vdb_chunks_text_embedding_ada_002_1536d" + +# 结果:找不到 "chunks" collection,迁移失败! +``` + +#### 影响范围 + +1. **所有使用workspace的Qdrant用户** - 升级后数据无法访问 +2. **所有不使用workspace的Qdrant用户** - 升级后数据无法访问 +3. **仅有旧版本使用 `lightrag_vdb_{namespace}` 作为collection名的用户不受影响** + +--- + +## 代码风格一致性检查 + +### ✅ 整体代码风格 + +1. **迁移逻辑模式统一**: + - PostgreSQL和Qdrant使用相同的4-Case逻辑 + - 两者都有 `setup_table/setup_collection` 静态方法 + - ✅ 一致性良好 + +2. **命名规范**: + - 都使用 `legacy_*` 和 `final_*` / `table_name` 命名 + - 都使用 `model_suffix` 生成逻辑 + - ✅ 一致性良好 + +3. **日志格式**: + - 都使用相同的日志格式和级别 + - 都输出清晰的迁移进度 + - ✅ 一致性良好 + +4. **错误处理**: + - 都定义了专门的迁移异常类 + - 都有迁移验证逻辑 + - ✅ 一致性良好 + +5. **批处理大小**: + - PostgreSQL: 500条/批 + - Qdrant: 500条/批 + - ✅ 一致性良好 + +### ⚠️ 需要改进的地方 + +1. **注释风格不统一**: + - 部分使用中文注释 + - 部分使用英文注释 + - 建议:统一为英文 + +2. **测试命名**: + - 部分测试有中文docstring + - 建议:保持中英双语 + +--- + +## 建议修复方案 + +### 1. 修复Qdrant兼容性问题 + +#### 方案A:支持多种legacy命名模式(推荐) + +```python +async def _find_legacy_collection( + client: QdrantClient, + workspace: str, + namespace: str +) -> str | None: + """ + Try to find legacy collection with various naming patterns + for backward compatibility. + + Returns: + Collection name if found, None otherwise + """ + # Pattern 1: New legacy format (from previous feature branch) + candidate1 = f"lightrag_vdb_{namespace}" + + # Pattern 2: Old format with workspace + candidate2 = f"{workspace}_{namespace}" if workspace else None + + # Pattern 3: Old format without workspace + candidate3 = namespace + + # Try each pattern + for candidate in [candidate1, candidate2, candidate3]: + if candidate and client.collection_exists(candidate): + logger.info(f"Found legacy collection: {candidate}") + return candidate + + return None +``` + +然后在`setup_collection`中使用: + +```python +# Find legacy collection with backward compatibility +legacy_collection = await _find_legacy_collection( + client, workspace, namespace +) + +legacy_exists = legacy_collection is not None + +# Case 4: Only legacy exists - Migrate data +if legacy_exists and not new_collection_exists: + logger.info( + f"Qdrant: Migrating data from legacy collection '{legacy_collection}'" + ) + # ... 迁移逻辑使用 legacy_collection +``` + +#### 方案B:文档化手动迁移步骤 + +如果不想支持自动识别,至少要提供清晰的手动迁移文档。 + +### 2. 补充缺失的测试 + +#### 高优先级测试 + +```python +@pytest.mark.asyncio +async def test_qdrant_legacy_workspace_migration(): + """Test migration from old workspace-based naming""" + # 创建旧格式collection: "workspace_chunks" + # 验证新代码能识别并迁移 + pass + +@pytest.mark.asyncio +async def test_case1_both_exist_warning(): + """Test Case 1: Both legacy and new exist""" + # 验证只输出警告,不迁移 + pass + +@pytest.mark.asyncio +async def test_case2_only_new_exists(): + """Test Case 2: Only new table/collection exists""" + # 验证跳过迁移,检查索引 + pass + +@pytest.mark.asyncio +async def test_empty_legacy_migration(): + """Test migration when legacy is empty""" + # 验证跳过数据迁移,只创建新表/collection + pass + +@pytest.mark.asyncio +async def test_workspace_isolation(): + """Test workspace isolation within same collection/table""" + # 验证不同workspace的数据完全隔离 + pass +``` + +#### 中等优先级测试 + +```python +@pytest.mark.asyncio +async def test_model_switch_scenario(): + """Test switching embedding models""" + # 验证切换模型后创建新表/collection + pass + +@pytest.mark.asyncio +async def test_migration_failure_handling(): + """Test migration error handling""" + # 验证迁移失败时的异常处理 + pass +``` + +### 3. 改进文档 + +需要在Migration Guide中明确说明: + +1. **Qdrant用户的特殊注意事项** +2. **如何手动迁移旧collection** +3. **升级前的备份建议** +4. **验证迁移成功的步骤** + +--- + +## 总结 + +### 关键问题 + +1. ❌ **Qdrant向后兼容性严重问题** - 必须修复! +2. ❌ **测试覆盖不足** - 缺少关键场景测试 +3. ✅ **PostgreSQL迁移逻辑正确** +4. ✅ **代码风格基本一致** + +### 建议优先级 + +1. **P0 - 立即修复**: + - 修复Qdrant向后兼容性问题 + - 添加兼容性测试 + +2. **P1 - PR合并前**: + - 补充Case 1、Case 2测试 + - 添加workspace隔离测试 + - 更新Migration Guide文档 + +3. **P2 - 后续改进**: + - 补充边界情况测试 + - 统一注释语言 + - 添加更详细的错误信息 + +### 风险评估 + +- **不修复Qdrant兼容性**: 🔴 高风险 - 用户升级后数据丢失 +- **测试覆盖不足**: 🟡 中风险 - 生产环境可能出现未预期的问题 +- **文档不完整**: 🟡 中风险 - 用户不知道如何正确升级 + +--- + +## 下一步行动 + +1. 与用户确认是否接受方案A(推荐)或方案B +2. 实施选定的修复方案 +3. 补充关键测试 +4. 更新文档 +5. 重新运行所有E2E测试 +6. 准备发布 diff --git a/examples/multi_model_demo.py b/examples/multi_model_demo.py index 5cde64a6..000c841c 100644 --- a/examples/multi_model_demo.py +++ b/examples/multi_model_demo.py @@ -15,11 +15,10 @@ Requirements: - Qdrant or PostgreSQL for vector storage (optional, defaults to NanoVectorDB) """ -import os import asyncio from lightrag import LightRAG, QueryParam from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed -from lightrag.utils import EmbeddingFunc, logger +from lightrag.utils import EmbeddingFunc # Set your API key # os.environ["OPENAI_API_KEY"] = "your-api-key-here" @@ -33,22 +32,19 @@ async def scenario_1_new_workspace_with_explicit_model(): - Qdrant: lightrag_vdb_chunks_text_embedding_3_large_3072d - PostgreSQL: LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d """ - print("\n" + "="*80) + print("\n" + "=" * 80) print("Scenario 1: New Workspace with Explicit Model Name") - print("="*80) + print("=" * 80) # Define custom embedding function with explicit model name async def my_embedding_func(texts: list[str]): - return await openai_embed( - texts, - model="text-embedding-3-large" - ) + return await openai_embed(texts, model="text-embedding-3-large") # Create EmbeddingFunc with model_name specified embedding_func = EmbeddingFunc( embedding_dim=3072, func=my_embedding_func, - model_name="text-embedding-3-large" # Explicit model name + model_name="text-embedding-3-large", # Explicit model name ) rag = LightRAG( @@ -64,8 +60,7 @@ async def scenario_1_new_workspace_with_explicit_model(): # Query result = await rag.aquery( - "What does LightRAG support?", - param=QueryParam(mode="hybrid") + "What does LightRAG support?", param=QueryParam(mode="hybrid") ) print(f"\nQuery Result: {result[:200]}...") @@ -85,9 +80,9 @@ async def scenario_2_legacy_migration(): - Old: lightrag_vdb_chunks (no suffix) - New: lightrag_vdb_chunks_text_embedding_ada_002_1536d (with suffix) """ - print("\n" + "="*80) + print("\n" + "=" * 80) print("Scenario 2: Automatic Migration from Legacy Format") - print("="*80) + print("=" * 80) # Step 1: Simulate legacy workspace (no model_name) print("\n[Step 1] Creating legacy workspace without model_name...") @@ -98,7 +93,7 @@ async def scenario_2_legacy_migration(): # Legacy: No model_name specified legacy_embedding = EmbeddingFunc( embedding_dim=1536, - func=legacy_embedding_func + func=legacy_embedding_func, # model_name not specified → uses "unknown" as fallback ) @@ -121,7 +116,7 @@ async def scenario_2_legacy_migration(): new_embedding = EmbeddingFunc( embedding_dim=1536, func=legacy_embedding_func, - model_name="text-embedding-ada-002" # Now explicitly specified + model_name="text-embedding-ada-002", # Now explicitly specified ) rag_new = LightRAG( @@ -138,8 +133,7 @@ async def scenario_2_legacy_migration(): # Verify data is still accessible result = await rag_new.aquery( - "What is the legacy data?", - param=QueryParam(mode="hybrid") + "What is the legacy data?", param=QueryParam(mode="hybrid") ) print(f"\nQuery Result: {result[:200] if result else 'No results'}...") @@ -160,9 +154,9 @@ async def scenario_3_multiple_models_coexistence(): - Workspace A: lightrag_vdb_chunks_bge_small_768d - Workspace B: lightrag_vdb_chunks_bge_large_1024d """ - print("\n" + "="*80) + print("\n" + "=" * 80) print("Scenario 3: Multiple Models Coexistence") - print("="*80) + print("=" * 80) # Workspace A: Small embedding model (768 dimensions) print("\n[Workspace A] Using bge-small model (768d)...") @@ -175,7 +169,7 @@ async def scenario_3_multiple_models_coexistence(): embedding_a = EmbeddingFunc( embedding_dim=1536, # text-embedding-3-small dimension func=embedding_func_small, - model_name="text-embedding-3-small" + model_name="text-embedding-3-small", ) rag_a = LightRAG( @@ -199,7 +193,7 @@ async def scenario_3_multiple_models_coexistence(): embedding_b = EmbeddingFunc( embedding_dim=3072, # text-embedding-3-large dimension func=embedding_func_large, - model_name="text-embedding-3-large" + model_name="text-embedding-3-large", ) rag_b = LightRAG( @@ -217,12 +211,10 @@ async def scenario_3_multiple_models_coexistence(): print("\n[Verification] Querying both workspaces...") result_a = await rag_a.aquery( - "What model does workspace use?", - param=QueryParam(mode="hybrid") + "What model does workspace use?", param=QueryParam(mode="hybrid") ) result_b = await rag_b.aquery( - "What model does workspace use?", - param=QueryParam(mode="hybrid") + "What model does workspace use?", param=QueryParam(mode="hybrid") ) print(f"\nWorkspace A Result: {result_a[:100] if result_a else 'No results'}...") @@ -238,9 +230,9 @@ async def main(): """ Run all scenarios to demonstrate model isolation features """ - print("\n" + "="*80) + print("\n" + "=" * 80) print("LightRAG Multi-Model Vector Storage Isolation Demo") - print("="*80) + print("=" * 80) print("\nThis demo shows how LightRAG automatically handles:") print("1. ✅ Automatic model suffix generation") print("2. ✅ Seamless data migration from legacy format") @@ -256,9 +248,9 @@ async def main(): # Scenario 3: Multiple models coexistence await scenario_3_multiple_models_coexistence() - print("\n" + "="*80) + print("\n" + "=" * 80) print("✅ All scenarios completed successfully!") - print("="*80) + print("=" * 80) print("\n📝 Key Takeaways:") print("- Always specify `model_name` in EmbeddingFunc for clear model tracking") @@ -271,6 +263,7 @@ async def main(): except Exception as e: print(f"\n❌ Error: {e}") import traceback + traceback.print_exc() diff --git a/lightrag/kg/qdrant_impl.py b/lightrag/kg/qdrant_impl.py index 6b0db51f..b63aadb5 100644 --- a/lightrag/kg/qdrant_impl.py +++ b/lightrag/kg/qdrant_impl.py @@ -66,6 +66,45 @@ def workspace_filter_condition(workspace: str) -> models.FieldCondition: ) +def _find_legacy_collection( + client: QdrantClient, namespace: str, workspace: str = None +) -> str | None: + """ + Find legacy collection with backward compatibility support. + + This function tries multiple naming patterns to locate legacy collections + created by older versions of LightRAG: + + 1. lightrag_vdb_{namespace} - Current legacy format + 2. {workspace}_{namespace} - Old format with workspace (pre-model-isolation) + 3. {namespace} - Old format without workspace (pre-model-isolation) + + Args: + client: QdrantClient instance + namespace: Base namespace (e.g., "chunks", "entities") + workspace: Optional workspace identifier + + Returns: + Collection name if found, None otherwise + """ + # Try multiple naming patterns for backward compatibility + candidates = [ + f"lightrag_vdb_{namespace}", # New legacy format + f"{workspace}_{namespace}" if workspace else None, # Old format with workspace + namespace, # Old format without workspace + ] + + for candidate in candidates: + if candidate and client.collection_exists(candidate): + logger.info( + f"Qdrant: Found legacy collection '{candidate}' " + f"(namespace={namespace}, workspace={workspace or 'none'})" + ) + return candidate + + return None + + @final @dataclass class QdrantVectorDBStorage(BaseVectorStorage): @@ -85,27 +124,37 @@ class QdrantVectorDBStorage(BaseVectorStorage): def setup_collection( client: QdrantClient, collection_name: str, - legacy_namespace: str = None, + namespace: str = None, workspace: str = None, **kwargs, ): """ Setup Qdrant collection with migration support from legacy collections. + This method now supports backward compatibility by automatically detecting + legacy collections created by older versions of LightRAG using multiple + naming patterns. + Args: client: QdrantClient instance collection_name: Name of the new collection - legacy_namespace: Name of the legacy collection (if exists) + namespace: Base namespace (e.g., "chunks", "entities") workspace: Workspace identifier for data isolation **kwargs: Additional arguments for collection creation (vectors_config, hnsw_config, etc.) """ new_collection_exists = client.collection_exists(collection_name) - legacy_exists = legacy_namespace and client.collection_exists(legacy_namespace) + + # Try to find legacy collection with backward compatibility + legacy_collection = ( + _find_legacy_collection(client, namespace, workspace) if namespace else None + ) + legacy_exists = legacy_collection is not None # Case 1: Both new and legacy collections exist - Warning only (no migration) if new_collection_exists and legacy_exists: logger.warning( - f"Qdrant: Legacy collection '{legacy_namespace}' still exist. Remove it if migration is complete." + f"Qdrant: Legacy collection '{legacy_collection}' still exists. " + f"Remove it if migration is complete." ) return @@ -149,13 +198,13 @@ class QdrantVectorDBStorage(BaseVectorStorage): # Case 4: Only legacy exists - Migrate data logger.info( - f"Qdrant: Migrating data from legacy collection '{legacy_namespace}'" + f"Qdrant: Migrating data from legacy collection '{legacy_collection}'" ) try: # Get legacy collection count legacy_count = client.count( - collection_name=legacy_namespace, exact=True + collection_name=legacy_collection, exact=True ).count logger.info(f"Qdrant: Found {legacy_count} records in legacy collection") @@ -185,7 +234,7 @@ class QdrantVectorDBStorage(BaseVectorStorage): while True: # Scroll through legacy data result = client.scroll( - collection_name=legacy_namespace, + collection_name=legacy_collection, limit=batch_size, offset=offset, with_vectors=True, @@ -258,7 +307,7 @@ class QdrantVectorDBStorage(BaseVectorStorage): ), ) logger.info( - f"Qdrant: Migration from '{legacy_namespace}' to '{collection_name}' completed successfully" + f"Qdrant: Migration from '{legacy_collection}' to '{collection_name}' completed successfully" ) except QdrantMigrationError: @@ -350,11 +399,11 @@ class QdrantVectorDBStorage(BaseVectorStorage): ) # Setup collection (create if not exists and configure indexes) - # Pass legacy_namespace and workspace for migration support + # Pass namespace and workspace for backward-compatible migration support QdrantVectorDBStorage.setup_collection( self._client, self.final_namespace, - legacy_namespace=self.legacy_namespace, + namespace=self.namespace, workspace=self.effective_workspace, vectors_config=models.VectorParams( size=self.embedding_func.embedding_dim, diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py index d56e0be0..3eb92f3f 100644 --- a/lightrag/kg/shared_storage.py +++ b/lightrag/kg/shared_storage.py @@ -176,7 +176,7 @@ class UnifiedLock(Generic[T]): enable_output=self._enable_logging, ) else: - direct_log( + direct_log( f"== Lock == Process {self._pid}: Main lock {self._name} is None (async={self._is_async})", level="WARNING", enable_output=self._enable_logging, diff --git a/tests/test_base_storage_integrity.py b/tests/test_base_storage_integrity.py index b8b4f6f7..1bd24777 100644 --- a/tests/test_base_storage_integrity.py +++ b/tests/test_base_storage_integrity.py @@ -2,33 +2,54 @@ import pytest from lightrag.base import BaseVectorStorage from lightrag.utils import EmbeddingFunc + def test_base_vector_storage_integrity(): # Just checking if we can import and inspect the class - assert hasattr(BaseVectorStorage, '_generate_collection_suffix') - assert hasattr(BaseVectorStorage, '_get_legacy_collection_name') - assert hasattr(BaseVectorStorage, '_get_new_collection_name') + assert hasattr(BaseVectorStorage, "_generate_collection_suffix") + assert hasattr(BaseVectorStorage, "_get_legacy_collection_name") + assert hasattr(BaseVectorStorage, "_get_new_collection_name") # Verify methods raise NotImplementedError class ConcreteStorage(BaseVectorStorage): - async def query(self, *args, **kwargs): pass - async def upsert(self, *args, **kwargs): pass - async def delete_entity(self, *args, **kwargs): pass - async def delete_entity_relation(self, *args, **kwargs): pass - async def get_by_id(self, *args, **kwargs): pass - async def get_by_ids(self, *args, **kwargs): pass - async def delete(self, *args, **kwargs): pass - async def get_vectors_by_ids(self, *args, **kwargs): pass - async def index_done_callback(self): pass - async def drop(self): pass + async def query(self, *args, **kwargs): + pass + + async def upsert(self, *args, **kwargs): + pass + + async def delete_entity(self, *args, **kwargs): + pass + + async def delete_entity_relation(self, *args, **kwargs): + pass + + async def get_by_id(self, *args, **kwargs): + pass + + async def get_by_ids(self, *args, **kwargs): + pass + + async def delete(self, *args, **kwargs): + pass + + async def get_vectors_by_ids(self, *args, **kwargs): + pass + + async def index_done_callback(self): + pass + + async def drop(self): + pass func = EmbeddingFunc(embedding_dim=128, func=lambda x: x) - storage = ConcreteStorage(namespace="test", workspace="test", global_config={}, embedding_func=func) - + storage = ConcreteStorage( + namespace="test", workspace="test", global_config={}, embedding_func=func + ) + assert storage._generate_collection_suffix() == "unknown_128d" - + with pytest.raises(NotImplementedError): storage._get_legacy_collection_name() - + with pytest.raises(NotImplementedError): storage._get_new_collection_name() - diff --git a/tests/test_embedding_func.py b/tests/test_embedding_func.py index 357e5808..8997a13a 100644 --- a/tests/test_embedding_func.py +++ b/tests/test_embedding_func.py @@ -1,37 +1,31 @@ -import pytest from lightrag.utils import EmbeddingFunc + def dummy_func(*args, **kwargs): pass + def test_embedding_func_with_model_name(): func = EmbeddingFunc( - embedding_dim=1536, - func=dummy_func, - model_name="text-embedding-ada-002" + embedding_dim=1536, func=dummy_func, model_name="text-embedding-ada-002" ) assert func.get_model_identifier() == "text_embedding_ada_002_1536d" + def test_embedding_func_without_model_name(): - func = EmbeddingFunc( - embedding_dim=768, - func=dummy_func - ) + func = EmbeddingFunc(embedding_dim=768, func=dummy_func) assert func.get_model_identifier() == "unknown_768d" + def test_model_name_sanitization(): func = EmbeddingFunc( embedding_dim=1024, func=dummy_func, - model_name="models/text-embedding-004" # Contains special chars + model_name="models/text-embedding-004", # Contains special chars ) assert func.get_model_identifier() == "models_text_embedding_004_1024d" -def test_model_name_with_uppercase(): - func = EmbeddingFunc( - embedding_dim=512, - func=dummy_func, - model_name="My-Model-V1" - ) - assert func.get_model_identifier() == "my_model_v1_512d" +def test_model_name_with_uppercase(): + func = EmbeddingFunc(embedding_dim=512, func=dummy_func, model_name="My-Model-V1") + assert func.get_model_identifier() == "my_model_v1_512d" diff --git a/tests/test_postgres_migration.py b/tests/test_postgres_migration.py index 8569335d..84cec898 100644 --- a/tests/test_postgres_migration.py +++ b/tests/test_postgres_migration.py @@ -1,13 +1,9 @@ -import os import pytest -from unittest.mock import MagicMock, patch, AsyncMock, call +from unittest.mock import patch, AsyncMock import numpy as np from lightrag.utils import EmbeddingFunc from lightrag.kg.postgres_impl import ( PGVectorStorage, - _pg_table_exists, - _pg_create_table, - PostgreSQLMigrationError, ) from lightrag.namespace import NameSpace @@ -56,29 +52,25 @@ def mock_embedding_func(): async def embed_func(texts, **kwargs): return np.array([[0.1] * 768 for _ in texts]) - func = EmbeddingFunc( - embedding_dim=768, - func=embed_func, - model_name="test_model" - ) + func = EmbeddingFunc(embedding_dim=768, func=embed_func, model_name="test_model") return func @pytest.mark.asyncio -async def test_postgres_table_naming(mock_client_manager, mock_pg_db, mock_embedding_func): +async def test_postgres_table_naming( + mock_client_manager, mock_pg_db, mock_embedding_func +): """Test if table name is correctly generated with model suffix""" config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } storage = PGVectorStorage( namespace=NameSpace.VECTOR_STORE_CHUNKS, global_config=config, embedding_func=mock_embedding_func, - workspace="test_ws" + workspace="test_ws", ) # Verify table name contains model suffix @@ -91,20 +83,20 @@ async def test_postgres_table_naming(mock_client_manager, mock_pg_db, mock_embed @pytest.mark.asyncio -async def test_postgres_migration_trigger(mock_client_manager, mock_pg_db, mock_embedding_func): +async def test_postgres_migration_trigger( + mock_client_manager, mock_pg_db, mock_embedding_func +): """Test if migration logic is triggered correctly""" config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } storage = PGVectorStorage( namespace=NameSpace.VECTOR_STORE_CHUNKS, global_config=config, embedding_func=mock_embedding_func, - workspace="test_ws" + workspace="test_ws", ) # Setup mocks for migration scenario @@ -132,9 +124,12 @@ async def test_postgres_migration_trigger(mock_client_manager, mock_pg_db, mock_ mock_pg_db.query = AsyncMock(side_effect=mock_query) - with patch("lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists), \ - patch("lightrag.kg.postgres_impl._pg_create_table", AsyncMock()): - + with ( + patch( + "lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists + ), + patch("lightrag.kg.postgres_impl._pg_create_table", AsyncMock()), + ): # Initialize storage (should trigger migration) await storage.initialize() @@ -144,29 +139,32 @@ async def test_postgres_migration_trigger(mock_client_manager, mock_pg_db, mock_ @pytest.mark.asyncio -async def test_postgres_no_migration_needed(mock_client_manager, mock_pg_db, mock_embedding_func): +async def test_postgres_no_migration_needed( + mock_client_manager, mock_pg_db, mock_embedding_func +): """Test scenario where new table already exists (no migration needed)""" config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } storage = PGVectorStorage( namespace=NameSpace.VECTOR_STORE_CHUNKS, global_config=config, embedding_func=mock_embedding_func, - workspace="test_ws" + workspace="test_ws", ) # Mock: new table already exists async def mock_table_exists(db, table_name): return table_name == storage.table_name - with patch("lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists), \ - patch("lightrag.kg.postgres_impl._pg_create_table", AsyncMock()) as mock_create: - + with ( + patch( + "lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists + ), + patch("lightrag.kg.postgres_impl._pg_create_table", AsyncMock()) as mock_create, + ): await storage.initialize() # Verify no table creation was attempted @@ -174,7 +172,9 @@ async def test_postgres_no_migration_needed(mock_client_manager, mock_pg_db, moc @pytest.mark.asyncio -async def test_scenario_1_new_workspace_creation(mock_client_manager, mock_pg_db, mock_embedding_func): +async def test_scenario_1_new_workspace_creation( + mock_client_manager, mock_pg_db, mock_embedding_func +): """ Scenario 1: New workspace creation @@ -185,31 +185,32 @@ async def test_scenario_1_new_workspace_creation(mock_client_manager, mock_pg_db """ config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } embedding_func = EmbeddingFunc( embedding_dim=3072, func=mock_embedding_func.func, - model_name="text-embedding-3-large" + model_name="text-embedding-3-large", ) storage = PGVectorStorage( namespace=NameSpace.VECTOR_STORE_CHUNKS, global_config=config, embedding_func=embedding_func, - workspace="new_workspace" + workspace="new_workspace", ) # Mock: neither table exists async def mock_table_exists(db, table_name): return False - with patch("lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists), \ - patch("lightrag.kg.postgres_impl._pg_create_table", AsyncMock()) as mock_create: - + with ( + patch( + "lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists + ), + patch("lightrag.kg.postgres_impl._pg_create_table", AsyncMock()) as mock_create, + ): await storage.initialize() # Verify table name format @@ -218,11 +219,15 @@ async def test_scenario_1_new_workspace_creation(mock_client_manager, mock_pg_db # Verify new table creation was called mock_create.assert_called_once() call_args = mock_create.call_args - assert call_args[0][1] == storage.table_name # table_name is second positional arg + assert ( + call_args[0][1] == storage.table_name + ) # table_name is second positional arg @pytest.mark.asyncio -async def test_scenario_2_legacy_upgrade_migration(mock_client_manager, mock_pg_db, mock_embedding_func): +async def test_scenario_2_legacy_upgrade_migration( + mock_client_manager, mock_pg_db, mock_embedding_func +): """ Scenario 2: Upgrade from legacy version @@ -233,22 +238,20 @@ async def test_scenario_2_legacy_upgrade_migration(mock_client_manager, mock_pg_ """ config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } embedding_func = EmbeddingFunc( embedding_dim=1536, func=mock_embedding_func.func, - model_name="text-embedding-ada-002" + model_name="text-embedding-ada-002", ) storage = PGVectorStorage( namespace=NameSpace.VECTOR_STORE_CHUNKS, global_config=config, embedding_func=embedding_func, - workspace="legacy_workspace" + workspace="legacy_workspace", ) # Mock: only legacy table exists @@ -257,7 +260,11 @@ async def test_scenario_2_legacy_upgrade_migration(mock_client_manager, mock_pg_ # Mock: legacy table has 50 records mock_rows = [ - {"id": f"legacy_id_{i}", "content": f"legacy_content_{i}", "workspace": "legacy_workspace"} + { + "id": f"legacy_id_{i}", + "content": f"legacy_content_{i}", + "workspace": "legacy_workspace", + } for i in range(50) ] @@ -279,9 +286,12 @@ async def test_scenario_2_legacy_upgrade_migration(mock_client_manager, mock_pg_ mock_pg_db.query = AsyncMock(side_effect=mock_query) - with patch("lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists), \ - patch("lightrag.kg.postgres_impl._pg_create_table", AsyncMock()) as mock_create: - + with ( + patch( + "lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists + ), + patch("lightrag.kg.postgres_impl._pg_create_table", AsyncMock()) as mock_create, + ): await storage.initialize() # Verify table name contains ada-002 @@ -293,7 +303,9 @@ async def test_scenario_2_legacy_upgrade_migration(mock_client_manager, mock_pg_ @pytest.mark.asyncio -async def test_scenario_3_multi_model_coexistence(mock_client_manager, mock_pg_db, mock_embedding_func): +async def test_scenario_3_multi_model_coexistence( + mock_client_manager, mock_pg_db, mock_embedding_func +): """ Scenario 3: Multiple embedding models coexist @@ -304,23 +316,19 @@ async def test_scenario_3_multi_model_coexistence(mock_client_manager, mock_pg_d """ config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } # Workspace A: uses bge-small (768d) embedding_func_a = EmbeddingFunc( - embedding_dim=768, - func=mock_embedding_func.func, - model_name="bge-small" + embedding_dim=768, func=mock_embedding_func.func, model_name="bge-small" ) storage_a = PGVectorStorage( namespace=NameSpace.VECTOR_STORE_CHUNKS, global_config=config, embedding_func=embedding_func_a, - workspace="workspace_a" + workspace="workspace_a", ) # Workspace B: uses bge-large (1024d) @@ -328,16 +336,14 @@ async def test_scenario_3_multi_model_coexistence(mock_client_manager, mock_pg_d return np.array([[0.1] * 1024 for _ in texts]) embedding_func_b = EmbeddingFunc( - embedding_dim=1024, - func=embed_func_b, - model_name="bge-large" + embedding_dim=1024, func=embed_func_b, model_name="bge-large" ) storage_b = PGVectorStorage( namespace=NameSpace.VECTOR_STORE_CHUNKS, global_config=config, embedding_func=embedding_func_b, - workspace="workspace_b" + workspace="workspace_b", ) # Verify different table names @@ -349,9 +355,12 @@ async def test_scenario_3_multi_model_coexistence(mock_client_manager, mock_pg_d async def mock_table_exists(db, table_name): return False - with patch("lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists), \ - patch("lightrag.kg.postgres_impl._pg_create_table", AsyncMock()) as mock_create: - + with ( + patch( + "lightrag.kg.postgres_impl._pg_table_exists", side_effect=mock_table_exists + ), + patch("lightrag.kg.postgres_impl._pg_create_table", AsyncMock()) as mock_create, + ): # Initialize both storages await storage_a.initialize() await storage_b.initialize() diff --git a/tests/test_qdrant_migration.py b/tests/test_qdrant_migration.py index 403dff9b..12ff3845 100644 --- a/tests/test_qdrant_migration.py +++ b/tests/test_qdrant_migration.py @@ -1,9 +1,9 @@ -import os import pytest -from unittest.mock import MagicMock, patch, AsyncMock, call +from unittest.mock import MagicMock, patch, AsyncMock import numpy as np from lightrag.utils import EmbeddingFunc -from lightrag.kg.qdrant_impl import QdrantVectorDBStorage, compute_mdhash_id_for_qdrant +from lightrag.kg.qdrant_impl import QdrantVectorDBStorage + # Mock QdrantClient @pytest.fixture @@ -18,6 +18,7 @@ def mock_qdrant_client(): client.get_collection.return_value = collection_info yield client + # Mock get_data_init_lock to avoid async lock issues in tests @pytest.fixture(autouse=True) def mock_data_init_lock(): @@ -26,36 +27,32 @@ def mock_data_init_lock(): mock_lock.return_value = mock_lock_ctx yield mock_lock + # Mock Embedding function @pytest.fixture def mock_embedding_func(): async def embed_func(texts, **kwargs): return np.array([[0.1] * 768 for _ in texts]) - - func = EmbeddingFunc( - embedding_dim=768, - func=embed_func, - model_name="test-model" - ) + + func = EmbeddingFunc(embedding_dim=768, func=embed_func, model_name="test-model") return func + @pytest.mark.asyncio async def test_qdrant_collection_naming(mock_qdrant_client, mock_embedding_func): """Test if collection name is correctly generated with model suffix""" config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } - + storage = QdrantVectorDBStorage( namespace="chunks", global_config=config, embedding_func=mock_embedding_func, - workspace="test_ws" + workspace="test_ws", ) - + # Verify collection name contains model suffix expected_suffix = "test_model_768d" assert expected_suffix in storage.final_namespace @@ -64,99 +61,97 @@ async def test_qdrant_collection_naming(mock_qdrant_client, mock_embedding_func) # Verify legacy namespace (should not include workspace, just the base collection name) assert storage.legacy_namespace == "lightrag_vdb_chunks" + @pytest.mark.asyncio async def test_qdrant_migration_trigger(mock_qdrant_client, mock_embedding_func): """Test if migration logic is triggered correctly""" config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } - + storage = QdrantVectorDBStorage( namespace="chunks", global_config=config, embedding_func=mock_embedding_func, - workspace="test_ws" + workspace="test_ws", ) - + # Setup mocks for migration scenario # 1. New collection does not exist - mock_qdrant_client.collection_exists.side_effect = lambda name: name == storage.legacy_namespace - + mock_qdrant_client.collection_exists.side_effect = ( + lambda name: name == storage.legacy_namespace + ) + # 2. Legacy collection exists and has data mock_qdrant_client.count.return_value.count = 100 - + # 3. Mock scroll for data migration - from qdrant_client import models + mock_point = MagicMock() mock_point.id = "old_id" mock_point.vector = [0.1] * 768 mock_point.payload = {"content": "test"} - + # First call returns points, second call returns empty (end of scroll) - mock_qdrant_client.scroll.side_effect = [ - ([mock_point], "next_offset"), - ([], None) - ] - + mock_qdrant_client.scroll.side_effect = [([mock_point], "next_offset"), ([], None)] + # Initialize storage (triggers migration) await storage.initialize() - + # Verify migration steps # 1. Legacy count checked mock_qdrant_client.count.assert_any_call( - collection_name=storage.legacy_namespace, - exact=True + collection_name=storage.legacy_namespace, exact=True ) - + # 2. New collection created mock_qdrant_client.create_collection.assert_called() - + # 3. Data scrolled from legacy assert mock_qdrant_client.scroll.call_count >= 1 call_args = mock_qdrant_client.scroll.call_args_list[0] - assert call_args.kwargs['collection_name'] == storage.legacy_namespace - assert call_args.kwargs['limit'] == 500 - + assert call_args.kwargs["collection_name"] == storage.legacy_namespace + assert call_args.kwargs["limit"] == 500 + # 4. Data upserted to new mock_qdrant_client.upsert.assert_called() - + # 5. Payload index created mock_qdrant_client.create_payload_index.assert_called() + @pytest.mark.asyncio async def test_qdrant_no_migration_needed(mock_qdrant_client, mock_embedding_func): """Test scenario where new collection already exists""" config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } - + storage = QdrantVectorDBStorage( namespace="chunks", global_config=config, embedding_func=mock_embedding_func, - workspace="test_ws" + workspace="test_ws", ) - + # New collection exists and Legacy exists (warning case) # or New collection exists and Legacy does not exist (normal case) # Mocking case where both exist to test logic flow but without migration - - # Logic in code: + + # Logic in code: # Case 1: Both exist -> Warning only # Case 2: Only new exists -> Ensure index - + # Let's test Case 2: Only new collection exists - mock_qdrant_client.collection_exists.side_effect = lambda name: name == storage.final_namespace - + mock_qdrant_client.collection_exists.side_effect = ( + lambda name: name == storage.final_namespace + ) + # Initialize await storage.initialize() - + # Should check index but NOT migrate # In Qdrant implementation, Case 2 calls get_collection mock_qdrant_client.get_collection.assert_called_with(storage.final_namespace) @@ -167,8 +162,11 @@ async def test_qdrant_no_migration_needed(mock_qdrant_client, mock_embedding_fun # Tests for scenarios described in design document (Lines 606-649) # ============================================================================ + @pytest.mark.asyncio -async def test_scenario_1_new_workspace_creation(mock_qdrant_client, mock_embedding_func): +async def test_scenario_1_new_workspace_creation( + mock_qdrant_client, mock_embedding_func +): """ 场景1:新建workspace 预期:直接创建lightrag_vdb_chunks_text_embedding_3_large_3072d @@ -177,21 +175,19 @@ async def test_scenario_1_new_workspace_creation(mock_qdrant_client, mock_embedd large_model_func = EmbeddingFunc( embedding_dim=3072, func=mock_embedding_func.func, - model_name="text-embedding-3-large" + model_name="text-embedding-3-large", ) config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } storage = QdrantVectorDBStorage( namespace="chunks", global_config=config, embedding_func=large_model_func, - workspace="test_new" + workspace="test_new", ) # Case 3: Neither legacy nor new collection exists @@ -205,18 +201,27 @@ async def test_scenario_1_new_workspace_creation(mock_qdrant_client, mock_embedd assert storage.final_namespace == expected_collection # Verify create_collection was called with correct name - create_calls = [call for call in mock_qdrant_client.create_collection.call_args_list] + create_calls = [ + call for call in mock_qdrant_client.create_collection.call_args_list + ] assert len(create_calls) > 0 - assert create_calls[0][0][0] == expected_collection or create_calls[0].kwargs.get('collection_name') == expected_collection + assert ( + create_calls[0][0][0] == expected_collection + or create_calls[0].kwargs.get("collection_name") == expected_collection + ) # Verify no migration was attempted mock_qdrant_client.scroll.assert_not_called() - print(f"✅ Scenario 1: New workspace created with collection '{expected_collection}'") + print( + f"✅ Scenario 1: New workspace created with collection '{expected_collection}'" + ) @pytest.mark.asyncio -async def test_scenario_2_legacy_upgrade_migration(mock_qdrant_client, mock_embedding_func): +async def test_scenario_2_legacy_upgrade_migration( + mock_qdrant_client, mock_embedding_func +): """ 场景2:从旧版本升级 已存在lightrag_vdb_chunks(无后缀) @@ -226,34 +231,34 @@ async def test_scenario_2_legacy_upgrade_migration(mock_qdrant_client, mock_embe ada_func = EmbeddingFunc( embedding_dim=1536, func=mock_embedding_func.func, - model_name="text-embedding-ada-002" + model_name="text-embedding-ada-002", ) config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } storage = QdrantVectorDBStorage( namespace="chunks", global_config=config, embedding_func=ada_func, - workspace="test_legacy" + workspace="test_legacy", ) legacy_collection = storage.legacy_namespace new_collection = storage.final_namespace # Case 4: Only legacy collection exists - mock_qdrant_client.collection_exists.side_effect = lambda name: name == legacy_collection + mock_qdrant_client.collection_exists.side_effect = ( + lambda name: name == legacy_collection + ) # Mock legacy data mock_qdrant_client.count.return_value.count = 150 # Mock scroll results (simulate migration in batches) - from qdrant_client import models + mock_points = [] for i in range(10): point = MagicMock() @@ -263,10 +268,7 @@ async def test_scenario_2_legacy_upgrade_migration(mock_qdrant_client, mock_embe mock_points.append(point) # First batch returns points, second batch returns empty - mock_qdrant_client.scroll.side_effect = [ - (mock_points, "offset1"), - ([], None) - ] + mock_qdrant_client.scroll.side_effect = [(mock_points, "offset1"), ([], None)] # Initialize (triggers migration) await storage.initialize() @@ -278,8 +280,7 @@ async def test_scenario_2_legacy_upgrade_migration(mock_qdrant_client, mock_embe # Verify migration steps # 1. Check legacy count mock_qdrant_client.count.assert_any_call( - collection_name=legacy_collection, - exact=True + collection_name=legacy_collection, exact=True ) # 2. Create new collection @@ -288,14 +289,16 @@ async def test_scenario_2_legacy_upgrade_migration(mock_qdrant_client, mock_embe # 3. Scroll legacy data scroll_calls = [call for call in mock_qdrant_client.scroll.call_args_list] assert len(scroll_calls) >= 1 - assert scroll_calls[0].kwargs['collection_name'] == legacy_collection + assert scroll_calls[0].kwargs["collection_name"] == legacy_collection # 4. Upsert to new collection upsert_calls = [call for call in mock_qdrant_client.upsert.call_args_list] assert len(upsert_calls) >= 1 - assert upsert_calls[0].kwargs['collection_name'] == new_collection + assert upsert_calls[0].kwargs["collection_name"] == new_collection - print(f"✅ Scenario 2: Legacy data migrated from '{legacy_collection}' to '{expected_new_collection}'") + print( + f"✅ Scenario 2: Legacy data migrated from '{legacy_collection}' to '{expected_new_collection}'" + ) @pytest.mark.asyncio @@ -304,14 +307,13 @@ async def test_scenario_3_multi_model_coexistence(mock_qdrant_client): 场景3:多模型并存 预期:两个独立的collection,互不干扰 """ + # Model A: bge-small with 768d async def embed_func_a(texts, **kwargs): return np.array([[0.1] * 768 for _ in texts]) model_a_func = EmbeddingFunc( - embedding_dim=768, - func=embed_func_a, - model_name="bge-small" + embedding_dim=768, func=embed_func_a, model_name="bge-small" ) # Model B: bge-large with 1024d @@ -319,16 +321,12 @@ async def test_scenario_3_multi_model_coexistence(mock_qdrant_client): return np.array([[0.2] * 1024 for _ in texts]) model_b_func = EmbeddingFunc( - embedding_dim=1024, - func=embed_func_b, - model_name="bge-large" + embedding_dim=1024, func=embed_func_b, model_name="bge-large" ) config = { "embedding_batch_num": 10, - "vector_db_storage_cls_kwargs": { - "cosine_better_than_threshold": 0.8 - } + "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.8}, } # Create storage for workspace A with model A @@ -336,7 +334,7 @@ async def test_scenario_3_multi_model_coexistence(mock_qdrant_client): namespace="chunks", global_config=config, embedding_func=model_a_func, - workspace="workspace_a" + workspace="workspace_a", ) # Create storage for workspace B with model B @@ -344,7 +342,7 @@ async def test_scenario_3_multi_model_coexistence(mock_qdrant_client): namespace="chunks", global_config=config, embedding_func=model_b_func, - workspace="workspace_b" + workspace="workspace_b", ) # Verify: Collection names are different @@ -362,7 +360,7 @@ async def test_scenario_3_multi_model_coexistence(mock_qdrant_client): assert storage_a.embedding_func.embedding_dim == 768 assert storage_b.embedding_func.embedding_dim == 1024 - print(f"✅ Scenario 3: Multi-model coexistence verified") + print("✅ Scenario 3: Multi-model coexistence verified") print(f" - Workspace A: {expected_collection_a} (768d)") print(f" - Workspace B: {expected_collection_b} (1024d)") - print(f" - Collections are independent") + print(" - Collections are independent")