271 lines
9.1 KiB
Python
271 lines
9.1 KiB
Python
"""
|
||
Multi-Model Vector Storage Isolation Demo
|
||
|
||
This example demonstrates LightRAG's automatic model isolation feature for vector storage.
|
||
When using different embedding models, LightRAG automatically creates separate collections/tables,
|
||
preventing dimension mismatches and data pollution.
|
||
|
||
Key Features:
|
||
- Automatic model suffix generation: {model_name}_{dim}d
|
||
- Seamless migration from legacy (no-suffix) to new (with-suffix) collections
|
||
- Support for multiple workspaces with different embedding models
|
||
|
||
Requirements:
|
||
- OpenAI API key (or any OpenAI-compatible API)
|
||
- Qdrant or PostgreSQL for vector storage (optional, defaults to NanoVectorDB)
|
||
"""
|
||
|
||
import asyncio
|
||
from lightrag import LightRAG, QueryParam
|
||
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
|
||
from lightrag.utils import EmbeddingFunc
|
||
|
||
# Set your API key
|
||
# os.environ["OPENAI_API_KEY"] = "your-api-key-here"
|
||
|
||
|
||
async def scenario_1_new_workspace_with_explicit_model():
|
||
"""
|
||
Scenario 1: Creating a new workspace with explicit model name
|
||
|
||
Result: Creates collection/table with name like:
|
||
- Qdrant: lightrag_vdb_chunks_text_embedding_3_large_3072d
|
||
- PostgreSQL: LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d
|
||
"""
|
||
print("\n" + "=" * 80)
|
||
print("Scenario 1: New Workspace with Explicit Model Name")
|
||
print("=" * 80)
|
||
|
||
# Define custom embedding function with explicit model name
|
||
async def my_embedding_func(texts: list[str]):
|
||
return await openai_embed(texts, model="text-embedding-3-large")
|
||
|
||
# Create EmbeddingFunc with model_name specified
|
||
embedding_func = EmbeddingFunc(
|
||
embedding_dim=3072,
|
||
func=my_embedding_func,
|
||
model_name="text-embedding-3-large", # Explicit model name
|
||
)
|
||
|
||
rag = LightRAG(
|
||
working_dir="./workspace_large_model",
|
||
llm_model_func=gpt_4o_mini_complete,
|
||
embedding_func=embedding_func,
|
||
)
|
||
|
||
await rag.initialize_storages()
|
||
|
||
# Insert sample data
|
||
await rag.ainsert("LightRAG supports automatic model isolation for vector storage.")
|
||
|
||
# Query
|
||
result = await rag.aquery(
|
||
"What does LightRAG support?", param=QueryParam(mode="hybrid")
|
||
)
|
||
|
||
print(f"\nQuery Result: {result[:200]}...")
|
||
print("\n✅ Collection/table created with suffix: text_embedding_3_large_3072d")
|
||
|
||
await rag.close()
|
||
|
||
|
||
async def scenario_2_legacy_migration():
|
||
"""
|
||
Scenario 2: Upgrading from legacy version (without model_name)
|
||
|
||
If you previously used LightRAG without specifying model_name,
|
||
the first run with model_name will automatically migrate your data.
|
||
|
||
Result: Data is migrated from:
|
||
- Old: lightrag_vdb_chunks (no suffix)
|
||
- New: lightrag_vdb_chunks_text_embedding_ada_002_1536d (with suffix)
|
||
"""
|
||
print("\n" + "=" * 80)
|
||
print("Scenario 2: Automatic Migration from Legacy Format")
|
||
print("=" * 80)
|
||
|
||
# Step 1: Simulate legacy workspace (no model_name)
|
||
print("\n[Step 1] Creating legacy workspace without model_name...")
|
||
|
||
async def legacy_embedding_func(texts: list[str]):
|
||
return await openai_embed(texts, model="text-embedding-ada-002")
|
||
|
||
# Legacy: No model_name specified
|
||
legacy_embedding = EmbeddingFunc(
|
||
embedding_dim=1536,
|
||
func=legacy_embedding_func,
|
||
# model_name not specified → uses "unknown" as fallback
|
||
)
|
||
|
||
rag_legacy = LightRAG(
|
||
working_dir="./workspace_legacy",
|
||
llm_model_func=gpt_4o_mini_complete,
|
||
embedding_func=legacy_embedding,
|
||
)
|
||
|
||
await rag_legacy.initialize_storages()
|
||
await rag_legacy.ainsert("Legacy data without model isolation.")
|
||
await rag_legacy.close()
|
||
|
||
print("✅ Legacy workspace created with suffix: unknown_1536d")
|
||
|
||
# Step 2: Upgrade to new version with model_name
|
||
print("\n[Step 2] Upgrading to new version with explicit model_name...")
|
||
|
||
# New: With model_name specified
|
||
new_embedding = EmbeddingFunc(
|
||
embedding_dim=1536,
|
||
func=legacy_embedding_func,
|
||
model_name="text-embedding-ada-002", # Now explicitly specified
|
||
)
|
||
|
||
rag_new = LightRAG(
|
||
working_dir="./workspace_legacy", # Same working directory
|
||
llm_model_func=gpt_4o_mini_complete,
|
||
embedding_func=new_embedding,
|
||
)
|
||
|
||
# On first initialization, LightRAG will:
|
||
# 1. Detect legacy collection exists
|
||
# 2. Automatically migrate data to new collection with model suffix
|
||
# 3. Legacy collection remains but can be deleted after verification
|
||
await rag_new.initialize_storages()
|
||
|
||
# Verify data is still accessible
|
||
result = await rag_new.aquery(
|
||
"What is the legacy data?", param=QueryParam(mode="hybrid")
|
||
)
|
||
|
||
print(f"\nQuery Result: {result[:200] if result else 'No results'}...")
|
||
print("\n✅ Data migrated to: text_embedding_ada_002_1536d")
|
||
print("ℹ️ Legacy collection can be manually deleted after verification")
|
||
|
||
await rag_new.close()
|
||
|
||
|
||
async def scenario_3_multiple_models_coexistence():
|
||
"""
|
||
Scenario 3: Multiple workspaces with different embedding models
|
||
|
||
Different embedding models create completely isolated collections/tables,
|
||
allowing safe coexistence without dimension conflicts or data pollution.
|
||
|
||
Result:
|
||
- Workspace A: lightrag_vdb_chunks_bge_small_768d
|
||
- Workspace B: lightrag_vdb_chunks_bge_large_1024d
|
||
"""
|
||
print("\n" + "=" * 80)
|
||
print("Scenario 3: Multiple Models Coexistence")
|
||
print("=" * 80)
|
||
|
||
# Workspace A: Small embedding model (768 dimensions)
|
||
print("\n[Workspace A] Using bge-small model (768d)...")
|
||
|
||
async def embedding_func_small(texts: list[str]):
|
||
# Simulate small embedding model
|
||
# In real usage, replace with actual model call
|
||
return await openai_embed(texts, model="text-embedding-3-small")
|
||
|
||
embedding_a = EmbeddingFunc(
|
||
embedding_dim=1536, # text-embedding-3-small dimension
|
||
func=embedding_func_small,
|
||
model_name="text-embedding-3-small",
|
||
)
|
||
|
||
rag_a = LightRAG(
|
||
working_dir="./workspace_a",
|
||
llm_model_func=gpt_4o_mini_complete,
|
||
embedding_func=embedding_a,
|
||
)
|
||
|
||
await rag_a.initialize_storages()
|
||
await rag_a.ainsert("Workspace A uses small embedding model for efficiency.")
|
||
|
||
print("✅ Workspace A created with suffix: text_embedding_3_small_1536d")
|
||
|
||
# Workspace B: Large embedding model (3072 dimensions)
|
||
print("\n[Workspace B] Using text-embedding-3-large model (3072d)...")
|
||
|
||
async def embedding_func_large(texts: list[str]):
|
||
# Simulate large embedding model
|
||
return await openai_embed(texts, model="text-embedding-3-large")
|
||
|
||
embedding_b = EmbeddingFunc(
|
||
embedding_dim=3072, # text-embedding-3-large dimension
|
||
func=embedding_func_large,
|
||
model_name="text-embedding-3-large",
|
||
)
|
||
|
||
rag_b = LightRAG(
|
||
working_dir="./workspace_b",
|
||
llm_model_func=gpt_4o_mini_complete,
|
||
embedding_func=embedding_b,
|
||
)
|
||
|
||
await rag_b.initialize_storages()
|
||
await rag_b.ainsert("Workspace B uses large embedding model for better accuracy.")
|
||
|
||
print("✅ Workspace B created with suffix: text_embedding_3_large_3072d")
|
||
|
||
# Verify isolation: Query each workspace
|
||
print("\n[Verification] Querying both workspaces...")
|
||
|
||
result_a = await rag_a.aquery(
|
||
"What model does workspace use?", param=QueryParam(mode="hybrid")
|
||
)
|
||
result_b = await rag_b.aquery(
|
||
"What model does workspace use?", param=QueryParam(mode="hybrid")
|
||
)
|
||
|
||
print(f"\nWorkspace A Result: {result_a[:100] if result_a else 'No results'}...")
|
||
print(f"Workspace B Result: {result_b[:100] if result_b else 'No results'}...")
|
||
|
||
print("\n✅ Both workspaces operate independently without interference")
|
||
|
||
await rag_a.close()
|
||
await rag_b.close()
|
||
|
||
|
||
async def main():
|
||
"""
|
||
Run all scenarios to demonstrate model isolation features
|
||
"""
|
||
print("\n" + "=" * 80)
|
||
print("LightRAG Multi-Model Vector Storage Isolation Demo")
|
||
print("=" * 80)
|
||
print("\nThis demo shows how LightRAG automatically handles:")
|
||
print("1. ✅ Automatic model suffix generation")
|
||
print("2. ✅ Seamless data migration from legacy format")
|
||
print("3. ✅ Multiple embedding models coexistence")
|
||
|
||
try:
|
||
# Scenario 1: New workspace with explicit model
|
||
await scenario_1_new_workspace_with_explicit_model()
|
||
|
||
# Scenario 2: Legacy migration
|
||
await scenario_2_legacy_migration()
|
||
|
||
# Scenario 3: Multiple models coexistence
|
||
await scenario_3_multiple_models_coexistence()
|
||
|
||
print("\n" + "=" * 80)
|
||
print("✅ All scenarios completed successfully!")
|
||
print("=" * 80)
|
||
|
||
print("\n📝 Key Takeaways:")
|
||
print("- Always specify `model_name` in EmbeddingFunc for clear model tracking")
|
||
print("- LightRAG automatically migrates legacy data on first run")
|
||
print("- Different embedding models create isolated collections/tables")
|
||
print("- Collection names follow pattern: {base_name}_{model_name}_{dim}d")
|
||
print("\n📚 See the plan document for more details:")
|
||
print(" .claude/plan/PR-vector-model-isolation.md")
|
||
|
||
except Exception as e:
|
||
print(f"\n❌ Error: {e}")
|
||
import traceback
|
||
|
||
traceback.print_exc()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|