LightRAG/examples/multi_model_demo.py

271 lines
9.1 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Multi-Model Vector Storage Isolation Demo
This example demonstrates LightRAG's automatic model isolation feature for vector storage.
When using different embedding models, LightRAG automatically creates separate collections/tables,
preventing dimension mismatches and data pollution.
Key Features:
- Automatic model suffix generation: {model_name}_{dim}d
- Seamless migration from legacy (no-suffix) to new (with-suffix) collections
- Support for multiple workspaces with different embedding models
Requirements:
- OpenAI API key (or any OpenAI-compatible API)
- Qdrant or PostgreSQL for vector storage (optional, defaults to NanoVectorDB)
"""
import asyncio
from lightrag import LightRAG, QueryParam
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
from lightrag.utils import EmbeddingFunc
# Set your API key
# os.environ["OPENAI_API_KEY"] = "your-api-key-here"
async def scenario_1_new_workspace_with_explicit_model():
"""
Scenario 1: Creating a new workspace with explicit model name
Result: Creates collection/table with name like:
- Qdrant: lightrag_vdb_chunks_text_embedding_3_large_3072d
- PostgreSQL: LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d
"""
print("\n" + "=" * 80)
print("Scenario 1: New Workspace with Explicit Model Name")
print("=" * 80)
# Define custom embedding function with explicit model name
async def my_embedding_func(texts: list[str]):
return await openai_embed(texts, model="text-embedding-3-large")
# Create EmbeddingFunc with model_name specified
embedding_func = EmbeddingFunc(
embedding_dim=3072,
func=my_embedding_func,
model_name="text-embedding-3-large", # Explicit model name
)
rag = LightRAG(
working_dir="./workspace_large_model",
llm_model_func=gpt_4o_mini_complete,
embedding_func=embedding_func,
)
await rag.initialize_storages()
# Insert sample data
await rag.ainsert("LightRAG supports automatic model isolation for vector storage.")
# Query
result = await rag.aquery(
"What does LightRAG support?", param=QueryParam(mode="hybrid")
)
print(f"\nQuery Result: {result[:200]}...")
print("\n✅ Collection/table created with suffix: text_embedding_3_large_3072d")
await rag.close()
async def scenario_2_legacy_migration():
"""
Scenario 2: Upgrading from legacy version (without model_name)
If you previously used LightRAG without specifying model_name,
the first run with model_name will automatically migrate your data.
Result: Data is migrated from:
- Old: lightrag_vdb_chunks (no suffix)
- New: lightrag_vdb_chunks_text_embedding_ada_002_1536d (with suffix)
"""
print("\n" + "=" * 80)
print("Scenario 2: Automatic Migration from Legacy Format")
print("=" * 80)
# Step 1: Simulate legacy workspace (no model_name)
print("\n[Step 1] Creating legacy workspace without model_name...")
async def legacy_embedding_func(texts: list[str]):
return await openai_embed(texts, model="text-embedding-ada-002")
# Legacy: No model_name specified
legacy_embedding = EmbeddingFunc(
embedding_dim=1536,
func=legacy_embedding_func,
# model_name not specified → uses "unknown" as fallback
)
rag_legacy = LightRAG(
working_dir="./workspace_legacy",
llm_model_func=gpt_4o_mini_complete,
embedding_func=legacy_embedding,
)
await rag_legacy.initialize_storages()
await rag_legacy.ainsert("Legacy data without model isolation.")
await rag_legacy.close()
print("✅ Legacy workspace created with suffix: unknown_1536d")
# Step 2: Upgrade to new version with model_name
print("\n[Step 2] Upgrading to new version with explicit model_name...")
# New: With model_name specified
new_embedding = EmbeddingFunc(
embedding_dim=1536,
func=legacy_embedding_func,
model_name="text-embedding-ada-002", # Now explicitly specified
)
rag_new = LightRAG(
working_dir="./workspace_legacy", # Same working directory
llm_model_func=gpt_4o_mini_complete,
embedding_func=new_embedding,
)
# On first initialization, LightRAG will:
# 1. Detect legacy collection exists
# 2. Automatically migrate data to new collection with model suffix
# 3. Legacy collection remains but can be deleted after verification
await rag_new.initialize_storages()
# Verify data is still accessible
result = await rag_new.aquery(
"What is the legacy data?", param=QueryParam(mode="hybrid")
)
print(f"\nQuery Result: {result[:200] if result else 'No results'}...")
print("\n✅ Data migrated to: text_embedding_ada_002_1536d")
print(" Legacy collection can be manually deleted after verification")
await rag_new.close()
async def scenario_3_multiple_models_coexistence():
"""
Scenario 3: Multiple workspaces with different embedding models
Different embedding models create completely isolated collections/tables,
allowing safe coexistence without dimension conflicts or data pollution.
Result:
- Workspace A: lightrag_vdb_chunks_bge_small_768d
- Workspace B: lightrag_vdb_chunks_bge_large_1024d
"""
print("\n" + "=" * 80)
print("Scenario 3: Multiple Models Coexistence")
print("=" * 80)
# Workspace A: Small embedding model (768 dimensions)
print("\n[Workspace A] Using bge-small model (768d)...")
async def embedding_func_small(texts: list[str]):
# Simulate small embedding model
# In real usage, replace with actual model call
return await openai_embed(texts, model="text-embedding-3-small")
embedding_a = EmbeddingFunc(
embedding_dim=1536, # text-embedding-3-small dimension
func=embedding_func_small,
model_name="text-embedding-3-small",
)
rag_a = LightRAG(
working_dir="./workspace_a",
llm_model_func=gpt_4o_mini_complete,
embedding_func=embedding_a,
)
await rag_a.initialize_storages()
await rag_a.ainsert("Workspace A uses small embedding model for efficiency.")
print("✅ Workspace A created with suffix: text_embedding_3_small_1536d")
# Workspace B: Large embedding model (3072 dimensions)
print("\n[Workspace B] Using text-embedding-3-large model (3072d)...")
async def embedding_func_large(texts: list[str]):
# Simulate large embedding model
return await openai_embed(texts, model="text-embedding-3-large")
embedding_b = EmbeddingFunc(
embedding_dim=3072, # text-embedding-3-large dimension
func=embedding_func_large,
model_name="text-embedding-3-large",
)
rag_b = LightRAG(
working_dir="./workspace_b",
llm_model_func=gpt_4o_mini_complete,
embedding_func=embedding_b,
)
await rag_b.initialize_storages()
await rag_b.ainsert("Workspace B uses large embedding model for better accuracy.")
print("✅ Workspace B created with suffix: text_embedding_3_large_3072d")
# Verify isolation: Query each workspace
print("\n[Verification] Querying both workspaces...")
result_a = await rag_a.aquery(
"What model does workspace use?", param=QueryParam(mode="hybrid")
)
result_b = await rag_b.aquery(
"What model does workspace use?", param=QueryParam(mode="hybrid")
)
print(f"\nWorkspace A Result: {result_a[:100] if result_a else 'No results'}...")
print(f"Workspace B Result: {result_b[:100] if result_b else 'No results'}...")
print("\n✅ Both workspaces operate independently without interference")
await rag_a.close()
await rag_b.close()
async def main():
"""
Run all scenarios to demonstrate model isolation features
"""
print("\n" + "=" * 80)
print("LightRAG Multi-Model Vector Storage Isolation Demo")
print("=" * 80)
print("\nThis demo shows how LightRAG automatically handles:")
print("1. ✅ Automatic model suffix generation")
print("2. ✅ Seamless data migration from legacy format")
print("3. ✅ Multiple embedding models coexistence")
try:
# Scenario 1: New workspace with explicit model
await scenario_1_new_workspace_with_explicit_model()
# Scenario 2: Legacy migration
await scenario_2_legacy_migration()
# Scenario 3: Multiple models coexistence
await scenario_3_multiple_models_coexistence()
print("\n" + "=" * 80)
print("✅ All scenarios completed successfully!")
print("=" * 80)
print("\n📝 Key Takeaways:")
print("- Always specify `model_name` in EmbeddingFunc for clear model tracking")
print("- LightRAG automatically migrates legacy data on first run")
print("- Different embedding models create isolated collections/tables")
print("- Collection names follow pattern: {base_name}_{model_name}_{dim}d")
print("\n📚 See the plan document for more details:")
print(" .claude/plan/PR-vector-model-isolation.md")
except Exception as e:
print(f"\n❌ Error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(main())