LightRAG/tests/test_empty_model_suffix.py
BukeLy 42df825d30 fix: handle empty model_suffix in Qdrant collection naming
This change ensures that when the model_suffix is empty, the final_namespace falls back to the legacy_namespace, preventing potential naming issues. A warning is logged to inform users about the missing model suffix and the fallback to the legacy naming scheme.

Additionally, comprehensive tests have been added to verify the behavior of both PostgreSQL and Qdrant storage when model_suffix is empty, ensuring that the naming conventions are correctly applied and that no trailing underscores are present.

Impact:
- Prevents crashes due to empty model_suffix
- Provides clear feedback to users regarding configuration issues
- Maintains backward compatibility with existing setups

Testing:
All new tests pass, validating the handling of empty model_suffix scenarios.
2025-11-20 01:55:20 +08:00

294 lines
10 KiB
Python

"""
Tests for handling empty model suffix in PostgreSQL and Qdrant storage.
This test module verifies that both storage backends gracefully handle
the case when _generate_collection_suffix() returns an empty string.
"""
import pytest
from unittest.mock import Mock, patch, MagicMock
from lightrag.base import BaseVectorStorage
from lightrag.utils import EmbeddingFunc
def dummy_embedding_func(*args, **kwargs):
"""Dummy embedding function for testing."""
pass
class TestEmptyModelSuffix:
"""Test suite for handling empty model suffix scenarios."""
def test_postgres_table_name_with_empty_suffix(self):
"""
Test PostgreSQL table name generation when model_suffix is empty.
Bug Fix Verification:
- Before: table_name = "LIGHTRAG_VDB_CHUNKS_" (trailing underscore)
- After: table_name = "LIGHTRAG_VDB_CHUNKS" (fallback to base name)
"""
from lightrag.kg.postgres_impl import PostgresVectorDBStorage
from lightrag.kg.shared_storage import namespace_to_table_name
# Create a mock embedding function without get_model_identifier
mock_embedding_func = Mock(spec=["embedding_dim"])
mock_embedding_func.embedding_dim = 1536
# Setup global_config without embedding_func
global_config = {
"embedding_batch_num": 100,
"pgvector_precision": "hybrid",
"pg_host": "localhost",
"pg_port": 5432,
"pg_user": "user",
"pg_password": "password",
"pg_database": "lightrag",
}
# Create PostgreSQL storage instance
storage = PostgresVectorDBStorage(
namespace="chunks",
workspace="test",
global_config=global_config,
embedding_func=mock_embedding_func,
)
# Verify that:
# 1. model_suffix is empty
# 2. table_name doesn't have trailing underscore
# 3. table_name equals the base table name
assert storage.model_suffix == "", "model_suffix should be empty"
assert (
not storage.table_name.endswith("_"),
f"table_name should not have trailing underscore: {storage.table_name}",
)
# Expected base table name
expected_base = namespace_to_table_name("chunks")
assert storage.table_name == expected_base, (
f"table_name should fallback to base name when model_suffix is empty. "
f"Expected: {expected_base}, Got: {storage.table_name}"
)
def test_qdrant_collection_name_with_empty_suffix(self):
"""
Test Qdrant collection name generation when model_suffix is empty.
Bug Fix Verification:
- Before: final_namespace = "lightrag_vdb_chunks_" (trailing underscore)
- After: final_namespace = "lightrag_vdb_chunks" (fallback to legacy name)
"""
from lightrag.kg.qdrant_impl import QdrantVectorDBStorage
# Create a mock embedding function without get_model_identifier
mock_embedding_func = Mock(spec=["embedding_dim"])
mock_embedding_func.embedding_dim = 1536
# Setup global_config without embedding_func
global_config = {
"embedding_batch_num": 100,
"qdrant_url": "http://localhost:6333",
}
# Create Qdrant storage instance
storage = QdrantVectorDBStorage(
namespace="chunks",
workspace="test",
global_config=global_config,
embedding_func=mock_embedding_func,
)
# Verify that:
# 1. model_suffix is empty
# 2. final_namespace doesn't have trailing underscore
# 3. final_namespace equals the legacy namespace
assert storage._generate_collection_suffix() == "", (
"model_suffix should be empty"
)
assert (
not storage.final_namespace.endswith("_"),
f"final_namespace should not have trailing underscore: {storage.final_namespace}",
)
assert storage.final_namespace == storage.legacy_namespace, (
f"final_namespace should fallback to legacy_namespace when model_suffix is empty. "
f"Expected: {storage.legacy_namespace}, Got: {storage.final_namespace}"
)
def test_postgres_table_name_with_valid_suffix(self):
"""
Test PostgreSQL table name generation with valid model suffix.
Verification:
- When embedding_func has get_model_identifier, use it
- table_name has proper format: base_table_model_suffix
"""
from lightrag.kg.postgres_impl import PostgresVectorDBStorage
from lightrag.kg.shared_storage import namespace_to_table_name
# Create a proper embedding function with model_name
embedding_func = EmbeddingFunc(
embedding_dim=1536, func=dummy_embedding_func, model_name="text-embedding-ada-002"
)
# Setup global_config
global_config = {
"embedding_batch_num": 100,
"pgvector_precision": "hybrid",
"pg_host": "localhost",
"pg_port": 5432,
"pg_user": "user",
"pg_password": "password",
"pg_database": "lightrag",
"embedding_func": embedding_func,
}
# Create PostgreSQL storage instance
storage = PostgresVectorDBStorage(
namespace="chunks",
workspace="test",
global_config=global_config,
embedding_func=embedding_func,
)
# Verify that:
# 1. model_suffix is not empty
# 2. table_name has correct format
assert storage.model_suffix != "", "model_suffix should not be empty"
assert "_" in storage.table_name, "table_name should contain underscore as separator"
# Expected format: base_table_model_suffix
expected_base = namespace_to_table_name("chunks")
expected_model_id = embedding_func.get_model_identifier()
expected_table_name = f"{expected_base}_{expected_model_id}"
assert storage.table_name == expected_table_name, (
f"table_name format incorrect. Expected: {expected_table_name}, Got: {storage.table_name}"
)
def test_qdrant_collection_name_with_valid_suffix(self):
"""
Test Qdrant collection name generation with valid model suffix.
Verification:
- When embedding_func has get_model_identifier, use it
- final_namespace has proper format: lightrag_vdb_namespace_model_suffix
"""
from lightrag.kg.qdrant_impl import QdrantVectorDBStorage
# Create a proper embedding function with model_name
embedding_func = EmbeddingFunc(
embedding_dim=1536, func=dummy_embedding_func, model_name="text-embedding-ada-002"
)
# Setup global_config
global_config = {
"embedding_batch_num": 100,
"qdrant_url": "http://localhost:6333",
"embedding_func": embedding_func,
}
# Create Qdrant storage instance
storage = QdrantVectorDBStorage(
namespace="chunks",
workspace="test",
global_config=global_config,
embedding_func=embedding_func,
)
# Verify that:
# 1. model_suffix is not empty
# 2. final_namespace has correct format
model_suffix = storage._generate_collection_suffix()
assert model_suffix != "", "model_suffix should not be empty"
assert "_" in storage.final_namespace, (
"final_namespace should contain underscore as separator"
)
# Expected format: lightrag_vdb_namespace_model_suffix
expected_model_id = embedding_func.get_model_identifier()
expected_collection_name = f"lightrag_vdb_chunks_{expected_model_id}"
assert storage.final_namespace == expected_collection_name, (
f"final_namespace format incorrect. Expected: {expected_collection_name}, Got: {storage.final_namespace}"
)
def test_suffix_generation_fallback_chain(self):
"""
Test the fallback chain in _generate_collection_suffix.
Verification:
1. Direct method: embedding_func.get_model_identifier()
2. Global config fallback: global_config["embedding_func"].get_model_identifier()
3. Final fallback: return empty string
"""
from lightrag.base import BaseVectorStorage
# Create a concrete implementation for testing
class TestStorage(BaseVectorStorage):
async def query(self, *args, **kwargs):
pass
async def upsert(self, *args, **kwargs):
pass
async def delete_entity(self, *args, **kwargs):
pass
async def delete_entity_relation(self, *args, **kwargs):
pass
async def get_by_id(self, *args, **kwargs):
pass
async def get_by_ids(self, *args, **kwargs):
pass
async def delete(self, *args, **kwargs):
pass
async def get_vectors_by_ids(self, *args, **kwargs):
pass
async def index_done_callback(self):
pass
async def drop(self):
pass
# Case 1: Direct method available
embedding_func = EmbeddingFunc(
embedding_dim=1536, func=dummy_embedding_func, model_name="test-model"
)
storage = TestStorage(
namespace="test",
workspace="test",
global_config={},
embedding_func=embedding_func,
)
assert (
storage._generate_collection_suffix() == "test_model_1536d"
), "Should use direct method when available"
# Case 2: Global config fallback
mock_embedding_func = Mock(spec=[]) # No get_model_identifier
storage = TestStorage(
namespace="test",
workspace="test",
global_config={"embedding_func": embedding_func},
embedding_func=mock_embedding_func,
)
assert (
storage._generate_collection_suffix() == "test_model_1536d"
), "Should fallback to global_config embedding_func"
# Case 3: Final fallback (no embedding_func anywhere)
storage = TestStorage(
namespace="test",
workspace="test",
global_config={},
embedding_func=mock_embedding_func,
)
assert storage._generate_collection_suffix() == "", (
"Should return empty string when no model_identifier available"
)