Introduces a full test suite under the tests/ directory, including API, service, connector, and utility tests, along with fixtures and documentation. Expands Makefile with granular test commands for unit, integration, API, service, connector, coverage, and quick tests. Adds configuration files for pytest and coverage reporting, and provides a quickstart guide for testing workflow.
182 lines
6.9 KiB
Python
182 lines
6.9 KiB
Python
"""
|
|
Tests for embeddings utility functions.
|
|
"""
|
|
|
|
import pytest
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add src to path
|
|
src_path = Path(__file__).parent.parent.parent / "src"
|
|
sys.path.insert(0, str(src_path))
|
|
|
|
from utils.embeddings import get_embedding_dimensions, create_dynamic_index_body
|
|
|
|
|
|
@pytest.mark.unit
|
|
class TestEmbeddingDimensions:
|
|
"""Test suite for embedding dimension utilities."""
|
|
|
|
def test_get_openai_embedding_dimensions(self):
|
|
"""Test getting dimensions for OpenAI models."""
|
|
# Test common OpenAI models
|
|
assert get_embedding_dimensions("text-embedding-ada-002") > 0
|
|
assert get_embedding_dimensions("text-embedding-3-small") > 0
|
|
assert get_embedding_dimensions("text-embedding-3-large") > 0
|
|
|
|
def test_get_ollama_embedding_dimensions(self):
|
|
"""Test getting dimensions for Ollama models."""
|
|
# Test common Ollama models
|
|
dimensions = get_embedding_dimensions("nomic-embed-text")
|
|
assert dimensions > 0
|
|
assert isinstance(dimensions, int)
|
|
|
|
def test_get_embedding_dimensions_with_version(self):
|
|
"""Test that model names with versions are handled correctly."""
|
|
# Model name with version tag should still work
|
|
dim_with_version = get_embedding_dimensions("nomic-embed-text:latest")
|
|
dim_without_version = get_embedding_dimensions("nomic-embed-text")
|
|
assert dim_with_version == dim_without_version
|
|
|
|
def test_get_embedding_dimensions_case_insensitive(self):
|
|
"""Test that model name lookup is case-insensitive."""
|
|
dim_lower = get_embedding_dimensions("nomic-embed-text")
|
|
dim_upper = get_embedding_dimensions("NOMIC-EMBED-TEXT")
|
|
dim_mixed = get_embedding_dimensions("Nomic-Embed-Text")
|
|
|
|
assert dim_lower == dim_upper == dim_mixed
|
|
|
|
def test_get_embedding_dimensions_with_whitespace(self):
|
|
"""Test that whitespace in model names is handled."""
|
|
dim_no_space = get_embedding_dimensions("nomic-embed-text")
|
|
dim_with_space = get_embedding_dimensions(" nomic-embed-text ")
|
|
|
|
assert dim_no_space == dim_with_space
|
|
|
|
def test_get_embedding_dimensions_unknown_model(self):
|
|
"""Test that unknown models return default dimensions."""
|
|
dimensions = get_embedding_dimensions("unknown-model-xyz")
|
|
assert isinstance(dimensions, int)
|
|
assert dimensions > 0 # Should return default VECTOR_DIM
|
|
|
|
def test_get_embedding_dimensions_empty_string(self):
|
|
"""Test handling of empty model name."""
|
|
dimensions = get_embedding_dimensions("")
|
|
assert isinstance(dimensions, int)
|
|
assert dimensions > 0
|
|
|
|
|
|
@pytest.mark.unit
|
|
class TestCreateDynamicIndexBody:
|
|
"""Test suite for dynamic index body creation."""
|
|
|
|
def test_create_index_body_structure(self):
|
|
"""Test that index body has correct structure."""
|
|
body = create_dynamic_index_body("text-embedding-ada-002")
|
|
|
|
assert "settings" in body
|
|
assert "mappings" in body
|
|
assert "index" in body["settings"]
|
|
assert "knn" in body["settings"]["index"]
|
|
assert body["settings"]["index"]["knn"] is True
|
|
|
|
def test_create_index_body_mappings(self):
|
|
"""Test that index body has all required field mappings."""
|
|
body = create_dynamic_index_body("nomic-embed-text")
|
|
|
|
properties = body["mappings"]["properties"]
|
|
|
|
# Check all required fields are present
|
|
required_fields = [
|
|
"document_id",
|
|
"filename",
|
|
"mimetype",
|
|
"page",
|
|
"text",
|
|
"chunk_embedding",
|
|
"source_url",
|
|
"connector_type",
|
|
"owner",
|
|
"allowed_users",
|
|
"allowed_groups",
|
|
"user_permissions",
|
|
"group_permissions",
|
|
"created_time",
|
|
"modified_time",
|
|
"indexed_time",
|
|
"metadata",
|
|
]
|
|
|
|
for field in required_fields:
|
|
assert field in properties, f"Field '{field}' missing from mappings"
|
|
|
|
def test_create_index_body_embedding_dimensions(self):
|
|
"""Test that embedding field uses correct dimensions for different models."""
|
|
# Test with different models
|
|
models = [
|
|
"text-embedding-ada-002",
|
|
"nomic-embed-text",
|
|
"text-embedding-3-small",
|
|
]
|
|
|
|
for model in models:
|
|
body = create_dynamic_index_body(model)
|
|
embedding_config = body["mappings"]["properties"]["chunk_embedding"]
|
|
|
|
assert "dimension" in embedding_config
|
|
assert embedding_config["dimension"] > 0
|
|
assert embedding_config["type"] == "knn_vector"
|
|
|
|
def test_create_index_body_knn_method(self):
|
|
"""Test that KNN method configuration is correct."""
|
|
body = create_dynamic_index_body("nomic-embed-text")
|
|
knn_config = body["mappings"]["properties"]["chunk_embedding"]["method"]
|
|
|
|
assert knn_config["name"] == "disk_ann"
|
|
assert knn_config["engine"] == "jvector"
|
|
assert knn_config["space_type"] == "l2"
|
|
assert "ef_construction" in knn_config["parameters"]
|
|
assert "m" in knn_config["parameters"]
|
|
|
|
def test_create_index_body_field_types(self):
|
|
"""Test that field types are correctly set."""
|
|
body = create_dynamic_index_body("nomic-embed-text")
|
|
properties = body["mappings"]["properties"]
|
|
|
|
# Test specific field types
|
|
assert properties["document_id"]["type"] == "keyword"
|
|
assert properties["filename"]["type"] == "keyword"
|
|
assert properties["text"]["type"] == "text"
|
|
assert properties["page"]["type"] == "integer"
|
|
assert properties["created_time"]["type"] == "date"
|
|
assert properties["metadata"]["type"] == "object"
|
|
|
|
def test_create_index_body_shards_config(self):
|
|
"""Test that shard configuration is correct."""
|
|
body = create_dynamic_index_body("nomic-embed-text")
|
|
settings = body["settings"]
|
|
|
|
assert settings["number_of_shards"] == 1
|
|
assert settings["number_of_replicas"] == 1
|
|
|
|
def test_create_index_body_different_models_different_dimensions(self):
|
|
"""Test that different models produce different embedding dimensions."""
|
|
body1 = create_dynamic_index_body("text-embedding-ada-002")
|
|
body2 = create_dynamic_index_body("text-embedding-3-large")
|
|
|
|
dim1 = body1["mappings"]["properties"]["chunk_embedding"]["dimension"]
|
|
dim2 = body2["mappings"]["properties"]["chunk_embedding"]["dimension"]
|
|
|
|
# These models should have different dimensions
|
|
# If they're the same, it's still valid, but typically they differ
|
|
assert isinstance(dim1, int)
|
|
assert isinstance(dim2, int)
|
|
|
|
def test_create_index_body_consistency(self):
|
|
"""Test that creating index body multiple times with same model is consistent."""
|
|
model = "nomic-embed-text"
|
|
|
|
body1 = create_dynamic_index_body(model)
|
|
body2 = create_dynamic_index_body(model)
|
|
|
|
assert body1 == body2
|