openrag/tests/conftest.py
Edwin Jose 13cc00043b Add robust retry and mocking for embedding API calls
Introduces exponential backoff and Retry-After header handling for embedding API rate limits in both processors and search service. Adds CI fixture to mock OpenAI embeddings, avoiding real API calls during tests. Updates Makefile to document and set MOCK_EMBEDDINGS for integration and CI test targets.
2025-11-26 15:54:38 -05:00

211 lines
No EOL
8 KiB
Python

import asyncio
import os
import tempfile
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
import pytest_asyncio
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Force no-auth mode for testing by setting OAuth credentials to empty strings
# This ensures anonymous JWT tokens are created automatically
os.environ['GOOGLE_OAUTH_CLIENT_ID'] = ''
os.environ['GOOGLE_OAUTH_CLIENT_SECRET'] = ''
from src.config.settings import clients
from src.session_manager import SessionManager
from src.main import generate_jwt_keys
# Mock embeddings for CI environment to avoid rate limits
@pytest.fixture(scope="session", autouse=True)
def mock_openai_embeddings():
"""Mock OpenAI embeddings API calls in CI environment to avoid rate limits."""
# Only mock in CI environment
if os.getenv("CI") or os.getenv("MOCK_EMBEDDINGS", "false").lower() in ("true", "1", "yes"):
print("[DEBUG] Mocking OpenAI embeddings for CI environment")
def create_mock_embedding(texts, model="text-embedding-3-small", **kwargs):
"""Create mock embeddings with proper dimensions based on model."""
# Get dimensions based on model
from src.config.settings import OPENAI_EMBEDDING_DIMENSIONS, WATSONX_EMBEDDING_DIMENSIONS
dimensions = OPENAI_EMBEDDING_DIMENSIONS.get(
model,
WATSONX_EMBEDDING_DIMENSIONS.get(model, 1536)
)
# Handle both single string and list of strings
if isinstance(texts, str):
texts = [texts]
# Create mock response
mock_data = []
for idx, text in enumerate(texts):
# Create deterministic embeddings based on text hash for consistency
import hashlib
text_hash = int(hashlib.md5(text.encode()).hexdigest(), 16)
# Use hash to seed pseudo-random values
embedding = [(text_hash % 1000) / 1000.0 + i / dimensions for i in range(dimensions)]
mock_data.append(MagicMock(embedding=embedding, index=idx))
mock_response = MagicMock()
mock_response.data = mock_data
return mock_response
async def async_create_mock_embedding(model, input, **kwargs):
"""Async version of mock embedding creation."""
return create_mock_embedding(input, model, **kwargs)
# Patch the OpenAI client's embeddings.create method
with patch('openai.AsyncOpenAI') as mock_async_openai:
# Create a mock client instance
mock_client_instance = MagicMock()
mock_embeddings = MagicMock()
mock_embeddings.create = AsyncMock(side_effect=async_create_mock_embedding)
mock_client_instance.embeddings = mock_embeddings
mock_client_instance.close = AsyncMock()
# Make AsyncOpenAI() return our mock instance
mock_async_openai.return_value = mock_client_instance
# Also patch the agentd patch function to return the mock
with patch('agentd.patch.patch_openai_with_mcp', return_value=mock_client_instance):
yield mock_client_instance
else:
# In non-CI environments, don't mock - use real API
yield None
@pytest_asyncio.fixture(scope="session", autouse=True)
async def onboard_system(mock_openai_embeddings):
"""Perform initial onboarding once for all tests in the session.
This ensures the OpenRAG config is marked as edited and properly initialized
so that tests can use the /settings endpoint.
"""
from pathlib import Path
import shutil
# Delete any existing config to ensure clean onboarding
config_file = Path("config/config.yaml")
if config_file.exists():
config_file.unlink()
# Clean up OpenSearch data directory to ensure fresh state for tests
opensearch_data_path = Path(os.getenv("OPENSEARCH_DATA_PATH", "./opensearch-data"))
if opensearch_data_path.exists():
try:
shutil.rmtree(opensearch_data_path)
print(f"[DEBUG] Cleaned up OpenSearch data directory: {opensearch_data_path}")
except Exception as e:
print(f"[DEBUG] Could not clean OpenSearch data directory: {e}")
# If we're using mocks, patch the clients to use mock embeddings
if mock_openai_embeddings is not None:
print("[DEBUG] Using mock OpenAI embeddings client")
# Replace the client's patched_async_client with our mock
clients._patched_async_client = mock_openai_embeddings
# Initialize clients
await clients.initialize()
# Create app and perform onboarding via API
from src.main import create_app, startup_tasks
import httpx
app = await create_app()
await startup_tasks(app.state.services)
transport = httpx.ASGITransport(app=app)
async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client:
onboarding_payload = {
"llm_provider": "openai",
"embedding_provider": "openai",
"embedding_model": "text-embedding-3-small",
"llm_model": "gpt-4o-mini",
"sample_data": False,
}
resp = await client.post("/onboarding", json=onboarding_payload)
if resp.status_code not in (200, 204):
# If it fails, it might already be onboarded, which is fine
print(f"[DEBUG] Onboarding returned {resp.status_code}: {resp.text}")
else:
print(f"[DEBUG] Session onboarding completed successfully")
yield
# Cleanup after all tests
try:
await clients.close()
except Exception:
pass
@pytest.fixture(scope="session")
def event_loop():
"""Create an instance of the default event loop for the test session."""
loop = asyncio.get_event_loop_policy().new_event_loop()
yield loop
loop.close()
@pytest_asyncio.fixture
async def opensearch_client():
"""OpenSearch client for testing - requires running OpenSearch."""
await clients.initialize()
yield clients.opensearch
# Cleanup test indices after tests
try:
await clients.opensearch.indices.delete(index="test_documents")
except Exception:
pass
@pytest.fixture
def session_manager():
"""Session manager for testing."""
# Generate RSA keys before creating SessionManager
generate_jwt_keys()
sm = SessionManager("test-secret-key")
print(f"[DEBUG] SessionManager created with keys: private={sm.private_key_path}, public={sm.public_key_path}")
return sm
@pytest.fixture
def test_documents_dir():
"""Create a temporary directory with test documents."""
with tempfile.TemporaryDirectory() as temp_dir:
test_dir = Path(temp_dir)
# Create some test files in supported formats
(test_dir / "test1.md").write_text("# Machine Learning Document\n\nThis is a test document about machine learning.")
(test_dir / "test2.md").write_text("# AI Document\n\nAnother document discussing artificial intelligence.")
(test_dir / "test3.md").write_text("# Data Science Document\n\nThis is a markdown file about data science.")
# Create subdirectory with files
sub_dir = test_dir / "subdir"
sub_dir.mkdir()
(sub_dir / "nested.md").write_text("# Neural Networks\n\nNested document about neural networks.")
yield test_dir
@pytest.fixture
def test_single_file():
"""Create a single test file."""
with tempfile.NamedTemporaryFile(mode='w', suffix='_test_document.md', delete=False) as f:
f.write("# Single Test Document\n\nThis is a test document about OpenRAG testing framework. This document contains multiple sentences to ensure proper chunking. The content should be indexed and searchable in OpenSearch after processing.")
temp_path = f.name
yield temp_path
# Cleanup
try:
os.unlink(temp_path)
except FileNotFoundError:
pass