openrag/tests/conftest.py

import asyncio
import os
import tempfile
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch

import pytest
import pytest_asyncio
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Force no-auth mode for testing by setting OAuth credentials to empty strings
# This ensures anonymous JWT tokens are created automatically
os.environ['GOOGLE_OAUTH_CLIENT_ID'] = ''
os.environ['GOOGLE_OAUTH_CLIENT_SECRET'] = ''

from src.config.settings import clients
from src.session_manager import SessionManager
from src.main import generate_jwt_keys


# Mock embeddings for CI environment to avoid rate limits
@pytest.fixture(scope="session", autouse=True)
def mock_openai_embeddings():
    """Mock OpenAI embeddings API calls in CI environment to avoid rate limits."""
    # Only mock in CI environment
    if os.getenv("CI") or os.getenv("MOCK_EMBEDDINGS", "false").lower() in ("true", "1", "yes"):
        print("[DEBUG] Mocking OpenAI embeddings for CI environment")

        def create_mock_embedding(texts, model="text-embedding-3-small", **kwargs):
            """Create mock embeddings with proper dimensions based on model."""
            # Get dimensions based on model
            from src.config.settings import OPENAI_EMBEDDING_DIMENSIONS, WATSONX_EMBEDDING_DIMENSIONS

            dimensions = OPENAI_EMBEDDING_DIMENSIONS.get(
                model,
                WATSONX_EMBEDDING_DIMENSIONS.get(model, 1536)
            )

            # Handle both single string and list of strings
            if isinstance(texts, str):
                texts = [texts]

            # Create mock response
            mock_data = []
            for idx, text in enumerate(texts):
                # Create deterministic embeddings based on text hash for consistency
                import hashlib
                text_hash = int(hashlib.md5(text.encode()).hexdigest(), 16)
                # Use hash to seed pseudo-random values
                embedding = [(text_hash % 1000) / 1000.0 + i / dimensions for i in range(dimensions)]
                mock_data.append(MagicMock(embedding=embedding, index=idx))

            mock_response = MagicMock()
            mock_response.data = mock_data
            return mock_response

        async def async_create_mock_embedding(model, input, **kwargs):
            """Async version of mock embedding creation."""
            return create_mock_embedding(input, model, **kwargs)

        # Patch the OpenAI client's embeddings.create method
        with patch('openai.AsyncOpenAI') as mock_async_openai:
            # Create a mock client instance
            mock_client_instance = MagicMock()
            mock_embeddings = MagicMock()
            mock_embeddings.create = AsyncMock(side_effect=async_create_mock_embedding)
            mock_client_instance.embeddings = mock_embeddings
            mock_client_instance.close = AsyncMock()

            # Make AsyncOpenAI() return our mock instance
            mock_async_openai.return_value = mock_client_instance

            # Also patch the agentd patch function to return the mock
            with patch('agentd.patch.patch_openai_with_mcp', return_value=mock_client_instance):
                yield mock_client_instance
    else:
        # In non-CI environments, don't mock - use real API
        yield None


@pytest_asyncio.fixture(scope="session", autouse=True)
async def onboard_system(mock_openai_embeddings):
    """Perform initial onboarding once for all tests in the session.

    This ensures the OpenRAG config is marked as edited and properly initialized
    so that tests can use the /settings endpoint.
    """
    from pathlib import Path
    import shutil

    # Delete any existing config to ensure clean onboarding
    config_file = Path("config/config.yaml")
    if config_file.exists():
        config_file.unlink()

    # Clean up OpenSearch data directory to ensure fresh state for tests
    opensearch_data_path = Path(os.getenv("OPENSEARCH_DATA_PATH", "./opensearch-data"))
    if opensearch_data_path.exists():
        try:
            shutil.rmtree(opensearch_data_path)
            print(f"[DEBUG] Cleaned up OpenSearch data directory: {opensearch_data_path}")
        except Exception as e:
            print(f"[DEBUG] Could not clean OpenSearch data directory: {e}")

    # If we're using mocks, patch the clients to use mock embeddings
    if mock_openai_embeddings is not None:
        print("[DEBUG] Using mock OpenAI embeddings client")
        # Replace the client's patched_async_client with our mock
        clients._patched_async_client = mock_openai_embeddings

    # Initialize clients
    await clients.initialize()

    # Create app and perform onboarding via API
    from src.main import create_app, startup_tasks
    import httpx

    app = await create_app()
    await startup_tasks(app.state.services)

    transport = httpx.ASGITransport(app=app)
    async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client:
        onboarding_payload = {
            "llm_provider": "openai",
            "embedding_provider": "openai",
            "embedding_model": "text-embedding-3-small",
            "llm_model": "gpt-4o-mini",
            "sample_data": False,
        }
        resp = await client.post("/onboarding", json=onboarding_payload)
        if resp.status_code not in (200, 204):
            # If it fails, it might already be onboarded, which is fine
            print(f"[DEBUG] Onboarding returned {resp.status_code}: {resp.text}")
        else:
            print(f"[DEBUG] Session onboarding completed successfully")

    yield

    # Cleanup after all tests
    try:
        await clients.close()
    except Exception:
        pass


@pytest.fixture(scope="session")
def event_loop():
    """Create an instance of the default event loop for the test session."""
    loop = asyncio.get_event_loop_policy().new_event_loop()
    yield loop
    loop.close()


@pytest_asyncio.fixture
async def opensearch_client():
    """OpenSearch client for testing - requires running OpenSearch."""
    await clients.initialize()
    yield clients.opensearch
    # Cleanup test indices after tests
    try:
        await clients.opensearch.indices.delete(index="test_documents")
    except Exception:
        pass


@pytest.fixture
def session_manager():
    """Session manager for testing."""
    # Generate RSA keys before creating SessionManager
    generate_jwt_keys()
    sm = SessionManager("test-secret-key")
    print(f"[DEBUG] SessionManager created with keys: private={sm.private_key_path}, public={sm.public_key_path}")
    return sm


@pytest.fixture
def test_documents_dir():
    """Create a temporary directory with test documents."""
    with tempfile.TemporaryDirectory() as temp_dir:
        test_dir = Path(temp_dir)

        # Create some test files in supported formats
        (test_dir / "test1.md").write_text("# Machine Learning Document\n\nThis is a test document about machine learning.")
        (test_dir / "test2.md").write_text("# AI Document\n\nAnother document discussing artificial intelligence.")
        (test_dir / "test3.md").write_text("# Data Science Document\n\nThis is a markdown file about data science.")

        # Create subdirectory with files
        sub_dir = test_dir / "subdir"
        sub_dir.mkdir()
        (sub_dir / "nested.md").write_text("# Neural Networks\n\nNested document about neural networks.")

        yield test_dir


@pytest.fixture
def test_single_file():
    """Create a single test file."""
    with tempfile.NamedTemporaryFile(mode='w', suffix='_test_document.md', delete=False) as f:
        f.write("# Single Test Document\n\nThis is a test document about OpenRAG testing framework. This document contains multiple sentences to ensure proper chunking. The content should be indexed and searchable in OpenSearch after processing.")
        temp_path = f.name

    yield temp_path

    # Cleanup
    try:
        os.unlink(temp_path)
    except FileNotFoundError:
        pass