LightRAG/lightrag/evaluation/test_prompt_variants.py

#!/usr/bin/env python3
"""
Fast prompt A/B testing without full backend restart.

Bypasses the server entirely - queries DB directly, calls LLM directly,
runs RAGAS eval on results. ~30s per variant vs 5+ minutes full eval.

Usage:
    # Test all variants on 3 queries
    python test_prompt_variants.py

    # Test specific number of queries
    python test_prompt_variants.py --num-queries 5

    # Test specific query indices
    python test_prompt_variants.py --indices 0,5,9
"""

import argparse
import asyncio
import json
import os
import sys
import time
from dataclasses import dataclass
from pathlib import Path

from dotenv import load_dotenv

# Load environment
load_dotenv()

# Ensure we can import lightrag
sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from openai import AsyncOpenAI
from ragas import evaluate
from ragas.metrics import AnswerRelevancy, Faithfulness

# ============================================================================
# PROMPT VARIANTS - Edit these to test different prompts
# ============================================================================

BASELINE_PROMPT = """Answer based ONLY on this context:

{context_data}

---

GROUNDING RULES:
- Every claim must be directly supported by the context above
- Before writing each sentence, verify you can point to its source in context
- If information isn't in context, don't include it - even if it seems obvious
- Cover all relevant points from the context that address the question

DO NOT:
- Infer what "typically" happens in similar situations
- Add standard terminology not explicitly in the context
- Expand acronyms unless the expansion appears in context
- Fill in gaps with plausible-sounding details

RESPONSE FORMAT GUIDANCE:
Match your response structure to the question type:
- "What were the lessons learned / key takeaways / main themes..." → Enumerate distinct points (e.g., "Key lessons include: (1)..., (2)..., (3)...")
- "What challenges / issues / gaps..." → List specific challenges with brief explanations
- "What considerations / strategies / factors..." → Enumerate the specific items mentioned in context
- "How does X describe / explain / define..." → Provide a coherent summary of the description
- "What are the interdependencies / relationships..." → Describe the connections and their nature

ANSWER STRUCTURE:
- Lead with the direct answer to the question in your first sentence
- Use the question's key terms in your response (e.g., if asked about "lessons learned", say "lessons learned include...")
- For enumeration questions, provide numbered or bulleted items when context supports multiple points

Write naturally in flowing paragraphs. Do NOT include [1], [2] citations - they're added automatically.

Question: {user_prompt}

Answer (grounded only in context above):"""


# Variant 1: More direct but still grounded
VARIANT_1 = """Answer based ONLY on this context:

{context_data}

---

CRITICAL: Your answer will be evaluated on two dimensions:
1. FAITHFULNESS - Every claim must be directly from the context (no hallucination)
2. RELEVANCE - Your answer must directly address what the question asks

ANSWER RULES:
- First sentence: Directly answer the question using the question's key terms
- Only include facts explicitly stated in the context above
- If asked about specific tools/methods, name them from the context
- If the context doesn't cover something, say "The available information does not specify..."

FORMAT:
- "What were the X?" → "The key X were: (1)..., (2)..., (3)..."
- "How does Y describe Z?" → "According to Y, Z is described as..."
- "What challenges/issues?" → List the specific challenges mentioned

Question: {user_prompt}

Answer:"""


# Variant 2: Minimal, trusts the model more
VARIANT_2 = """Context:
{context_data}

---

Answer the question using ONLY information from the context above.
- Do not add anything not explicitly stated in the context
- If unsure, say "not specified in the available information"

Question: {user_prompt}

Answer:"""


# Variant 3: DSPy MIPRO-optimized (CoT + extraction focus)
VARIANT_MIPRO = """Context:
{context_data}

---

Given the context above and the question below, follow these steps:

1. **Extract** - Identify specific facts from the context relevant to the question
2. **Answer** - Produce a concise answer (2-4 key points) strictly based on extracted facts

RULES:
- Do NOT hallucinate or add information not present in the context
- If context lacks relevant information, state: "The available information does not address this"
- Use the question's key terms in your response
- For list questions, enumerate with (1), (2), (3)

Question: {user_prompt}

Answer:"""


# Variant 4: Hybrid - combines best of balanced + mipro insights
VARIANT_HYBRID = """Answer based ONLY on this context:

{context_data}

---

TASK: Answer the question using ONLY facts from the context above.

STEP 1 - Identify relevant facts:
Before answering, mentally note which specific facts from the context address the question.

STEP 2 - Structure your answer:
- Start with a direct answer using the question's key terms
- For "What were the X?" → List the X with brief explanations
- For "How does Y describe Z?" → Summarize Y's description of Z
- Use (1), (2), (3) for multiple points when appropriate

STEP 3 - Verify grounding:
- Every claim must come from the context
- If context doesn't cover something, say "The available information does not specify..."
- Don't add obvious-seeming details not explicitly stated

Question: {user_prompt}

Answer:"""


# Variant 5: Ultra-direct - focuses on answer structure only
VARIANT_DIRECT = """Context:
{context_data}

---

Answer the question below using ONLY the context above.

ANSWER FORMAT:
- First sentence: Direct answer using the question's key terms
- Then: Supporting details from context (use numbered points for lists)
- Do not include information not in the context

Question: {user_prompt}

Answer:"""


# Variant 6: Question-type aware
VARIANT_QTYPE = """Context:
{context_data}

---

Answer using ONLY the context above. Match your response to the question type:

IF "What were the lessons/challenges/considerations..." → Enumerate: "(1)..., (2)..., (3)..."
IF "How does X describe/explain..." → Summarize what X says about the topic
IF "What are the relationships/interdependencies..." → Describe the connections

RULES:
- Every fact must be from the context above
- Use the question's terminology in your answer
- If information is missing, acknowledge it

Question: {user_prompt}

Answer:"""


PROMPT_VARIANTS = {
    'baseline': BASELINE_PROMPT,
    'balanced': VARIANT_1,
    'minimal': VARIANT_2,
    'mipro': VARIANT_MIPRO,
    'hybrid': VARIANT_HYBRID,
    'direct': VARIANT_DIRECT,
    'qtype': VARIANT_QTYPE,
}


# ============================================================================
# Core Functions
# ============================================================================

@dataclass
class TestResult:
    variant: str
    query: str
    answer: str
    faithfulness: float
    relevance: float
    latency_ms: float


async def call_llm(prompt: str, client: AsyncOpenAI) -> tuple[str, float]:
    """Call LLM and return response + latency"""
    start = time.perf_counter()

    response = await client.chat.completions.create(
        model=os.getenv('LLM_MODEL', 'gpt-4o-mini'),
        messages=[{'role': 'user', 'content': prompt}],
        temperature=0.1,
        max_tokens=2000,
    )

    latency = (time.perf_counter() - start) * 1000
    return response.choices[0].message.content, latency


async def get_context_from_server(query: str, server_url: str = 'http://localhost:9621') -> str:
    """Get context from running LightRAG server (only_need_context mode)"""
    import httpx

    async with httpx.AsyncClient(timeout=60) as client:
        response = await client.post(
            f'{server_url}/query',
            json={
                'query': query,
                'mode': 'mix',
                'only_need_context': True,
            }
        )
        data = response.json()
        return data.get('response', '')


def run_ragas_eval(question: str, answer: str, context: str, ground_truth: str) -> tuple[float, float]:
    """Run RAGAS evaluation and return (faithfulness, relevance)"""
    from datasets import Dataset
    from ragas import evaluate
    from ragas.metrics import answer_relevancy, faithfulness
    from langchain_openai import ChatOpenAI, OpenAIEmbeddings

    # Create dataset
    data = {
        'question': [question],
        'answer': [answer],
        'contexts': [[context]],
        'ground_truth': [ground_truth],
    }
    dataset = Dataset.from_dict(data)

    # Create evaluation LLM
    llm = ChatOpenAI(model='gpt-4o-mini', temperature=0)
    embeddings = OpenAIEmbeddings(model='text-embedding-3-large')

    # Run evaluation
    result = evaluate(
        dataset,
        metrics=[faithfulness, answer_relevancy],
        llm=llm,
        embeddings=embeddings,
    )

    # Extract first element since we're evaluating single samples
    faith = result['faithfulness'][0] if isinstance(result['faithfulness'], list) else result['faithfulness']
    relevance = result['answer_relevancy'][0] if isinstance(result['answer_relevancy'], list) else result['answer_relevancy']

    return float(faith), float(relevance)


async def test_variant(
    variant_name: str,
    prompt_template: str,
    queries: list[dict],
    client: AsyncOpenAI,
    server_url: str,
) -> list[TestResult]:
    """Test a single prompt variant on all queries"""
    results = []

    for i, q in enumerate(queries):
        question = q['question']
        ground_truth = q['ground_truth']

        print(f'  [{variant_name}] Query {i+1}/{len(queries)}: {question[:50]}...')

        # Get context from server
        context = await get_context_from_server(question, server_url)

        if not context or context == 'No relevant context found for the query.':
            print(f'    ⚠️ No context retrieved, skipping')
            continue

        # Build prompt and call LLM
        prompt = prompt_template.format(
            context_data=context,
            user_prompt=question,
        )

        answer, latency = await call_llm(prompt, client)

        # Run RAGAS eval
        try:
            faith, relevance = run_ragas_eval(question, answer, context, ground_truth)
        except Exception as e:
            print(f'    ⚠️ RAGAS eval failed: {e}')
            faith, relevance = 0.0, 0.0

        results.append(TestResult(
            variant=variant_name,
            query=question[:60],
            answer=answer[:200],
            faithfulness=faith,
            relevance=relevance,
            latency_ms=latency,
        ))

        print(f'    Faith: {faith:.3f} | Relevance: {relevance:.3f} | {latency:.0f}ms')

    return results


def print_summary(all_results: dict[str, list[TestResult]]):
    """Print comparison summary"""
    print('\n' + '='*80)
    print('📊 PROMPT VARIANT COMPARISON')
    print('='*80)

    print(f"\n{'Variant':<12} | {'Faithfulness':>12} | {'Relevance':>12} | {'Avg Latency':>12}")
    print('-'*55)

    for variant, results in all_results.items():
        if not results:
            continue
        avg_faith = sum(r.faithfulness for r in results) / len(results)
        avg_rel = sum(r.relevance for r in results) / len(results)
        avg_lat = sum(r.latency_ms for r in results) / len(results)

        print(f'{variant:<12} | {avg_faith:>12.3f} | {avg_rel:>12.3f} | {avg_lat:>10.0f}ms')

    print('='*80)

    # Per-query breakdown
    print('\n📋 Per-Query Results:')
    queries = list(all_results.values())[0] if all_results else []
    for i in range(len(queries)):
        query = queries[i].query if i < len(queries) else 'N/A'
        print(f'\nQ{i+1}: {query}')
        for variant, results in all_results.items():
            if i < len(results):
                r = results[i]
                print(f'  {variant:<10}: Faith={r.faithfulness:.2f} Rel={r.relevance:.2f}')


async def main():
    parser = argparse.ArgumentParser(description='Fast prompt A/B testing')
    parser.add_argument('--num-queries', '-n', type=int, default=3, help='Number of queries to test')
    parser.add_argument('--indices', '-i', type=str, help='Specific query indices (comma-separated)')
    parser.add_argument('--server', '-s', type=str, default='http://localhost:9621', help='LightRAG server URL')
    parser.add_argument('--variants', '-v', type=str, help='Variants to test (comma-separated)')
    args = parser.parse_args()

    # Load test dataset
    dataset_path = Path(__file__).parent / 'pharma_test_dataset.json'
    with open(dataset_path) as f:
        data = json.load(f)

    # Handle both formats: dict with 'test_cases' key or direct list
    if isinstance(data, dict) and 'test_cases' in data:
        dataset = data['test_cases']
    else:
        dataset = data

    # Select queries
    if args.indices:
        indices = [int(i) for i in args.indices.split(',')]
        queries = [dataset[i] for i in indices if i < len(dataset)]
    else:
        queries = dataset[:args.num_queries]

    print(f'🧪 Testing {len(queries)} queries')

    # Select variants
    if args.variants:
        variant_names = args.variants.split(',')
        variants = {k: v for k, v in PROMPT_VARIANTS.items() if k in variant_names}
    else:
        variants = PROMPT_VARIANTS

    print(f'📝 Testing variants: {list(variants.keys())}')

    # Create OpenAI client
    client = AsyncOpenAI(
        api_key=os.getenv('LLM_BINDING_API_KEY') or os.getenv('OPENAI_API_KEY'),
        base_url=os.getenv('LLM_BINDING_HOST', 'https://api.openai.com/v1'),
    )

    # Test each variant
    all_results = {}
    for variant_name, template in variants.items():
        print(f'\n🔄 Testing variant: {variant_name}')
        results = await test_variant(variant_name, template, queries, client, args.server)
        all_results[variant_name] = results

    # Print summary
    print_summary(all_results)


if __name__ == '__main__':
    asyncio.run(main())