LightRAG/tests/test_prompt_accuracy.py

"""
Accuracy tests for optimized prompts.
Validates that optimized prompts produce correct, parseable outputs.

Run with: uv run --extra test python tests/test_prompt_accuracy.py
"""

from __future__ import annotations

import asyncio
import json
import sys
from dataclasses import dataclass
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

from lightrag.prompt import PROMPTS

# =============================================================================
# Test Data
# =============================================================================

KEYWORD_TEST_QUERIES = [
    {
        'query': 'What are the main causes of climate change and how do they affect polar ice caps?',
        'expected_high': ['climate change', 'causes', 'effects'],
        'expected_low': ['polar ice caps', 'greenhouse'],
    },
    {
        'query': "How did Apple's iPhone sales compare to Samsung Galaxy in Q3 2024?",
        'expected_high': ['sales comparison', 'smartphone'],
        'expected_low': ['Apple', 'iPhone', 'Samsung', 'Galaxy', 'Q3 2024'],
    },
    {
        'query': 'hello',  # Trivial - should return empty
        'expected_high': [],
        'expected_low': [],
    },
]

ORPHAN_TEST_CASES = [
    {
        'orphan': {
            'name': 'Pfizer',
            'type': 'organization',
            'desc': 'Pharmaceutical company that developed COVID-19 vaccine',
        },
        'candidate': {
            'name': 'Moderna',
            'type': 'organization',
            'desc': 'Biotechnology company that developed mRNA COVID-19 vaccine',
        },
        'should_connect': True,
        'reason': 'Both are COVID-19 vaccine developers',
    },
    {
        'orphan': {
            'name': 'Mount Everest',
            'type': 'location',
            'desc': 'Highest mountain in the world, located in the Himalayas',
        },
        'candidate': {
            'name': 'Python Programming',
            'type': 'concept',
            'desc': 'Popular programming language used for data science',
        },
        'should_connect': False,
        'reason': 'No logical connection between mountain and programming language',
    },
]

SUMMARIZATION_TEST_CASES = [
    {
        'name': 'Albert Einstein',
        'type': 'Entity',
        'descriptions': [
            '{"description": "Albert Einstein was a German-born theoretical physicist."}',
            '{"description": "Einstein developed the theory of relativity and won the Nobel Prize in Physics in 1921."}',
            '{"description": "He is widely regarded as one of the most influential scientists of the 20th century."}',
        ],
        'must_contain': ['physicist', 'relativity', 'Nobel Prize', 'influential'],
    },
]

RAG_TEST_CASES = [
    {
        'query': 'What is the capital of France?',
        'context': 'Paris is the capital and largest city of France. It has a population of over 2 million people.',
        'must_contain': ['Paris'],
        'must_not_contain': ['[1]', '[2]', 'References'],
    },
]


# =============================================================================
# Helper Functions
# =============================================================================


async def call_llm(prompt: str, model: str = 'gpt-4o-mini') -> str:
    """Call OpenAI API with a single prompt."""
    import openai

    client = openai.AsyncOpenAI()
    response = await client.chat.completions.create(
        model=model,
        messages=[{'role': 'user', 'content': prompt}],
        temperature=0.0,
    )
    return response.choices[0].message.content


@dataclass
class TestResult:
    name: str
    passed: bool
    details: str
    raw_output: str = ''


# =============================================================================
# Test Functions
# =============================================================================


async def test_keywords_extraction() -> list[TestResult]:
    """Test keywords extraction prompt."""
    results = []

    examples = '\n'.join(PROMPTS['keywords_extraction_examples'])

    for case in KEYWORD_TEST_QUERIES:
        prompt = PROMPTS['keywords_extraction'].format(examples=examples, query=case['query'])

        output = await call_llm(prompt)

        # Try to parse JSON
        try:
            # Clean potential markdown
            clean = output.strip()
            if clean.startswith('```'):
                clean = clean.split('```')[1]
                if clean.startswith('json'):
                    clean = clean[4:]

            parsed = json.loads(clean)

            has_high = 'high_level_keywords' in parsed
            has_low = 'low_level_keywords' in parsed
            is_list_high = isinstance(parsed.get('high_level_keywords'), list)
            is_list_low = isinstance(parsed.get('low_level_keywords'), list)

            if has_high and has_low and is_list_high and is_list_low:
                # Check if trivial query returns empty
                if case['expected_high'] == [] and case['expected_low'] == []:
                    passed = len(parsed['high_level_keywords']) == 0 and len(parsed['low_level_keywords']) == 0
                    details = 'Empty lists returned for trivial query' if passed else f'Non-empty for trivial: {parsed}'
                else:
                    # Check that some expected keywords are present (case-insensitive)
                    high_lower = [k.lower() for k in parsed['high_level_keywords']]
                    low_lower = [k.lower() for k in parsed['low_level_keywords']]
                    all_keywords = ' '.join(high_lower + low_lower)

                    found_high = sum(1 for exp in case['expected_high'] if exp.lower() in all_keywords)
                    found_low = sum(1 for exp in case['expected_low'] if exp.lower() in all_keywords)

                    passed = found_high > 0 or found_low > 0
                    details = f'Found {found_high}/{len(case["expected_high"])} high, {found_low}/{len(case["expected_low"])} low'
            else:
                passed = False
                details = f'Missing keys or wrong types: has_high={has_high}, has_low={has_low}'

        except json.JSONDecodeError as e:
            passed = False
            details = f'JSON parse error: {e}'

        results.append(
            TestResult(
                name=f'Keywords: {case["query"][:40]}...', passed=passed, details=details, raw_output=output[:200]
            )
        )

    return results


async def test_orphan_validation() -> list[TestResult]:
    """Test orphan connection validation prompt."""
    results = []

    for case in ORPHAN_TEST_CASES:
        prompt = PROMPTS['orphan_connection_validation'].format(
            orphan_name=case['orphan']['name'],
            orphan_type=case['orphan']['type'],
            orphan_description=case['orphan']['desc'],
            candidate_name=case['candidate']['name'],
            candidate_type=case['candidate']['type'],
            candidate_description=case['candidate']['desc'],
            similarity_score=0.85,
        )

        output = await call_llm(prompt)

        try:
            # Clean potential markdown
            clean = output.strip()
            if clean.startswith('```'):
                clean = clean.split('```')[1]
                if clean.startswith('json'):
                    clean = clean[4:]

            parsed = json.loads(clean)

            has_should_connect = 'should_connect' in parsed
            has_confidence = 'confidence' in parsed
            has_reasoning = 'reasoning' in parsed

            if has_should_connect and has_confidence and has_reasoning:
                correct_decision = parsed['should_connect'] == case['should_connect']
                valid_confidence = 0.0 <= parsed['confidence'] <= 1.0

                passed = correct_decision and valid_confidence
                details = f'Decision: {parsed["should_connect"]} (expected {case["should_connect"]}), confidence: {parsed["confidence"]:.2f}'
            else:
                passed = False
                details = f'Missing keys: should_connect={has_should_connect}, confidence={has_confidence}, reasoning={has_reasoning}'

        except json.JSONDecodeError as e:
            passed = False
            details = f'JSON parse error: {e}'

        results.append(
            TestResult(
                name=f'Orphan: {case["orphan"]["name"]} ↔ {case["candidate"]["name"]}',
                passed=passed,
                details=details,
                raw_output=output[:200],
            )
        )

    return results


async def test_entity_summarization() -> list[TestResult]:
    """Test entity summarization prompt."""
    results = []

    for case in SUMMARIZATION_TEST_CASES:
        prompt = PROMPTS['summarize_entity_descriptions'].format(
            description_name=case['name'],
            description_type=case['type'],
            description_list='\n'.join(case['descriptions']),
            summary_length=200,
            language='English',
        )

        output = await call_llm(prompt)

        # Check if required terms are present
        output_lower = output.lower()
        found = [term for term in case['must_contain'] if term.lower() in output_lower]
        missing = [term for term in case['must_contain'] if term.lower() not in output_lower]

        # Check it's not empty and mentions the entity
        has_content = len(output.strip()) > 50
        mentions_entity = case['name'].lower() in output_lower

        passed = len(found) >= len(case['must_contain']) // 2 and has_content and mentions_entity
        details = f'Found {len(found)}/{len(case["must_contain"])} terms, mentions entity: {mentions_entity}'
        if missing:
            details += f', missing: {missing}'

        results.append(
            TestResult(name=f'Summarize: {case["name"]}', passed=passed, details=details, raw_output=output[:200])
        )

    return results


async def test_naive_rag_response() -> list[TestResult]:
    """Test naive RAG response prompt."""
    results = []

    for case in RAG_TEST_CASES:
        prompt = PROMPTS['naive_rag_response'].format(
            response_type='concise paragraph',
            user_prompt=case['query'],
            content_data=case['context'],
        )

        output = await call_llm(prompt)

        # Check must_contain
        output_lower = output.lower()
        found = [term for term in case['must_contain'] if term.lower() in output_lower]

        # Check must_not_contain (citation markers)
        violations = [term for term in case['must_not_contain'] if term in output]

        passed = len(found) == len(case['must_contain']) and len(violations) == 0
        details = f'Found {len(found)}/{len(case["must_contain"])} required terms'
        if violations:
            details += f', VIOLATIONS: {violations}'

        results.append(
            TestResult(name=f'RAG: {case["query"][:40]}', passed=passed, details=details, raw_output=output[:200])
        )

    return results


# =============================================================================
# Main
# =============================================================================


async def main() -> None:
    """Run all accuracy tests."""
    print('\n' + '=' * 70)
    print('  PROMPT ACCURACY TESTS')
    print('=' * 70)

    all_results = []

    # Run tests in parallel
    print('\nRunning tests...')

    keywords_results, orphan_results, summarize_results, rag_results = await asyncio.gather(
        test_keywords_extraction(),
        test_orphan_validation(),
        test_entity_summarization(),
        test_naive_rag_response(),
    )

    all_results.extend(keywords_results)
    all_results.extend(orphan_results)
    all_results.extend(summarize_results)
    all_results.extend(rag_results)

    # Print results
    print('\n' + '-' * 70)
    print('  RESULTS')
    print('-' * 70)

    passed = 0
    failed = 0

    for result in all_results:
        status = '✓ PASS' if result.passed else '✗ FAIL'
        print(f'\n{status}: {result.name}')
        print(f'       {result.details}')
        if not result.passed:
            print(f'       Output: {result.raw_output}...')

        if result.passed:
            passed += 1
        else:
            failed += 1

    # Summary
    print('\n' + '=' * 70)
    print(f'  SUMMARY: {passed}/{passed + failed} tests passed')
    print('=' * 70)

    if failed > 0:
        print('\n⚠️  Some tests failed - review prompt changes')
        sys.exit(1)
    else:
        print('\n✓ All prompts producing correct outputs!')


if __name__ == '__main__':
    asyncio.run(main())