LightRAG/lightrag/evaluation/regenerate_ground_truths.py
clssck 59e89772de refactor: consolidate to PostgreSQL-only backend and modernize stack
Remove legacy storage implementations and deprecated examples:
- Delete FAISS, JSON, Memgraph, Milvus, MongoDB, Nano Vector DB, Neo4j, NetworkX, Qdrant, Redis storage backends
- Remove Kubernetes deployment manifests and installation scripts
- Delete unofficial examples for deprecated backends and offline deployment docs
Streamline core infrastructure:
- Consolidate storage layer to PostgreSQL-only implementation
- Add full-text search caching with FTS cache module
- Implement metrics collection and monitoring pipeline
- Add explain and metrics API routes
Modernize frontend and tooling:
- Switch web UI to Bun with bun.lock, remove npm and pnpm lockfiles
- Update Dockerfile for PostgreSQL-only deployment
- Add Makefile for common development tasks
- Update environment and configuration examples
Enhance evaluation and testing capabilities:
- Add prompt optimization with DSPy and auto-tuning
- Implement ground truth regeneration and variant testing
- Add prompt debugging and response comparison utilities
- Expand test coverage with new integration scenarios
Simplify dependencies and configuration:
- Remove offline-specific requirement files
- Update pyproject.toml with streamlined dependencies
- Add Python version pinning with .python-version
- Create project guidelines in CLAUDE.md and AGENTS.md
2025-12-12 16:28:49 +01:00

347 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Regenerate ground truths for pharma_test_dataset.json based on actual LightRAG context.
This script:
1. Reads each question from the test dataset
2. Queries LightRAG to get the actual retrieved context
3. Uses an LLM to generate ground truth ONLY from that context
4. Saves the updated dataset
This ensures ground truths match what LightRAG can actually retrieve,
making RAGAS evaluation meaningful.
Configuration is loaded from the project's .env file (same as LightRAG service).
"""
import asyncio
import json
import os
import re
from pathlib import Path
import httpx
from dotenv import load_dotenv
from openai import AsyncOpenAI
# Load .env from project root (same config as LightRAG service)
PROJECT_ROOT = Path(__file__).parent.parent.parent
load_dotenv(PROJECT_ROOT / '.env')
# Configuration - use same env vars as LightRAG service
LIGHTRAG_ENDPOINT = os.getenv('LIGHTRAG_ENDPOINT', 'http://localhost:9621')
LIGHTRAG_WORKSPACE = os.getenv('LIGHTRAG_WORKSPACE', 'default')
# LLM Configuration (same as LightRAG service)
LLM_BINDING = os.getenv('LLM_BINDING', 'openai')
LLM_MODEL = os.getenv('LLM_MODEL', 'gpt-4o-mini')
LLM_BINDING_HOST = os.getenv('LLM_BINDING_HOST', 'https://api.openai.com/v1')
LLM_BINDING_API_KEY = os.getenv('LLM_BINDING_API_KEY') or os.getenv('OPENAI_API_KEY')
# Create OpenAI client with same config as service
client = AsyncOpenAI(
api_key=LLM_BINDING_API_KEY,
base_url=LLM_BINDING_HOST,
)
GROUND_TRUTH_PROMPT = """You are generating a ground truth answer for RAG evaluation.
Based ONLY on the following retrieved context, write a factual answer to the question.
- Include ONLY information that appears explicitly in the context
- Do not add any information not present in the context
- Do not speculate or infer beyond what is stated
- If the context doesn't fully answer the question, only include what IS covered
- Write in a clear, factual style (2-4 sentences)
Question: {question}
Retrieved Context:
{context}
Ground Truth Answer:"""
SIMPLIFY_QUERY_PROMPT = """Extract key search terms from this question for a RAG system query.
Return only the most important nouns, proper nouns, and key phrases.
Separate terms with spaces. No punctuation, no articles, no filler words.
Question: {question}
Key search terms:"""
async def simplify_question(question: str) -> str:
"""Use LLM to simplify the question into key search terms."""
response = await client.chat.completions.create(
model=LLM_MODEL,
messages=[{'role': 'user', 'content': SIMPLIFY_QUERY_PROMPT.format(question=question)}],
temperature=0,
max_tokens=100,
)
content = response.choices[0].message.content
return content.strip() if content else question
def extract_key_terms(question: str) -> str:
"""Extract key terms from a question using simple heuristics.
This is a fast fallback when we don't want to call the LLM.
"""
# Remove common question words and punctuation
stopwords = {
'what',
'were',
'was',
'are',
'is',
'the',
'a',
'an',
'how',
'does',
'do',
'did',
'which',
'who',
'when',
'where',
'why',
'from',
'to',
'in',
'on',
'at',
'for',
'of',
'with',
'by',
'and',
'or',
'that',
'this',
'these',
'those',
'according',
'based',
'should',
'influence',
'main',
'key',
'critical',
'important',
}
# Clean and tokenize
words = re.findall(r'\b\w+\b', question.lower())
# Keep important terms (not stopwords, keep numbers and capitalized terms)
key_terms = []
for word in words:
# Check if original had capitalization (proper noun)
if word not in stopwords and (word.upper() == word or any(c.isupper() for c in question) or len(word) > 3):
key_terms.append(word)
return ' '.join(key_terms[:12]) # Limit to top 12 terms
async def query_lightrag(query: str, workspace: str = 'default', timeout: float = 120.0) -> dict:
"""Query LightRAG and get the response with context."""
async with httpx.AsyncClient(timeout=timeout) as http_client:
response = await http_client.post(
f'{LIGHTRAG_ENDPOINT}/query',
json={
'query': query,
'mode': 'mix',
'workspace': workspace,
'include_chunk_content': True,
'include_references': True,
},
)
response.raise_for_status()
return response.json()
async def generate_ground_truth(question: str, context: str) -> str:
"""Use LLM to generate ground truth from actual context."""
prompt = GROUND_TRUTH_PROMPT.format(question=question, context=context)
response = await client.chat.completions.create(
model=LLM_MODEL,
messages=[{'role': 'user', 'content': prompt}],
temperature=0.1, # Low temperature for factual output
max_tokens=500,
)
content = response.choices[0].message.content
return content.strip() if content else ''
def extract_context_text(rag_response: dict) -> tuple[str, int]:
"""Extract readable context from LightRAG response.
The API returns references like:
{
"references": [
{"reference_id": "1", "file_path": "...", "content": ["chunk1", "chunk2"]},
...
]
}
where 'content' is a list of chunk strings from that file.
Returns:
tuple: (context_string, chunk_count)
"""
context_parts = []
# Get references with embedded chunk content
if rag_response.get('references'):
for ref in rag_response['references']:
if isinstance(ref, dict):
content = ref.get('content')
if content:
# content is a list of chunk strings
if isinstance(content, list):
for chunk in content:
if isinstance(chunk, str) and chunk.strip():
context_parts.append(chunk.strip())
elif isinstance(content, str) and content.strip():
context_parts.append(content.strip())
# Limit to first 15 chunks with separator
limited_parts = context_parts[:15]
return '\n\n---\n\n'.join(limited_parts), len(context_parts)
async def query_with_retry(question: str, workspace: str = 'default', max_retries: int = 2) -> tuple[str, str, int]:
"""Query LightRAG with retry using simplified query if no context found.
Returns:
tuple: (context, query_used, chunk_count)
"""
# First attempt with original question
rag_result = await query_lightrag(question, workspace)
context, chunk_count = extract_context_text(rag_result)
if chunk_count > 0:
return context, question, chunk_count
# Retry with simplified key terms
for retry in range(max_retries):
if retry == 0:
# First retry: extract key terms with heuristics (fast)
simplified = extract_key_terms(question)
else:
# Second retry: use LLM to simplify (slower but smarter)
simplified = await simplify_question(question)
if simplified and simplified != question:
print(f' Retry {retry + 1}: Using simplified query: {simplified[:60]}...')
rag_result = await query_lightrag(simplified, workspace)
context, chunk_count = extract_context_text(rag_result)
if chunk_count > 0:
return context, simplified, chunk_count
# Fallback: use the LLM response as context (it's based on retrieved info)
response_text = rag_result.get('response', '')
if response_text and '[no-context]' not in response_text:
return f'[LLM Response]: {response_text}', question, 0
return '', question, 0
async def regenerate_dataset(input_path: str, output_path: str, workspace: str = 'default') -> None:
"""Regenerate all ground truths in the dataset."""
# Load original dataset
with open(input_path) as f:
dataset = json.load(f)
test_cases = dataset.get('test_cases', [])
print(f'Processing {len(test_cases)} test cases...')
updated_cases = []
stats = {'success': 0, 'retry_success': 0, 'no_context': 0, 'errors': 0}
for i, case in enumerate(test_cases):
question = case['question']
print(f'\n[{i + 1}/{len(test_cases)}] Processing: {question[:60]}...')
try:
# Query LightRAG for actual context with retry
context, query_used, chunk_count = await query_with_retry(question, workspace)
if chunk_count > 0:
if query_used == question:
stats['success'] += 1
else:
stats['retry_success'] += 1
print(f' ✓ Found {chunk_count} chunks')
else:
stats['no_context'] += 1
print(' ✗ No chunk content retrieved')
if not context:
# Generate a "no info available" ground truth
ground_truth = (
f'The retrieved context does not provide information '
f'to answer this question about {question[:50]}...'
)
else:
# Generate ground truth from actual context
ground_truth = await generate_ground_truth(question, context)
print(f' Original GT: {case["ground_truth"][:80]}...')
print(f' New GT: {ground_truth[:80]}...')
# Update the case
updated_case = {
'question': question,
'ground_truth': ground_truth,
'project': case.get('project', 'pharma_evaluation'),
}
updated_cases.append(updated_case)
except Exception as e:
stats['errors'] += 1
print(f' Error: {e}')
# Keep original on error
updated_cases.append(case)
# Save updated dataset
output_data = {'test_cases': updated_cases}
with open(output_path, 'w') as f:
json.dump(output_data, f, indent=2)
print(f'\n{"=" * 50}')
print(f'✅ Updated dataset saved to: {output_path}')
print('\nStats:')
print(f' - Direct success: {stats["success"]}')
print(f' - Retry success: {stats["retry_success"]}')
print(f' - No context: {stats["no_context"]}')
print(f' - Errors: {stats["errors"]}')
async def main():
script_dir = Path(__file__).parent
input_path = script_dir / 'pharma_test_dataset_original.json'
output_path = script_dir / 'pharma_test_dataset.json'
if not input_path.exists():
# If no backup exists, use current as input
input_path = output_path
print(f'{"=" * 60}')
print('Ground Truth Regeneration')
print(f'{"=" * 60}')
print(f'Input: {input_path}')
print(f'Output: {output_path}')
print(f'LightRAG: {LIGHTRAG_ENDPOINT}')
print(f'Workspace: {LIGHTRAG_WORKSPACE}')
print(f'LLM Host: {LLM_BINDING_HOST}')
print(f'LLM Model: {LLM_MODEL}')
print(f'{"=" * 60}')
print()
await regenerate_dataset(str(input_path), str(output_path), LIGHTRAG_WORKSPACE)
if __name__ == '__main__':
asyncio.run(main())