diff --git a/env.example b/env.example index 7606c8db..43bc759b 100644 --- a/env.example +++ b/env.example @@ -50,6 +50,8 @@ OLLAMA_EMULATING_MODEL_TAG=latest # JWT_ALGORITHM=HS256 ### API-Key to access LightRAG Server API +### Use this key in HTTP requests with the 'X-API-Key' header +### Example: curl -H "X-API-Key: your-secure-api-key-here" http://localhost:9621/query # LIGHTRAG_API_KEY=your-secure-api-key-here # WHITELIST_PATHS=/health,/api/* @@ -73,16 +75,6 @@ ENABLE_LLM_CACHE=true # MAX_RELATION_TOKENS=8000 ### control the maximum tokens send to LLM (include entities, relations and chunks) # MAX_TOTAL_TOKENS=30000 -### control the maximum chunk_ids stored in vector and graph db -# MAX_SOURCE_IDS_PER_ENTITY=300 -# MAX_SOURCE_IDS_PER_RELATION=300 -### control chunk_ids limitation method: KEEP, FIFO (KEPP: Ingore New Chunks, FIFO: New chunks replace old chunks) -# SOURCE_IDS_LIMIT_METHOD=KEEP - -### maximum number of related chunks per source entity or relation -### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) -### Higher values increase re-ranking time -# RELATED_CHUNK_NUMBER=5 ### chunk selection strategies ### VECTOR: Pick KG chunks by vector similarity, delivered chunks to the LLM aligning more closely with naive retrieval @@ -110,9 +102,6 @@ RERANK_BINDING=null # RERANK_MODEL=rerank-v3.5 # RERANK_BINDING_HOST=https://api.cohere.com/v2/rerank # RERANK_BINDING_API_KEY=your_rerank_api_key_here -### Cohere rerank chunking configuration (useful for models with token limits like ColBERT) -# RERANK_ENABLE_CHUNKING=true -# RERANK_MAX_TOKENS_PER_DOC=480 ### Default value for Jina AI # RERANK_MODEL=jina-reranker-v2-base-multilingual @@ -132,6 +121,9 @@ ENABLE_LLM_CACHE_FOR_EXTRACT=true ### Document processing output language: English, Chinese, French, German ... SUMMARY_LANGUAGE=English +### PDF decryption password for protected PDF files +# PDF_DECRYPT_PASSWORD=your_pdf_password_here + ### Entity types that the LLM will attempt to recognize # ENTITY_TYPES='["Person", "Creature", "Organization", "Location", "Event", "Concept", "Method", "Content", "Data", "Artifact", "NaturalObject"]' @@ -148,6 +140,22 @@ SUMMARY_LANGUAGE=English ### Maximum context size sent to LLM for description summary # SUMMARY_CONTEXT_SIZE=12000 +### control the maximum chunk_ids stored in vector and graph db +# MAX_SOURCE_IDS_PER_ENTITY=300 +# MAX_SOURCE_IDS_PER_RELATION=300 +### control chunk_ids limitation method: FIFO, KEEP +### FIFO: First in first out +### KEEP: Keep oldest (less merge action and faster) +# SOURCE_IDS_LIMIT_METHOD=FIFO + +# Maximum number of file paths stored in entity/relation file_path field (For displayed only, does not affect query performance) +# MAX_FILE_PATHS=100 + +### maximum number of related chunks per source entity or relation +### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) +### Higher values increase re-ranking time +# RELATED_CHUNK_NUMBER=5 + ############################### ### Concurrency Configuration ############################### @@ -386,3 +394,35 @@ MEMGRAPH_USERNAME= MEMGRAPH_PASSWORD= MEMGRAPH_DATABASE=memgraph # MEMGRAPH_WORKSPACE=forced_workspace_name + +############################ +### Evaluation Configuration +############################ +### RAGAS evaluation models (used for RAG quality assessment) +### āš ļø IMPORTANT: Both LLM and Embedding endpoints MUST be OpenAI-compatible +### Default uses OpenAI models for evaluation + +### LLM Configuration for Evaluation +# EVAL_LLM_MODEL=gpt-4o-mini +### API key for LLM evaluation (fallback to OPENAI_API_KEY if not set) +# EVAL_LLM_BINDING_API_KEY=your_api_key +### Custom OpenAI-compatible endpoint for LLM evaluation (optional) +# EVAL_LLM_BINDING_HOST=https://api.openai.com/v1 + +### Embedding Configuration for Evaluation +# EVAL_EMBEDDING_MODEL=text-embedding-3-large +### API key for embeddings (fallback: EVAL_LLM_BINDING_API_KEY -> OPENAI_API_KEY) +# EVAL_EMBEDDING_BINDING_API_KEY=your_embedding_api_key +### Custom OpenAI-compatible endpoint for embeddings (fallback: EVAL_LLM_BINDING_HOST) +# EVAL_EMBEDDING_BINDING_HOST=https://api.openai.com/v1 + +### Performance Tuning +### Number of concurrent test case evaluations +### Lower values reduce API rate limit issues but increase evaluation time +# EVAL_MAX_CONCURRENT=2 +### TOP_K query parameter of LightRAG (default: 10) +### Number of entities or relations retrieved from KG +# EVAL_QUERY_TOP_K=10 +### LLM request retry and timeout settings for evaluation +# EVAL_LLM_MAX_RETRIES=5 +# EVAL_LLM_TIMEOUT=180 diff --git a/lightrag/evaluation/README.md b/lightrag/evaluation/README.md index 2294c027..de1845ef 100644 --- a/lightrag/evaluation/README.md +++ b/lightrag/evaluation/README.md @@ -1,12 +1,8 @@ -# šŸ“Š LightRAG Evaluation Framework - -RAGAS-based offline evaluation of your LightRAG system. +# šŸ“Š RAGAS-based Evaluation Framework ## What is RAGAS? -**RAGAS** (Retrieval Augmented Generation Assessment) is a framework for reference-free evaluation of RAG systems using LLMs. - -Instead of requiring human-annotated ground truth, RAGAS uses state-of-the-art evaluation metrics: +**RAGAS** (Retrieval Augmented Generation Assessment) is a framework for reference-free evaluation of RAG systems using LLMs. RAGAS uses state-of-the-art evaluation metrics: ### Core Metrics @@ -18,9 +14,7 @@ Instead of requiring human-annotated ground truth, RAGAS uses state-of-the-art e | **Context Precision** | Is retrieved context clean without irrelevant noise? | > 0.80 | | **RAGAS Score** | Overall quality metric (average of above) | > 0.80 | ---- - -## šŸ“ Structure +### šŸ“ LightRAG Evalua'tion Framework Directory Structure ``` lightrag/evaluation/ @@ -42,7 +36,7 @@ lightrag/evaluation/ **Quick Test:** Index files from `sample_documents/` into LightRAG, then run the evaluator to reproduce results (~89-100% RAGAS score per question). ---- + ## šŸš€ Quick Start @@ -55,20 +49,35 @@ pip install ragas datasets langfuse Or use your project dependencies (already included in pyproject.toml): ```bash -pip install -e ".[offline-llm]" +pip install -e ".[evaluation]" ``` ### 2. Run Evaluation +**Basic usage (uses defaults):** ```bash cd /path/to/LightRAG -python -m lightrag.evaluation.eval_rag_quality +python lightrag/evaluation/eval_rag_quality.py ``` -Or directly: - +**Specify custom dataset:** ```bash -python lightrag/evaluation/eval_rag_quality.py +python lightrag/evaluation/eval_rag_quality.py --dataset my_test.json +``` + +**Specify custom RAG endpoint:** +```bash +python lightrag/evaluation/eval_rag_quality.py --ragendpoint http://my-server.com:9621 +``` + +**Specify both (short form):** +```bash +python lightrag/evaluation/eval_rag_quality.py -d my_test.json -r http://localhost:9621 +``` + +**Get help:** +```bash +python lightrag/evaluation/eval_rag_quality.py --help ``` ### 3. View Results @@ -87,7 +96,179 @@ results/ - šŸ“‹ Individual test case results - šŸ“ˆ Performance breakdown by question ---- + + +## šŸ“‹ Command-Line Arguments + +The evaluation script supports command-line arguments for easy configuration: + +| Argument | Short | Default | Description | +|----------|-------|---------|-------------| +| `--dataset` | `-d` | `sample_dataset.json` | Path to test dataset JSON file | +| `--ragendpoint` | `-r` | `http://localhost:9621` or `$LIGHTRAG_API_URL` | LightRAG API endpoint URL | + +### Usage Examples + +**Use default dataset and endpoint:** +```bash +python lightrag/evaluation/eval_rag_quality.py +``` + +**Custom dataset with default endpoint:** +```bash +python lightrag/evaluation/eval_rag_quality.py --dataset path/to/my_dataset.json +``` + +**Default dataset with custom endpoint:** +```bash +python lightrag/evaluation/eval_rag_quality.py --ragendpoint http://my-server.com:9621 +``` + +**Custom dataset and endpoint:** +```bash +python lightrag/evaluation/eval_rag_quality.py -d my_dataset.json -r http://localhost:9621 +``` + +**Absolute path to dataset:** +```bash +python lightrag/evaluation/eval_rag_quality.py -d /path/to/custom_dataset.json +``` + +**Show help message:** +```bash +python lightrag/evaluation/eval_rag_quality.py --help +``` + + + +## āš™ļø Configuration + +### Environment Variables + +The evaluation framework supports customization through environment variables: + +**āš ļø IMPORTANT: Both LLM and Embedding endpoints MUST be OpenAI-compatible** +- The RAGAS framework requires OpenAI-compatible API interfaces +- Custom endpoints must implement the OpenAI API format (e.g., vLLM, SGLang, LocalAI) +- Non-compatible endpoints will cause evaluation failures + +| Variable | Default | Description | +|----------|---------|-------------| +| **LLM Configuration** | | | +| `EVAL_LLM_MODEL` | `gpt-4o-mini` | LLM model used for RAGAS evaluation | +| `EVAL_LLM_BINDING_API_KEY` | falls back to `OPENAI_API_KEY` | API key for LLM evaluation | +| `EVAL_LLM_BINDING_HOST` | (optional) | Custom OpenAI-compatible endpoint URL for LLM | +| **Embedding Configuration** | | | +| `EVAL_EMBEDDING_MODEL` | `text-embedding-3-large` | Embedding model for evaluation | +| `EVAL_EMBEDDING_BINDING_API_KEY` | falls back to `EVAL_LLM_BINDING_API_KEY` → `OPENAI_API_KEY` | API key for embeddings | +| `EVAL_EMBEDDING_BINDING_HOST` | falls back to `EVAL_LLM_BINDING_HOST` | Custom OpenAI-compatible endpoint URL for embeddings | +| **Performance Tuning** | | | +| `EVAL_MAX_CONCURRENT` | 2 | Number of concurrent test case evaluations (1=serial) | +| `EVAL_QUERY_TOP_K` | 10 | Number of documents to retrieve per query | +| `EVAL_LLM_MAX_RETRIES` | 5 | Maximum LLM request retries | +| `EVAL_LLM_TIMEOUT` | 180 | LLM request timeout in seconds | + +### Usage Examples + +**Example 1: Default Configuration (OpenAI Official API)** +```bash +export OPENAI_API_KEY=sk-xxx +python lightrag/evaluation/eval_rag_quality.py +``` +Both LLM and embeddings use OpenAI's official API with default models. + +**Example 2: Custom Models on OpenAI** +```bash +export OPENAI_API_KEY=sk-xxx +export EVAL_LLM_MODEL=gpt-4o-mini +export EVAL_EMBEDDING_MODEL=text-embedding-3-large +python lightrag/evaluation/eval_rag_quality.py +``` + +**Example 3: Same Custom OpenAI-Compatible Endpoint for Both** +```bash +# Both LLM and embeddings use the same custom endpoint +export EVAL_LLM_BINDING_API_KEY=your-custom-key +export EVAL_LLM_BINDING_HOST=http://localhost:8000/v1 +export EVAL_LLM_MODEL=qwen-plus +export EVAL_EMBEDDING_MODEL=BAAI/bge-m3 +python lightrag/evaluation/eval_rag_quality.py +``` +Embeddings automatically inherit LLM endpoint configuration. + +**Example 4: Separate Endpoints (Cost Optimization)** +```bash +# Use OpenAI for LLM (high quality) +export EVAL_LLM_BINDING_API_KEY=sk-openai-key +export EVAL_LLM_MODEL=gpt-4o-mini +# No EVAL_LLM_BINDING_HOST means use OpenAI official API + +# Use local vLLM for embeddings (cost-effective) +export EVAL_EMBEDDING_BINDING_API_KEY=local-key +export EVAL_EMBEDDING_BINDING_HOST=http://localhost:8001/v1 +export EVAL_EMBEDDING_MODEL=BAAI/bge-m3 + +python lightrag/evaluation/eval_rag_quality.py +``` +LLM uses OpenAI official API, embeddings use local custom endpoint. + +**Example 5: Different Custom Endpoints for LLM and Embeddings** +```bash +# LLM on one OpenAI-compatible server +export EVAL_LLM_BINDING_API_KEY=key1 +export EVAL_LLM_BINDING_HOST=http://llm-server:8000/v1 +export EVAL_LLM_MODEL=custom-llm + +# Embeddings on another OpenAI-compatible server +export EVAL_EMBEDDING_BINDING_API_KEY=key2 +export EVAL_EMBEDDING_BINDING_HOST=http://embedding-server:8001/v1 +export EVAL_EMBEDDING_MODEL=custom-embedding + +python lightrag/evaluation/eval_rag_quality.py +``` +Both use different custom OpenAI-compatible endpoints. + +**Example 6: Using Environment Variables from .env File** +```bash +# Create .env file in project root +cat > .env << EOF +EVAL_LLM_BINDING_API_KEY=your-key +EVAL_LLM_BINDING_HOST=http://localhost:8000/v1 +EVAL_LLM_MODEL=qwen-plus +EVAL_EMBEDDING_MODEL=BAAI/bge-m3 +EOF + +# Run evaluation (automatically loads .env) +python lightrag/evaluation/eval_rag_quality.py +``` + +### Concurrency Control & Rate Limiting + +The evaluation framework includes built-in concurrency control to prevent API rate limiting issues: + +**Why Concurrency Control Matters:** +- RAGAS internally makes many concurrent LLM calls for each test case +- Context Precision metric calls LLM once per retrieved document +- Without control, this can easily exceed API rate limits + +**Default Configuration (Conservative):** +```bash +EVAL_MAX_CONCURRENT=2 # Serial evaluation (one test at a time) +EVAL_QUERY_TOP_K=10 # OP_K query parameter of LightRAG +EVAL_LLM_MAX_RETRIES=5 # Retry failed requests 5 times +EVAL_LLM_TIMEOUT=180 # 3-minute timeout per request +``` + +**Common Issues and Solutions:** + +| Issue | Solution | +|-------|----------| +| **Warning: "LM returned 1 generations instead of 3"** | Reduce `EVAL_MAX_CONCURRENT` to 1 or decrease `EVAL_QUERY_TOP_K` | +| **Context Precision returns NaN** | Lower `EVAL_QUERY_TOP_K` to reduce LLM calls per test case | +| **Rate limit errors (429)** | Increase `EVAL_LLM_MAX_RETRIES` and decrease `EVAL_MAX_CONCURRENT` | +| **Request timeouts** | Increase `EVAL_LLM_TIMEOUT` to 180 or higher | + + ## šŸ“ Test Dataset @@ -101,7 +282,7 @@ results/ { "question": "Your question here", "ground_truth": "Expected answer from your data", - "context": "topic" + "project": "evaluation_project_name" } ] } @@ -166,6 +347,50 @@ results/ pip install ragas datasets ``` +### "Warning: LM returned 1 generations instead of requested 3" or Context Precision NaN + +**Cause**: This warning indicates API rate limiting or concurrent request overload: +- RAGAS makes multiple LLM calls per test case (faithfulness, relevancy, recall, precision) +- Context Precision calls LLM once per retrieved document (with `EVAL_QUERY_TOP_K=10`, that's 10 calls) +- Concurrent evaluation multiplies these calls: `EVAL_MAX_CONCURRENT Ɨ LLM calls per test` + +**Solutions** (in order of effectiveness): + +1. **Serial Evaluation** (Default): + ```bash + export EVAL_MAX_CONCURRENT=1 + python lightrag/evaluation/eval_rag_quality.py + ``` + +2. **Reduce Retrieved Documents**: + ```bash + export EVAL_QUERY_TOP_K=5 # Halves Context Precision LLM calls + python lightrag/evaluation/eval_rag_quality.py + ``` + +3. **Increase Retry & Timeout**: + ```bash + export EVAL_LLM_MAX_RETRIES=10 + export EVAL_LLM_TIMEOUT=180 + python lightrag/evaluation/eval_rag_quality.py + ``` + +4. **Use Higher Quota API** (if available): + - Upgrade to OpenAI Tier 2+ for higher RPM limits + - Use self-hosted OpenAI-compatible service with no rate limits + +### "AttributeError: 'InstructorLLM' object has no attribute 'agenerate_prompt'" or NaN results + +This error occurs with RAGAS 0.3.x when LLM and Embeddings are not explicitly configured. The evaluation framework now handles this automatically by: +- Using environment variables to configure evaluation models +- Creating proper LLM and Embeddings instances for RAGAS + +**Solution**: Ensure you have set one of the following: +- `OPENAI_API_KEY` environment variable (default) +- `EVAL_LLM_BINDING_API_KEY` for custom API key + +The framework will automatically configure the evaluation models. + ### "No sample_dataset.json found" Make sure you're running from the project root: @@ -175,11 +400,10 @@ cd /path/to/LightRAG python lightrag/evaluation/eval_rag_quality.py ``` -### "LLM API errors during evaluation" +### "LightRAG query API errors during evaluation" The evaluation uses your configured LLM (OpenAI by default). Ensure: - API keys are set in `.env` -- Have sufficient API quota - Network connection is stable ### Evaluation requires running LightRAG API @@ -189,15 +413,74 @@ The evaluator queries a running LightRAG API server at `http://localhost:9621`. 2. Documents are indexed in your LightRAG instance 3. API is accessible at the configured URL ---- + ## šŸ“ Next Steps -1. Index documents into LightRAG (WebUI or API) -2. Start LightRAG API server +1. Start LightRAG API server +2. Upload sample documents into LightRAG throught WebUI 3. Run `python lightrag/evaluation/eval_rag_quality.py` 4. Review results (JSON/CSV) in `results/` folder -5. Adjust entity extraction prompts or retrieval settings based on scores + +Evaluation Result Sample: + +``` +INFO: ====================================================================== +INFO: šŸ” RAGAS Evaluation - Using Real LightRAG API +INFO: ====================================================================== +INFO: Evaluation Models: +INFO: • LLM Model: gpt-4.1 +INFO: • Embedding Model: text-embedding-3-large +INFO: • Endpoint: OpenAI Official API +INFO: Concurrency & Rate Limiting: +INFO: • Query Top-K: 10 Entities/Relations +INFO: • LLM Max Retries: 5 +INFO: • LLM Timeout: 180 seconds +INFO: Test Configuration: +INFO: • Total Test Cases: 6 +INFO: • Test Dataset: sample_dataset.json +INFO: • LightRAG API: http://localhost:9621 +INFO: • Results Directory: results +INFO: ====================================================================== +INFO: šŸš€ Starting RAGAS Evaluation of LightRAG System +INFO: šŸ”§ RAGAS Evaluation (Stage 2): 2 concurrent +INFO: ====================================================================== +INFO: +INFO: =================================================================================================================== +INFO: šŸ“Š EVALUATION RESULTS SUMMARY +INFO: =================================================================================================================== +INFO: # | Question | Faith | AnswRel | CtxRec | CtxPrec | RAGAS | Status +INFO: ------------------------------------------------------------------------------------------------------------------- +INFO: 1 | How does LightRAG solve the hallucination probl... | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | āœ“ +INFO: 2 | What are the three main components required in ... | 0.8500 | 0.5790 | 1.0000 | 1.0000 | 0.8573 | āœ“ +INFO: 3 | How does LightRAG's retrieval performance compa... | 0.8056 | 1.0000 | 1.0000 | 1.0000 | 0.9514 | āœ“ +INFO: 4 | What vector databases does LightRAG support and... | 0.8182 | 0.9807 | 1.0000 | 1.0000 | 0.9497 | āœ“ +INFO: 5 | What are the four key metrics for evaluating RA... | 1.0000 | 0.7452 | 1.0000 | 1.0000 | 0.9363 | āœ“ +INFO: 6 | What are the core benefits of LightRAG and how ... | 0.9583 | 0.8829 | 1.0000 | 1.0000 | 0.9603 | āœ“ +INFO: =================================================================================================================== +INFO: +INFO: ====================================================================== +INFO: šŸ“Š EVALUATION COMPLETE +INFO: ====================================================================== +INFO: Total Tests: 6 +INFO: Successful: 6 +INFO: Failed: 0 +INFO: Success Rate: 100.00% +INFO: Elapsed Time: 161.10 seconds +INFO: Avg Time/Test: 26.85 seconds +INFO: +INFO: ====================================================================== +INFO: šŸ“ˆ BENCHMARK RESULTS (Average) +INFO: ====================================================================== +INFO: Average Faithfulness: 0.9053 +INFO: Average Answer Relevance: 0.8646 +INFO: Average Context Recall: 1.0000 +INFO: Average Context Precision: 1.0000 +INFO: Average RAGAS Score: 0.9425 +INFO: ---------------------------------------------------------------------- +INFO: Min RAGAS Score: 0.8573 +INFO: Max RAGAS Score: 1.0000 +``` --- diff --git a/lightrag/evaluation/eval_rag_quality.py b/lightrag/evaluation/eval_rag_quality.py index 0b5dff11..7b415090 100644 --- a/lightrag/evaluation/eval_rag_quality.py +++ b/lightrag/evaluation/eval_rag_quality.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -RAGAS Evaluation Script for Portfolio RAG System +RAGAS Evaluation Script for LightRAG System Evaluates RAG response quality using RAGAS metrics: - Faithfulness: Is the answer factually accurate based on context? @@ -9,56 +9,107 @@ Evaluates RAG response quality using RAGAS metrics: - Context Precision: Is retrieved context clean without noise? Usage: + # Use defaults (sample_dataset.json, http://localhost:9621) python lightrag/evaluation/eval_rag_quality.py - python lightrag/evaluation/eval_rag_quality.py http://localhost:9621 - python lightrag/evaluation/eval_rag_quality.py http://your-rag-server.com:9621 + + # Specify custom dataset + python lightrag/evaluation/eval_rag_quality.py --dataset my_test.json + python lightrag/evaluation/eval_rag_quality.py -d my_test.json + + # Specify custom RAG endpoint + python lightrag/evaluation/eval_rag_quality.py --ragendpoint http://my-server.com:9621 + python lightrag/evaluation/eval_rag_quality.py -r http://my-server.com:9621 + + # Specify both + python lightrag/evaluation/eval_rag_quality.py -d my_test.json -r http://localhost:9621 + + # Get help + python lightrag/evaluation/eval_rag_quality.py --help Results are saved to: lightrag/evaluation/results/ - results_YYYYMMDD_HHMMSS.csv (CSV export for analysis) - results_YYYYMMDD_HHMMSS.json (Full results with details) + +Technical Notes: + - Uses stable RAGAS API (LangchainLLMWrapper) for maximum compatibility + - Supports custom OpenAI-compatible endpoints via EVAL_LLM_BINDING_HOST + - Enables bypass_n mode for endpoints that don't support 'n' parameter + - Deprecation warnings are suppressed for cleaner output """ +import argparse import asyncio import csv import json +import math import os import sys import time +import warnings from datetime import datetime from pathlib import Path from typing import Any, Dict, List import httpx from dotenv import load_dotenv +from lightrag.utils import logger + +# Suppress LangchainLLMWrapper deprecation warning +# We use LangchainLLMWrapper for stability and compatibility with all RAGAS versions +warnings.filterwarnings( + "ignore", + message=".*LangchainLLMWrapper is deprecated.*", + category=DeprecationWarning, +) + +# Suppress token usage warning for custom OpenAI-compatible endpoints +# Custom endpoints (vLLM, SGLang, etc.) often don't return usage information +# This is non-critical as token tracking is not required for RAGAS evaluation +warnings.filterwarnings( + "ignore", + message=".*Unexpected type for token usage.*", + category=UserWarning, +) # Add parent directory to path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) -# Load .env from project root -project_root = Path(__file__).parent.parent.parent -load_dotenv(project_root / ".env") - -# Setup OpenAI API key (required for RAGAS evaluation) -# Use LLM_BINDING_API_KEY if OPENAI_API_KEY is not set -if "OPENAI_API_KEY" not in os.environ: - if "LLM_BINDING_API_KEY" in os.environ: - os.environ["OPENAI_API_KEY"] = os.environ["LLM_BINDING_API_KEY"] - else: - os.environ["OPENAI_API_KEY"] = input("Enter your OpenAI API key: ") +# use the .env that is inside the current folder +# allows to use different .env file for each lightrag instance +# the OS environment variables take precedence over the .env file +load_dotenv(dotenv_path=".env", override=False) +# Conditional imports - will raise ImportError if dependencies not installed try: from datasets import Dataset from ragas import evaluate from ragas.metrics import ( - answer_relevancy, - context_precision, - context_recall, - faithfulness, + AnswerRelevancy, + ContextPrecision, + ContextRecall, + Faithfulness, ) -except ImportError as e: - print(f"āŒ RAGAS import error: {e}") - print(" Install with: pip install ragas datasets") - sys.exit(1) + from ragas.llms import LangchainLLMWrapper + from langchain_openai import ChatOpenAI, OpenAIEmbeddings + from tqdm.auto import tqdm + + RAGAS_AVAILABLE = True + +except ImportError: + RAGAS_AVAILABLE = False + Dataset = None + evaluate = None + LangchainLLMWrapper = None + + +CONNECT_TIMEOUT_SECONDS = 180.0 +READ_TIMEOUT_SECONDS = 300.0 +TOTAL_TIMEOUT_SECONDS = 180.0 + + +def _is_nan(value: Any) -> bool: + """Return True when value is a float NaN.""" + return isinstance(value, float) and math.isnan(value) class RAGEvaluator: @@ -72,7 +123,94 @@ class RAGEvaluator: test_dataset_path: Path to test dataset JSON file rag_api_url: Base URL of LightRAG API (e.g., http://localhost:9621) If None, will try to read from environment or use default + + Environment Variables: + EVAL_LLM_MODEL: LLM model for evaluation (default: gpt-4o-mini) + EVAL_EMBEDDING_MODEL: Embedding model for evaluation (default: text-embedding-3-small) + EVAL_LLM_BINDING_API_KEY: API key for LLM (fallback to OPENAI_API_KEY) + EVAL_LLM_BINDING_HOST: Custom endpoint URL for LLM (optional) + EVAL_EMBEDDING_BINDING_API_KEY: API key for embeddings (fallback: EVAL_LLM_BINDING_API_KEY -> OPENAI_API_KEY) + EVAL_EMBEDDING_BINDING_HOST: Custom endpoint URL for embeddings (fallback: EVAL_LLM_BINDING_HOST) + + Raises: + ImportError: If ragas or datasets packages are not installed + EnvironmentError: If EVAL_LLM_BINDING_API_KEY and OPENAI_API_KEY are both not set """ + # Validate RAGAS dependencies are installed + if not RAGAS_AVAILABLE: + raise ImportError( + "RAGAS dependencies not installed. " + "Install with: pip install ragas datasets" + ) + + # Configure evaluation LLM (for RAGAS scoring) + eval_llm_api_key = os.getenv("EVAL_LLM_BINDING_API_KEY") or os.getenv( + "OPENAI_API_KEY" + ) + if not eval_llm_api_key: + raise EnvironmentError( + "EVAL_LLM_BINDING_API_KEY or OPENAI_API_KEY is required for evaluation. " + "Set EVAL_LLM_BINDING_API_KEY to use a custom API key, " + "or ensure OPENAI_API_KEY is set." + ) + + eval_model = os.getenv("EVAL_LLM_MODEL", "gpt-4o-mini") + eval_llm_base_url = os.getenv("EVAL_LLM_BINDING_HOST") + + # Configure evaluation embeddings (for RAGAS scoring) + # Fallback chain: EVAL_EMBEDDING_BINDING_API_KEY -> EVAL_LLM_BINDING_API_KEY -> OPENAI_API_KEY + eval_embedding_api_key = ( + os.getenv("EVAL_EMBEDDING_BINDING_API_KEY") + or os.getenv("EVAL_LLM_BINDING_API_KEY") + or os.getenv("OPENAI_API_KEY") + ) + eval_embedding_model = os.getenv( + "EVAL_EMBEDDING_MODEL", "text-embedding-3-large" + ) + # Fallback chain: EVAL_EMBEDDING_BINDING_HOST -> EVAL_LLM_BINDING_HOST -> None + eval_embedding_base_url = os.getenv("EVAL_EMBEDDING_BINDING_HOST") or os.getenv( + "EVAL_LLM_BINDING_HOST" + ) + + # Create LLM and Embeddings instances for RAGAS + llm_kwargs = { + "model": eval_model, + "api_key": eval_llm_api_key, + "max_retries": int(os.getenv("EVAL_LLM_MAX_RETRIES", "5")), + "request_timeout": int(os.getenv("EVAL_LLM_TIMEOUT", "180")), + } + embedding_kwargs = { + "model": eval_embedding_model, + "api_key": eval_embedding_api_key, + } + + if eval_llm_base_url: + llm_kwargs["base_url"] = eval_llm_base_url + + if eval_embedding_base_url: + embedding_kwargs["base_url"] = eval_embedding_base_url + + # Create base LangChain LLM + base_llm = ChatOpenAI(**llm_kwargs) + self.eval_embeddings = OpenAIEmbeddings(**embedding_kwargs) + + # Wrap LLM with LangchainLLMWrapper and enable bypass_n mode for custom endpoints + # This ensures compatibility with endpoints that don't support the 'n' parameter + # by generating multiple outputs through repeated prompts instead of using 'n' parameter + try: + self.eval_llm = LangchainLLMWrapper( + langchain_llm=base_llm, + bypass_n=True, # Enable bypass_n to avoid passing 'n' to OpenAI API + ) + logger.debug("Successfully configured bypass_n mode for LLM wrapper") + except Exception as e: + logger.warning( + "Could not configure LangchainLLMWrapper with bypass_n: %s. " + "Using base LLM directly, which may cause warnings with custom endpoints.", + e, + ) + self.eval_llm = base_llm + if test_dataset_path is None: test_dataset_path = Path(__file__).parent / "sample_dataset.json" @@ -87,6 +225,58 @@ class RAGEvaluator: # Load test dataset self.test_cases = self._load_test_dataset() + # Store configuration values for display + self.eval_model = eval_model + self.eval_embedding_model = eval_embedding_model + self.eval_llm_base_url = eval_llm_base_url + self.eval_embedding_base_url = eval_embedding_base_url + self.eval_max_retries = llm_kwargs["max_retries"] + self.eval_timeout = llm_kwargs["request_timeout"] + + # Display configuration + self._display_configuration() + + def _display_configuration(self): + """Display all evaluation configuration settings""" + logger.info("Evaluation Models:") + logger.info(" • LLM Model: %s", self.eval_model) + logger.info(" • Embedding Model: %s", self.eval_embedding_model) + + # Display LLM endpoint + if self.eval_llm_base_url: + logger.info(" • LLM Endpoint: %s", self.eval_llm_base_url) + logger.info( + " • Bypass N-Parameter: Enabled (use LangchainLLMWrapper for compatibility)" + ) + else: + logger.info(" • LLM Endpoint: OpenAI Official API") + + # Display Embedding endpoint (only if different from LLM) + if self.eval_embedding_base_url: + if self.eval_embedding_base_url != self.eval_llm_base_url: + logger.info( + " • Embedding Endpoint: %s", self.eval_embedding_base_url + ) + # If same as LLM endpoint, no need to display separately + elif not self.eval_llm_base_url: + # Both using OpenAI - already displayed above + pass + else: + # LLM uses custom endpoint, but embeddings use OpenAI + logger.info(" • Embedding Endpoint: OpenAI Official API") + + logger.info("Concurrency & Rate Limiting:") + query_top_k = int(os.getenv("EVAL_QUERY_TOP_K", "10")) + logger.info(" • Query Top-K: %s Entities/Relations", query_top_k) + logger.info(" • LLM Max Retries: %s", self.eval_max_retries) + logger.info(" • LLM Timeout: %s seconds", self.eval_timeout) + + logger.info("Test Configuration:") + logger.info(" • Total Test Cases: %s", len(self.test_cases)) + logger.info(" • Test Dataset: %s", self.test_dataset_path.name) + logger.info(" • LightRAG API: %s", self.rag_api_url) + logger.info(" • Results Directory: %s", self.results_dir.name) + def _load_test_dataset(self) -> List[Dict[str, str]]: """Load test cases from JSON file""" if not self.test_dataset_path.exists(): @@ -123,13 +313,22 @@ class RAGEvaluator: "include_references": True, "include_chunk_content": True, # NEW: Request chunk content in references "response_type": "Multiple Paragraphs", - "top_k": 10, + "top_k": int(os.getenv("EVAL_QUERY_TOP_K", "10")), } + # Get API key from environment for authentication + api_key = os.getenv("LIGHTRAG_API_KEY") + + # Prepare headers with optional authentication + headers = {} + if api_key: + headers["X-API-Key"] = api_key + # Single optimized API call - gets both answer AND chunk content response = await client.post( f"{self.rag_api_url}/query", json=payload, + headers=headers if headers else None, ) response.raise_for_status() result = response.json() @@ -138,17 +337,31 @@ class RAGEvaluator: references = result.get("references", []) # DEBUG: Inspect the API response - print(f" šŸ” References Count: {len(references)}") + logger.debug("šŸ” References Count: %s", len(references)) if references: first_ref = references[0] - print(f" šŸ” First Reference Keys: {list(first_ref.keys())}") + logger.debug("šŸ” First Reference Keys: %s", list(first_ref.keys())) if "content" in first_ref: - print(f" šŸ” Content Preview: {first_ref['content'][:100]}...") + content_preview = first_ref["content"] + if isinstance(content_preview, list) and content_preview: + logger.debug( + "šŸ” Content Preview (first chunk): %s...", + content_preview[0][:100], + ) + elif isinstance(content_preview, str): + logger.debug("šŸ” Content Preview: %s...", content_preview[:100]) # Extract chunk content from enriched references - contexts = [ - ref.get("content", "") for ref in references if ref.get("content") - ] + # Note: content is now a list of chunks per reference (one file may have multiple chunks) + contexts = [] + for ref in references: + content = ref.get("content", []) + if isinstance(content, list): + # Flatten the list: each chunk becomes a separate context + contexts.extend(content) + elif isinstance(content, str): + # Backward compatibility: if content is still a string (shouldn't happen) + contexts.append(content) return { "answer": answer, @@ -179,42 +392,55 @@ class RAGEvaluator: self, idx: int, test_case: Dict[str, str], - semaphore: asyncio.Semaphore, + rag_semaphore: asyncio.Semaphore, + eval_semaphore: asyncio.Semaphore, client: httpx.AsyncClient, + progress_counter: Dict[str, int], + position_pool: asyncio.Queue, + pbar_creation_lock: asyncio.Lock, ) -> Dict[str, Any]: """ - Evaluate a single test case with concurrency control + Evaluate a single test case with two-stage pipeline concurrency control Args: idx: Test case index (1-based) test_case: Test case dictionary with question and ground_truth - semaphore: Semaphore to control concurrency + rag_semaphore: Semaphore to control overall concurrency (covers entire function) + eval_semaphore: Semaphore to control RAGAS evaluation concurrency (Stage 2) client: Shared httpx AsyncClient for connection pooling + progress_counter: Shared dictionary for progress tracking + position_pool: Queue of available tqdm position indices + pbar_creation_lock: Lock to serialize tqdm creation and prevent race conditions Returns: Evaluation result dictionary """ - async with semaphore: + # rag_semaphore controls the entire evaluation process to prevent + # all RAG responses from being generated at once when eval is slow + async with rag_semaphore: question = test_case["question"] ground_truth = test_case["ground_truth"] - print(f"[{idx}/{len(self.test_cases)}] Evaluating: {question[:60]}...") - - # Generate RAG response by calling actual LightRAG API - rag_response = await self.generate_rag_response( - question=question, client=client - ) + # Stage 1: Generate RAG response + try: + rag_response = await self.generate_rag_response( + question=question, client=client + ) + except Exception as e: + logger.error("Error generating response for test %s: %s", idx, str(e)) + progress_counter["completed"] += 1 + return { + "test_number": idx, + "question": question, + "error": str(e), + "metrics": {}, + "ragas_score": 0, + "timestamp": datetime.now().isoformat(), + } # *** CRITICAL FIX: Use actual retrieved contexts, NOT ground_truth *** retrieved_contexts = rag_response["contexts"] - # DEBUG: Print what was actually retrieved - print(f" šŸ“ Retrieved {len(retrieved_contexts)} contexts") - if retrieved_contexts: - print(f" šŸ“„ First context preview: {retrieved_contexts[0][:100]}...") - else: - print(" āš ļø WARNING: No contexts retrieved!") - # Prepare dataset for RAGAS evaluation with CORRECT contexts eval_dataset = Dataset.from_dict( { @@ -225,108 +451,171 @@ class RAGEvaluator: } ) - # Run RAGAS evaluation - try: - eval_results = evaluate( - dataset=eval_dataset, - metrics=[ - faithfulness, - answer_relevancy, - context_recall, - context_precision, - ], - ) + # Stage 2: Run RAGAS evaluation (controlled by eval_semaphore) + # IMPORTANT: Create fresh metric instances for each evaluation to avoid + # concurrent state conflicts when multiple tasks run in parallel + async with eval_semaphore: + pbar = None + position = None + try: + # Acquire a position from the pool for this tqdm progress bar + position = await position_pool.get() - # Convert to DataFrame (RAGAS v0.3+ API) - df = eval_results.to_pandas() + # Serialize tqdm creation to prevent race conditions + # Multiple tasks creating tqdm simultaneously can cause display conflicts + async with pbar_creation_lock: + # Create tqdm progress bar with assigned position to avoid overlapping + # leave=False ensures the progress bar is cleared after completion, + # preventing accumulation of completed bars and allowing position reuse + pbar = tqdm( + total=4, + desc=f"Eval-{idx:02d}", + position=position, + leave=False, + ) + # Give tqdm time to initialize and claim its screen position + await asyncio.sleep(0.05) - # Extract scores from first row - scores_row = df.iloc[0] + eval_results = evaluate( + dataset=eval_dataset, + metrics=[ + Faithfulness(), + AnswerRelevancy(), + ContextRecall(), + ContextPrecision(), + ], + llm=self.eval_llm, + embeddings=self.eval_embeddings, + _pbar=pbar, + ) - # Extract scores (RAGAS v0.3+ uses .to_pandas()) - result = { - "question": question, - "answer": rag_response["answer"][:200] + "..." - if len(rag_response["answer"]) > 200 - else rag_response["answer"], - "ground_truth": ground_truth[:200] + "..." - if len(ground_truth) > 200 - else ground_truth, - "project": test_case.get("project_context", "unknown"), - "metrics": { - "faithfulness": float(scores_row.get("faithfulness", 0)), - "answer_relevance": float( - scores_row.get("answer_relevancy", 0) - ), - "context_recall": float(scores_row.get("context_recall", 0)), - "context_precision": float( - scores_row.get("context_precision", 0) - ), - }, - "timestamp": datetime.now().isoformat(), - } + # Convert to DataFrame (RAGAS v0.3+ API) + df = eval_results.to_pandas() - # Calculate RAGAS score (average of all metrics) - metrics = result["metrics"] - ragas_score = sum(metrics.values()) / len(metrics) if metrics else 0 - result["ragas_score"] = round(ragas_score, 4) + # Extract scores from first row + scores_row = df.iloc[0] - # Print metrics - print(f" āœ… Faithfulness: {metrics['faithfulness']:.4f}") - print(f" āœ… Answer Relevance: {metrics['answer_relevance']:.4f}") - print(f" āœ… Context Recall: {metrics['context_recall']:.4f}") - print(f" āœ… Context Precision: {metrics['context_precision']:.4f}") - print(f" šŸ“Š RAGAS Score: {result['ragas_score']:.4f}\n") + # Extract scores (RAGAS v0.3+ uses .to_pandas()) + result = { + "test_number": idx, + "question": question, + "answer": rag_response["answer"][:200] + "..." + if len(rag_response["answer"]) > 200 + else rag_response["answer"], + "ground_truth": ground_truth[:200] + "..." + if len(ground_truth) > 200 + else ground_truth, + "project": test_case.get("project", "unknown"), + "metrics": { + "faithfulness": float(scores_row.get("faithfulness", 0)), + "answer_relevance": float( + scores_row.get("answer_relevancy", 0) + ), + "context_recall": float( + scores_row.get("context_recall", 0) + ), + "context_precision": float( + scores_row.get("context_precision", 0) + ), + }, + "timestamp": datetime.now().isoformat(), + } - return result + # Calculate RAGAS score (average of all metrics, excluding NaN values) + metrics = result["metrics"] + valid_metrics = [v for v in metrics.values() if not _is_nan(v)] + ragas_score = ( + sum(valid_metrics) / len(valid_metrics) if valid_metrics else 0 + ) + result["ragas_score"] = round(ragas_score, 4) - except Exception as e: - import traceback + # Update progress counter + progress_counter["completed"] += 1 - print(f" āŒ Error evaluating: {str(e)}") - print(f" šŸ” Full traceback:\n{traceback.format_exc()}\n") - return { - "question": question, - "error": str(e), - "metrics": {}, - "ragas_score": 0, - "timestamp": datetime.now().isoformat(), - } + return result + + except Exception as e: + logger.error("Error evaluating test %s: %s", idx, str(e)) + progress_counter["completed"] += 1 + return { + "test_number": idx, + "question": question, + "error": str(e), + "metrics": {}, + "ragas_score": 0, + "timestamp": datetime.now().isoformat(), + } + finally: + # Force close progress bar to ensure completion + if pbar is not None: + pbar.close() + # Release the position back to the pool for reuse + if position is not None: + await position_pool.put(position) async def evaluate_responses(self) -> List[Dict[str, Any]]: """ - Evaluate all test cases in parallel and return metrics + Evaluate all test cases in parallel with two-stage pipeline and return metrics Returns: List of evaluation results with metrics """ - # Get MAX_ASYNC from environment (default to 4 if not set) - max_async = int(os.getenv("MAX_ASYNC", "4")) + # Get evaluation concurrency from environment (default to 2 for parallel evaluation) + max_async = int(os.getenv("EVAL_MAX_CONCURRENT", "2")) - print("\n" + "=" * 70) - print("šŸš€ Starting RAGAS Evaluation of Portfolio RAG System") - print(f"šŸ”§ Parallel evaluations: {max_async}") - print("=" * 70 + "\n") + logger.info("%s", "=" * 70) + logger.info("šŸš€ Starting RAGAS Evaluation of LightRAG System") + logger.info("šŸ”§ RAGAS Evaluation (Stage 2): %s concurrent", max_async) + logger.info("%s", "=" * 70) - # Create semaphore to limit concurrent evaluations - semaphore = asyncio.Semaphore(max_async) + # Create two-stage pipeline semaphores + # Stage 1: RAG generation - allow x2 concurrency to keep evaluation fed + rag_semaphore = asyncio.Semaphore(max_async * 2) + # Stage 2: RAGAS evaluation - primary bottleneck + eval_semaphore = asyncio.Semaphore(max_async) + + # Create progress counter (shared across all tasks) + progress_counter = {"completed": 0} + + # Create position pool for tqdm progress bars + # Positions range from 0 to max_async-1, ensuring no overlapping displays + position_pool = asyncio.Queue() + for i in range(max_async): + await position_pool.put(i) + + # Create lock to serialize tqdm creation and prevent race conditions + # This ensures progress bars are created one at a time, avoiding display conflicts + pbar_creation_lock = asyncio.Lock() # Create shared HTTP client with connection pooling and proper timeouts # Timeout: 3 minutes for connect, 5 minutes for read (LLM can be slow) - timeout = httpx.Timeout(180.0, connect=180.0, read=300.0) + timeout = httpx.Timeout( + TOTAL_TIMEOUT_SECONDS, + connect=CONNECT_TIMEOUT_SECONDS, + read=READ_TIMEOUT_SECONDS, + ) limits = httpx.Limits( - max_connections=max_async * 2, # Allow some buffer - max_keepalive_connections=max_async, + max_connections=(max_async + 1) * 2, # Allow buffer for RAG stage + max_keepalive_connections=max_async + 1, ) async with httpx.AsyncClient(timeout=timeout, limits=limits) as client: # Create tasks for all test cases tasks = [ - self.evaluate_single_case(idx, test_case, semaphore, client) + self.evaluate_single_case( + idx, + test_case, + rag_semaphore, + eval_semaphore, + client, + progress_counter, + position_pool, + pbar_creation_lock, + ) for idx, test_case in enumerate(self.test_cases, 1) ] - # Run all evaluations in parallel (limited by semaphore) + # Run all evaluations in parallel (limited by two-stage semaphores) results = await asyncio.gather(*tasks) return list(results) @@ -391,6 +680,95 @@ class RAGEvaluator: return csv_path + def _format_metric(self, value: float, width: int = 6) -> str: + """ + Format a metric value for display, handling NaN gracefully + + Args: + value: The metric value to format + width: The width of the formatted string + + Returns: + Formatted string (e.g., "0.8523" or " N/A ") + """ + if _is_nan(value): + return "N/A".center(width) + return f"{value:.4f}".rjust(width) + + def _display_results_table(self, results: List[Dict[str, Any]]): + """ + Display evaluation results in a formatted table + + Args: + results: List of evaluation results + """ + logger.info("") + logger.info("%s", "=" * 115) + logger.info("šŸ“Š EVALUATION RESULTS SUMMARY") + logger.info("%s", "=" * 115) + + # Table header + logger.info( + "%-4s | %-50s | %6s | %7s | %6s | %7s | %6s | %6s", + "#", + "Question", + "Faith", + "AnswRel", + "CtxRec", + "CtxPrec", + "RAGAS", + "Status", + ) + logger.info("%s", "-" * 115) + + # Table rows + for result in results: + test_num = result.get("test_number", 0) + question = result.get("question", "") + # Truncate question to 50 chars + question_display = ( + (question[:47] + "...") if len(question) > 50 else question + ) + + metrics = result.get("metrics", {}) + if metrics: + # Success case - format each metric, handling NaN values + faith = metrics.get("faithfulness", 0) + ans_rel = metrics.get("answer_relevance", 0) + ctx_rec = metrics.get("context_recall", 0) + ctx_prec = metrics.get("context_precision", 0) + ragas = result.get("ragas_score", 0) + status = "āœ“" + + logger.info( + "%-4d | %-50s | %s | %s | %s | %s | %s | %6s", + test_num, + question_display, + self._format_metric(faith, 6), + self._format_metric(ans_rel, 7), + self._format_metric(ctx_rec, 6), + self._format_metric(ctx_prec, 7), + self._format_metric(ragas, 6), + status, + ) + else: + # Error case + error = result.get("error", "Unknown error") + error_display = (error[:20] + "...") if len(error) > 23 else error + logger.info( + "%-4d | %-50s | %6s | %7s | %6s | %7s | %6s | āœ— %s", + test_num, + question_display, + "N/A", + "N/A", + "N/A", + "N/A", + "N/A", + error_display, + ) + + logger.info("%s", "=" * 115) + def _calculate_benchmark_stats( self, results: List[Dict[str, Any]] ) -> Dict[str, Any]: @@ -417,69 +795,61 @@ class RAGEvaluator: "success_rate": 0.0, } - # Calculate averages for each metric (handling NaN values) - import math - - metrics_sum = { - "faithfulness": 0.0, - "answer_relevance": 0.0, - "context_recall": 0.0, - "context_precision": 0.0, - "ragas_score": 0.0, + # Calculate averages for each metric (handling NaN values correctly) + # Track both sum and count for each metric to handle NaN values properly + metrics_data = { + "faithfulness": {"sum": 0.0, "count": 0}, + "answer_relevance": {"sum": 0.0, "count": 0}, + "context_recall": {"sum": 0.0, "count": 0}, + "context_precision": {"sum": 0.0, "count": 0}, + "ragas_score": {"sum": 0.0, "count": 0}, } for result in valid_results: metrics = result.get("metrics", {}) - # Skip NaN values when summing + + # For each metric, sum non-NaN values and count them faithfulness = metrics.get("faithfulness", 0) - if ( - not math.isnan(faithfulness) - if isinstance(faithfulness, float) - else True - ): - metrics_sum["faithfulness"] += faithfulness + if not _is_nan(faithfulness): + metrics_data["faithfulness"]["sum"] += faithfulness + metrics_data["faithfulness"]["count"] += 1 answer_relevance = metrics.get("answer_relevance", 0) - if ( - not math.isnan(answer_relevance) - if isinstance(answer_relevance, float) - else True - ): - metrics_sum["answer_relevance"] += answer_relevance + if not _is_nan(answer_relevance): + metrics_data["answer_relevance"]["sum"] += answer_relevance + metrics_data["answer_relevance"]["count"] += 1 context_recall = metrics.get("context_recall", 0) - if ( - not math.isnan(context_recall) - if isinstance(context_recall, float) - else True - ): - metrics_sum["context_recall"] += context_recall + if not _is_nan(context_recall): + metrics_data["context_recall"]["sum"] += context_recall + metrics_data["context_recall"]["count"] += 1 context_precision = metrics.get("context_precision", 0) - if ( - not math.isnan(context_precision) - if isinstance(context_precision, float) - else True - ): - metrics_sum["context_precision"] += context_precision + if not _is_nan(context_precision): + metrics_data["context_precision"]["sum"] += context_precision + metrics_data["context_precision"]["count"] += 1 ragas_score = result.get("ragas_score", 0) - if not math.isnan(ragas_score) if isinstance(ragas_score, float) else True: - metrics_sum["ragas_score"] += ragas_score + if not _is_nan(ragas_score): + metrics_data["ragas_score"]["sum"] += ragas_score + metrics_data["ragas_score"]["count"] += 1 - # Calculate averages - n = len(valid_results) + # Calculate averages using actual counts for each metric avg_metrics = {} - for k, v in metrics_sum.items(): - avg_val = v / n if n > 0 else 0 - # Handle NaN in average - avg_metrics[k] = round(avg_val, 4) if not math.isnan(avg_val) else 0.0 + for metric_name, data in metrics_data.items(): + if data["count"] > 0: + avg_val = data["sum"] / data["count"] + avg_metrics[metric_name] = ( + round(avg_val, 4) if not _is_nan(avg_val) else 0.0 + ) + else: + avg_metrics[metric_name] = 0.0 # Find min and max RAGAS scores (filter out NaN) ragas_scores = [] for r in valid_results: score = r.get("ragas_score", 0) - if isinstance(score, float) and math.isnan(score): + if _is_nan(score): continue # Skip NaN values ragas_scores.append(score) @@ -518,6 +888,9 @@ class RAGEvaluator: "results": results, } + # Display results table + self._display_results_table(results) + # Save JSON results json_path = ( self.results_dir @@ -525,43 +898,51 @@ class RAGEvaluator: ) with open(json_path, "w") as f: json.dump(summary, f, indent=2) - print(f"āœ… JSON results saved to: {json_path}") # Export to CSV csv_path = self._export_to_csv(results) - print(f"āœ… CSV results saved to: {csv_path}") # Print summary - print("\n" + "=" * 70) - print("šŸ“Š EVALUATION COMPLETE") - print("=" * 70) - print(f"Total Tests: {len(results)}") - print(f"Successful: {benchmark_stats['successful_tests']}") - print(f"Failed: {benchmark_stats['failed_tests']}") - print(f"Success Rate: {benchmark_stats['success_rate']:.2f}%") - print(f"Elapsed Time: {elapsed_time:.2f} seconds") - print(f"Avg Time/Test: {elapsed_time / len(results):.2f} seconds") + logger.info("") + logger.info("%s", "=" * 70) + logger.info("šŸ“Š EVALUATION COMPLETE") + logger.info("%s", "=" * 70) + logger.info("Total Tests: %s", len(results)) + logger.info("Successful: %s", benchmark_stats["successful_tests"]) + logger.info("Failed: %s", benchmark_stats["failed_tests"]) + logger.info("Success Rate: %.2f%%", benchmark_stats["success_rate"]) + logger.info("Elapsed Time: %.2f seconds", elapsed_time) + logger.info("Avg Time/Test: %.2f seconds", elapsed_time / len(results)) # Print benchmark metrics - print("\n" + "=" * 70) - print("šŸ“ˆ BENCHMARK RESULTS (Moyennes)") - print("=" * 70) + logger.info("") + logger.info("%s", "=" * 70) + logger.info("šŸ“ˆ BENCHMARK RESULTS (Average)") + logger.info("%s", "=" * 70) avg = benchmark_stats["average_metrics"] - print(f"Moyenne Faithfulness: {avg['faithfulness']:.4f}") - print(f"Moyenne Answer Relevance: {avg['answer_relevance']:.4f}") - print(f"Moyenne Context Recall: {avg['context_recall']:.4f}") - print(f"Moyenne Context Precision: {avg['context_precision']:.4f}") - print(f"Moyenne RAGAS Score: {avg['ragas_score']:.4f}") - print(f"\nMin RAGAS Score: {benchmark_stats['min_ragas_score']:.4f}") - print(f"Max RAGAS Score: {benchmark_stats['max_ragas_score']:.4f}") + logger.info("Average Faithfulness: %.4f", avg["faithfulness"]) + logger.info("Average Answer Relevance: %.4f", avg["answer_relevance"]) + logger.info("Average Context Recall: %.4f", avg["context_recall"]) + logger.info("Average Context Precision: %.4f", avg["context_precision"]) + logger.info("Average RAGAS Score: %.4f", avg["ragas_score"]) + logger.info("%s", "-" * 70) + logger.info( + "Min RAGAS Score: %.4f", + benchmark_stats["min_ragas_score"], + ) + logger.info( + "Max RAGAS Score: %.4f", + benchmark_stats["max_ragas_score"], + ) - print("\n" + "=" * 70) - print("šŸ“ GENERATED FILES") - print("=" * 70) - print(f"Results Dir: {self.results_dir.absolute()}") - print(f" • CSV: {csv_path.name}") - print(f" • JSON: {json_path.name}") - print("=" * 70 + "\n") + logger.info("") + logger.info("%s", "=" * 70) + logger.info("šŸ“ GENERATED FILES") + logger.info("%s", "=" * 70) + logger.info("Results Dir: %s", self.results_dir.absolute()) + logger.info(" • CSV: %s", csv_path.name) + logger.info(" • JSON: %s", json_path.name) + logger.info("%s", "=" * 70) return summary @@ -570,30 +951,64 @@ async def main(): """ Main entry point for RAGAS evaluation + Command-line arguments: + --dataset, -d: Path to test dataset JSON file (default: sample_dataset.json) + --ragendpoint, -r: LightRAG API endpoint URL (default: http://localhost:9621 or $LIGHTRAG_API_URL) + Usage: python lightrag/evaluation/eval_rag_quality.py - python lightrag/evaluation/eval_rag_quality.py http://localhost:9621 - python lightrag/evaluation/eval_rag_quality.py http://your-server.com:9621 + python lightrag/evaluation/eval_rag_quality.py --dataset my_test.json + python lightrag/evaluation/eval_rag_quality.py -d my_test.json -r http://localhost:9621 """ try: - # Get RAG API URL from command line or environment - rag_api_url = None - if len(sys.argv) > 1: - rag_api_url = sys.argv[1] + # Parse command-line arguments + parser = argparse.ArgumentParser( + description="RAGAS Evaluation Script for LightRAG System", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Use defaults + python lightrag/evaluation/eval_rag_quality.py - print("\n" + "=" * 70) - print("šŸ” RAGAS Evaluation - Using Real LightRAG API") - print("=" * 70) - if rag_api_url: - print(f"šŸ“” RAG API URL: {rag_api_url}") - else: - print("šŸ“” RAG API URL: http://localhost:9621 (default)") - print("=" * 70 + "\n") + # Specify custom dataset + python lightrag/evaluation/eval_rag_quality.py --dataset my_test.json - evaluator = RAGEvaluator(rag_api_url=rag_api_url) + # Specify custom RAG endpoint + python lightrag/evaluation/eval_rag_quality.py --ragendpoint http://my-server.com:9621 + + # Specify both + python lightrag/evaluation/eval_rag_quality.py -d my_test.json -r http://localhost:9621 + """, + ) + + parser.add_argument( + "--dataset", + "-d", + type=str, + default=None, + help="Path to test dataset JSON file (default: sample_dataset.json in evaluation directory)", + ) + + parser.add_argument( + "--ragendpoint", + "-r", + type=str, + default=None, + help="LightRAG API endpoint URL (default: http://localhost:9621 or $LIGHTRAG_API_URL environment variable)", + ) + + args = parser.parse_args() + + logger.info("%s", "=" * 70) + logger.info("šŸ” RAGAS Evaluation - Using Real LightRAG API") + logger.info("%s", "=" * 70) + + evaluator = RAGEvaluator( + test_dataset_path=args.dataset, rag_api_url=args.ragendpoint + ) await evaluator.run() except Exception as e: - print(f"\nāŒ Error: {str(e)}\n") + logger.exception("āŒ Error: %s", e) sys.exit(1)