Update RAGAS evaluation to use gpt-4o-mini and improve compatibility
- Change default model to gpt-4o-mini - Add deprecation warning suppression - Update docs and comments for LightRAG - Improve output formatting and timing
This commit is contained in:
parent
6d61f70b92
commit
d4b8a229b9
3 changed files with 26 additions and 21 deletions
|
|
@ -400,7 +400,7 @@ MEMGRAPH_DATABASE=memgraph
|
||||||
############################
|
############################
|
||||||
### RAGAS evaluation models (used for RAG quality assessment)
|
### RAGAS evaluation models (used for RAG quality assessment)
|
||||||
### Default uses OpenAI models for evaluation
|
### Default uses OpenAI models for evaluation
|
||||||
# EVAL_LLM_MODEL=gpt-4.1
|
# EVAL_LLM_MODEL=gpt-4o-mini
|
||||||
# EVAL_EMBEDDING_MODEL=text-embedding-3-large
|
# EVAL_EMBEDDING_MODEL=text-embedding-3-large
|
||||||
### API key for evaluation (fallback to OPENAI_API_KEY if not set)
|
### API key for evaluation (fallback to OPENAI_API_KEY if not set)
|
||||||
# EVAL_LLM_BINDING_API_KEY=your_api_key
|
# EVAL_LLM_BINDING_API_KEY=your_api_key
|
||||||
|
|
|
||||||
|
|
@ -117,7 +117,7 @@ python lightrag/evaluation/eval_rag_quality.py
|
||||||
**Custom Model:**
|
**Custom Model:**
|
||||||
```bash
|
```bash
|
||||||
export OPENAI_API_KEY=sk-xxx
|
export OPENAI_API_KEY=sk-xxx
|
||||||
export EVAL_LLM_MODEL=gpt-4.1
|
export EVAL_LLM_MODEL=gpt-4o-mini
|
||||||
export EVAL_EMBEDDING_MODEL=text-embedding-3-large
|
export EVAL_EMBEDDING_MODEL=text-embedding-3-large
|
||||||
python lightrag/evaluation/eval_rag_quality.py
|
python lightrag/evaluation/eval_rag_quality.py
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
RAGAS Evaluation Script for Portfolio RAG System
|
RAGAS Evaluation Script for LightRAG System
|
||||||
|
|
||||||
Evaluates RAG response quality using RAGAS metrics:
|
Evaluates RAG response quality using RAGAS metrics:
|
||||||
- Faithfulness: Is the answer factually accurate based on context?
|
- Faithfulness: Is the answer factually accurate based on context?
|
||||||
|
|
@ -17,11 +17,11 @@ Results are saved to: lightrag/evaluation/results/
|
||||||
- results_YYYYMMDD_HHMMSS.csv (CSV export for analysis)
|
- results_YYYYMMDD_HHMMSS.csv (CSV export for analysis)
|
||||||
- results_YYYYMMDD_HHMMSS.json (Full results with details)
|
- results_YYYYMMDD_HHMMSS.json (Full results with details)
|
||||||
|
|
||||||
Note on Custom OpenAI-Compatible Endpoints:
|
Technical Notes:
|
||||||
This script uses bypass_n=True mode for answer_relevancy metric to ensure
|
- Uses stable RAGAS API (LangchainLLMWrapper) for maximum compatibility
|
||||||
compatibility with custom endpoints that may not support OpenAI's 'n' parameter
|
- Supports custom OpenAI-compatible endpoints via EVAL_LLM_BINDING_HOST
|
||||||
for multiple completions. This generates multiple outputs through repeated prompts
|
- Enables bypass_n mode for endpoints that don't support 'n' parameter
|
||||||
instead, maintaining evaluation quality while supporting broader endpoint compatibility.
|
- Deprecation warnings are suppressed for cleaner output
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
@ -31,6 +31,7 @@ import math
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
import warnings
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List
|
||||||
|
|
@ -39,6 +40,14 @@ import httpx
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from lightrag.utils import logger
|
from lightrag.utils import logger
|
||||||
|
|
||||||
|
# Suppress LangchainLLMWrapper deprecation warning
|
||||||
|
# We use LangchainLLMWrapper for stability and compatibility with all RAGAS versions
|
||||||
|
warnings.filterwarnings(
|
||||||
|
"ignore",
|
||||||
|
message=".*LangchainLLMWrapper is deprecated.*",
|
||||||
|
category=DeprecationWarning,
|
||||||
|
)
|
||||||
|
|
||||||
# Add parent directory to path
|
# Add parent directory to path
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
|
@ -119,7 +128,7 @@ class RAGEvaluator:
|
||||||
"or ensure OPENAI_API_KEY is set."
|
"or ensure OPENAI_API_KEY is set."
|
||||||
)
|
)
|
||||||
|
|
||||||
eval_model = os.getenv("EVAL_LLM_MODEL", "gpt-4.1")
|
eval_model = os.getenv("EVAL_LLM_MODEL", "gpt-4o-mini")
|
||||||
eval_embedding_model = os.getenv(
|
eval_embedding_model = os.getenv(
|
||||||
"EVAL_EMBEDDING_MODEL", "text-embedding-3-large"
|
"EVAL_EMBEDDING_MODEL", "text-embedding-3-large"
|
||||||
)
|
)
|
||||||
|
|
@ -185,24 +194,22 @@ class RAGEvaluator:
|
||||||
|
|
||||||
def _display_configuration(self):
|
def _display_configuration(self):
|
||||||
"""Display all evaluation configuration settings"""
|
"""Display all evaluation configuration settings"""
|
||||||
logger.info("EVALUATION CONFIGURATION")
|
logger.info("Evaluation Models:")
|
||||||
|
|
||||||
logger.info(" Evaluation Models:")
|
|
||||||
logger.info(" • LLM Model: %s", self.eval_model)
|
logger.info(" • LLM Model: %s", self.eval_model)
|
||||||
logger.info(" • Embedding Model: %s", self.eval_embedding_model)
|
logger.info(" • Embedding Model: %s", self.eval_embedding_model)
|
||||||
if self.eval_base_url:
|
if self.eval_base_url:
|
||||||
logger.info(" • Custom Endpoint: %s", self.eval_base_url)
|
logger.info(" • Custom Endpoint: %s", self.eval_base_url)
|
||||||
logger.info(" • Bypass N-Parameter: Enabled (for compatibility)")
|
logger.info(" • Bypass N-Parameter: Enabled (use LangchainLLMWrapperfor compatibility)")
|
||||||
else:
|
else:
|
||||||
logger.info(" • Endpoint: OpenAI Official API")
|
logger.info(" • Endpoint: OpenAI Official API")
|
||||||
|
|
||||||
logger.info(" Concurrency & Rate Limiting:")
|
logger.info("Concurrency & Rate Limiting:")
|
||||||
query_top_k = int(os.getenv("EVAL_QUERY_TOP_K", "10"))
|
query_top_k = int(os.getenv("EVAL_QUERY_TOP_K", "10"))
|
||||||
logger.info(" • Query Top-K: %s Entities/Relations", query_top_k)
|
logger.info(" • Query Top-K: %s Entities/Relations", query_top_k)
|
||||||
logger.info(" • LLM Max Retries: %s", self.eval_max_retries)
|
logger.info(" • LLM Max Retries: %s", self.eval_max_retries)
|
||||||
logger.info(" • LLM Timeout: %s seconds", self.eval_timeout)
|
logger.info(" • LLM Timeout: %s seconds", self.eval_timeout)
|
||||||
|
|
||||||
logger.info(" Test Configuration:")
|
logger.info("Test Configuration:")
|
||||||
logger.info(" • Total Test Cases: %s", len(self.test_cases))
|
logger.info(" • Total Test Cases: %s", len(self.test_cases))
|
||||||
logger.info(" • Test Dataset: %s", self.test_dataset_path.name)
|
logger.info(" • Test Dataset: %s", self.test_dataset_path.name)
|
||||||
logger.info(" • LightRAG API: %s", self.rag_api_url)
|
logger.info(" • LightRAG API: %s", self.rag_api_url)
|
||||||
|
|
@ -460,9 +467,8 @@ class RAGEvaluator:
|
||||||
# Get evaluation concurrency from environment (default to 1 for serial evaluation)
|
# Get evaluation concurrency from environment (default to 1 for serial evaluation)
|
||||||
max_async = int(os.getenv("EVAL_MAX_CONCURRENT", "3"))
|
max_async = int(os.getenv("EVAL_MAX_CONCURRENT", "3"))
|
||||||
|
|
||||||
logger.info("")
|
|
||||||
logger.info("%s", "=" * 70)
|
logger.info("%s", "=" * 70)
|
||||||
logger.info("🚀 Starting RAGAS Evaluation of Portfolio RAG System")
|
logger.info("🚀 Starting RAGAS Evaluation of LightRAG System")
|
||||||
logger.info("🔧 Concurrent evaluations: %s", max_async)
|
logger.info("🔧 Concurrent evaluations: %s", max_async)
|
||||||
logger.info("%s", "=" * 70)
|
logger.info("%s", "=" * 70)
|
||||||
|
|
||||||
|
|
@ -580,7 +586,6 @@ class RAGEvaluator:
|
||||||
Args:
|
Args:
|
||||||
results: List of evaluation results
|
results: List of evaluation results
|
||||||
"""
|
"""
|
||||||
logger.info("")
|
|
||||||
logger.info("%s", "=" * 115)
|
logger.info("%s", "=" * 115)
|
||||||
logger.info("📊 EVALUATION RESULTS SUMMARY")
|
logger.info("📊 EVALUATION RESULTS SUMMARY")
|
||||||
logger.info("%s", "=" * 115)
|
logger.info("%s", "=" * 115)
|
||||||
|
|
@ -755,13 +760,13 @@ class RAGEvaluator:
|
||||||
elapsed_time = time.time() - start_time
|
elapsed_time = time.time() - start_time
|
||||||
|
|
||||||
# Add a small delay to ensure all buffered output is completely written
|
# Add a small delay to ensure all buffered output is completely written
|
||||||
await asyncio.sleep(0.2)
|
await asyncio.sleep(0.5)
|
||||||
# Flush all output buffers to ensure RAGAS progress bars are fully displayed
|
# Flush all output buffers to ensure RAGAS progress bars are fully displayed
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
sys.stderr.flush()
|
sys.stderr.flush()
|
||||||
|
sys.stdout.write("\n")
|
||||||
await asyncio.sleep(0.2)
|
|
||||||
sys.stderr.write("\n")
|
sys.stderr.write("\n")
|
||||||
|
sys.stdout.flush()
|
||||||
sys.stderr.flush()
|
sys.stderr.flush()
|
||||||
|
|
||||||
# Display results table
|
# Display results table
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue