fix: Apply ruff formatting and rename test_dataset to sample_dataset
**Lint Fixes (ruff)**: - Sort imports alphabetically (I001) - Add blank line after import traceback (E302) - Add trailing comma to dict literals (COM812) - Reformat writer.writerow for readability (E501) **Rename test_dataset.json → sample_dataset.json**: - Avoids .gitignore pattern conflict (test_* is ignored) - More descriptive name - it's a sample/template, not actual test data - Updated all references in eval_rag_quality.py and README.md Resolves lint-and-format CI check failure. Addresses reviewer feedback about test dataset naming.
This commit is contained in:
parent
aa916f28d2
commit
5cdb4b0ef2
3 changed files with 34 additions and 30 deletions
|
|
@ -25,7 +25,7 @@ Instead of requiring human-annotated ground truth, RAGAS uses state-of-the-art e
|
||||||
```
|
```
|
||||||
lightrag/evaluation/
|
lightrag/evaluation/
|
||||||
├── eval_rag_quality.py # Main evaluation script
|
├── eval_rag_quality.py # Main evaluation script
|
||||||
├── test_dataset.json # Test cases with ground truth
|
├── sample_dataset.json # Test cases with ground truth
|
||||||
├── __init__.py # Package init
|
├── __init__.py # Package init
|
||||||
├── results/ # Output directory
|
├── results/ # Output directory
|
||||||
│ ├── results_YYYYMMDD_HHMMSS.json # Raw metrics
|
│ ├── results_YYYYMMDD_HHMMSS.json # Raw metrics
|
||||||
|
|
@ -82,7 +82,7 @@ results/
|
||||||
|
|
||||||
## 📝 Test Dataset
|
## 📝 Test Dataset
|
||||||
|
|
||||||
Edit `test_dataset.json` to add your own test cases:
|
Edit `sample_dataset.json` to add your own test cases:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
|
@ -268,7 +268,7 @@ for i in range(3):
|
||||||
pip install ragas datasets
|
pip install ragas datasets
|
||||||
```
|
```
|
||||||
|
|
||||||
### "No test_dataset.json found"
|
### "No sample_dataset.json found"
|
||||||
|
|
||||||
Make sure you're running from the project root:
|
Make sure you're running from the project root:
|
||||||
|
|
||||||
|
|
@ -297,7 +297,7 @@ Current implementation uses ground truth as mock responses. Results will show pe
|
||||||
|
|
||||||
## 📝 Next Steps
|
## 📝 Next Steps
|
||||||
|
|
||||||
1. ✅ Review test dataset in `test_dataset.json`
|
1. ✅ Review test dataset in `sample_dataset.json`
|
||||||
2. ✅ Run `python lightrag/evaluation/eval_rag_quality.py`
|
2. ✅ Run `python lightrag/evaluation/eval_rag_quality.py`
|
||||||
3. ✅ Open the HTML report in browser
|
3. ✅ Open the HTML report in browser
|
||||||
4. 🔄 Integrate with actual LightRAG system
|
4. 🔄 Integrate with actual LightRAG system
|
||||||
|
|
|
||||||
|
|
@ -18,16 +18,17 @@ Results are saved to: lightrag/evaluation/results/
|
||||||
- results_YYYYMMDD_HHMMSS.json (Full results with details)
|
- results_YYYYMMDD_HHMMSS.json (Full results with details)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import time
|
|
||||||
import csv
|
import csv
|
||||||
from pathlib import Path
|
import json
|
||||||
from datetime import datetime
|
|
||||||
from typing import Any, Dict, List
|
|
||||||
import sys
|
|
||||||
import httpx
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
import httpx
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# Add parent directory to path
|
# Add parent directory to path
|
||||||
|
|
@ -46,14 +47,14 @@ if "OPENAI_API_KEY" not in os.environ:
|
||||||
os.environ["OPENAI_API_KEY"] = input("Enter your OpenAI API key: ")
|
os.environ["OPENAI_API_KEY"] = input("Enter your OpenAI API key: ")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
from datasets import Dataset
|
||||||
from ragas import evaluate
|
from ragas import evaluate
|
||||||
from ragas.metrics import (
|
from ragas.metrics import (
|
||||||
faithfulness,
|
|
||||||
answer_relevancy,
|
answer_relevancy,
|
||||||
context_recall,
|
|
||||||
context_precision,
|
context_precision,
|
||||||
|
context_recall,
|
||||||
|
faithfulness,
|
||||||
)
|
)
|
||||||
from datasets import Dataset
|
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
print(f"❌ RAGAS import error: {e}")
|
print(f"❌ RAGAS import error: {e}")
|
||||||
print(" Install with: pip install ragas datasets")
|
print(" Install with: pip install ragas datasets")
|
||||||
|
|
@ -73,7 +74,7 @@ class RAGEvaluator:
|
||||||
If None, will try to read from environment or use default
|
If None, will try to read from environment or use default
|
||||||
"""
|
"""
|
||||||
if test_dataset_path is None:
|
if test_dataset_path is None:
|
||||||
test_dataset_path = Path(__file__).parent / "test_dataset.json"
|
test_dataset_path = Path(__file__).parent / "sample_dataset.json"
|
||||||
|
|
||||||
if rag_api_url is None:
|
if rag_api_url is None:
|
||||||
rag_api_url = os.getenv("LIGHTRAG_API_URL", "http://localhost:8000")
|
rag_api_url = os.getenv("LIGHTRAG_API_URL", "http://localhost:8000")
|
||||||
|
|
@ -247,6 +248,7 @@ class RAGEvaluator:
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
print(f" ❌ Error evaluating: {str(e)}")
|
print(f" ❌ Error evaluating: {str(e)}")
|
||||||
print(f" 🔍 Full traceback:\n{traceback.format_exc()}\n")
|
print(f" 🔍 Full traceback:\n{traceback.format_exc()}\n")
|
||||||
result = {
|
result = {
|
||||||
|
|
@ -254,7 +256,7 @@ class RAGEvaluator:
|
||||||
"error": str(e),
|
"error": str(e),
|
||||||
"metrics": {},
|
"metrics": {},
|
||||||
"ragas_score": 0,
|
"ragas_score": 0,
|
||||||
"timestamp": datetime.now().isoformat()
|
"timestamp": datetime.now().isoformat(),
|
||||||
}
|
}
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
|
|
@ -301,18 +303,20 @@ class RAGEvaluator:
|
||||||
|
|
||||||
for idx, result in enumerate(results, 1):
|
for idx, result in enumerate(results, 1):
|
||||||
metrics = result.get("metrics", {})
|
metrics = result.get("metrics", {})
|
||||||
writer.writerow({
|
writer.writerow(
|
||||||
"test_number": idx,
|
{
|
||||||
"question": result.get("question", ""),
|
"test_number": idx,
|
||||||
"project": result.get("project", "unknown"),
|
"question": result.get("question", ""),
|
||||||
"faithfulness": f"{metrics.get('faithfulness', 0):.4f}",
|
"project": result.get("project", "unknown"),
|
||||||
"answer_relevance": f"{metrics.get('answer_relevance', 0):.4f}",
|
"faithfulness": f"{metrics.get('faithfulness', 0):.4f}",
|
||||||
"context_recall": f"{metrics.get('context_recall', 0):.4f}",
|
"answer_relevance": f"{metrics.get('answer_relevance', 0):.4f}",
|
||||||
"context_precision": f"{metrics.get('context_precision', 0):.4f}",
|
"context_recall": f"{metrics.get('context_recall', 0):.4f}",
|
||||||
"ragas_score": f"{result.get('ragas_score', 0):.4f}",
|
"context_precision": f"{metrics.get('context_precision', 0):.4f}",
|
||||||
"status": "success" if metrics else "error",
|
"ragas_score": f"{result.get('ragas_score', 0):.4f}",
|
||||||
"timestamp": result.get("timestamp", ""),
|
"status": "success" if metrics else "error",
|
||||||
})
|
"timestamp": result.get("timestamp", ""),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return csv_path
|
return csv_path
|
||||||
|
|
||||||
|
|
@ -331,7 +335,7 @@ class RAGEvaluator:
|
||||||
"timestamp": datetime.now().isoformat(),
|
"timestamp": datetime.now().isoformat(),
|
||||||
"total_tests": len(results),
|
"total_tests": len(results),
|
||||||
"elapsed_time_seconds": round(elapsed_time, 2),
|
"elapsed_time_seconds": round(elapsed_time, 2),
|
||||||
"results": results
|
"results": results,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Save JSON results
|
# Save JSON results
|
||||||
|
|
@ -380,7 +384,7 @@ async def main():
|
||||||
if rag_api_url:
|
if rag_api_url:
|
||||||
print(f"📡 RAG API URL: {rag_api_url}")
|
print(f"📡 RAG API URL: {rag_api_url}")
|
||||||
else:
|
else:
|
||||||
print(f"📡 RAG API URL: http://localhost:8000 (default)")
|
print("📡 RAG API URL: http://localhost:8000 (default)")
|
||||||
print("="*70 + "\n")
|
print("="*70 + "\n")
|
||||||
|
|
||||||
evaluator = RAGEvaluator(rag_api_url=rag_api_url)
|
evaluator = RAGEvaluator(rag_api_url=rag_api_url)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue