diff --git a/lightrag/evaluation/README.md b/lightrag/evaluation/README.md index 3c4942cf..7bcb3ba7 100644 --- a/lightrag/evaluation/README.md +++ b/lightrag/evaluation/README.md @@ -1,6 +1,6 @@ -# 📊 Portfolio RAG Evaluation Framework +# 📊 LightRAG Evaluation Framework -RAGAS-based offline evaluation of your LightRAG portfolio system. +RAGAS-based offline evaluation of your LightRAG system. ## What is RAGAS? @@ -25,14 +25,16 @@ Instead of requiring human-annotated ground truth, RAGAS uses state-of-the-art e ``` lightrag/evaluation/ ├── eval_rag_quality.py # Main evaluation script -├── test_dataset.json # Test cases with ground truth +├── sample_dataset.json # Generic LightRAG test cases (not personal data) ├── __init__.py # Package init ├── results/ # Output directory -│ ├── results_YYYYMMDD_HHMMSS.json # Raw metrics -│ └── report_YYYYMMDD_HHMMSS.html # Beautiful HTML report +│ ├── results_YYYYMMDD_HHMMSS.json # Raw metrics in JSON +│ └── results_YYYYMMDD_HHMMSS.csv # Metrics in CSV format └── README.md # This file ``` +**Note:** `sample_dataset.json` contains **generic test questions** about LightRAG features (RAG systems, vector databases, deployment, etc.). This is **not personal portfolio data** - you can use these questions directly to test your own LightRAG installation. + --- ## 🚀 Quick Start @@ -68,41 +70,47 @@ Results are saved automatically in `lightrag/evaluation/results/`: ``` results/ -├── results_20241023_143022.json ← Raw metrics (for analysis) -└── report_20241023_143022.html ← Beautiful HTML report 🌟 +├── results_20241023_143022.json ← Raw metrics in JSON format +└── results_20241023_143022.csv ← Metrics in CSV format (for spreadsheets) ``` -**Open the HTML report in your browser to see:** +**Results include:** - ✅ Overall RAGAS score -- 📊 Per-metric averages +- 📊 Per-metric averages (Faithfulness, Answer Relevance, Context Recall, Context Precision) - 📋 Individual test case results -- 📈 Performance breakdown +- 📈 Performance breakdown by question --- ## 📝 Test Dataset -Edit `test_dataset.json` to add your own test cases: +The included `sample_dataset.json` contains **generic example questions** about LightRAG (RAG systems, vector databases, deployment, etc.). **This is NOT personal data** - it's meant as a template. + +**Important:** You should **replace these with test questions based on YOUR data** that you've injected into your RAG system. + +### Creating Your Own Test Cases + +Edit `sample_dataset.json` with questions relevant to your indexed documents: ```json { "test_cases": [ { - "question": "Your test question here", - "ground_truth": "Expected answer with key information", - "project_context": "project_name" + "question": "Question based on your documents", + "ground_truth": "Expected answer from your data", + "context": "topic_category" } ] } ``` -**Example:** +**Example (for a technical portfolio):** ```json { "question": "Which projects use PyTorch?", "ground_truth": "The Neural ODE Project uses PyTorch with TorchODE library for continuous-time neural networks.", - "project_context": "neural_ode_project" + "context": "ml_projects" } ``` @@ -229,18 +237,21 @@ for i in range(3): --- -## 🎯 For Portfolio/Interview +## 🎯 Using Evaluation Results -**What to Highlight:** +**What the Metrics Tell You:** -1. ✅ **Quality Metrics**: "RAG system achieves 85% RAGAS score" -2. ✅ **Evaluation Framework**: "Automated quality assessment with RAGAS" -3. ✅ **Best Practices**: "Offline evaluation pipeline for continuous improvement" -4. ✅ **Production-Ready**: "Metrics-driven system optimization" +1. ✅ **Quality Metrics**: Overall RAGAS score indicates system health +2. ✅ **Evaluation Framework**: Automated quality assessment with RAGAS +3. ✅ **Best Practices**: Offline evaluation pipeline for continuous improvement +4. ✅ **Production-Ready**: Metrics-driven system optimization -**Example Statement:** +**Example Use Cases:** -> "I built an evaluation framework using RAGAS that measures RAG quality across faithfulness, relevance, and context coverage. The system achieves 85% average RAGAS score, with automated HTML reports for quality tracking." +- Track RAG quality over time as you update your documents +- Compare different retrieval modes (local, global, hybrid, mix) +- Measure impact of chunking strategy changes +- Validate system performance before deployment --- @@ -268,7 +279,7 @@ for i in range(3): pip install ragas datasets ``` -### "No test_dataset.json found" +### "No sample_dataset.json found" Make sure you're running from the project root: @@ -297,7 +308,7 @@ Current implementation uses ground truth as mock responses. Results will show pe ## 📝 Next Steps -1. ✅ Review test dataset in `test_dataset.json` +1. ✅ Review test dataset in `sample_dataset.json` 2. ✅ Run `python lightrag/evaluation/eval_rag_quality.py` 3. ✅ Open the HTML report in browser 4. 🔄 Integrate with actual LightRAG system diff --git a/lightrag/evaluation/__init__.py b/lightrag/evaluation/__init__.py index 82ae6f95..49eb189e 100644 --- a/lightrag/evaluation/__init__.py +++ b/lightrag/evaluation/__init__.py @@ -4,13 +4,22 @@ LightRAG Evaluation Module RAGAS-based evaluation framework for assessing RAG system quality. Usage: - from lightrag.evaluation.eval_rag_quality import RAGEvaluator + from lightrag.evaluation import RAGEvaluator evaluator = RAGEvaluator() results = await evaluator.run() -Note: RAGEvaluator is imported dynamically to avoid import errors +Note: RAGEvaluator is imported lazily to avoid import errors when ragas/datasets are not installed. """ __all__ = ["RAGEvaluator"] + + +def __getattr__(name): + """Lazy import to avoid dependency errors when ragas is not installed.""" + if name == "RAGEvaluator": + from .eval_rag_quality import RAGEvaluator + + return RAGEvaluator + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/lightrag/evaluation/eval_rag_quality.py b/lightrag/evaluation/eval_rag_quality.py index 621b14e8..df5485b1 100644 --- a/lightrag/evaluation/eval_rag_quality.py +++ b/lightrag/evaluation/eval_rag_quality.py @@ -40,14 +40,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent.parent)) project_root = Path(__file__).parent.parent.parent load_dotenv(project_root / ".env") -# Setup OpenAI API key (required for RAGAS evaluation) -# Use LLM_BINDING_API_KEY if OPENAI_API_KEY is not set -if "OPENAI_API_KEY" not in os.environ: - if "LLM_BINDING_API_KEY" in os.environ: - os.environ["OPENAI_API_KEY"] = os.environ["LLM_BINDING_API_KEY"] - else: - os.environ["OPENAI_API_KEY"] = input("Enter your OpenAI API key: ") - +# Conditional imports - will raise ImportError if dependencies not installed try: from datasets import Dataset from ragas import evaluate @@ -57,10 +50,12 @@ try: context_recall, faithfulness, ) -except ImportError as e: - logger.error("❌ RAGAS import error: %s", e) - logger.error(" Install with: pip install ragas datasets") - sys.exit(1) + + RAGAS_AVAILABLE = True +except ImportError: + RAGAS_AVAILABLE = False + Dataset = None + evaluate = None CONNECT_TIMEOUT_SECONDS = 180.0 @@ -84,7 +79,39 @@ class RAGEvaluator: test_dataset_path: Path to test dataset JSON file rag_api_url: Base URL of LightRAG API (e.g., http://localhost:9621) If None, will try to read from environment or use default + + Raises: + ImportError: If ragas or datasets packages are not installed + ValueError: If LLM_BINDING is not set to 'openai' + EnvironmentError: If LLM_BINDING_API_KEY is not set """ + # Validate RAGAS dependencies are installed + if not RAGAS_AVAILABLE: + raise ImportError( + "RAGAS dependencies not installed. " + "Install with: pip install ragas datasets" + ) + + # Validate LLM_BINDING is set to openai (required for RAGAS) + llm_binding = os.getenv("LLM_BINDING", "").lower() + if llm_binding != "openai": + raise ValueError( + f"LLM_BINDING must be set to 'openai' for RAGAS evaluation. " + f"Current value: '{llm_binding or '(not set)'}'" + ) + + # Validate LLM_BINDING_API_KEY exists + llm_binding_key = os.getenv("LLM_BINDING_API_KEY") + if not llm_binding_key: + raise EnvironmentError( + "LLM_BINDING_API_KEY environment variable is not set. " + "This is required for RAGAS evaluation." + ) + + # Set OPENAI_API_KEY from LLM_BINDING_API_KEY for RAGAS + os.environ["OPENAI_API_KEY"] = llm_binding_key + logger.info("✅ LLM_BINDING: openai") + if test_dataset_path is None: test_dataset_path = Path(__file__).parent / "sample_dataset.json" @@ -155,14 +182,26 @@ class RAGEvaluator: first_ref = references[0] logger.debug("🔍 First Reference Keys: %s", list(first_ref.keys())) if "content" in first_ref: - logger.debug( - "🔍 Content Preview: %s...", first_ref["content"][:100] - ) + content_preview = first_ref["content"] + if isinstance(content_preview, list) and content_preview: + logger.debug( + "🔍 Content Preview (first chunk): %s...", + content_preview[0][:100], + ) + elif isinstance(content_preview, str): + logger.debug("🔍 Content Preview: %s...", content_preview[:100]) # Extract chunk content from enriched references - contexts = [ - ref.get("content", "") for ref in references if ref.get("content") - ] + # Note: content is now a list of chunks per reference (one file may have multiple chunks) + contexts = [] + for ref in references: + content = ref.get("content", []) + if isinstance(content, list): + # Flatten the list: each chunk becomes a separate context + contexts.extend(content) + elif isinstance(content, str): + # Backward compatibility: if content is still a string (shouldn't happen) + contexts.append(content) return { "answer": answer, @@ -547,14 +586,14 @@ class RAGEvaluator: # Print benchmark metrics logger.info("") logger.info("%s", "=" * 70) - logger.info("📈 BENCHMARK RESULTS (Moyennes)") + logger.info("📈 BENCHMARK RESULTS (Average)") logger.info("%s", "=" * 70) avg = benchmark_stats["average_metrics"] - logger.info("Moyenne Faithfulness: %.4f", avg["faithfulness"]) - logger.info("Moyenne Answer Relevance: %.4f", avg["answer_relevance"]) - logger.info("Moyenne Context Recall: %.4f", avg["context_recall"]) - logger.info("Moyenne Context Precision: %.4f", avg["context_precision"]) - logger.info("Moyenne RAGAS Score: %.4f", avg["ragas_score"]) + logger.info("Average Faithfulness: %.4f", avg["faithfulness"]) + logger.info("Average Answer Relevance: %.4f", avg["answer_relevance"]) + logger.info("Average Context Recall: %.4f", avg["context_recall"]) + logger.info("Average Context Precision: %.4f", avg["context_precision"]) + logger.info("Average RAGAS Score: %.4f", avg["ragas_score"]) logger.info("") logger.info( "Min RAGAS Score: %.4f",