diff --git a/lightrag/evaluation/README.md b/lightrag/evaluation/README.md
index 3c4942cf..7bcb3ba7 100644
--- a/lightrag/evaluation/README.md
+++ b/lightrag/evaluation/README.md
@@ -1,6 +1,6 @@
-# 📊 Portfolio RAG Evaluation Framework
+# 📊 LightRAG Evaluation Framework
 
-RAGAS-based offline evaluation of your LightRAG portfolio system.
+RAGAS-based offline evaluation of your LightRAG system.
 
 ## What is RAGAS?
 
@@ -25,14 +25,16 @@ Instead of requiring human-annotated ground truth, RAGAS uses state-of-the-art e
 ```
 lightrag/evaluation/
 ├── eval_rag_quality.py      # Main evaluation script
-├── test_dataset.json        # Test cases with ground truth
+├── sample_dataset.json        # Generic LightRAG test cases (not personal data)
 ├── __init__.py              # Package init
 ├── results/                 # Output directory
-│   ├── results_YYYYMMDD_HHMMSS.json    # Raw metrics
-│   └── report_YYYYMMDD_HHMMSS.html     # Beautiful HTML report
+│   ├── results_YYYYMMDD_HHMMSS.json    # Raw metrics in JSON
+│   └── results_YYYYMMDD_HHMMSS.csv     # Metrics in CSV format
 └── README.md                # This file
 ```
 
+**Note:** `sample_dataset.json` contains **generic test questions** about LightRAG features (RAG systems, vector databases, deployment, etc.). This is **not personal portfolio data** - you can use these questions directly to test your own LightRAG installation.
+
 ---
 
 ## 🚀 Quick Start
@@ -68,41 +70,47 @@ Results are saved automatically in `lightrag/evaluation/results/`:
 
 ```
 results/
-├── results_20241023_143022.json     ← Raw metrics (for analysis)
-└── report_20241023_143022.html      ← Beautiful HTML report 🌟
+├── results_20241023_143022.json     ← Raw metrics in JSON format
+└── results_20241023_143022.csv      ← Metrics in CSV format (for spreadsheets)
 ```
 
-**Open the HTML report in your browser to see:**
+**Results include:**
 - ✅ Overall RAGAS score
-- 📊 Per-metric averages
+- 📊 Per-metric averages (Faithfulness, Answer Relevance, Context Recall, Context Precision)
 - 📋 Individual test case results
-- 📈 Performance breakdown
+- 📈 Performance breakdown by question
 
 ---
 
 ## 📝 Test Dataset
 
-Edit `test_dataset.json` to add your own test cases:
+The included `sample_dataset.json` contains **generic example questions** about LightRAG (RAG systems, vector databases, deployment, etc.). **This is NOT personal data** - it's meant as a template.
+
+**Important:** You should **replace these with test questions based on YOUR data** that you've injected into your RAG system.
+
+### Creating Your Own Test Cases
+
+Edit `sample_dataset.json` with questions relevant to your indexed documents:
 
 ```json
 {
   "test_cases": [
     {
-      "question": "Your test question here",
-      "ground_truth": "Expected answer with key information",
-      "project_context": "project_name"
+      "question": "Question based on your documents",
+      "ground_truth": "Expected answer from your data",
+      "context": "topic_category"
     }
   ]
 }
 ```
 
-**Example:**
+**Example (for a technical portfolio):**
 
 ```json
 {
   "question": "Which projects use PyTorch?",
   "ground_truth": "The Neural ODE Project uses PyTorch with TorchODE library for continuous-time neural networks.",
-  "project_context": "neural_ode_project"
+  "context": "ml_projects"
 }
 ```
 
@@ -229,18 +237,21 @@ for i in range(3):
 
 ---
 
-## 🎯 For Portfolio/Interview
+## 🎯 Using Evaluation Results
 
-**What to Highlight:**
+**What the Metrics Tell You:**
 
-1. ✅ **Quality Metrics**: "RAG system achieves 85% RAGAS score"
-2. ✅ **Evaluation Framework**: "Automated quality assessment with RAGAS"
-3. ✅ **Best Practices**: "Offline evaluation pipeline for continuous improvement"
-4. ✅ **Production-Ready**: "Metrics-driven system optimization"
+1. ✅ **Quality Metrics**: Overall RAGAS score indicates system health
+2. ✅ **Evaluation Framework**: Automated quality assessment with RAGAS
+3. ✅ **Best Practices**: Offline evaluation pipeline for continuous improvement
+4. ✅ **Production-Ready**: Metrics-driven system optimization
 
-**Example Statement:**
+**Example Use Cases:**
 
-> "I built an evaluation framework using RAGAS that measures RAG quality across faithfulness, relevance, and context coverage. The system achieves 85% average RAGAS score, with automated HTML reports for quality tracking."
+- Track RAG quality over time as you update your documents
+- Compare different retrieval modes (local, global, hybrid, mix)
+- Measure impact of chunking strategy changes
+- Validate system performance before deployment
 
 ---
 
@@ -268,7 +279,7 @@ for i in range(3):
 pip install ragas datasets
 ```
 
-### "No test_dataset.json found"
+### "No sample_dataset.json found"
 
 Make sure you're running from the project root:
 
@@ -297,7 +308,7 @@ Current implementation uses ground truth as mock responses. Results will show pe
 
 ## 📝 Next Steps
 
-1. ✅ Review test dataset in `test_dataset.json`
+1. ✅ Review test dataset in `sample_dataset.json`
 2. ✅ Run `python lightrag/evaluation/eval_rag_quality.py`
 3. ✅ Open the HTML report in browser
 4. 🔄 Integrate with actual LightRAG system
diff --git a/lightrag/evaluation/__init__.py b/lightrag/evaluation/__init__.py
index 82ae6f95..49eb189e 100644
--- a/lightrag/evaluation/__init__.py
+++ b/lightrag/evaluation/__init__.py
@@ -4,13 +4,22 @@ LightRAG Evaluation Module
 RAGAS-based evaluation framework for assessing RAG system quality.
 
 Usage:
-    from lightrag.evaluation.eval_rag_quality import RAGEvaluator
+    from lightrag.evaluation import RAGEvaluator
 
     evaluator = RAGEvaluator()
     results = await evaluator.run()
 
-Note: RAGEvaluator is imported dynamically to avoid import errors
+Note: RAGEvaluator is imported lazily to avoid import errors
 when ragas/datasets are not installed.
 """
 
 __all__ = ["RAGEvaluator"]
+
+
+def __getattr__(name):
+    """Lazy import to avoid dependency errors when ragas is not installed."""
+    if name == "RAGEvaluator":
+        from .eval_rag_quality import RAGEvaluator
+
+        return RAGEvaluator
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/lightrag/evaluation/eval_rag_quality.py b/lightrag/evaluation/eval_rag_quality.py
index 621b14e8..df5485b1 100644
--- a/lightrag/evaluation/eval_rag_quality.py
+++ b/lightrag/evaluation/eval_rag_quality.py
@@ -40,14 +40,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 project_root = Path(__file__).parent.parent.parent
 load_dotenv(project_root / ".env")
 
-# Setup OpenAI API key (required for RAGAS evaluation)
-# Use LLM_BINDING_API_KEY if OPENAI_API_KEY is not set
-if "OPENAI_API_KEY" not in os.environ:
-    if "LLM_BINDING_API_KEY" in os.environ:
-        os.environ["OPENAI_API_KEY"] = os.environ["LLM_BINDING_API_KEY"]
-    else:
-        os.environ["OPENAI_API_KEY"] = input("Enter your OpenAI API key: ")
-
+# Conditional imports - will raise ImportError if dependencies not installed
 try:
     from datasets import Dataset
     from ragas import evaluate
@@ -57,10 +50,12 @@ try:
         context_recall,
         faithfulness,
     )
-except ImportError as e:
-    logger.error("❌ RAGAS import error: %s", e)
-    logger.error("   Install with: pip install ragas datasets")
-    sys.exit(1)
+
+    RAGAS_AVAILABLE = True
+except ImportError:
+    RAGAS_AVAILABLE = False
+    Dataset = None
+    evaluate = None
 
 
 CONNECT_TIMEOUT_SECONDS = 180.0
@@ -84,7 +79,39 @@ class RAGEvaluator:
             test_dataset_path: Path to test dataset JSON file
             rag_api_url: Base URL of LightRAG API (e.g., http://localhost:9621)
                         If None, will try to read from environment or use default
+
+        Raises:
+            ImportError: If ragas or datasets packages are not installed
+            ValueError: If LLM_BINDING is not set to 'openai'
+            EnvironmentError: If LLM_BINDING_API_KEY is not set
         """
+        # Validate RAGAS dependencies are installed
+        if not RAGAS_AVAILABLE:
+            raise ImportError(
+                "RAGAS dependencies not installed. "
+                "Install with: pip install ragas datasets"
+            )
+
+        # Validate LLM_BINDING is set to openai (required for RAGAS)
+        llm_binding = os.getenv("LLM_BINDING", "").lower()
+        if llm_binding != "openai":
+            raise ValueError(
+                f"LLM_BINDING must be set to 'openai' for RAGAS evaluation. "
+                f"Current value: '{llm_binding or '(not set)'}'"
+            )
+
+        # Validate LLM_BINDING_API_KEY exists
+        llm_binding_key = os.getenv("LLM_BINDING_API_KEY")
+        if not llm_binding_key:
+            raise EnvironmentError(
+                "LLM_BINDING_API_KEY environment variable is not set. "
+                "This is required for RAGAS evaluation."
+            )
+
+        # Set OPENAI_API_KEY from LLM_BINDING_API_KEY for RAGAS
+        os.environ["OPENAI_API_KEY"] = llm_binding_key
+        logger.info("✅ LLM_BINDING: openai")
+
         if test_dataset_path is None:
             test_dataset_path = Path(__file__).parent / "sample_dataset.json"
 
@@ -155,14 +182,26 @@ class RAGEvaluator:
                 first_ref = references[0]
                 logger.debug("🔍 First Reference Keys: %s", list(first_ref.keys()))
                 if "content" in first_ref:
-                    logger.debug(
-                        "🔍 Content Preview: %s...", first_ref["content"][:100]
-                    )
+                    content_preview = first_ref["content"]
+                    if isinstance(content_preview, list) and content_preview:
+                        logger.debug(
+                            "🔍 Content Preview (first chunk): %s...",
+                            content_preview[0][:100],
+                        )
+                    elif isinstance(content_preview, str):
+                        logger.debug("🔍 Content Preview: %s...", content_preview[:100])
 
             # Extract chunk content from enriched references
-            contexts = [
-                ref.get("content", "") for ref in references if ref.get("content")
-            ]
+            # Note: content is now a list of chunks per reference (one file may have multiple chunks)
+            contexts = []
+            for ref in references:
+                content = ref.get("content", [])
+                if isinstance(content, list):
+                    # Flatten the list: each chunk becomes a separate context
+                    contexts.extend(content)
+                elif isinstance(content, str):
+                    # Backward compatibility: if content is still a string (shouldn't happen)
+                    contexts.append(content)
 
             return {
                 "answer": answer,
@@ -547,14 +586,14 @@ class RAGEvaluator:
         # Print benchmark metrics
         logger.info("")
         logger.info("%s", "=" * 70)
-        logger.info("📈 BENCHMARK RESULTS (Moyennes)")
+        logger.info("📈 BENCHMARK RESULTS (Average)")
         logger.info("%s", "=" * 70)
         avg = benchmark_stats["average_metrics"]
-        logger.info("Moyenne Faithfulness:      %.4f", avg["faithfulness"])
-        logger.info("Moyenne Answer Relevance:  %.4f", avg["answer_relevance"])
-        logger.info("Moyenne Context Recall:    %.4f", avg["context_recall"])
-        logger.info("Moyenne Context Precision: %.4f", avg["context_precision"])
-        logger.info("Moyenne RAGAS Score:       %.4f", avg["ragas_score"])
+        logger.info("Average Faithfulness:      %.4f", avg["faithfulness"])
+        logger.info("Average Answer Relevance:  %.4f", avg["answer_relevance"])
+        logger.info("Average Context Recall:    %.4f", avg["context_recall"])
+        logger.info("Average Context Precision: %.4f", avg["context_precision"])
+        logger.info("Average RAGAS Score:       %.4f", avg["ragas_score"])
         logger.info("")
         logger.info(
             "Min RAGAS Score:           %.4f",