From 026bca00d94989cb1187c44ec24dcce74ec86ff5 Mon Sep 17 00:00:00 2001
From: anouarbm <anouarbnmoussa@gmail.com>
Date: Sun, 2 Nov 2025 16:16:00 +0100
Subject: [PATCH] fix: Use actual retrieved contexts for RAGAS evaluation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Critical Fix: Contexts vs Ground Truth**
- RAGAS metrics now evaluate actual retrieval performance
- Previously: Used ground_truth as contexts (always perfect scores)
- Now: Uses retrieved documents from LightRAG API (real evaluation)

**Changes to generate_rag_response (lines 100-156)**:
- Remove unused 'context' parameter
- Change return type: Dict[str, str] → Dict[str, Any]
- Extract contexts as list of strings from references[].text
- Return 'contexts' key instead of 'context' (JSON dump)
- Add response.raise_for_status() for better error handling
- Add httpx.HTTPStatusError exception handler

**Changes to evaluate_responses (lines 180-191)**:
- Line 183: Extract retrieved_contexts from rag_response
- Line 190: Use [retrieved_contexts] instead of [[ground_truth]]
- Now correctly evaluates: retrieval quality, not ground_truth quality

**Impact on RAGAS Metrics**:
- Context Precision: Now ranks actual retrieved docs by relevance
- Context Recall: Compares ground_truth against actual retrieval
- Faithfulness: Verifies answer based on actual retrieved contexts
- Answer Relevance: Unchanged (question-answer relevance)

Fixes incorrect evaluation methodology. Based on RAGAS documentation:
- contexts = retrieved documents from RAG system
- ground_truth = reference answer for context_recall metric

References:
- https://docs.ragas.io/en/stable/concepts/components/eval_dataset/
- https://docs.ragas.io/en/stable/concepts/metrics/
---
 lightrag/evaluation/eval_rag_quality.py | 49 ++++++++++++-------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/lightrag/evaluation/eval_rag_quality.py b/lightrag/evaluation/eval_rag_quality.py
index c8f8be7b..1a26a103 100644
--- a/lightrag/evaluation/eval_rag_quality.py
+++ b/lightrag/evaluation/eval_rag_quality.py
@@ -100,52 +100,46 @@ class RAGEvaluator:
     async def generate_rag_response(
         self,
         question: str,
-        context: str = None,  # Not used - actual context comes from LightRAG
-    ) -> Dict[str, str]:
+    ) -> Dict[str, Any]:
         """
-        Generate RAG response by calling LightRAG API
-
-        Calls the actual LightRAG /query endpoint instead of using mock data.
+        Generate RAG response by calling LightRAG API.
 
         Args:
-            question: The user query
-            context: Ignored (for compatibility), actual context from LightRAG
+            question: The user query.
 
         Returns:
-            Dict with 'answer' and 'context' keys
+            Dictionary with 'answer' and 'contexts' keys.
+            'contexts' is a list of strings (one per retrieved document).
 
         Raises:
-            Exception: If LightRAG API is unavailable
+            Exception: If LightRAG API is unavailable.
         """
         try:
             async with httpx.AsyncClient(timeout=60.0) as client:
-                # Prepare request to LightRAG API
                 payload = {
                     "query": question,
-                    "mode": "mix",  # Recommended: combines local & global
+                    "mode": "mix",
                     "include_references": True,
                     "response_type": "Multiple Paragraphs",
                     "top_k": 10,
                 }
 
-                # Call LightRAG /query endpoint
                 response = await client.post(
                     f"{self.rag_api_url}/query",
                     json=payload,
                 )
-
-                if response.status_code != 200:
-                    raise Exception(
-                        f"LightRAG API error {response.status_code}: {response.text}"
-                    )
-
+                response.raise_for_status()  # Better error handling
                 result = response.json()
 
+                # Extract text content from each reference document
+                references = result.get("references", [])
+                contexts = [
+                    ref.get("text", "") for ref in references if ref.get("text")
+                ]
+
                 return {
                     "answer": result.get("response", "No response generated"),
-                    "context": json.dumps(result.get("references", []))
-                    if result.get("references")
-                    else "",
+                    "contexts": contexts,  # List of strings, not JSON dump
                 }
 
         except httpx.ConnectError:
@@ -154,6 +148,10 @@ class RAGEvaluator:
                 f"   Make sure LightRAG server is running:\n"
                 f"   python -m lightrag.api.lightrag_server"
             )
+        except httpx.HTTPStatusError as e:
+            raise Exception(
+                f"LightRAG API error {e.response.status_code}: {e.response.text}"
+            )
         except Exception as e:
             raise Exception(f"Error calling LightRAG API: {str(e)}")
 
@@ -179,14 +177,15 @@ class RAGEvaluator:
             # Generate RAG response by calling actual LightRAG API
             rag_response = await self.generate_rag_response(question=question)
 
-            # Prepare dataset for RAGAS evaluation
+            # *** CRITICAL FIX: Use actual retrieved contexts, NOT ground_truth ***
+            retrieved_contexts = rag_response["contexts"]
+
+            # Prepare dataset for RAGAS evaluation with CORRECT contexts
             eval_dataset = Dataset.from_dict(
                 {
                     "question": [question],
                     "answer": [rag_response["answer"]],
-                    "contexts": [
-                        [ground_truth]
-                    ],  # RAGAS expects list of context strings
+                    "contexts": [retrieved_contexts],
                     "ground_truth": [ground_truth],
                 }
             )