From 026bca00d94989cb1187c44ec24dcce74ec86ff5 Mon Sep 17 00:00:00 2001 From: anouarbm Date: Sun, 2 Nov 2025 16:16:00 +0100 Subject: [PATCH] fix: Use actual retrieved contexts for RAGAS evaluation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Critical Fix: Contexts vs Ground Truth** - RAGAS metrics now evaluate actual retrieval performance - Previously: Used ground_truth as contexts (always perfect scores) - Now: Uses retrieved documents from LightRAG API (real evaluation) **Changes to generate_rag_response (lines 100-156)**: - Remove unused 'context' parameter - Change return type: Dict[str, str] → Dict[str, Any] - Extract contexts as list of strings from references[].text - Return 'contexts' key instead of 'context' (JSON dump) - Add response.raise_for_status() for better error handling - Add httpx.HTTPStatusError exception handler **Changes to evaluate_responses (lines 180-191)**: - Line 183: Extract retrieved_contexts from rag_response - Line 190: Use [retrieved_contexts] instead of [[ground_truth]] - Now correctly evaluates: retrieval quality, not ground_truth quality **Impact on RAGAS Metrics**: - Context Precision: Now ranks actual retrieved docs by relevance - Context Recall: Compares ground_truth against actual retrieval - Faithfulness: Verifies answer based on actual retrieved contexts - Answer Relevance: Unchanged (question-answer relevance) Fixes incorrect evaluation methodology. Based on RAGAS documentation: - contexts = retrieved documents from RAG system - ground_truth = reference answer for context_recall metric References: - https://docs.ragas.io/en/stable/concepts/components/eval_dataset/ - https://docs.ragas.io/en/stable/concepts/metrics/ --- lightrag/evaluation/eval_rag_quality.py | 49 ++++++++++++------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/lightrag/evaluation/eval_rag_quality.py b/lightrag/evaluation/eval_rag_quality.py index c8f8be7b..1a26a103 100644 --- a/lightrag/evaluation/eval_rag_quality.py +++ b/lightrag/evaluation/eval_rag_quality.py @@ -100,52 +100,46 @@ class RAGEvaluator: async def generate_rag_response( self, question: str, - context: str = None, # Not used - actual context comes from LightRAG - ) -> Dict[str, str]: + ) -> Dict[str, Any]: """ - Generate RAG response by calling LightRAG API - - Calls the actual LightRAG /query endpoint instead of using mock data. + Generate RAG response by calling LightRAG API. Args: - question: The user query - context: Ignored (for compatibility), actual context from LightRAG + question: The user query. Returns: - Dict with 'answer' and 'context' keys + Dictionary with 'answer' and 'contexts' keys. + 'contexts' is a list of strings (one per retrieved document). Raises: - Exception: If LightRAG API is unavailable + Exception: If LightRAG API is unavailable. """ try: async with httpx.AsyncClient(timeout=60.0) as client: - # Prepare request to LightRAG API payload = { "query": question, - "mode": "mix", # Recommended: combines local & global + "mode": "mix", "include_references": True, "response_type": "Multiple Paragraphs", "top_k": 10, } - # Call LightRAG /query endpoint response = await client.post( f"{self.rag_api_url}/query", json=payload, ) - - if response.status_code != 200: - raise Exception( - f"LightRAG API error {response.status_code}: {response.text}" - ) - + response.raise_for_status() # Better error handling result = response.json() + # Extract text content from each reference document + references = result.get("references", []) + contexts = [ + ref.get("text", "") for ref in references if ref.get("text") + ] + return { "answer": result.get("response", "No response generated"), - "context": json.dumps(result.get("references", [])) - if result.get("references") - else "", + "contexts": contexts, # List of strings, not JSON dump } except httpx.ConnectError: @@ -154,6 +148,10 @@ class RAGEvaluator: f" Make sure LightRAG server is running:\n" f" python -m lightrag.api.lightrag_server" ) + except httpx.HTTPStatusError as e: + raise Exception( + f"LightRAG API error {e.response.status_code}: {e.response.text}" + ) except Exception as e: raise Exception(f"Error calling LightRAG API: {str(e)}") @@ -179,14 +177,15 @@ class RAGEvaluator: # Generate RAG response by calling actual LightRAG API rag_response = await self.generate_rag_response(question=question) - # Prepare dataset for RAGAS evaluation + # *** CRITICAL FIX: Use actual retrieved contexts, NOT ground_truth *** + retrieved_contexts = rag_response["contexts"] + + # Prepare dataset for RAGAS evaluation with CORRECT contexts eval_dataset = Dataset.from_dict( { "question": [question], "answer": [rag_response["answer"]], - "contexts": [ - [ground_truth] - ], # RAGAS expects list of context strings + "contexts": [retrieved_contexts], "ground_truth": [ground_truth], } )