Implement two-stage pipeline for RAG evaluation with separate semaphores

• Split RAG gen and eval stages
• Add rag_semaphore for stage 1
• Add eval_semaphore for stage 2
• Improve concurrency control
• Update connection pool limits
This commit is contained in:
yangdx 2025-11-05 00:36:09 +08:00
parent d36be1f499
commit 83715a3ac1

View file

@ -347,28 +347,30 @@ class RAGEvaluator:
self, self,
idx: int, idx: int,
test_case: Dict[str, str], test_case: Dict[str, str],
semaphore: asyncio.Semaphore, rag_semaphore: asyncio.Semaphore,
eval_semaphore: asyncio.Semaphore,
client: httpx.AsyncClient, client: httpx.AsyncClient,
progress_counter: Dict[str, int], progress_counter: Dict[str, int],
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
Evaluate a single test case with concurrency control Evaluate a single test case with two-stage pipeline concurrency control
Args: Args:
idx: Test case index (1-based) idx: Test case index (1-based)
test_case: Test case dictionary with question and ground_truth test_case: Test case dictionary with question and ground_truth
semaphore: Semaphore to control concurrency rag_semaphore: Semaphore to control RAG generation concurrency (Stage 1)
eval_semaphore: Semaphore to control RAGAS evaluation concurrency (Stage 2)
client: Shared httpx AsyncClient for connection pooling client: Shared httpx AsyncClient for connection pooling
progress_counter: Shared dictionary for progress tracking progress_counter: Shared dictionary for progress tracking
Returns: Returns:
Evaluation result dictionary Evaluation result dictionary
""" """
async with semaphore: question = test_case["question"]
question = test_case["question"] ground_truth = test_case["ground_truth"]
ground_truth = test_case["ground_truth"]
# Generate RAG response by calling actual LightRAG API # Stage 1: Generate RAG response (controlled by rag_semaphore)
async with rag_semaphore:
try: try:
rag_response = await self.generate_rag_response( rag_response = await self.generate_rag_response(
question=question, client=client question=question, client=client
@ -385,32 +387,31 @@ class RAGEvaluator:
"timestamp": datetime.now().isoformat(), "timestamp": datetime.now().isoformat(),
} }
# *** CRITICAL FIX: Use actual retrieved contexts, NOT ground_truth *** # *** CRITICAL FIX: Use actual retrieved contexts, NOT ground_truth ***
retrieved_contexts = rag_response["contexts"] retrieved_contexts = rag_response["contexts"]
# DEBUG: Print what was actually retrieved (only in debug mode) # DEBUG: Print what was actually retrieved (only in debug mode)
logger.debug( logger.debug("📝 Test %s: Retrieved %s contexts", idx, len(retrieved_contexts))
"📝 Test %s: Retrieved %s contexts", idx, len(retrieved_contexts)
)
# Prepare dataset for RAGAS evaluation with CORRECT contexts # Prepare dataset for RAGAS evaluation with CORRECT contexts
eval_dataset = Dataset.from_dict( eval_dataset = Dataset.from_dict(
{ {
"question": [question], "question": [question],
"answer": [rag_response["answer"]], "answer": [rag_response["answer"]],
"contexts": [retrieved_contexts], "contexts": [retrieved_contexts],
"ground_truth": [ground_truth], "ground_truth": [ground_truth],
} }
) )
# Run RAGAS evaluation # Stage 2: Run RAGAS evaluation (controlled by eval_semaphore)
# IMPORTANT: Create fresh metric instances for each evaluation to avoid # IMPORTANT: Create fresh metric instances for each evaluation to avoid
# concurrent state conflicts when multiple tasks run in parallel # concurrent state conflicts when multiple tasks run in parallel
async with eval_semaphore:
pbar = None pbar = None
try: try:
# Create standard tqdm progress bar for RAGAS evaluation # Create standard tqdm progress bar for RAGAS evaluation
pbar = tqdm(total=4, desc=f"Eval-{idx}", leave=True) pbar = tqdm(total=4, desc=f"Eval-{idx}", leave=True)
eval_results = evaluate( eval_results = evaluate(
dataset=eval_dataset, dataset=eval_dataset,
metrics=[ metrics=[
@ -485,21 +486,25 @@ class RAGEvaluator:
async def evaluate_responses(self) -> List[Dict[str, Any]]: async def evaluate_responses(self) -> List[Dict[str, Any]]:
""" """
Evaluate all test cases in parallel and return metrics Evaluate all test cases in parallel with two-stage pipeline and return metrics
Returns: Returns:
List of evaluation results with metrics List of evaluation results with metrics
""" """
# Get evaluation concurrency from environment (default to 1 for serial evaluation) # Get evaluation concurrency from environment (default to 2 for parallel evaluation)
max_async = int(os.getenv("EVAL_MAX_CONCURRENT", "2")) max_async = int(os.getenv("EVAL_MAX_CONCURRENT", "2"))
logger.info("%s", "=" * 70) logger.info("%s", "=" * 70)
logger.info("🚀 Starting RAGAS Evaluation of LightRAG System") logger.info("🚀 Starting RAGAS Evaluation of LightRAG System")
logger.info("🔧 Concurrent evaluations: %s", max_async) logger.info("🔧 Two-Stage Pipeline Configuration:")
logger.info(" • RAGAS Evaluation (Stage 2): %s concurrent", max_async)
logger.info("%s", "=" * 70) logger.info("%s", "=" * 70)
# Create semaphore to limit concurrent evaluations # Create two-stage pipeline semaphores
semaphore = asyncio.Semaphore(max_async) # Stage 1: RAG generation - allow +1 concurrency to keep evaluation fed
rag_semaphore = asyncio.Semaphore(max_async + 1)
# Stage 2: RAGAS evaluation - primary bottleneck
eval_semaphore = asyncio.Semaphore(max_async)
# Create progress counter (shared across all tasks) # Create progress counter (shared across all tasks)
progress_counter = {"completed": 0} progress_counter = {"completed": 0}
@ -512,20 +517,25 @@ class RAGEvaluator:
read=READ_TIMEOUT_SECONDS, read=READ_TIMEOUT_SECONDS,
) )
limits = httpx.Limits( limits = httpx.Limits(
max_connections=max_async * 2, # Allow some buffer max_connections=(max_async + 1) * 2, # Allow buffer for RAG stage
max_keepalive_connections=max_async, max_keepalive_connections=max_async + 1,
) )
async with httpx.AsyncClient(timeout=timeout, limits=limits) as client: async with httpx.AsyncClient(timeout=timeout, limits=limits) as client:
# Create tasks for all test cases # Create tasks for all test cases
tasks = [ tasks = [
self.evaluate_single_case( self.evaluate_single_case(
idx, test_case, semaphore, client, progress_counter idx,
test_case,
rag_semaphore,
eval_semaphore,
client,
progress_counter,
) )
for idx, test_case in enumerate(self.test_cases, 1) for idx, test_case in enumerate(self.test_cases, 1)
] ]
# Run all evaluations in parallel (limited by semaphore) # Run all evaluations in parallel (limited by two-stage semaphores)
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
return list(results) return list(results)