feat: unify comparative evals (#916)

## Description  - Comparative Framework: Independent benchmarking system for evaluating different RAG/QA systems - HotpotQA Dataset: 50 instances corpus and corresponding QA pairs for standardized evaluation - Base Class: Abstract QABenchmarkRAG with async pipeline for document ingestion and question answering - Three Benchmarks: Standalone implementations for Mem0, LightRAG, and Graphiti with specific dependencies ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com>
2025-06-11 10:06:09 +02:00 · 2025-06-11 10:06:09 +02:00 · cfe9c949a7
commit cfe9c949a7
parent 9d5835042a
10 changed files with 825 additions and 0 deletions
--- a/evals/comparative_eval/README.md
+++ b/evals/comparative_eval/README.md
@ -0,0 +1,40 @@
+# Comparative QA Benchmarks
+
+Independent benchmarks for different QA/RAG systems using HotpotQA dataset.
+
+## Dataset Files
+- `hotpot_50_corpus.json` - 50 instances from HotpotQA
+- `hotpot_50_qa_pairs.json` - Corresponding question-answer pairs
+
+## Benchmarks
+
+Each benchmark can be run independently with appropriate dependencies:
+
+### Mem0
+```bash
+pip install mem0ai openai
+python qa_benchmark_mem0.py
+```
+
+### LightRAG
+```bash
+pip install "lightrag-hku[api]"
+python qa_benchmark_lightrag.py
+```
+
+### Graphiti
+```bash
+pip install graphiti-core
+python qa_benchmark_graphiti.py
+```
+
+## Environment
+Create `.env` with required API keys:
+- `OPENAI_API_KEY` (all benchmarks)
+- `NEO4J_URI`, `NEO4J_USER`, `NEO4J_PASSWORD` (Graphiti only)
+
+## Usage
+Each benchmark inherits from `QABenchmarkRAG` base class and can be configured independently.
+
+# Results
+Updated results will be posted soon.
--- a/evals/comparative_eval/helpers/calculate_aggregate_metrics.py
+++ b/evals/comparative_eval/helpers/calculate_aggregate_metrics.py
@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+"""Simple script to calculate aggregate metrics for multiple JSON files."""
+
+import os
+from cognee.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
+from cognee.shared.logging_utils import get_logger
+
+logger = get_logger()
+
+
+def calculate_aggregates_for_files(json_paths: list[str]) -> None:
+    """Calculate aggregate metrics for a list of JSON files."""
+    for json_path in json_paths:
+        if not os.path.exists(json_path):
+            logger.error(f"File not found: {json_path}")
+            continue
+
+        # Generate output path for aggregate metrics in the same folder as input
+        input_dir = os.path.dirname(json_path)
+        base_name = os.path.splitext(os.path.basename(json_path))[0]
+        output_path = os.path.join(input_dir, f"aggregate_metrics_{base_name}.json")
+
+        try:
+            logger.info(f"Calculating aggregate metrics for {json_path}")
+            calculate_metrics_statistics(json_path, output_path)
+            logger.info(f"Saved aggregate metrics to {output_path}")
+        except Exception as e:
+            logger.error(f"Failed to calculate metrics for {json_path}: {e}")
+
+
+if __name__ == "__main__":
+    dir_path = ""
+    json_file_paths = [
+        os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith(".json")
+    ]
+
+    calculate_aggregates_for_files(json_file_paths)
+    print("Done calculating aggregate metrics!")
--- a/evals/comparative_eval/helpers/convert_metrics.py
+++ b/evals/comparative_eval/helpers/convert_metrics.py
@ -0,0 +1,107 @@
+import json
+import os
+from pathlib import Path
+from typing import List, Dict, Any
+import pandas as pd
+
+
+def convert_metrics_file(json_path: str, metrics: List[str] = None) -> Dict[str, Any]:
+    """Convert a single metrics JSON file to the desired format."""
+    if metrics is None:
+        metrics = ["correctness", "f1", "EM"]
+
+    with open(json_path, "r") as f:
+        data = json.load(f)
+
+    # Extract filename without extension for system name
+    filename = Path(json_path).stem
+
+    # Convert to desired format
+    result = {
+        "system": filename,
+        "Human-LLM Correctness": None,
+        "Human-LLM Correctness Error": None,
+    }
+
+    # Add metrics dynamically based on the metrics list
+    for metric in metrics:
+        if metric in data:
+            result[f"DeepEval {metric.title()}"] = data[metric]["mean"]
+            result[f"DeepEval {metric.title()} Error"] = [
+                data[metric]["ci_lower"],
+                data[metric]["ci_upper"],
+            ]
+        else:
+            print(f"Warning: Metric '{metric}' not found in {json_path}")
+
+    return result
+
+
+def convert_to_dataframe(results: List[Dict[str, Any]]) -> pd.DataFrame:
+    """Convert results list to DataFrame with expanded error columns."""
+    df_data = []
+
+    for result in results:
+        row = {}
+        for key, value in result.items():
+            if key.endswith("Error") and isinstance(value, list) and len(value) == 2:
+                # Split error columns into lower and upper
+                row[f"{key} Lower"] = value[0]
+                row[f"{key} Upper"] = value[1]
+            else:
+                row[key] = value
+        df_data.append(row)
+
+    return pd.DataFrame(df_data)
+
+
+def process_multiple_files(
+    json_paths: List[str], output_path: str, metrics: List[str] = None
+) -> None:
+    """Process multiple JSON files and save concatenated results."""
+    if metrics is None:
+        metrics = ["correctness", "f1", "EM"]
+
+    results = []
+
+    for json_path in json_paths:
+        try:
+            converted = convert_metrics_file(json_path, metrics)
+            results.append(converted)
+            print(f"Processed: {json_path}")
+        except Exception as e:
+            print(f"Error processing {json_path}: {e}")
+
+    # Save JSON results
+    with open(output_path, "w") as f:
+        json.dump(results, f, indent=2)
+
+    print(f"Saved {len(results)} results to {output_path}")
+
+    # Convert to DataFrame and save CSV
+    df = convert_to_dataframe(results)
+    csv_path = output_path.replace(".json", ".csv")
+    df.to_csv(csv_path, index=False)
+    print(f"Saved DataFrame to {csv_path}")
+
+
+if __name__ == "__main__":
+    # Default metrics (can be customized here)
+    # default_metrics = ['correctness', 'f1', 'EM']
+    default_metrics = ["correctness"]
+
+    # List JSON files in the current directory
+    current_dir = ""
+    json_files = [f for f in os.listdir(current_dir) if f.endswith(".json")]
+
+    if json_files:
+        print(f"Found {len(json_files)} JSON files:")
+        for f in json_files:
+            print(f"  - {f}")
+
+        # Create full paths for JSON files and output file in current working directory
+        json_full_paths = [os.path.join(current_dir, f) for f in json_files]
+        output_file = os.path.join(current_dir, "converted_metrics.json")
+        process_multiple_files(json_full_paths, output_file, default_metrics)
+    else:
+        print("No JSON files found in current directory")
--- a/evals/comparative_eval/helpers/modal_evaluate_answers.py
+++ b/evals/comparative_eval/helpers/modal_evaluate_answers.py
@ -0,0 +1,161 @@
+import modal
+import os
+import asyncio
+import datetime
+import hashlib
+import json
+from cognee.shared.logging_utils import get_logger
+from cognee.eval_framework.eval_config import EvalConfig
+from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
+from cognee.eval_framework.metrics_dashboard import create_dashboard
+
+logger = get_logger()
+vol = modal.Volume.from_name("comparison-eval-answers", create_if_missing=True)
+
+app = modal.App("comparison-eval-answerst")
+
+image = (
+    modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
+    .copy_local_file("pyproject.toml", "pyproject.toml")
+    .copy_local_file("poetry.lock", "poetry.lock")
+    .env(
+        {
+            "ENV": os.getenv("ENV"),
+            "LLM_API_KEY": os.getenv("LLM_API_KEY"),
+            "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
+        }
+    )
+    .pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
+)
+
+
+@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol})
+async def modal_evaluate_answers(
+    answers_json_content: dict, answers_filename: str, eval_config: dict = None
+):
+    """Evaluates answers from JSON content and returns metrics results."""
+    if eval_config is None:
+        eval_config = EvalConfig().to_dict()
+
+    timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
+
+    # Create temporary file path for the JSON content
+    base_name = os.path.splitext(answers_filename)[0]
+    temp_answers_path = f"/data/temp_answers_{base_name}_{timestamp}.json"
+
+    # Write JSON content to temporary file
+    with open(temp_answers_path, "w") as f:
+        json.dump(answers_json_content, f, ensure_ascii=False, indent=4)
+
+    # Set up output paths with simplified naming: prefix_original_file_name
+    eval_params = eval_config.copy()
+    eval_params["answers_path"] = temp_answers_path
+    eval_params["metrics_path"] = f"/data/metrics_{answers_filename}"
+    eval_params["aggregate_metrics_path"] = f"/data/aggregate_metrics_{answers_filename}"
+    eval_params["dashboard_path"] = f"/data/dashboard_{os.path.splitext(answers_filename)[0]}.html"
+
+    # eval_params["evaluation_engine"] = "DirectLLM"
+    # eval_params["evaluation_metrics"] = ["correctness"]
+
+    logger.info(f"Evaluating answers from: {answers_filename}")
+    logger.info(f"Using eval params: {eval_params}")
+
+    try:
+        # Only run evaluation (skip corpus building and question answering)
+        evaluated_answers = await run_evaluation(eval_params)
+
+        # Save evaluated answers
+        evaluated_answers_path = f"/data/evaluated_{answers_filename}"
+        with open(evaluated_answers_path, "w") as f:
+            json.dump(evaluated_answers, f, ensure_ascii=False, indent=4)
+        vol.commit()
+
+        # Generate dashboard if requested
+        if eval_params.get("dashboard"):
+            logger.info("Generating dashboard...")
+            html_output = create_dashboard(
+                metrics_path=eval_params["metrics_path"],
+                aggregate_metrics_path=eval_params["aggregate_metrics_path"],
+                output_file=eval_params["dashboard_path"],
+                benchmark=eval_params.get("benchmark", "Unknown"),
+            )
+
+            with open(eval_params["dashboard_path"], "w") as f:
+                f.write(html_output)
+            vol.commit()
+
+        logger.info(f"Evaluation completed for {answers_filename}")
+
+        # Return metrics results
+        result = {
+            "answers_file": answers_filename,
+            "metrics_path": eval_params["metrics_path"],
+            "aggregate_metrics_path": eval_params["aggregate_metrics_path"],
+            "dashboard_path": eval_params["dashboard_path"]
+            if eval_params.get("dashboard")
+            else None,
+            "evaluated_answers_path": evaluated_answers_path,
+        }
+
+        return result
+
+    except Exception as e:
+        logger.error(f"Error evaluating {answers_filename}: {e}")
+        raise
+
+
+@app.local_entrypoint()
+async def main():
+    """Main entry point that evaluates multiple JSON answer files in parallel."""
+
+    json_files_dir = ""
+    json_files = [f for f in os.listdir(json_files_dir) if f.endswith(".json")]
+    json_file_paths = [os.path.join(json_files_dir, f) for f in json_files]
+
+    # Manually specify your evaluation configuration here
+    eval_config = EvalConfig(
+        # Only evaluation-related settings
+        evaluating_answers=True,
+        evaluating_contexts=False,
+        evaluation_engine="DeepEval",
+        evaluation_metrics=["correctness", "EM", "f1"],
+        calculate_metrics=True,
+        dashboard=True,
+        deepeval_model="gpt-4o-mini",
+    ).to_dict()
+
+    logger.info(f"Starting evaluation of {len(json_file_paths)} JSON files")
+
+    # Read JSON files locally and prepare tasks
+    modal_tasks = []
+    for json_path in json_file_paths:
+        try:
+            # Read JSON content locally
+            with open(json_path, "r", encoding="utf-8") as f:
+                json_content = json.load(f)
+
+            filename = os.path.basename(json_path)
+
+            # Create remote evaluation task with JSON content
+            task = modal_evaluate_answers.remote.aio(json_content, filename, eval_config)
+            modal_tasks.append(task)
+
+        except (FileNotFoundError, json.JSONDecodeError) as e:
+            logger.error(f"Error reading {json_path}: {e}")
+            continue
+
+    if not modal_tasks:
+        logger.error("No valid JSON files found to process")
+        return []
+
+    # Run evaluations in parallel
+    results = await asyncio.gather(*modal_tasks, return_exceptions=True)
+
+    # Log results
+    for i, result in enumerate(results):
+        if isinstance(result, Exception):
+            logger.error(f"Failed to evaluate {json_file_paths[i]}: {result}")
+        else:
+            logger.info(f"Successfully evaluated {result['answers_file']}")
+
+    return results
--- a/evals/comparative_eval/hotpot_50_corpus.json
+++ b/evals/comparative_eval/hotpot_50_corpus.json
--- a/evals/comparative_eval/hotpot_50_qa_pairs.json
+++ b/evals/comparative_eval/hotpot_50_qa_pairs.json
--- a/evals/comparative_eval/qa_benchmark_base.py
+++ b/evals/comparative_eval/qa_benchmark_base.py
@ -0,0 +1,159 @@
+import asyncio
+import json
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from dotenv import load_dotenv
+from tqdm import tqdm
+
+load_dotenv()
+
+
+@dataclass
+class QABenchmarkConfig:
+    """Base configuration for QA benchmark pipelines."""
+
+    corpus_limit: Optional[int] = None
+    qa_limit: Optional[int] = None
+    results_file: str = "hotpot_qa_results.json"
+    print_results: bool = True
+
+
+class QABenchmarkRAG(ABC):
+    """Abstract base class for QA benchmarking with different RAG systems."""
+
+    def __init__(
+        self, corpus: List[str], qa_pairs: List[Dict[str, Any]], config: QABenchmarkConfig
+    ):
+        """Initialize the benchmark with corpus and QA data."""
+        self.corpus = corpus
+        self.qa_pairs = qa_pairs
+        self.config = config
+        self.rag_client = None
+
+        # Apply limits if specified
+        if config.corpus_limit is not None:
+            self.corpus = self.corpus[: config.corpus_limit]
+            print(f"Limited to first {config.corpus_limit} documents")
+
+        if config.qa_limit is not None:
+            self.qa_pairs = self.qa_pairs[: config.qa_limit]
+            print(f"Limited to first {config.qa_limit} questions")
+
+    @classmethod
+    def from_jsons(
+        cls, corpus_file: str, qa_pairs_file: str, config: QABenchmarkConfig
+    ) -> "QABenchmarkRAG":
+        """Create benchmark instance by loading data from JSON files."""
+        print(f"Loading corpus from {corpus_file}...")
+        with open(corpus_file) as file:
+            corpus = json.load(file)
+
+        print(f"Loading QA pairs from {qa_pairs_file}...")
+        with open(qa_pairs_file) as file:
+            qa_pairs = json.load(file)
+
+        return cls(corpus, qa_pairs, config)
+
+    @abstractmethod
+    async def initialize_rag(self) -> Any:
+        """Initialize the RAG system. Returns the RAG client."""
+        pass
+
+    @abstractmethod
+    async def cleanup_rag(self) -> None:
+        """Clean up RAG system resources."""
+        pass
+
+    @abstractmethod
+    async def insert_document(self, document: str, document_id: int) -> None:
+        """Insert a single document into the RAG system."""
+        pass
+
+    @abstractmethod
+    async def query_rag(self, question: str) -> str:
+        """Query the RAG system and return the answer."""
+        pass
+
+    @property
+    @abstractmethod
+    def system_name(self) -> str:
+        """Return the name of the RAG system for logging."""
+        pass
+
+    async def load_corpus_to_rag(self) -> None:
+        """Load corpus data into the RAG system."""
+        print(f"Adding {len(self.corpus)} documents to {self.system_name}...")
+        for i, document in enumerate(tqdm(self.corpus, desc="Adding documents")):
+            await self.insert_document(document, i + 1)
+        print(f"All documents added to {self.system_name}")
+
+    async def answer_questions(self) -> List[Dict[str, Any]]:
+        """Answer questions using the RAG system."""
+        print(f"Processing {len(self.qa_pairs)} questions...")
+        results = []
+
+        for i, qa_pair in enumerate(self.qa_pairs):
+            question = qa_pair.get("question")
+            expected_answer = qa_pair.get("answer")
+
+            print(f"Processing question {i + 1}/{len(self.qa_pairs)}: {question}")
+
+            # Get answer from RAG system
+            try:
+                answer = await self.query_rag(question)
+            except Exception as e:
+                print(f"Error processing question {i + 1}: {e}")
+                answer = f"Error: {str(e)}"
+
+            result = {"question": question, "answer": answer, "golden_answer": expected_answer}
+
+            if self.config.print_results:
+                print(
+                    f"Question {i + 1}: {question}\nResponse: {answer}\nExpected: {expected_answer}\n{'-' * 50}"
+                )
+
+            results.append(result)
+
+        return results
+
+    def save_results(self, results: List[Dict[str, Any]]) -> None:
+        """Save results to JSON file."""
+        if self.config.results_file:
+            print(f"Saving results to {self.config.results_file}...")
+            with open(self.config.results_file, "w", encoding="utf-8") as file:
+                json.dump(results, file, indent=2)
+
+    async def run_benchmark(self) -> List[Dict[str, Any]]:
+        """Run the complete benchmark pipeline."""
+        print(f"Starting QA benchmark for {self.system_name}...")
+
+        try:
+            # Initialize RAG system
+            self.rag_client = await self.initialize_rag()
+
+            # Load corpus
+            await self.load_corpus_to_rag()
+
+            # Answer questions
+            results = await self.answer_questions()
+
+            # Save results
+            self.save_results(results)
+
+            print(f"Results saved to {self.config.results_file}")
+            print("Pipeline completed successfully")
+            return results
+
+        except Exception as e:
+            print(f"An error occurred during benchmark: {e}")
+            raise
+        finally:
+            # Cleanup
+            if self.rag_client:
+                await self.cleanup_rag()
+
+    def run(self) -> List[Dict[str, Any]]:
+        """Synchronous wrapper for the benchmark."""
+        return asyncio.run(self.run_benchmark())
--- a/evals/comparative_eval/qa_benchmark_graphiti.py
+++ b/evals/comparative_eval/qa_benchmark_graphiti.py
@ -0,0 +1,114 @@
+import asyncio
+import os
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from typing import Any
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from graphiti_core import Graphiti
+from graphiti_core.nodes import EpisodeType
+from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
+
+load_dotenv()
+
+
+@dataclass
+class GraphitiConfig(QABenchmarkConfig):
+    """Configuration for Graphiti QA benchmark."""
+
+    # Database parameters
+    db_url: str = os.getenv("NEO4J_URI")
+    db_user: str = os.getenv("NEO4J_USER")
+    db_password: str = os.getenv("NEO4J_PASSWORD")
+
+    # Model parameters
+    model_name: str = "gpt-4o-mini"
+
+    # Default results file
+    results_file: str = "hotpot_qa_graphiti_results.json"
+
+
+class QABenchmarkGraphiti(QABenchmarkRAG):
+    """Graphiti implementation of QA benchmark."""
+
+    def __init__(self, corpus, qa_pairs, config: GraphitiConfig):
+        super().__init__(corpus, qa_pairs, config)
+        self.config: GraphitiConfig = config
+        self.llm = None
+
+    async def initialize_rag(self) -> Any:
+        """Initialize Graphiti and LLM."""
+        graphiti = Graphiti(self.config.db_url, self.config.db_user, self.config.db_password)
+        await graphiti.build_indices_and_constraints(delete_existing=True)
+
+        # Initialize LLM
+        self.llm = ChatOpenAI(model=self.config.model_name, temperature=0)
+
+        return graphiti
+
+    async def cleanup_rag(self) -> None:
+        """Clean up Graphiti connection."""
+        if self.rag_client:
+            await self.rag_client.close()
+
+    async def insert_document(self, document: str, document_id: int) -> None:
+        """Insert document into Graphiti as an episode."""
+        await self.rag_client.add_episode(
+            name=f"Document {document_id}",
+            episode_body=document,
+            source=EpisodeType.text,
+            source_description="corpus",
+            reference_time=datetime.now(timezone.utc),
+        )
+
+    async def query_rag(self, question: str) -> str:
+        """Query Graphiti and generate answer using LLM."""
+        # Search Graphiti for relevant facts
+        results = await self.rag_client.search(query=question, num_results=10)
+        context = "\n".join(f"- {entry.fact}" for entry in results)
+
+        # Generate answer using LLM
+        messages = [
+            {
+                "role": "system",
+                "content": "Answer minimally using provided facts. Respond with one word or phrase.",
+            },
+            {"role": "user", "content": f"Facts:\n{context}\n\nQuestion: {question}"},
+        ]
+
+        response = await self.llm.ainvoke(messages)
+        answer = response.content
+
+        # Store the QA interaction in Graphiti
+        qa_memory = f"Question: {question}\nAnswer: {answer}"
+        await self.rag_client.add_episode(
+            name="QA Interaction",
+            episode_body=qa_memory,
+            source=EpisodeType.text,
+            source_description="qa_interaction",
+            reference_time=datetime.now(timezone.utc),
+        )
+
+        return answer
+
+    @property
+    def system_name(self) -> str:
+        """Return system name."""
+        return "Graphiti"
+
+
+if __name__ == "__main__":
+    # Example usage
+    config = GraphitiConfig(
+        corpus_limit=5,  # Small test
+        qa_limit=3,
+        print_results=True,
+    )
+
+    benchmark = QABenchmarkGraphiti.from_jsons(
+        corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
+    )
+
+    results = benchmark.run()
--- a/evals/comparative_eval/qa_benchmark_lightrag.py
+++ b/evals/comparative_eval/qa_benchmark_lightrag.py
@ -0,0 +1,91 @@
+import asyncio
+import os
+from dataclasses import dataclass
+from typing import Any
+
+from dotenv import load_dotenv
+
+from lightrag import LightRAG, QueryParam
+from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
+from lightrag.kg.shared_storage import initialize_pipeline_status
+from lightrag.utils import setup_logger
+from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
+
+load_dotenv()
+setup_logger("lightrag", level="INFO")
+
+
+@dataclass
+class LightRAGConfig(QABenchmarkConfig):
+    """Configuration for LightRAG QA benchmark."""
+
+    # Storage parameters
+    working_dir: str = "./lightrag_storage"
+
+    # Query parameters
+    query_mode: str = "hybrid"  # "naive", "local", "global", "hybrid"
+
+    # Default results file
+    results_file: str = "hotpot_qa_lightrag_results.json"
+
+
+class QABenchmarkLightRAG(QABenchmarkRAG):
+    """LightRAG implementation of QA benchmark."""
+
+    def __init__(self, corpus, qa_pairs, config: LightRAGConfig):
+        super().__init__(corpus, qa_pairs, config)
+        self.config: LightRAGConfig = config
+
+        # Ensure working directory exists
+        if not os.path.exists(self.config.working_dir):
+            os.makedirs(self.config.working_dir)
+
+    async def initialize_rag(self) -> Any:
+        """Initialize LightRAG with storage and pipeline setup."""
+        lightrag = LightRAG(
+            working_dir=self.config.working_dir,
+            embedding_func=openai_embed,
+            llm_model_func=gpt_4o_mini_complete,
+        )
+
+        await lightrag.initialize_storages()
+        await initialize_pipeline_status()
+
+        return lightrag
+
+    async def cleanup_rag(self) -> None:
+        """Clean up LightRAG storages."""
+        if self.rag_client:
+            await self.rag_client.finalize_storages()
+
+    async def insert_document(self, document: str, document_id: int) -> None:
+        """Insert document into LightRAG."""
+        await self.rag_client.ainsert([document])
+
+    async def query_rag(self, question: str) -> str:
+        """Query LightRAG and return the answer."""
+        result = await self.rag_client.aquery(
+            question, param=QueryParam(mode=self.config.query_mode)
+        )
+        return result
+
+    @property
+    def system_name(self) -> str:
+        """Return system name."""
+        return "LightRAG"
+
+
+if __name__ == "__main__":
+    # Example usage
+    config = LightRAGConfig(
+        corpus_limit=5,  # Small test
+        qa_limit=3,
+        query_mode="hybrid",
+        print_results=True,
+    )
+
+    benchmark = QABenchmarkLightRAG.from_jsons(
+        corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
+    )
+
+    results = benchmark.run()
--- a/evals/comparative_eval/qa_benchmark_mem0.py
+++ b/evals/comparative_eval/qa_benchmark_mem0.py
@ -0,0 +1,113 @@
+import asyncio
+from dataclasses import dataclass
+from typing import Any
+
+from dotenv import load_dotenv
+from openai import OpenAI
+from mem0 import Memory
+
+from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
+
+load_dotenv()
+
+
+@dataclass
+class Mem0Config(QABenchmarkConfig):
+    """Configuration for Mem0 QA benchmark."""
+
+    # Memory parameters
+    user_id: str = "hotpot_qa_user"
+
+    # Model parameters
+    model_name: str = "gpt-4o-mini"
+
+    # Default results file
+    results_file: str = "hotpot_qa_mem0_results.json"
+
+
+class QABenchmarkMem0(QABenchmarkRAG):
+    """Mem0 implementation of QA benchmark."""
+
+    def __init__(self, corpus, qa_pairs, config: Mem0Config):
+        super().__init__(corpus, qa_pairs, config)
+        self.config: Mem0Config = config
+        self.openai_client = None
+
+    async def initialize_rag(self) -> Any:
+        """Initialize Mem0 Memory and OpenAI client."""
+        memory = Memory()
+        self.openai_client = OpenAI()
+        return memory
+
+    async def cleanup_rag(self) -> None:
+        """Clean up resources (no cleanup needed for Mem0)."""
+        pass
+
+    async def insert_document(self, document: str, document_id: int) -> None:
+        """Insert document into Mem0 as conversation messages."""
+        # Create conversation messages format
+        messages = [
+            {"role": "system", "content": "This is a document to remember."},
+            {"role": "user", "content": "Please remember this document."},
+            {"role": "assistant", "content": document},
+        ]
+
+        # Add to memory (wrap sync call in async)
+        await asyncio.get_event_loop().run_in_executor(
+            None, lambda: self.rag_client.add(messages, user_id=self.config.user_id)
+        )
+
+    async def query_rag(self, question: str) -> str:
+        """Query Mem0 and generate answer using OpenAI."""
+        # Search Mem0 for relevant memories
+        relevant_memories = await asyncio.get_event_loop().run_in_executor(
+            None,
+            lambda: self.rag_client.search(query=question, user_id=self.config.user_id, limit=5),
+        )
+
+        # Format memories for context
+        memories_str = "\n".join(f"- {entry['memory']}" for entry in relevant_memories["results"])
+
+        # Generate response with OpenAI
+        system_prompt = f"You are a helpful AI assistant. Answer the question based on the provided context.\n\nContext:\n{memories_str}"
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": question},
+        ]
+
+        # Call OpenAI API (wrap sync call in async)
+        response = await asyncio.get_event_loop().run_in_executor(
+            None,
+            lambda: self.openai_client.chat.completions.create(
+                model=self.config.model_name, messages=messages
+            ),
+        )
+        answer = response.choices[0].message.content
+
+        # Store the QA interaction in Mem0
+        qa_messages = messages + [{"role": "assistant", "content": answer}]
+        await asyncio.get_event_loop().run_in_executor(
+            None, lambda: self.rag_client.add(qa_messages, user_id=self.config.user_id)
+        )
+
+        return answer
+
+    @property
+    def system_name(self) -> str:
+        """Return system name."""
+        return "Mem0"
+
+
+if __name__ == "__main__":
+    # Example usage
+    config = Mem0Config(
+        corpus_limit=5,  # Small test
+        qa_limit=3,
+        print_results=True,
+    )
+
+    benchmark = QABenchmarkMem0.from_jsons(
+        corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
+    )
+
+    results = benchmark.run()