feat: unify comparative evals (#916)

## Description  - Comparative Framework: Independent benchmarking system for evaluating different RAG/QA systems - HotpotQA Dataset: 50 instances corpus and corresponding QA pairs for standardized evaluation - Base Class: Abstract QABenchmarkRAG with async pipeline for document ingestion and question answering - Three Benchmarks: Standalone implementations for Mem0, LightRAG, and Graphiti with specific dependencies ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com>
2025-06-11 10:06:09 +02:00 · 2025-06-11 10:06:09 +02:00 · cfe9c949a7
commit cfe9c949a7
parent 9d5835042a
10 changed files with 825 additions and 0 deletions
--- a/evals/comparative_eval/README.md
+++ b/evals/comparative_eval/README.md
@ -0,0 +1,40 @@
 # Comparative QA Benchmarks
 Independent benchmarks for different QA/RAG systems using HotpotQA dataset.
 ## Dataset Files
 - `hotpot_50_corpus.json` - 50 instances from HotpotQA
 - `hotpot_50_qa_pairs.json` - Corresponding question-answer pairs
 ## Benchmarks
 Each benchmark can be run independently with appropriate dependencies:
 ### Mem0
 ```bash
 pip install mem0ai openai
 python qa_benchmark_mem0.py
 ```
 ### LightRAG
 ```bash
 pip install "lightrag-hku[api]"
 python qa_benchmark_lightrag.py
 ```
 ### Graphiti
 ```bash
 pip install graphiti-core
 python qa_benchmark_graphiti.py
 ```
 ## Environment
 Create `.env` with required API keys:
 - `OPENAI_API_KEY` (all benchmarks)
 - `NEO4J_URI`, `NEO4J_USER`, `NEO4J_PASSWORD` (Graphiti only)
 ## Usage
 Each benchmark inherits from `QABenchmarkRAG` base class and can be configured independently.
 # Results
 Updated results will be posted soon.
--- a/evals/comparative_eval/helpers/calculate_aggregate_metrics.py
+++ b/evals/comparative_eval/helpers/calculate_aggregate_metrics.py
@ -0,0 +1,38 @@
 #!/usr/bin/env python3
 """Simple script to calculate aggregate metrics for multiple JSON files."""
 import os
 from cognee.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
 from cognee.shared.logging_utils import get_logger
 logger = get_logger()
 def calculate_aggregates_for_files(json_paths: list[str]) -> None:
    """Calculate aggregate metrics for a list of JSON files."""
    for json_path in json_paths:
        if not os.path.exists(json_path):
            logger.error(f"File not found: {json_path}")
            continue
        # Generate output path for aggregate metrics in the same folder as input
        input_dir = os.path.dirname(json_path)
        base_name = os.path.splitext(os.path.basename(json_path))[0]
        output_path = os.path.join(input_dir, f"aggregate_metrics_{base_name}.json")
        try:
            logger.info(f"Calculating aggregate metrics for {json_path}")
            calculate_metrics_statistics(json_path, output_path)
            logger.info(f"Saved aggregate metrics to {output_path}")
        except Exception as e:
            logger.error(f"Failed to calculate metrics for {json_path}: {e}")
 if __name__ == "__main__":
    dir_path = ""
    json_file_paths = [
        os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith(".json")
    ]
    calculate_aggregates_for_files(json_file_paths)
    print("Done calculating aggregate metrics!")
--- a/evals/comparative_eval/helpers/convert_metrics.py
+++ b/evals/comparative_eval/helpers/convert_metrics.py
@ -0,0 +1,107 @@
 import json
 import os
 from pathlib import Path
 from typing import List, Dict, Any
 import pandas as pd
 def convert_metrics_file(json_path: str, metrics: List[str] = None) -> Dict[str, Any]:
    """Convert a single metrics JSON file to the desired format."""
    if metrics is None:
        metrics = ["correctness", "f1", "EM"]
    with open(json_path, "r") as f:
        data = json.load(f)
    # Extract filename without extension for system name
    filename = Path(json_path).stem
    # Convert to desired format
    result = {
        "system": filename,
        "Human-LLM Correctness": None,
        "Human-LLM Correctness Error": None,
    }
    # Add metrics dynamically based on the metrics list
    for metric in metrics:
        if metric in data:
            result[f"DeepEval {metric.title()}"] = data[metric]["mean"]
            result[f"DeepEval {metric.title()} Error"] = [
                data[metric]["ci_lower"],
                data[metric]["ci_upper"],
            ]
        else:
            print(f"Warning: Metric '{metric}' not found in {json_path}")
    return result
 def convert_to_dataframe(results: List[Dict[str, Any]]) -> pd.DataFrame:
    """Convert results list to DataFrame with expanded error columns."""
    df_data = []
    for result in results:
        row = {}
        for key, value in result.items():
            if key.endswith("Error") and isinstance(value, list) and len(value) == 2:
                # Split error columns into lower and upper
                row[f"{key} Lower"] = value[0]
                row[f"{key} Upper"] = value[1]
            else:
                row[key] = value
        df_data.append(row)
    return pd.DataFrame(df_data)
 def process_multiple_files(
    json_paths: List[str], output_path: str, metrics: List[str] = None
 ) -> None:
    """Process multiple JSON files and save concatenated results."""
    if metrics is None:
        metrics = ["correctness", "f1", "EM"]
    results = []
    for json_path in json_paths:
        try:
            converted = convert_metrics_file(json_path, metrics)
            results.append(converted)
            print(f"Processed: {json_path}")
        except Exception as e:
            print(f"Error processing {json_path}: {e}")
    # Save JSON results
    with open(output_path, "w") as f:
        json.dump(results, f, indent=2)
    print(f"Saved {len(results)} results to {output_path}")
    # Convert to DataFrame and save CSV
    df = convert_to_dataframe(results)
    csv_path = output_path.replace(".json", ".csv")
    df.to_csv(csv_path, index=False)
    print(f"Saved DataFrame to {csv_path}")
 if __name__ == "__main__":
    # Default metrics (can be customized here)
    # default_metrics = ['correctness', 'f1', 'EM']
    default_metrics = ["correctness"]
    # List JSON files in the current directory
    current_dir = ""
    json_files = [f for f in os.listdir(current_dir) if f.endswith(".json")]
    if json_files:
        print(f"Found {len(json_files)} JSON files:")
        for f in json_files:
            print(f"  - {f}")
        # Create full paths for JSON files and output file in current working directory
        json_full_paths = [os.path.join(current_dir, f) for f in json_files]
        output_file = os.path.join(current_dir, "converted_metrics.json")
        process_multiple_files(json_full_paths, output_file, default_metrics)
    else:
        print("No JSON files found in current directory")
--- a/evals/comparative_eval/helpers/modal_evaluate_answers.py
+++ b/evals/comparative_eval/helpers/modal_evaluate_answers.py
@ -0,0 +1,161 @@
 import modal
 import os
 import asyncio
 import datetime
 import hashlib
 import json
 from cognee.shared.logging_utils import get_logger
 from cognee.eval_framework.eval_config import EvalConfig
 from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
 from cognee.eval_framework.metrics_dashboard import create_dashboard
 logger = get_logger()
 vol = modal.Volume.from_name("comparison-eval-answers", create_if_missing=True)
 app = modal.App("comparison-eval-answerst")
 image = (
    modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
    .copy_local_file("pyproject.toml", "pyproject.toml")
    .copy_local_file("poetry.lock", "poetry.lock")
    .env(
        {
            "ENV": os.getenv("ENV"),
            "LLM_API_KEY": os.getenv("LLM_API_KEY"),
            "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
        }
    )
    .pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
 )
@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol})
 async def modal_evaluate_answers(
    answers_json_content: dict, answers_filename: str, eval_config: dict = None
 ):
    """Evaluates answers from JSON content and returns metrics results."""
    if eval_config is None:
        eval_config = EvalConfig().to_dict()
    timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
    # Create temporary file path for the JSON content
    base_name = os.path.splitext(answers_filename)[0]
    temp_answers_path = f"/data/temp_answers_{base_name}_{timestamp}.json"
    # Write JSON content to temporary file
    with open(temp_answers_path, "w") as f:
        json.dump(answers_json_content, f, ensure_ascii=False, indent=4)
    # Set up output paths with simplified naming: prefix_original_file_name
    eval_params = eval_config.copy()
    eval_params["answers_path"] = temp_answers_path
    eval_params["metrics_path"] = f"/data/metrics_{answers_filename}"
    eval_params["aggregate_metrics_path"] = f"/data/aggregate_metrics_{answers_filename}"
    eval_params["dashboard_path"] = f"/data/dashboard_{os.path.splitext(answers_filename)[0]}.html"
    # eval_params["evaluation_engine"] = "DirectLLM"
    # eval_params["evaluation_metrics"] = ["correctness"]
    logger.info(f"Evaluating answers from: {answers_filename}")
    logger.info(f"Using eval params: {eval_params}")
    try:
        # Only run evaluation (skip corpus building and question answering)
        evaluated_answers = await run_evaluation(eval_params)
        # Save evaluated answers
        evaluated_answers_path = f"/data/evaluated_{answers_filename}"
        with open(evaluated_answers_path, "w") as f:
            json.dump(evaluated_answers, f, ensure_ascii=False, indent=4)
        vol.commit()
        # Generate dashboard if requested
        if eval_params.get("dashboard"):
            logger.info("Generating dashboard...")
            html_output = create_dashboard(
                metrics_path=eval_params["metrics_path"],
                aggregate_metrics_path=eval_params["aggregate_metrics_path"],
                output_file=eval_params["dashboard_path"],
                benchmark=eval_params.get("benchmark", "Unknown"),
            )
            with open(eval_params["dashboard_path"], "w") as f:
                f.write(html_output)
            vol.commit()
        logger.info(f"Evaluation completed for {answers_filename}")
        # Return metrics results
        result = {
            "answers_file": answers_filename,
            "metrics_path": eval_params["metrics_path"],
            "aggregate_metrics_path": eval_params["aggregate_metrics_path"],
            "dashboard_path": eval_params["dashboard_path"]
            if eval_params.get("dashboard")
            else None,
            "evaluated_answers_path": evaluated_answers_path,
        }
        return result
    except Exception as e:
        logger.error(f"Error evaluating {answers_filename}: {e}")
        raise
@app.local_entrypoint()
 async def main():
    """Main entry point that evaluates multiple JSON answer files in parallel."""
    json_files_dir = ""
    json_files = [f for f in os.listdir(json_files_dir) if f.endswith(".json")]
    json_file_paths = [os.path.join(json_files_dir, f) for f in json_files]
    # Manually specify your evaluation configuration here
    eval_config = EvalConfig(
        # Only evaluation-related settings
        evaluating_answers=True,
        evaluating_contexts=False,
        evaluation_engine="DeepEval",
        evaluation_metrics=["correctness", "EM", "f1"],
        calculate_metrics=True,
        dashboard=True,
        deepeval_model="gpt-4o-mini",
    ).to_dict()
    logger.info(f"Starting evaluation of {len(json_file_paths)} JSON files")
    # Read JSON files locally and prepare tasks
    modal_tasks = []
    for json_path in json_file_paths:
        try:
            # Read JSON content locally
            with open(json_path, "r", encoding="utf-8") as f:
                json_content = json.load(f)
            filename = os.path.basename(json_path)
            # Create remote evaluation task with JSON content
            task = modal_evaluate_answers.remote.aio(json_content, filename, eval_config)
            modal_tasks.append(task)
        except (FileNotFoundError, json.JSONDecodeError) as e:
            logger.error(f"Error reading {json_path}: {e}")
            continue
    if not modal_tasks:
        logger.error("No valid JSON files found to process")
        return []
    # Run evaluations in parallel
    results = await asyncio.gather(*modal_tasks, return_exceptions=True)
    # Log results
    for i, result in enumerate(results):
        if isinstance(result, Exception):
            logger.error(f"Failed to evaluate {json_file_paths[i]}: {result}")
        else:
            logger.info(f"Successfully evaluated {result['answers_file']}")
    return results
--- a/evals/comparative_eval/hotpot_50_corpus.json
+++ b/evals/comparative_eval/hotpot_50_corpus.json
--- a/evals/comparative_eval/hotpot_50_qa_pairs.json
+++ b/evals/comparative_eval/hotpot_50_qa_pairs.json
--- a/evals/comparative_eval/qa_benchmark_base.py
+++ b/evals/comparative_eval/qa_benchmark_base.py
@ -0,0 +1,159 @@
 import asyncio
 import json
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 from dotenv import load_dotenv
 from tqdm import tqdm
 load_dotenv()
@dataclass
 class QABenchmarkConfig:
    """Base configuration for QA benchmark pipelines."""
    corpus_limit: Optional[int] = None
    qa_limit: Optional[int] = None
    results_file: str = "hotpot_qa_results.json"
    print_results: bool = True
 class QABenchmarkRAG(ABC):
    """Abstract base class for QA benchmarking with different RAG systems."""
    def __init__(
        self, corpus: List[str], qa_pairs: List[Dict[str, Any]], config: QABenchmarkConfig
    ):
        """Initialize the benchmark with corpus and QA data."""
        self.corpus = corpus
        self.qa_pairs = qa_pairs
        self.config = config
        self.rag_client = None
        # Apply limits if specified
        if config.corpus_limit is not None:
            self.corpus = self.corpus[: config.corpus_limit]
            print(f"Limited to first {config.corpus_limit} documents")
        if config.qa_limit is not None:
            self.qa_pairs = self.qa_pairs[: config.qa_limit]
            print(f"Limited to first {config.qa_limit} questions")
    @classmethod
    def from_jsons(
        cls, corpus_file: str, qa_pairs_file: str, config: QABenchmarkConfig
    ) -> "QABenchmarkRAG":
        """Create benchmark instance by loading data from JSON files."""
        print(f"Loading corpus from {corpus_file}...")
        with open(corpus_file) as file:
            corpus = json.load(file)
        print(f"Loading QA pairs from {qa_pairs_file}...")
        with open(qa_pairs_file) as file:
            qa_pairs = json.load(file)
        return cls(corpus, qa_pairs, config)
    @abstractmethod
    async def initialize_rag(self) -> Any:
        """Initialize the RAG system. Returns the RAG client."""
        pass
    @abstractmethod
    async def cleanup_rag(self) -> None:
        """Clean up RAG system resources."""
        pass
    @abstractmethod
    async def insert_document(self, document: str, document_id: int) -> None:
        """Insert a single document into the RAG system."""
        pass
    @abstractmethod
    async def query_rag(self, question: str) -> str:
        """Query the RAG system and return the answer."""
        pass
    @property
    @abstractmethod
    def system_name(self) -> str:
        """Return the name of the RAG system for logging."""
        pass
    async def load_corpus_to_rag(self) -> None:
        """Load corpus data into the RAG system."""
        print(f"Adding {len(self.corpus)} documents to {self.system_name}...")
        for i, document in enumerate(tqdm(self.corpus, desc="Adding documents")):
            await self.insert_document(document, i + 1)
        print(f"All documents added to {self.system_name}")
    async def answer_questions(self) -> List[Dict[str, Any]]:
        """Answer questions using the RAG system."""
        print(f"Processing {len(self.qa_pairs)} questions...")
        results = []
        for i, qa_pair in enumerate(self.qa_pairs):
            question = qa_pair.get("question")
            expected_answer = qa_pair.get("answer")
            print(f"Processing question {i + 1}/{len(self.qa_pairs)}: {question}")
            # Get answer from RAG system
            try:
                answer = await self.query_rag(question)
            except Exception as e:
                print(f"Error processing question {i + 1}: {e}")
                answer = f"Error: {str(e)}"
            result = {"question": question, "answer": answer, "golden_answer": expected_answer}
            if self.config.print_results:
                print(
                    f"Question {i + 1}: {question}\nResponse: {answer}\nExpected: {expected_answer}\n{'-' * 50}"
                )
            results.append(result)
        return results
    def save_results(self, results: List[Dict[str, Any]]) -> None:
        """Save results to JSON file."""
        if self.config.results_file:
            print(f"Saving results to {self.config.results_file}...")
            with open(self.config.results_file, "w", encoding="utf-8") as file:
                json.dump(results, file, indent=2)
    async def run_benchmark(self) -> List[Dict[str, Any]]:
        """Run the complete benchmark pipeline."""
        print(f"Starting QA benchmark for {self.system_name}...")
        try:
            # Initialize RAG system
            self.rag_client = await self.initialize_rag()
            # Load corpus
            await self.load_corpus_to_rag()
            # Answer questions
            results = await self.answer_questions()
            # Save results
            self.save_results(results)
            print(f"Results saved to {self.config.results_file}")
            print("Pipeline completed successfully")
            return results
        except Exception as e:
            print(f"An error occurred during benchmark: {e}")
            raise
        finally:
            # Cleanup
            if self.rag_client:
                await self.cleanup_rag()
    def run(self) -> List[Dict[str, Any]]:
        """Synchronous wrapper for the benchmark."""
        return asyncio.run(self.run_benchmark())
--- a/evals/comparative_eval/qa_benchmark_graphiti.py
+++ b/evals/comparative_eval/qa_benchmark_graphiti.py
@ -0,0 +1,114 @@
 import asyncio
 import os
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from typing import Any
 from dotenv import load_dotenv
 from langchain_openai import ChatOpenAI
 from graphiti_core import Graphiti
 from graphiti_core.nodes import EpisodeType
 from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
 load_dotenv()
@dataclass
 class GraphitiConfig(QABenchmarkConfig):
    """Configuration for Graphiti QA benchmark."""
    # Database parameters
    db_url: str = os.getenv("NEO4J_URI")
    db_user: str = os.getenv("NEO4J_USER")
    db_password: str = os.getenv("NEO4J_PASSWORD")
    # Model parameters
    model_name: str = "gpt-4o-mini"
    # Default results file
    results_file: str = "hotpot_qa_graphiti_results.json"
 class QABenchmarkGraphiti(QABenchmarkRAG):
    """Graphiti implementation of QA benchmark."""
    def __init__(self, corpus, qa_pairs, config: GraphitiConfig):
        super().__init__(corpus, qa_pairs, config)
        self.config: GraphitiConfig = config
        self.llm = None
    async def initialize_rag(self) -> Any:
        """Initialize Graphiti and LLM."""
        graphiti = Graphiti(self.config.db_url, self.config.db_user, self.config.db_password)
        await graphiti.build_indices_and_constraints(delete_existing=True)
        # Initialize LLM
        self.llm = ChatOpenAI(model=self.config.model_name, temperature=0)
        return graphiti
    async def cleanup_rag(self) -> None:
        """Clean up Graphiti connection."""
        if self.rag_client:
            await self.rag_client.close()
    async def insert_document(self, document: str, document_id: int) -> None:
        """Insert document into Graphiti as an episode."""
        await self.rag_client.add_episode(
            name=f"Document {document_id}",
            episode_body=document,
            source=EpisodeType.text,
            source_description="corpus",
            reference_time=datetime.now(timezone.utc),
        )
    async def query_rag(self, question: str) -> str:
        """Query Graphiti and generate answer using LLM."""
        # Search Graphiti for relevant facts
        results = await self.rag_client.search(query=question, num_results=10)
        context = "\n".join(f"- {entry.fact}" for entry in results)
        # Generate answer using LLM
        messages = [
            {
                "role": "system",
                "content": "Answer minimally using provided facts. Respond with one word or phrase.",
            },
            {"role": "user", "content": f"Facts:\n{context}\n\nQuestion: {question}"},
        ]
        response = await self.llm.ainvoke(messages)
        answer = response.content
        # Store the QA interaction in Graphiti
        qa_memory = f"Question: {question}\nAnswer: {answer}"
        await self.rag_client.add_episode(
            name="QA Interaction",
            episode_body=qa_memory,
            source=EpisodeType.text,
            source_description="qa_interaction",
            reference_time=datetime.now(timezone.utc),
        )
        return answer
    @property
    def system_name(self) -> str:
        """Return system name."""
        return "Graphiti"
 if __name__ == "__main__":
    # Example usage
    config = GraphitiConfig(
        corpus_limit=5,  # Small test
        qa_limit=3,
        print_results=True,
    )
    benchmark = QABenchmarkGraphiti.from_jsons(
        corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
    )
    results = benchmark.run()
--- a/evals/comparative_eval/qa_benchmark_lightrag.py
+++ b/evals/comparative_eval/qa_benchmark_lightrag.py
@ -0,0 +1,91 @@
 import asyncio
 import os
 from dataclasses import dataclass
 from typing import Any
 from dotenv import load_dotenv
 from lightrag import LightRAG, QueryParam
 from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
 from lightrag.kg.shared_storage import initialize_pipeline_status
 from lightrag.utils import setup_logger
 from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
 load_dotenv()
 setup_logger("lightrag", level="INFO")
@dataclass
 class LightRAGConfig(QABenchmarkConfig):
    """Configuration for LightRAG QA benchmark."""
    # Storage parameters
    working_dir: str = "./lightrag_storage"
    # Query parameters
    query_mode: str = "hybrid"  # "naive", "local", "global", "hybrid"
    # Default results file
    results_file: str = "hotpot_qa_lightrag_results.json"
 class QABenchmarkLightRAG(QABenchmarkRAG):
    """LightRAG implementation of QA benchmark."""
    def __init__(self, corpus, qa_pairs, config: LightRAGConfig):
        super().__init__(corpus, qa_pairs, config)
        self.config: LightRAGConfig = config
        # Ensure working directory exists
        if not os.path.exists(self.config.working_dir):
            os.makedirs(self.config.working_dir)
    async def initialize_rag(self) -> Any:
        """Initialize LightRAG with storage and pipeline setup."""
        lightrag = LightRAG(
            working_dir=self.config.working_dir,
            embedding_func=openai_embed,
            llm_model_func=gpt_4o_mini_complete,
        )
        await lightrag.initialize_storages()
        await initialize_pipeline_status()
        return lightrag
    async def cleanup_rag(self) -> None:
        """Clean up LightRAG storages."""
        if self.rag_client:
            await self.rag_client.finalize_storages()
    async def insert_document(self, document: str, document_id: int) -> None:
        """Insert document into LightRAG."""
        await self.rag_client.ainsert([document])
    async def query_rag(self, question: str) -> str:
        """Query LightRAG and return the answer."""
        result = await self.rag_client.aquery(
            question, param=QueryParam(mode=self.config.query_mode)
        )
        return result
    @property
    def system_name(self) -> str:
        """Return system name."""
        return "LightRAG"
 if __name__ == "__main__":
    # Example usage
    config = LightRAGConfig(
        corpus_limit=5,  # Small test
        qa_limit=3,
        query_mode="hybrid",
        print_results=True,
    )
    benchmark = QABenchmarkLightRAG.from_jsons(
        corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
    )
    results = benchmark.run()
--- a/evals/comparative_eval/qa_benchmark_mem0.py
+++ b/evals/comparative_eval/qa_benchmark_mem0.py
@ -0,0 +1,113 @@
 import asyncio
 from dataclasses import dataclass
 from typing import Any
 from dotenv import load_dotenv
 from openai import OpenAI
 from mem0 import Memory
 from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
 load_dotenv()
@dataclass
 class Mem0Config(QABenchmarkConfig):
    """Configuration for Mem0 QA benchmark."""
    # Memory parameters
    user_id: str = "hotpot_qa_user"
    # Model parameters
    model_name: str = "gpt-4o-mini"
    # Default results file
    results_file: str = "hotpot_qa_mem0_results.json"
 class QABenchmarkMem0(QABenchmarkRAG):
    """Mem0 implementation of QA benchmark."""
    def __init__(self, corpus, qa_pairs, config: Mem0Config):
        super().__init__(corpus, qa_pairs, config)
        self.config: Mem0Config = config
        self.openai_client = None
    async def initialize_rag(self) -> Any:
        """Initialize Mem0 Memory and OpenAI client."""
        memory = Memory()
        self.openai_client = OpenAI()
        return memory
    async def cleanup_rag(self) -> None:
        """Clean up resources (no cleanup needed for Mem0)."""
        pass
    async def insert_document(self, document: str, document_id: int) -> None:
        """Insert document into Mem0 as conversation messages."""
        # Create conversation messages format
        messages = [
            {"role": "system", "content": "This is a document to remember."},
            {"role": "user", "content": "Please remember this document."},
            {"role": "assistant", "content": document},
        ]
        # Add to memory (wrap sync call in async)
        await asyncio.get_event_loop().run_in_executor(
            None, lambda: self.rag_client.add(messages, user_id=self.config.user_id)
        )
    async def query_rag(self, question: str) -> str:
        """Query Mem0 and generate answer using OpenAI."""
        # Search Mem0 for relevant memories
        relevant_memories = await asyncio.get_event_loop().run_in_executor(
            None,
            lambda: self.rag_client.search(query=question, user_id=self.config.user_id, limit=5),
        )
        # Format memories for context
        memories_str = "\n".join(f"- {entry['memory']}" for entry in relevant_memories["results"])
        # Generate response with OpenAI
        system_prompt = f"You are a helpful AI assistant. Answer the question based on the provided context.\n\nContext:\n{memories_str}"
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question},
        ]
        # Call OpenAI API (wrap sync call in async)
        response = await asyncio.get_event_loop().run_in_executor(
            None,
            lambda: self.openai_client.chat.completions.create(
                model=self.config.model_name, messages=messages
            ),
        )
        answer = response.choices[0].message.content
        # Store the QA interaction in Mem0
        qa_messages = messages + [{"role": "assistant", "content": answer}]
        await asyncio.get_event_loop().run_in_executor(
            None, lambda: self.rag_client.add(qa_messages, user_id=self.config.user_id)
        )
        return answer
    @property
    def system_name(self) -> str:
        """Return system name."""
        return "Mem0"
 if __name__ == "__main__":
    # Example usage
    config = Mem0Config(
        corpus_limit=5,  # Small test
        qa_limit=3,
        print_results=True,
    )
    benchmark = QABenchmarkMem0.from_jsons(
        corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
    )
    results = benchmark.run()