feat: unify comparative evals (#916)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> - Comparative Framework: Independent benchmarking system for evaluating different RAG/QA systems - HotpotQA Dataset: 50 instances corpus and corresponding QA pairs for standardized evaluation - Base Class: Abstract QABenchmarkRAG with async pipeline for document ingestion and question answering - Three Benchmarks: Standalone implementations for Mem0, LightRAG, and Graphiti with specific dependencies ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com>
This commit is contained in:
parent
9d5835042a
commit
cfe9c949a7
10 changed files with 825 additions and 0 deletions
40
evals/comparative_eval/README.md
Normal file
40
evals/comparative_eval/README.md
Normal file
|
|
@ -0,0 +1,40 @@
|
||||||
|
# Comparative QA Benchmarks
|
||||||
|
|
||||||
|
Independent benchmarks for different QA/RAG systems using HotpotQA dataset.
|
||||||
|
|
||||||
|
## Dataset Files
|
||||||
|
- `hotpot_50_corpus.json` - 50 instances from HotpotQA
|
||||||
|
- `hotpot_50_qa_pairs.json` - Corresponding question-answer pairs
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
Each benchmark can be run independently with appropriate dependencies:
|
||||||
|
|
||||||
|
### Mem0
|
||||||
|
```bash
|
||||||
|
pip install mem0ai openai
|
||||||
|
python qa_benchmark_mem0.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### LightRAG
|
||||||
|
```bash
|
||||||
|
pip install "lightrag-hku[api]"
|
||||||
|
python qa_benchmark_lightrag.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Graphiti
|
||||||
|
```bash
|
||||||
|
pip install graphiti-core
|
||||||
|
python qa_benchmark_graphiti.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Environment
|
||||||
|
Create `.env` with required API keys:
|
||||||
|
- `OPENAI_API_KEY` (all benchmarks)
|
||||||
|
- `NEO4J_URI`, `NEO4J_USER`, `NEO4J_PASSWORD` (Graphiti only)
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
Each benchmark inherits from `QABenchmarkRAG` base class and can be configured independently.
|
||||||
|
|
||||||
|
# Results
|
||||||
|
Updated results will be posted soon.
|
||||||
|
|
@ -0,0 +1,38 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Simple script to calculate aggregate metrics for multiple JSON files."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from cognee.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_aggregates_for_files(json_paths: list[str]) -> None:
|
||||||
|
"""Calculate aggregate metrics for a list of JSON files."""
|
||||||
|
for json_path in json_paths:
|
||||||
|
if not os.path.exists(json_path):
|
||||||
|
logger.error(f"File not found: {json_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Generate output path for aggregate metrics in the same folder as input
|
||||||
|
input_dir = os.path.dirname(json_path)
|
||||||
|
base_name = os.path.splitext(os.path.basename(json_path))[0]
|
||||||
|
output_path = os.path.join(input_dir, f"aggregate_metrics_{base_name}.json")
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"Calculating aggregate metrics for {json_path}")
|
||||||
|
calculate_metrics_statistics(json_path, output_path)
|
||||||
|
logger.info(f"Saved aggregate metrics to {output_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to calculate metrics for {json_path}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
dir_path = ""
|
||||||
|
json_file_paths = [
|
||||||
|
os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith(".json")
|
||||||
|
]
|
||||||
|
|
||||||
|
calculate_aggregates_for_files(json_file_paths)
|
||||||
|
print("Done calculating aggregate metrics!")
|
||||||
107
evals/comparative_eval/helpers/convert_metrics.py
Normal file
107
evals/comparative_eval/helpers/convert_metrics.py
Normal file
|
|
@ -0,0 +1,107 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def convert_metrics_file(json_path: str, metrics: List[str] = None) -> Dict[str, Any]:
|
||||||
|
"""Convert a single metrics JSON file to the desired format."""
|
||||||
|
if metrics is None:
|
||||||
|
metrics = ["correctness", "f1", "EM"]
|
||||||
|
|
||||||
|
with open(json_path, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# Extract filename without extension for system name
|
||||||
|
filename = Path(json_path).stem
|
||||||
|
|
||||||
|
# Convert to desired format
|
||||||
|
result = {
|
||||||
|
"system": filename,
|
||||||
|
"Human-LLM Correctness": None,
|
||||||
|
"Human-LLM Correctness Error": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add metrics dynamically based on the metrics list
|
||||||
|
for metric in metrics:
|
||||||
|
if metric in data:
|
||||||
|
result[f"DeepEval {metric.title()}"] = data[metric]["mean"]
|
||||||
|
result[f"DeepEval {metric.title()} Error"] = [
|
||||||
|
data[metric]["ci_lower"],
|
||||||
|
data[metric]["ci_upper"],
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
print(f"Warning: Metric '{metric}' not found in {json_path}")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_dataframe(results: List[Dict[str, Any]]) -> pd.DataFrame:
|
||||||
|
"""Convert results list to DataFrame with expanded error columns."""
|
||||||
|
df_data = []
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
row = {}
|
||||||
|
for key, value in result.items():
|
||||||
|
if key.endswith("Error") and isinstance(value, list) and len(value) == 2:
|
||||||
|
# Split error columns into lower and upper
|
||||||
|
row[f"{key} Lower"] = value[0]
|
||||||
|
row[f"{key} Upper"] = value[1]
|
||||||
|
else:
|
||||||
|
row[key] = value
|
||||||
|
df_data.append(row)
|
||||||
|
|
||||||
|
return pd.DataFrame(df_data)
|
||||||
|
|
||||||
|
|
||||||
|
def process_multiple_files(
|
||||||
|
json_paths: List[str], output_path: str, metrics: List[str] = None
|
||||||
|
) -> None:
|
||||||
|
"""Process multiple JSON files and save concatenated results."""
|
||||||
|
if metrics is None:
|
||||||
|
metrics = ["correctness", "f1", "EM"]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for json_path in json_paths:
|
||||||
|
try:
|
||||||
|
converted = convert_metrics_file(json_path, metrics)
|
||||||
|
results.append(converted)
|
||||||
|
print(f"Processed: {json_path}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {json_path}: {e}")
|
||||||
|
|
||||||
|
# Save JSON results
|
||||||
|
with open(output_path, "w") as f:
|
||||||
|
json.dump(results, f, indent=2)
|
||||||
|
|
||||||
|
print(f"Saved {len(results)} results to {output_path}")
|
||||||
|
|
||||||
|
# Convert to DataFrame and save CSV
|
||||||
|
df = convert_to_dataframe(results)
|
||||||
|
csv_path = output_path.replace(".json", ".csv")
|
||||||
|
df.to_csv(csv_path, index=False)
|
||||||
|
print(f"Saved DataFrame to {csv_path}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Default metrics (can be customized here)
|
||||||
|
# default_metrics = ['correctness', 'f1', 'EM']
|
||||||
|
default_metrics = ["correctness"]
|
||||||
|
|
||||||
|
# List JSON files in the current directory
|
||||||
|
current_dir = ""
|
||||||
|
json_files = [f for f in os.listdir(current_dir) if f.endswith(".json")]
|
||||||
|
|
||||||
|
if json_files:
|
||||||
|
print(f"Found {len(json_files)} JSON files:")
|
||||||
|
for f in json_files:
|
||||||
|
print(f" - {f}")
|
||||||
|
|
||||||
|
# Create full paths for JSON files and output file in current working directory
|
||||||
|
json_full_paths = [os.path.join(current_dir, f) for f in json_files]
|
||||||
|
output_file = os.path.join(current_dir, "converted_metrics.json")
|
||||||
|
process_multiple_files(json_full_paths, output_file, default_metrics)
|
||||||
|
else:
|
||||||
|
print("No JSON files found in current directory")
|
||||||
161
evals/comparative_eval/helpers/modal_evaluate_answers.py
Normal file
161
evals/comparative_eval/helpers/modal_evaluate_answers.py
Normal file
|
|
@ -0,0 +1,161 @@
|
||||||
|
import modal
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import datetime
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
from cognee.eval_framework.eval_config import EvalConfig
|
||||||
|
from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
|
||||||
|
from cognee.eval_framework.metrics_dashboard import create_dashboard
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
vol = modal.Volume.from_name("comparison-eval-answers", create_if_missing=True)
|
||||||
|
|
||||||
|
app = modal.App("comparison-eval-answerst")
|
||||||
|
|
||||||
|
image = (
|
||||||
|
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
|
||||||
|
.copy_local_file("pyproject.toml", "pyproject.toml")
|
||||||
|
.copy_local_file("poetry.lock", "poetry.lock")
|
||||||
|
.env(
|
||||||
|
{
|
||||||
|
"ENV": os.getenv("ENV"),
|
||||||
|
"LLM_API_KEY": os.getenv("LLM_API_KEY"),
|
||||||
|
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
.pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol})
|
||||||
|
async def modal_evaluate_answers(
|
||||||
|
answers_json_content: dict, answers_filename: str, eval_config: dict = None
|
||||||
|
):
|
||||||
|
"""Evaluates answers from JSON content and returns metrics results."""
|
||||||
|
if eval_config is None:
|
||||||
|
eval_config = EvalConfig().to_dict()
|
||||||
|
|
||||||
|
timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
|
||||||
|
|
||||||
|
# Create temporary file path for the JSON content
|
||||||
|
base_name = os.path.splitext(answers_filename)[0]
|
||||||
|
temp_answers_path = f"/data/temp_answers_{base_name}_{timestamp}.json"
|
||||||
|
|
||||||
|
# Write JSON content to temporary file
|
||||||
|
with open(temp_answers_path, "w") as f:
|
||||||
|
json.dump(answers_json_content, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
# Set up output paths with simplified naming: prefix_original_file_name
|
||||||
|
eval_params = eval_config.copy()
|
||||||
|
eval_params["answers_path"] = temp_answers_path
|
||||||
|
eval_params["metrics_path"] = f"/data/metrics_{answers_filename}"
|
||||||
|
eval_params["aggregate_metrics_path"] = f"/data/aggregate_metrics_{answers_filename}"
|
||||||
|
eval_params["dashboard_path"] = f"/data/dashboard_{os.path.splitext(answers_filename)[0]}.html"
|
||||||
|
|
||||||
|
# eval_params["evaluation_engine"] = "DirectLLM"
|
||||||
|
# eval_params["evaluation_metrics"] = ["correctness"]
|
||||||
|
|
||||||
|
logger.info(f"Evaluating answers from: {answers_filename}")
|
||||||
|
logger.info(f"Using eval params: {eval_params}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Only run evaluation (skip corpus building and question answering)
|
||||||
|
evaluated_answers = await run_evaluation(eval_params)
|
||||||
|
|
||||||
|
# Save evaluated answers
|
||||||
|
evaluated_answers_path = f"/data/evaluated_{answers_filename}"
|
||||||
|
with open(evaluated_answers_path, "w") as f:
|
||||||
|
json.dump(evaluated_answers, f, ensure_ascii=False, indent=4)
|
||||||
|
vol.commit()
|
||||||
|
|
||||||
|
# Generate dashboard if requested
|
||||||
|
if eval_params.get("dashboard"):
|
||||||
|
logger.info("Generating dashboard...")
|
||||||
|
html_output = create_dashboard(
|
||||||
|
metrics_path=eval_params["metrics_path"],
|
||||||
|
aggregate_metrics_path=eval_params["aggregate_metrics_path"],
|
||||||
|
output_file=eval_params["dashboard_path"],
|
||||||
|
benchmark=eval_params.get("benchmark", "Unknown"),
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(eval_params["dashboard_path"], "w") as f:
|
||||||
|
f.write(html_output)
|
||||||
|
vol.commit()
|
||||||
|
|
||||||
|
logger.info(f"Evaluation completed for {answers_filename}")
|
||||||
|
|
||||||
|
# Return metrics results
|
||||||
|
result = {
|
||||||
|
"answers_file": answers_filename,
|
||||||
|
"metrics_path": eval_params["metrics_path"],
|
||||||
|
"aggregate_metrics_path": eval_params["aggregate_metrics_path"],
|
||||||
|
"dashboard_path": eval_params["dashboard_path"]
|
||||||
|
if eval_params.get("dashboard")
|
||||||
|
else None,
|
||||||
|
"evaluated_answers_path": evaluated_answers_path,
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error evaluating {answers_filename}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
@app.local_entrypoint()
|
||||||
|
async def main():
|
||||||
|
"""Main entry point that evaluates multiple JSON answer files in parallel."""
|
||||||
|
|
||||||
|
json_files_dir = ""
|
||||||
|
json_files = [f for f in os.listdir(json_files_dir) if f.endswith(".json")]
|
||||||
|
json_file_paths = [os.path.join(json_files_dir, f) for f in json_files]
|
||||||
|
|
||||||
|
# Manually specify your evaluation configuration here
|
||||||
|
eval_config = EvalConfig(
|
||||||
|
# Only evaluation-related settings
|
||||||
|
evaluating_answers=True,
|
||||||
|
evaluating_contexts=False,
|
||||||
|
evaluation_engine="DeepEval",
|
||||||
|
evaluation_metrics=["correctness", "EM", "f1"],
|
||||||
|
calculate_metrics=True,
|
||||||
|
dashboard=True,
|
||||||
|
deepeval_model="gpt-4o-mini",
|
||||||
|
).to_dict()
|
||||||
|
|
||||||
|
logger.info(f"Starting evaluation of {len(json_file_paths)} JSON files")
|
||||||
|
|
||||||
|
# Read JSON files locally and prepare tasks
|
||||||
|
modal_tasks = []
|
||||||
|
for json_path in json_file_paths:
|
||||||
|
try:
|
||||||
|
# Read JSON content locally
|
||||||
|
with open(json_path, "r", encoding="utf-8") as f:
|
||||||
|
json_content = json.load(f)
|
||||||
|
|
||||||
|
filename = os.path.basename(json_path)
|
||||||
|
|
||||||
|
# Create remote evaluation task with JSON content
|
||||||
|
task = modal_evaluate_answers.remote.aio(json_content, filename, eval_config)
|
||||||
|
modal_tasks.append(task)
|
||||||
|
|
||||||
|
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||||
|
logger.error(f"Error reading {json_path}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not modal_tasks:
|
||||||
|
logger.error("No valid JSON files found to process")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Run evaluations in parallel
|
||||||
|
results = await asyncio.gather(*modal_tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
# Log results
|
||||||
|
for i, result in enumerate(results):
|
||||||
|
if isinstance(result, Exception):
|
||||||
|
logger.error(f"Failed to evaluate {json_file_paths[i]}: {result}")
|
||||||
|
else:
|
||||||
|
logger.info(f"Successfully evaluated {result['answers_file']}")
|
||||||
|
|
||||||
|
return results
|
||||||
1
evals/comparative_eval/hotpot_50_corpus.json
Normal file
1
evals/comparative_eval/hotpot_50_corpus.json
Normal file
File diff suppressed because one or more lines are too long
1
evals/comparative_eval/hotpot_50_qa_pairs.json
Normal file
1
evals/comparative_eval/hotpot_50_qa_pairs.json
Normal file
File diff suppressed because one or more lines are too long
159
evals/comparative_eval/qa_benchmark_base.py
Normal file
159
evals/comparative_eval/qa_benchmark_base.py
Normal file
|
|
@ -0,0 +1,159 @@
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class QABenchmarkConfig:
|
||||||
|
"""Base configuration for QA benchmark pipelines."""
|
||||||
|
|
||||||
|
corpus_limit: Optional[int] = None
|
||||||
|
qa_limit: Optional[int] = None
|
||||||
|
results_file: str = "hotpot_qa_results.json"
|
||||||
|
print_results: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
class QABenchmarkRAG(ABC):
|
||||||
|
"""Abstract base class for QA benchmarking with different RAG systems."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, corpus: List[str], qa_pairs: List[Dict[str, Any]], config: QABenchmarkConfig
|
||||||
|
):
|
||||||
|
"""Initialize the benchmark with corpus and QA data."""
|
||||||
|
self.corpus = corpus
|
||||||
|
self.qa_pairs = qa_pairs
|
||||||
|
self.config = config
|
||||||
|
self.rag_client = None
|
||||||
|
|
||||||
|
# Apply limits if specified
|
||||||
|
if config.corpus_limit is not None:
|
||||||
|
self.corpus = self.corpus[: config.corpus_limit]
|
||||||
|
print(f"Limited to first {config.corpus_limit} documents")
|
||||||
|
|
||||||
|
if config.qa_limit is not None:
|
||||||
|
self.qa_pairs = self.qa_pairs[: config.qa_limit]
|
||||||
|
print(f"Limited to first {config.qa_limit} questions")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_jsons(
|
||||||
|
cls, corpus_file: str, qa_pairs_file: str, config: QABenchmarkConfig
|
||||||
|
) -> "QABenchmarkRAG":
|
||||||
|
"""Create benchmark instance by loading data from JSON files."""
|
||||||
|
print(f"Loading corpus from {corpus_file}...")
|
||||||
|
with open(corpus_file) as file:
|
||||||
|
corpus = json.load(file)
|
||||||
|
|
||||||
|
print(f"Loading QA pairs from {qa_pairs_file}...")
|
||||||
|
with open(qa_pairs_file) as file:
|
||||||
|
qa_pairs = json.load(file)
|
||||||
|
|
||||||
|
return cls(corpus, qa_pairs, config)
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def initialize_rag(self) -> Any:
|
||||||
|
"""Initialize the RAG system. Returns the RAG client."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def cleanup_rag(self) -> None:
|
||||||
|
"""Clean up RAG system resources."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def insert_document(self, document: str, document_id: int) -> None:
|
||||||
|
"""Insert a single document into the RAG system."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def query_rag(self, question: str) -> str:
|
||||||
|
"""Query the RAG system and return the answer."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def system_name(self) -> str:
|
||||||
|
"""Return the name of the RAG system for logging."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def load_corpus_to_rag(self) -> None:
|
||||||
|
"""Load corpus data into the RAG system."""
|
||||||
|
print(f"Adding {len(self.corpus)} documents to {self.system_name}...")
|
||||||
|
for i, document in enumerate(tqdm(self.corpus, desc="Adding documents")):
|
||||||
|
await self.insert_document(document, i + 1)
|
||||||
|
print(f"All documents added to {self.system_name}")
|
||||||
|
|
||||||
|
async def answer_questions(self) -> List[Dict[str, Any]]:
|
||||||
|
"""Answer questions using the RAG system."""
|
||||||
|
print(f"Processing {len(self.qa_pairs)} questions...")
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for i, qa_pair in enumerate(self.qa_pairs):
|
||||||
|
question = qa_pair.get("question")
|
||||||
|
expected_answer = qa_pair.get("answer")
|
||||||
|
|
||||||
|
print(f"Processing question {i + 1}/{len(self.qa_pairs)}: {question}")
|
||||||
|
|
||||||
|
# Get answer from RAG system
|
||||||
|
try:
|
||||||
|
answer = await self.query_rag(question)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing question {i + 1}: {e}")
|
||||||
|
answer = f"Error: {str(e)}"
|
||||||
|
|
||||||
|
result = {"question": question, "answer": answer, "golden_answer": expected_answer}
|
||||||
|
|
||||||
|
if self.config.print_results:
|
||||||
|
print(
|
||||||
|
f"Question {i + 1}: {question}\nResponse: {answer}\nExpected: {expected_answer}\n{'-' * 50}"
|
||||||
|
)
|
||||||
|
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def save_results(self, results: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Save results to JSON file."""
|
||||||
|
if self.config.results_file:
|
||||||
|
print(f"Saving results to {self.config.results_file}...")
|
||||||
|
with open(self.config.results_file, "w", encoding="utf-8") as file:
|
||||||
|
json.dump(results, file, indent=2)
|
||||||
|
|
||||||
|
async def run_benchmark(self) -> List[Dict[str, Any]]:
|
||||||
|
"""Run the complete benchmark pipeline."""
|
||||||
|
print(f"Starting QA benchmark for {self.system_name}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Initialize RAG system
|
||||||
|
self.rag_client = await self.initialize_rag()
|
||||||
|
|
||||||
|
# Load corpus
|
||||||
|
await self.load_corpus_to_rag()
|
||||||
|
|
||||||
|
# Answer questions
|
||||||
|
results = await self.answer_questions()
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
self.save_results(results)
|
||||||
|
|
||||||
|
print(f"Results saved to {self.config.results_file}")
|
||||||
|
print("Pipeline completed successfully")
|
||||||
|
return results
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred during benchmark: {e}")
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
# Cleanup
|
||||||
|
if self.rag_client:
|
||||||
|
await self.cleanup_rag()
|
||||||
|
|
||||||
|
def run(self) -> List[Dict[str, Any]]:
|
||||||
|
"""Synchronous wrapper for the benchmark."""
|
||||||
|
return asyncio.run(self.run_benchmark())
|
||||||
114
evals/comparative_eval/qa_benchmark_graphiti.py
Normal file
114
evals/comparative_eval/qa_benchmark_graphiti.py
Normal file
|
|
@ -0,0 +1,114 @@
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from langchain_openai import ChatOpenAI
|
||||||
|
|
||||||
|
from graphiti_core import Graphiti
|
||||||
|
from graphiti_core.nodes import EpisodeType
|
||||||
|
from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GraphitiConfig(QABenchmarkConfig):
|
||||||
|
"""Configuration for Graphiti QA benchmark."""
|
||||||
|
|
||||||
|
# Database parameters
|
||||||
|
db_url: str = os.getenv("NEO4J_URI")
|
||||||
|
db_user: str = os.getenv("NEO4J_USER")
|
||||||
|
db_password: str = os.getenv("NEO4J_PASSWORD")
|
||||||
|
|
||||||
|
# Model parameters
|
||||||
|
model_name: str = "gpt-4o-mini"
|
||||||
|
|
||||||
|
# Default results file
|
||||||
|
results_file: str = "hotpot_qa_graphiti_results.json"
|
||||||
|
|
||||||
|
|
||||||
|
class QABenchmarkGraphiti(QABenchmarkRAG):
|
||||||
|
"""Graphiti implementation of QA benchmark."""
|
||||||
|
|
||||||
|
def __init__(self, corpus, qa_pairs, config: GraphitiConfig):
|
||||||
|
super().__init__(corpus, qa_pairs, config)
|
||||||
|
self.config: GraphitiConfig = config
|
||||||
|
self.llm = None
|
||||||
|
|
||||||
|
async def initialize_rag(self) -> Any:
|
||||||
|
"""Initialize Graphiti and LLM."""
|
||||||
|
graphiti = Graphiti(self.config.db_url, self.config.db_user, self.config.db_password)
|
||||||
|
await graphiti.build_indices_and_constraints(delete_existing=True)
|
||||||
|
|
||||||
|
# Initialize LLM
|
||||||
|
self.llm = ChatOpenAI(model=self.config.model_name, temperature=0)
|
||||||
|
|
||||||
|
return graphiti
|
||||||
|
|
||||||
|
async def cleanup_rag(self) -> None:
|
||||||
|
"""Clean up Graphiti connection."""
|
||||||
|
if self.rag_client:
|
||||||
|
await self.rag_client.close()
|
||||||
|
|
||||||
|
async def insert_document(self, document: str, document_id: int) -> None:
|
||||||
|
"""Insert document into Graphiti as an episode."""
|
||||||
|
await self.rag_client.add_episode(
|
||||||
|
name=f"Document {document_id}",
|
||||||
|
episode_body=document,
|
||||||
|
source=EpisodeType.text,
|
||||||
|
source_description="corpus",
|
||||||
|
reference_time=datetime.now(timezone.utc),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def query_rag(self, question: str) -> str:
|
||||||
|
"""Query Graphiti and generate answer using LLM."""
|
||||||
|
# Search Graphiti for relevant facts
|
||||||
|
results = await self.rag_client.search(query=question, num_results=10)
|
||||||
|
context = "\n".join(f"- {entry.fact}" for entry in results)
|
||||||
|
|
||||||
|
# Generate answer using LLM
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "Answer minimally using provided facts. Respond with one word or phrase.",
|
||||||
|
},
|
||||||
|
{"role": "user", "content": f"Facts:\n{context}\n\nQuestion: {question}"},
|
||||||
|
]
|
||||||
|
|
||||||
|
response = await self.llm.ainvoke(messages)
|
||||||
|
answer = response.content
|
||||||
|
|
||||||
|
# Store the QA interaction in Graphiti
|
||||||
|
qa_memory = f"Question: {question}\nAnswer: {answer}"
|
||||||
|
await self.rag_client.add_episode(
|
||||||
|
name="QA Interaction",
|
||||||
|
episode_body=qa_memory,
|
||||||
|
source=EpisodeType.text,
|
||||||
|
source_description="qa_interaction",
|
||||||
|
reference_time=datetime.now(timezone.utc),
|
||||||
|
)
|
||||||
|
|
||||||
|
return answer
|
||||||
|
|
||||||
|
@property
|
||||||
|
def system_name(self) -> str:
|
||||||
|
"""Return system name."""
|
||||||
|
return "Graphiti"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example usage
|
||||||
|
config = GraphitiConfig(
|
||||||
|
corpus_limit=5, # Small test
|
||||||
|
qa_limit=3,
|
||||||
|
print_results=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
benchmark = QABenchmarkGraphiti.from_jsons(
|
||||||
|
corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
|
||||||
|
)
|
||||||
|
|
||||||
|
results = benchmark.run()
|
||||||
91
evals/comparative_eval/qa_benchmark_lightrag.py
Normal file
91
evals/comparative_eval/qa_benchmark_lightrag.py
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
from lightrag import LightRAG, QueryParam
|
||||||
|
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
|
||||||
|
from lightrag.kg.shared_storage import initialize_pipeline_status
|
||||||
|
from lightrag.utils import setup_logger
|
||||||
|
from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
setup_logger("lightrag", level="INFO")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LightRAGConfig(QABenchmarkConfig):
|
||||||
|
"""Configuration for LightRAG QA benchmark."""
|
||||||
|
|
||||||
|
# Storage parameters
|
||||||
|
working_dir: str = "./lightrag_storage"
|
||||||
|
|
||||||
|
# Query parameters
|
||||||
|
query_mode: str = "hybrid" # "naive", "local", "global", "hybrid"
|
||||||
|
|
||||||
|
# Default results file
|
||||||
|
results_file: str = "hotpot_qa_lightrag_results.json"
|
||||||
|
|
||||||
|
|
||||||
|
class QABenchmarkLightRAG(QABenchmarkRAG):
|
||||||
|
"""LightRAG implementation of QA benchmark."""
|
||||||
|
|
||||||
|
def __init__(self, corpus, qa_pairs, config: LightRAGConfig):
|
||||||
|
super().__init__(corpus, qa_pairs, config)
|
||||||
|
self.config: LightRAGConfig = config
|
||||||
|
|
||||||
|
# Ensure working directory exists
|
||||||
|
if not os.path.exists(self.config.working_dir):
|
||||||
|
os.makedirs(self.config.working_dir)
|
||||||
|
|
||||||
|
async def initialize_rag(self) -> Any:
|
||||||
|
"""Initialize LightRAG with storage and pipeline setup."""
|
||||||
|
lightrag = LightRAG(
|
||||||
|
working_dir=self.config.working_dir,
|
||||||
|
embedding_func=openai_embed,
|
||||||
|
llm_model_func=gpt_4o_mini_complete,
|
||||||
|
)
|
||||||
|
|
||||||
|
await lightrag.initialize_storages()
|
||||||
|
await initialize_pipeline_status()
|
||||||
|
|
||||||
|
return lightrag
|
||||||
|
|
||||||
|
async def cleanup_rag(self) -> None:
|
||||||
|
"""Clean up LightRAG storages."""
|
||||||
|
if self.rag_client:
|
||||||
|
await self.rag_client.finalize_storages()
|
||||||
|
|
||||||
|
async def insert_document(self, document: str, document_id: int) -> None:
|
||||||
|
"""Insert document into LightRAG."""
|
||||||
|
await self.rag_client.ainsert([document])
|
||||||
|
|
||||||
|
async def query_rag(self, question: str) -> str:
|
||||||
|
"""Query LightRAG and return the answer."""
|
||||||
|
result = await self.rag_client.aquery(
|
||||||
|
question, param=QueryParam(mode=self.config.query_mode)
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
@property
|
||||||
|
def system_name(self) -> str:
|
||||||
|
"""Return system name."""
|
||||||
|
return "LightRAG"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example usage
|
||||||
|
config = LightRAGConfig(
|
||||||
|
corpus_limit=5, # Small test
|
||||||
|
qa_limit=3,
|
||||||
|
query_mode="hybrid",
|
||||||
|
print_results=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
benchmark = QABenchmarkLightRAG.from_jsons(
|
||||||
|
corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
|
||||||
|
)
|
||||||
|
|
||||||
|
results = benchmark.run()
|
||||||
113
evals/comparative_eval/qa_benchmark_mem0.py
Normal file
113
evals/comparative_eval/qa_benchmark_mem0.py
Normal file
|
|
@ -0,0 +1,113 @@
|
||||||
|
import asyncio
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from openai import OpenAI
|
||||||
|
from mem0 import Memory
|
||||||
|
|
||||||
|
from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Mem0Config(QABenchmarkConfig):
|
||||||
|
"""Configuration for Mem0 QA benchmark."""
|
||||||
|
|
||||||
|
# Memory parameters
|
||||||
|
user_id: str = "hotpot_qa_user"
|
||||||
|
|
||||||
|
# Model parameters
|
||||||
|
model_name: str = "gpt-4o-mini"
|
||||||
|
|
||||||
|
# Default results file
|
||||||
|
results_file: str = "hotpot_qa_mem0_results.json"
|
||||||
|
|
||||||
|
|
||||||
|
class QABenchmarkMem0(QABenchmarkRAG):
|
||||||
|
"""Mem0 implementation of QA benchmark."""
|
||||||
|
|
||||||
|
def __init__(self, corpus, qa_pairs, config: Mem0Config):
|
||||||
|
super().__init__(corpus, qa_pairs, config)
|
||||||
|
self.config: Mem0Config = config
|
||||||
|
self.openai_client = None
|
||||||
|
|
||||||
|
async def initialize_rag(self) -> Any:
|
||||||
|
"""Initialize Mem0 Memory and OpenAI client."""
|
||||||
|
memory = Memory()
|
||||||
|
self.openai_client = OpenAI()
|
||||||
|
return memory
|
||||||
|
|
||||||
|
async def cleanup_rag(self) -> None:
|
||||||
|
"""Clean up resources (no cleanup needed for Mem0)."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def insert_document(self, document: str, document_id: int) -> None:
|
||||||
|
"""Insert document into Mem0 as conversation messages."""
|
||||||
|
# Create conversation messages format
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "This is a document to remember."},
|
||||||
|
{"role": "user", "content": "Please remember this document."},
|
||||||
|
{"role": "assistant", "content": document},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add to memory (wrap sync call in async)
|
||||||
|
await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None, lambda: self.rag_client.add(messages, user_id=self.config.user_id)
|
||||||
|
)
|
||||||
|
|
||||||
|
async def query_rag(self, question: str) -> str:
|
||||||
|
"""Query Mem0 and generate answer using OpenAI."""
|
||||||
|
# Search Mem0 for relevant memories
|
||||||
|
relevant_memories = await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: self.rag_client.search(query=question, user_id=self.config.user_id, limit=5),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Format memories for context
|
||||||
|
memories_str = "\n".join(f"- {entry['memory']}" for entry in relevant_memories["results"])
|
||||||
|
|
||||||
|
# Generate response with OpenAI
|
||||||
|
system_prompt = f"You are a helpful AI assistant. Answer the question based on the provided context.\n\nContext:\n{memories_str}"
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": question},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Call OpenAI API (wrap sync call in async)
|
||||||
|
response = await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: self.openai_client.chat.completions.create(
|
||||||
|
model=self.config.model_name, messages=messages
|
||||||
|
),
|
||||||
|
)
|
||||||
|
answer = response.choices[0].message.content
|
||||||
|
|
||||||
|
# Store the QA interaction in Mem0
|
||||||
|
qa_messages = messages + [{"role": "assistant", "content": answer}]
|
||||||
|
await asyncio.get_event_loop().run_in_executor(
|
||||||
|
None, lambda: self.rag_client.add(qa_messages, user_id=self.config.user_id)
|
||||||
|
)
|
||||||
|
|
||||||
|
return answer
|
||||||
|
|
||||||
|
@property
|
||||||
|
def system_name(self) -> str:
|
||||||
|
"""Return system name."""
|
||||||
|
return "Mem0"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example usage
|
||||||
|
config = Mem0Config(
|
||||||
|
corpus_limit=5, # Small test
|
||||||
|
qa_limit=3,
|
||||||
|
print_results=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
benchmark = QABenchmarkMem0.from_jsons(
|
||||||
|
corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
|
||||||
|
)
|
||||||
|
|
||||||
|
results = benchmark.run()
|
||||||
Loading…
Add table
Reference in a new issue