feat: unify comparative evals (#916)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> - Comparative Framework: Independent benchmarking system for evaluating different RAG/QA systems - HotpotQA Dataset: 50 instances corpus and corresponding QA pairs for standardized evaluation - Base Class: Abstract QABenchmarkRAG with async pipeline for document ingestion and question answering - Three Benchmarks: Standalone implementations for Mem0, LightRAG, and Graphiti with specific dependencies ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com>
This commit is contained in:
parent
9d5835042a
commit
cfe9c949a7
10 changed files with 825 additions and 0 deletions
40
evals/comparative_eval/README.md
Normal file
40
evals/comparative_eval/README.md
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
# Comparative QA Benchmarks
|
||||
|
||||
Independent benchmarks for different QA/RAG systems using HotpotQA dataset.
|
||||
|
||||
## Dataset Files
|
||||
- `hotpot_50_corpus.json` - 50 instances from HotpotQA
|
||||
- `hotpot_50_qa_pairs.json` - Corresponding question-answer pairs
|
||||
|
||||
## Benchmarks
|
||||
|
||||
Each benchmark can be run independently with appropriate dependencies:
|
||||
|
||||
### Mem0
|
||||
```bash
|
||||
pip install mem0ai openai
|
||||
python qa_benchmark_mem0.py
|
||||
```
|
||||
|
||||
### LightRAG
|
||||
```bash
|
||||
pip install "lightrag-hku[api]"
|
||||
python qa_benchmark_lightrag.py
|
||||
```
|
||||
|
||||
### Graphiti
|
||||
```bash
|
||||
pip install graphiti-core
|
||||
python qa_benchmark_graphiti.py
|
||||
```
|
||||
|
||||
## Environment
|
||||
Create `.env` with required API keys:
|
||||
- `OPENAI_API_KEY` (all benchmarks)
|
||||
- `NEO4J_URI`, `NEO4J_USER`, `NEO4J_PASSWORD` (Graphiti only)
|
||||
|
||||
## Usage
|
||||
Each benchmark inherits from `QABenchmarkRAG` base class and can be configured independently.
|
||||
|
||||
# Results
|
||||
Updated results will be posted soon.
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Simple script to calculate aggregate metrics for multiple JSON files."""
|
||||
|
||||
import os
|
||||
from cognee.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def calculate_aggregates_for_files(json_paths: list[str]) -> None:
|
||||
"""Calculate aggregate metrics for a list of JSON files."""
|
||||
for json_path in json_paths:
|
||||
if not os.path.exists(json_path):
|
||||
logger.error(f"File not found: {json_path}")
|
||||
continue
|
||||
|
||||
# Generate output path for aggregate metrics in the same folder as input
|
||||
input_dir = os.path.dirname(json_path)
|
||||
base_name = os.path.splitext(os.path.basename(json_path))[0]
|
||||
output_path = os.path.join(input_dir, f"aggregate_metrics_{base_name}.json")
|
||||
|
||||
try:
|
||||
logger.info(f"Calculating aggregate metrics for {json_path}")
|
||||
calculate_metrics_statistics(json_path, output_path)
|
||||
logger.info(f"Saved aggregate metrics to {output_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to calculate metrics for {json_path}: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dir_path = ""
|
||||
json_file_paths = [
|
||||
os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith(".json")
|
||||
]
|
||||
|
||||
calculate_aggregates_for_files(json_file_paths)
|
||||
print("Done calculating aggregate metrics!")
|
||||
107
evals/comparative_eval/helpers/convert_metrics.py
Normal file
107
evals/comparative_eval/helpers/convert_metrics.py
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def convert_metrics_file(json_path: str, metrics: List[str] = None) -> Dict[str, Any]:
|
||||
"""Convert a single metrics JSON file to the desired format."""
|
||||
if metrics is None:
|
||||
metrics = ["correctness", "f1", "EM"]
|
||||
|
||||
with open(json_path, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Extract filename without extension for system name
|
||||
filename = Path(json_path).stem
|
||||
|
||||
# Convert to desired format
|
||||
result = {
|
||||
"system": filename,
|
||||
"Human-LLM Correctness": None,
|
||||
"Human-LLM Correctness Error": None,
|
||||
}
|
||||
|
||||
# Add metrics dynamically based on the metrics list
|
||||
for metric in metrics:
|
||||
if metric in data:
|
||||
result[f"DeepEval {metric.title()}"] = data[metric]["mean"]
|
||||
result[f"DeepEval {metric.title()} Error"] = [
|
||||
data[metric]["ci_lower"],
|
||||
data[metric]["ci_upper"],
|
||||
]
|
||||
else:
|
||||
print(f"Warning: Metric '{metric}' not found in {json_path}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def convert_to_dataframe(results: List[Dict[str, Any]]) -> pd.DataFrame:
|
||||
"""Convert results list to DataFrame with expanded error columns."""
|
||||
df_data = []
|
||||
|
||||
for result in results:
|
||||
row = {}
|
||||
for key, value in result.items():
|
||||
if key.endswith("Error") and isinstance(value, list) and len(value) == 2:
|
||||
# Split error columns into lower and upper
|
||||
row[f"{key} Lower"] = value[0]
|
||||
row[f"{key} Upper"] = value[1]
|
||||
else:
|
||||
row[key] = value
|
||||
df_data.append(row)
|
||||
|
||||
return pd.DataFrame(df_data)
|
||||
|
||||
|
||||
def process_multiple_files(
|
||||
json_paths: List[str], output_path: str, metrics: List[str] = None
|
||||
) -> None:
|
||||
"""Process multiple JSON files and save concatenated results."""
|
||||
if metrics is None:
|
||||
metrics = ["correctness", "f1", "EM"]
|
||||
|
||||
results = []
|
||||
|
||||
for json_path in json_paths:
|
||||
try:
|
||||
converted = convert_metrics_file(json_path, metrics)
|
||||
results.append(converted)
|
||||
print(f"Processed: {json_path}")
|
||||
except Exception as e:
|
||||
print(f"Error processing {json_path}: {e}")
|
||||
|
||||
# Save JSON results
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
|
||||
print(f"Saved {len(results)} results to {output_path}")
|
||||
|
||||
# Convert to DataFrame and save CSV
|
||||
df = convert_to_dataframe(results)
|
||||
csv_path = output_path.replace(".json", ".csv")
|
||||
df.to_csv(csv_path, index=False)
|
||||
print(f"Saved DataFrame to {csv_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Default metrics (can be customized here)
|
||||
# default_metrics = ['correctness', 'f1', 'EM']
|
||||
default_metrics = ["correctness"]
|
||||
|
||||
# List JSON files in the current directory
|
||||
current_dir = ""
|
||||
json_files = [f for f in os.listdir(current_dir) if f.endswith(".json")]
|
||||
|
||||
if json_files:
|
||||
print(f"Found {len(json_files)} JSON files:")
|
||||
for f in json_files:
|
||||
print(f" - {f}")
|
||||
|
||||
# Create full paths for JSON files and output file in current working directory
|
||||
json_full_paths = [os.path.join(current_dir, f) for f in json_files]
|
||||
output_file = os.path.join(current_dir, "converted_metrics.json")
|
||||
process_multiple_files(json_full_paths, output_file, default_metrics)
|
||||
else:
|
||||
print("No JSON files found in current directory")
|
||||
161
evals/comparative_eval/helpers/modal_evaluate_answers.py
Normal file
161
evals/comparative_eval/helpers/modal_evaluate_answers.py
Normal file
|
|
@ -0,0 +1,161 @@
|
|||
import modal
|
||||
import os
|
||||
import asyncio
|
||||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from cognee.eval_framework.eval_config import EvalConfig
|
||||
from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
|
||||
from cognee.eval_framework.metrics_dashboard import create_dashboard
|
||||
|
||||
logger = get_logger()
|
||||
vol = modal.Volume.from_name("comparison-eval-answers", create_if_missing=True)
|
||||
|
||||
app = modal.App("comparison-eval-answerst")
|
||||
|
||||
image = (
|
||||
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
|
||||
.copy_local_file("pyproject.toml", "pyproject.toml")
|
||||
.copy_local_file("poetry.lock", "poetry.lock")
|
||||
.env(
|
||||
{
|
||||
"ENV": os.getenv("ENV"),
|
||||
"LLM_API_KEY": os.getenv("LLM_API_KEY"),
|
||||
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
|
||||
}
|
||||
)
|
||||
.pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
|
||||
)
|
||||
|
||||
|
||||
@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol})
|
||||
async def modal_evaluate_answers(
|
||||
answers_json_content: dict, answers_filename: str, eval_config: dict = None
|
||||
):
|
||||
"""Evaluates answers from JSON content and returns metrics results."""
|
||||
if eval_config is None:
|
||||
eval_config = EvalConfig().to_dict()
|
||||
|
||||
timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
|
||||
|
||||
# Create temporary file path for the JSON content
|
||||
base_name = os.path.splitext(answers_filename)[0]
|
||||
temp_answers_path = f"/data/temp_answers_{base_name}_{timestamp}.json"
|
||||
|
||||
# Write JSON content to temporary file
|
||||
with open(temp_answers_path, "w") as f:
|
||||
json.dump(answers_json_content, f, ensure_ascii=False, indent=4)
|
||||
|
||||
# Set up output paths with simplified naming: prefix_original_file_name
|
||||
eval_params = eval_config.copy()
|
||||
eval_params["answers_path"] = temp_answers_path
|
||||
eval_params["metrics_path"] = f"/data/metrics_{answers_filename}"
|
||||
eval_params["aggregate_metrics_path"] = f"/data/aggregate_metrics_{answers_filename}"
|
||||
eval_params["dashboard_path"] = f"/data/dashboard_{os.path.splitext(answers_filename)[0]}.html"
|
||||
|
||||
# eval_params["evaluation_engine"] = "DirectLLM"
|
||||
# eval_params["evaluation_metrics"] = ["correctness"]
|
||||
|
||||
logger.info(f"Evaluating answers from: {answers_filename}")
|
||||
logger.info(f"Using eval params: {eval_params}")
|
||||
|
||||
try:
|
||||
# Only run evaluation (skip corpus building and question answering)
|
||||
evaluated_answers = await run_evaluation(eval_params)
|
||||
|
||||
# Save evaluated answers
|
||||
evaluated_answers_path = f"/data/evaluated_{answers_filename}"
|
||||
with open(evaluated_answers_path, "w") as f:
|
||||
json.dump(evaluated_answers, f, ensure_ascii=False, indent=4)
|
||||
vol.commit()
|
||||
|
||||
# Generate dashboard if requested
|
||||
if eval_params.get("dashboard"):
|
||||
logger.info("Generating dashboard...")
|
||||
html_output = create_dashboard(
|
||||
metrics_path=eval_params["metrics_path"],
|
||||
aggregate_metrics_path=eval_params["aggregate_metrics_path"],
|
||||
output_file=eval_params["dashboard_path"],
|
||||
benchmark=eval_params.get("benchmark", "Unknown"),
|
||||
)
|
||||
|
||||
with open(eval_params["dashboard_path"], "w") as f:
|
||||
f.write(html_output)
|
||||
vol.commit()
|
||||
|
||||
logger.info(f"Evaluation completed for {answers_filename}")
|
||||
|
||||
# Return metrics results
|
||||
result = {
|
||||
"answers_file": answers_filename,
|
||||
"metrics_path": eval_params["metrics_path"],
|
||||
"aggregate_metrics_path": eval_params["aggregate_metrics_path"],
|
||||
"dashboard_path": eval_params["dashboard_path"]
|
||||
if eval_params.get("dashboard")
|
||||
else None,
|
||||
"evaluated_answers_path": evaluated_answers_path,
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error evaluating {answers_filename}: {e}")
|
||||
raise
|
||||
|
||||
|
||||
@app.local_entrypoint()
|
||||
async def main():
|
||||
"""Main entry point that evaluates multiple JSON answer files in parallel."""
|
||||
|
||||
json_files_dir = ""
|
||||
json_files = [f for f in os.listdir(json_files_dir) if f.endswith(".json")]
|
||||
json_file_paths = [os.path.join(json_files_dir, f) for f in json_files]
|
||||
|
||||
# Manually specify your evaluation configuration here
|
||||
eval_config = EvalConfig(
|
||||
# Only evaluation-related settings
|
||||
evaluating_answers=True,
|
||||
evaluating_contexts=False,
|
||||
evaluation_engine="DeepEval",
|
||||
evaluation_metrics=["correctness", "EM", "f1"],
|
||||
calculate_metrics=True,
|
||||
dashboard=True,
|
||||
deepeval_model="gpt-4o-mini",
|
||||
).to_dict()
|
||||
|
||||
logger.info(f"Starting evaluation of {len(json_file_paths)} JSON files")
|
||||
|
||||
# Read JSON files locally and prepare tasks
|
||||
modal_tasks = []
|
||||
for json_path in json_file_paths:
|
||||
try:
|
||||
# Read JSON content locally
|
||||
with open(json_path, "r", encoding="utf-8") as f:
|
||||
json_content = json.load(f)
|
||||
|
||||
filename = os.path.basename(json_path)
|
||||
|
||||
# Create remote evaluation task with JSON content
|
||||
task = modal_evaluate_answers.remote.aio(json_content, filename, eval_config)
|
||||
modal_tasks.append(task)
|
||||
|
||||
except (FileNotFoundError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error reading {json_path}: {e}")
|
||||
continue
|
||||
|
||||
if not modal_tasks:
|
||||
logger.error("No valid JSON files found to process")
|
||||
return []
|
||||
|
||||
# Run evaluations in parallel
|
||||
results = await asyncio.gather(*modal_tasks, return_exceptions=True)
|
||||
|
||||
# Log results
|
||||
for i, result in enumerate(results):
|
||||
if isinstance(result, Exception):
|
||||
logger.error(f"Failed to evaluate {json_file_paths[i]}: {result}")
|
||||
else:
|
||||
logger.info(f"Successfully evaluated {result['answers_file']}")
|
||||
|
||||
return results
|
||||
1
evals/comparative_eval/hotpot_50_corpus.json
Normal file
1
evals/comparative_eval/hotpot_50_corpus.json
Normal file
File diff suppressed because one or more lines are too long
1
evals/comparative_eval/hotpot_50_qa_pairs.json
Normal file
1
evals/comparative_eval/hotpot_50_qa_pairs.json
Normal file
File diff suppressed because one or more lines are too long
159
evals/comparative_eval/qa_benchmark_base.py
Normal file
159
evals/comparative_eval/qa_benchmark_base.py
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
import asyncio
|
||||
import json
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from tqdm import tqdm
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
@dataclass
|
||||
class QABenchmarkConfig:
|
||||
"""Base configuration for QA benchmark pipelines."""
|
||||
|
||||
corpus_limit: Optional[int] = None
|
||||
qa_limit: Optional[int] = None
|
||||
results_file: str = "hotpot_qa_results.json"
|
||||
print_results: bool = True
|
||||
|
||||
|
||||
class QABenchmarkRAG(ABC):
|
||||
"""Abstract base class for QA benchmarking with different RAG systems."""
|
||||
|
||||
def __init__(
|
||||
self, corpus: List[str], qa_pairs: List[Dict[str, Any]], config: QABenchmarkConfig
|
||||
):
|
||||
"""Initialize the benchmark with corpus and QA data."""
|
||||
self.corpus = corpus
|
||||
self.qa_pairs = qa_pairs
|
||||
self.config = config
|
||||
self.rag_client = None
|
||||
|
||||
# Apply limits if specified
|
||||
if config.corpus_limit is not None:
|
||||
self.corpus = self.corpus[: config.corpus_limit]
|
||||
print(f"Limited to first {config.corpus_limit} documents")
|
||||
|
||||
if config.qa_limit is not None:
|
||||
self.qa_pairs = self.qa_pairs[: config.qa_limit]
|
||||
print(f"Limited to first {config.qa_limit} questions")
|
||||
|
||||
@classmethod
|
||||
def from_jsons(
|
||||
cls, corpus_file: str, qa_pairs_file: str, config: QABenchmarkConfig
|
||||
) -> "QABenchmarkRAG":
|
||||
"""Create benchmark instance by loading data from JSON files."""
|
||||
print(f"Loading corpus from {corpus_file}...")
|
||||
with open(corpus_file) as file:
|
||||
corpus = json.load(file)
|
||||
|
||||
print(f"Loading QA pairs from {qa_pairs_file}...")
|
||||
with open(qa_pairs_file) as file:
|
||||
qa_pairs = json.load(file)
|
||||
|
||||
return cls(corpus, qa_pairs, config)
|
||||
|
||||
@abstractmethod
|
||||
async def initialize_rag(self) -> Any:
|
||||
"""Initialize the RAG system. Returns the RAG client."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def cleanup_rag(self) -> None:
|
||||
"""Clean up RAG system resources."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def insert_document(self, document: str, document_id: int) -> None:
|
||||
"""Insert a single document into the RAG system."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def query_rag(self, question: str) -> str:
|
||||
"""Query the RAG system and return the answer."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def system_name(self) -> str:
|
||||
"""Return the name of the RAG system for logging."""
|
||||
pass
|
||||
|
||||
async def load_corpus_to_rag(self) -> None:
|
||||
"""Load corpus data into the RAG system."""
|
||||
print(f"Adding {len(self.corpus)} documents to {self.system_name}...")
|
||||
for i, document in enumerate(tqdm(self.corpus, desc="Adding documents")):
|
||||
await self.insert_document(document, i + 1)
|
||||
print(f"All documents added to {self.system_name}")
|
||||
|
||||
async def answer_questions(self) -> List[Dict[str, Any]]:
|
||||
"""Answer questions using the RAG system."""
|
||||
print(f"Processing {len(self.qa_pairs)} questions...")
|
||||
results = []
|
||||
|
||||
for i, qa_pair in enumerate(self.qa_pairs):
|
||||
question = qa_pair.get("question")
|
||||
expected_answer = qa_pair.get("answer")
|
||||
|
||||
print(f"Processing question {i + 1}/{len(self.qa_pairs)}: {question}")
|
||||
|
||||
# Get answer from RAG system
|
||||
try:
|
||||
answer = await self.query_rag(question)
|
||||
except Exception as e:
|
||||
print(f"Error processing question {i + 1}: {e}")
|
||||
answer = f"Error: {str(e)}"
|
||||
|
||||
result = {"question": question, "answer": answer, "golden_answer": expected_answer}
|
||||
|
||||
if self.config.print_results:
|
||||
print(
|
||||
f"Question {i + 1}: {question}\nResponse: {answer}\nExpected: {expected_answer}\n{'-' * 50}"
|
||||
)
|
||||
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
def save_results(self, results: List[Dict[str, Any]]) -> None:
|
||||
"""Save results to JSON file."""
|
||||
if self.config.results_file:
|
||||
print(f"Saving results to {self.config.results_file}...")
|
||||
with open(self.config.results_file, "w", encoding="utf-8") as file:
|
||||
json.dump(results, file, indent=2)
|
||||
|
||||
async def run_benchmark(self) -> List[Dict[str, Any]]:
|
||||
"""Run the complete benchmark pipeline."""
|
||||
print(f"Starting QA benchmark for {self.system_name}...")
|
||||
|
||||
try:
|
||||
# Initialize RAG system
|
||||
self.rag_client = await self.initialize_rag()
|
||||
|
||||
# Load corpus
|
||||
await self.load_corpus_to_rag()
|
||||
|
||||
# Answer questions
|
||||
results = await self.answer_questions()
|
||||
|
||||
# Save results
|
||||
self.save_results(results)
|
||||
|
||||
print(f"Results saved to {self.config.results_file}")
|
||||
print("Pipeline completed successfully")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred during benchmark: {e}")
|
||||
raise
|
||||
finally:
|
||||
# Cleanup
|
||||
if self.rag_client:
|
||||
await self.cleanup_rag()
|
||||
|
||||
def run(self) -> List[Dict[str, Any]]:
|
||||
"""Synchronous wrapper for the benchmark."""
|
||||
return asyncio.run(self.run_benchmark())
|
||||
114
evals/comparative_eval/qa_benchmark_graphiti.py
Normal file
114
evals/comparative_eval/qa_benchmark_graphiti.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
import asyncio
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
from graphiti_core import Graphiti
|
||||
from graphiti_core.nodes import EpisodeType
|
||||
from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
@dataclass
|
||||
class GraphitiConfig(QABenchmarkConfig):
|
||||
"""Configuration for Graphiti QA benchmark."""
|
||||
|
||||
# Database parameters
|
||||
db_url: str = os.getenv("NEO4J_URI")
|
||||
db_user: str = os.getenv("NEO4J_USER")
|
||||
db_password: str = os.getenv("NEO4J_PASSWORD")
|
||||
|
||||
# Model parameters
|
||||
model_name: str = "gpt-4o-mini"
|
||||
|
||||
# Default results file
|
||||
results_file: str = "hotpot_qa_graphiti_results.json"
|
||||
|
||||
|
||||
class QABenchmarkGraphiti(QABenchmarkRAG):
|
||||
"""Graphiti implementation of QA benchmark."""
|
||||
|
||||
def __init__(self, corpus, qa_pairs, config: GraphitiConfig):
|
||||
super().__init__(corpus, qa_pairs, config)
|
||||
self.config: GraphitiConfig = config
|
||||
self.llm = None
|
||||
|
||||
async def initialize_rag(self) -> Any:
|
||||
"""Initialize Graphiti and LLM."""
|
||||
graphiti = Graphiti(self.config.db_url, self.config.db_user, self.config.db_password)
|
||||
await graphiti.build_indices_and_constraints(delete_existing=True)
|
||||
|
||||
# Initialize LLM
|
||||
self.llm = ChatOpenAI(model=self.config.model_name, temperature=0)
|
||||
|
||||
return graphiti
|
||||
|
||||
async def cleanup_rag(self) -> None:
|
||||
"""Clean up Graphiti connection."""
|
||||
if self.rag_client:
|
||||
await self.rag_client.close()
|
||||
|
||||
async def insert_document(self, document: str, document_id: int) -> None:
|
||||
"""Insert document into Graphiti as an episode."""
|
||||
await self.rag_client.add_episode(
|
||||
name=f"Document {document_id}",
|
||||
episode_body=document,
|
||||
source=EpisodeType.text,
|
||||
source_description="corpus",
|
||||
reference_time=datetime.now(timezone.utc),
|
||||
)
|
||||
|
||||
async def query_rag(self, question: str) -> str:
|
||||
"""Query Graphiti and generate answer using LLM."""
|
||||
# Search Graphiti for relevant facts
|
||||
results = await self.rag_client.search(query=question, num_results=10)
|
||||
context = "\n".join(f"- {entry.fact}" for entry in results)
|
||||
|
||||
# Generate answer using LLM
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Answer minimally using provided facts. Respond with one word or phrase.",
|
||||
},
|
||||
{"role": "user", "content": f"Facts:\n{context}\n\nQuestion: {question}"},
|
||||
]
|
||||
|
||||
response = await self.llm.ainvoke(messages)
|
||||
answer = response.content
|
||||
|
||||
# Store the QA interaction in Graphiti
|
||||
qa_memory = f"Question: {question}\nAnswer: {answer}"
|
||||
await self.rag_client.add_episode(
|
||||
name="QA Interaction",
|
||||
episode_body=qa_memory,
|
||||
source=EpisodeType.text,
|
||||
source_description="qa_interaction",
|
||||
reference_time=datetime.now(timezone.utc),
|
||||
)
|
||||
|
||||
return answer
|
||||
|
||||
@property
|
||||
def system_name(self) -> str:
|
||||
"""Return system name."""
|
||||
return "Graphiti"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
config = GraphitiConfig(
|
||||
corpus_limit=5, # Small test
|
||||
qa_limit=3,
|
||||
print_results=True,
|
||||
)
|
||||
|
||||
benchmark = QABenchmarkGraphiti.from_jsons(
|
||||
corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
|
||||
)
|
||||
|
||||
results = benchmark.run()
|
||||
91
evals/comparative_eval/qa_benchmark_lightrag.py
Normal file
91
evals/comparative_eval/qa_benchmark_lightrag.py
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
import asyncio
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from lightrag import LightRAG, QueryParam
|
||||
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
|
||||
from lightrag.kg.shared_storage import initialize_pipeline_status
|
||||
from lightrag.utils import setup_logger
|
||||
from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
|
||||
|
||||
load_dotenv()
|
||||
setup_logger("lightrag", level="INFO")
|
||||
|
||||
|
||||
@dataclass
|
||||
class LightRAGConfig(QABenchmarkConfig):
|
||||
"""Configuration for LightRAG QA benchmark."""
|
||||
|
||||
# Storage parameters
|
||||
working_dir: str = "./lightrag_storage"
|
||||
|
||||
# Query parameters
|
||||
query_mode: str = "hybrid" # "naive", "local", "global", "hybrid"
|
||||
|
||||
# Default results file
|
||||
results_file: str = "hotpot_qa_lightrag_results.json"
|
||||
|
||||
|
||||
class QABenchmarkLightRAG(QABenchmarkRAG):
|
||||
"""LightRAG implementation of QA benchmark."""
|
||||
|
||||
def __init__(self, corpus, qa_pairs, config: LightRAGConfig):
|
||||
super().__init__(corpus, qa_pairs, config)
|
||||
self.config: LightRAGConfig = config
|
||||
|
||||
# Ensure working directory exists
|
||||
if not os.path.exists(self.config.working_dir):
|
||||
os.makedirs(self.config.working_dir)
|
||||
|
||||
async def initialize_rag(self) -> Any:
|
||||
"""Initialize LightRAG with storage and pipeline setup."""
|
||||
lightrag = LightRAG(
|
||||
working_dir=self.config.working_dir,
|
||||
embedding_func=openai_embed,
|
||||
llm_model_func=gpt_4o_mini_complete,
|
||||
)
|
||||
|
||||
await lightrag.initialize_storages()
|
||||
await initialize_pipeline_status()
|
||||
|
||||
return lightrag
|
||||
|
||||
async def cleanup_rag(self) -> None:
|
||||
"""Clean up LightRAG storages."""
|
||||
if self.rag_client:
|
||||
await self.rag_client.finalize_storages()
|
||||
|
||||
async def insert_document(self, document: str, document_id: int) -> None:
|
||||
"""Insert document into LightRAG."""
|
||||
await self.rag_client.ainsert([document])
|
||||
|
||||
async def query_rag(self, question: str) -> str:
|
||||
"""Query LightRAG and return the answer."""
|
||||
result = await self.rag_client.aquery(
|
||||
question, param=QueryParam(mode=self.config.query_mode)
|
||||
)
|
||||
return result
|
||||
|
||||
@property
|
||||
def system_name(self) -> str:
|
||||
"""Return system name."""
|
||||
return "LightRAG"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
config = LightRAGConfig(
|
||||
corpus_limit=5, # Small test
|
||||
qa_limit=3,
|
||||
query_mode="hybrid",
|
||||
print_results=True,
|
||||
)
|
||||
|
||||
benchmark = QABenchmarkLightRAG.from_jsons(
|
||||
corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
|
||||
)
|
||||
|
||||
results = benchmark.run()
|
||||
113
evals/comparative_eval/qa_benchmark_mem0.py
Normal file
113
evals/comparative_eval/qa_benchmark_mem0.py
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from openai import OpenAI
|
||||
from mem0 import Memory
|
||||
|
||||
from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
@dataclass
|
||||
class Mem0Config(QABenchmarkConfig):
|
||||
"""Configuration for Mem0 QA benchmark."""
|
||||
|
||||
# Memory parameters
|
||||
user_id: str = "hotpot_qa_user"
|
||||
|
||||
# Model parameters
|
||||
model_name: str = "gpt-4o-mini"
|
||||
|
||||
# Default results file
|
||||
results_file: str = "hotpot_qa_mem0_results.json"
|
||||
|
||||
|
||||
class QABenchmarkMem0(QABenchmarkRAG):
|
||||
"""Mem0 implementation of QA benchmark."""
|
||||
|
||||
def __init__(self, corpus, qa_pairs, config: Mem0Config):
|
||||
super().__init__(corpus, qa_pairs, config)
|
||||
self.config: Mem0Config = config
|
||||
self.openai_client = None
|
||||
|
||||
async def initialize_rag(self) -> Any:
|
||||
"""Initialize Mem0 Memory and OpenAI client."""
|
||||
memory = Memory()
|
||||
self.openai_client = OpenAI()
|
||||
return memory
|
||||
|
||||
async def cleanup_rag(self) -> None:
|
||||
"""Clean up resources (no cleanup needed for Mem0)."""
|
||||
pass
|
||||
|
||||
async def insert_document(self, document: str, document_id: int) -> None:
|
||||
"""Insert document into Mem0 as conversation messages."""
|
||||
# Create conversation messages format
|
||||
messages = [
|
||||
{"role": "system", "content": "This is a document to remember."},
|
||||
{"role": "user", "content": "Please remember this document."},
|
||||
{"role": "assistant", "content": document},
|
||||
]
|
||||
|
||||
# Add to memory (wrap sync call in async)
|
||||
await asyncio.get_event_loop().run_in_executor(
|
||||
None, lambda: self.rag_client.add(messages, user_id=self.config.user_id)
|
||||
)
|
||||
|
||||
async def query_rag(self, question: str) -> str:
|
||||
"""Query Mem0 and generate answer using OpenAI."""
|
||||
# Search Mem0 for relevant memories
|
||||
relevant_memories = await asyncio.get_event_loop().run_in_executor(
|
||||
None,
|
||||
lambda: self.rag_client.search(query=question, user_id=self.config.user_id, limit=5),
|
||||
)
|
||||
|
||||
# Format memories for context
|
||||
memories_str = "\n".join(f"- {entry['memory']}" for entry in relevant_memories["results"])
|
||||
|
||||
# Generate response with OpenAI
|
||||
system_prompt = f"You are a helpful AI assistant. Answer the question based on the provided context.\n\nContext:\n{memories_str}"
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": question},
|
||||
]
|
||||
|
||||
# Call OpenAI API (wrap sync call in async)
|
||||
response = await asyncio.get_event_loop().run_in_executor(
|
||||
None,
|
||||
lambda: self.openai_client.chat.completions.create(
|
||||
model=self.config.model_name, messages=messages
|
||||
),
|
||||
)
|
||||
answer = response.choices[0].message.content
|
||||
|
||||
# Store the QA interaction in Mem0
|
||||
qa_messages = messages + [{"role": "assistant", "content": answer}]
|
||||
await asyncio.get_event_loop().run_in_executor(
|
||||
None, lambda: self.rag_client.add(qa_messages, user_id=self.config.user_id)
|
||||
)
|
||||
|
||||
return answer
|
||||
|
||||
@property
|
||||
def system_name(self) -> str:
|
||||
"""Return system name."""
|
||||
return "Mem0"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
config = Mem0Config(
|
||||
corpus_limit=5, # Small test
|
||||
qa_limit=3,
|
||||
print_results=True,
|
||||
)
|
||||
|
||||
benchmark = QABenchmarkMem0.from_jsons(
|
||||
corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
|
||||
)
|
||||
|
||||
results = benchmark.run()
|
||||
Loading…
Add table
Reference in a new issue