feat: unify comparative evals (#916)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->
- Comparative Framework: Independent benchmarking system for evaluating
different RAG/QA systems
- HotpotQA Dataset: 50 instances corpus and corresponding QA pairs for
standardized evaluation
- Base Class: Abstract QABenchmarkRAG with async pipeline for document
ingestion and question answering
- Three Benchmarks: Standalone implementations for Mem0, LightRAG, and
Graphiti with specific dependencies

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.

---------

Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com>
This commit is contained in:
lxobr 2025-06-11 10:06:09 +02:00 committed by GitHub
parent 9d5835042a
commit cfe9c949a7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 825 additions and 0 deletions

View file

@ -0,0 +1,40 @@
# Comparative QA Benchmarks
Independent benchmarks for different QA/RAG systems using HotpotQA dataset.
## Dataset Files
- `hotpot_50_corpus.json` - 50 instances from HotpotQA
- `hotpot_50_qa_pairs.json` - Corresponding question-answer pairs
## Benchmarks
Each benchmark can be run independently with appropriate dependencies:
### Mem0
```bash
pip install mem0ai openai
python qa_benchmark_mem0.py
```
### LightRAG
```bash
pip install "lightrag-hku[api]"
python qa_benchmark_lightrag.py
```
### Graphiti
```bash
pip install graphiti-core
python qa_benchmark_graphiti.py
```
## Environment
Create `.env` with required API keys:
- `OPENAI_API_KEY` (all benchmarks)
- `NEO4J_URI`, `NEO4J_USER`, `NEO4J_PASSWORD` (Graphiti only)
## Usage
Each benchmark inherits from `QABenchmarkRAG` base class and can be configured independently.
# Results
Updated results will be posted soon.

View file

@ -0,0 +1,38 @@
#!/usr/bin/env python3
"""Simple script to calculate aggregate metrics for multiple JSON files."""
import os
from cognee.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
from cognee.shared.logging_utils import get_logger
logger = get_logger()
def calculate_aggregates_for_files(json_paths: list[str]) -> None:
"""Calculate aggregate metrics for a list of JSON files."""
for json_path in json_paths:
if not os.path.exists(json_path):
logger.error(f"File not found: {json_path}")
continue
# Generate output path for aggregate metrics in the same folder as input
input_dir = os.path.dirname(json_path)
base_name = os.path.splitext(os.path.basename(json_path))[0]
output_path = os.path.join(input_dir, f"aggregate_metrics_{base_name}.json")
try:
logger.info(f"Calculating aggregate metrics for {json_path}")
calculate_metrics_statistics(json_path, output_path)
logger.info(f"Saved aggregate metrics to {output_path}")
except Exception as e:
logger.error(f"Failed to calculate metrics for {json_path}: {e}")
if __name__ == "__main__":
dir_path = ""
json_file_paths = [
os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith(".json")
]
calculate_aggregates_for_files(json_file_paths)
print("Done calculating aggregate metrics!")

View file

@ -0,0 +1,107 @@
import json
import os
from pathlib import Path
from typing import List, Dict, Any
import pandas as pd
def convert_metrics_file(json_path: str, metrics: List[str] = None) -> Dict[str, Any]:
"""Convert a single metrics JSON file to the desired format."""
if metrics is None:
metrics = ["correctness", "f1", "EM"]
with open(json_path, "r") as f:
data = json.load(f)
# Extract filename without extension for system name
filename = Path(json_path).stem
# Convert to desired format
result = {
"system": filename,
"Human-LLM Correctness": None,
"Human-LLM Correctness Error": None,
}
# Add metrics dynamically based on the metrics list
for metric in metrics:
if metric in data:
result[f"DeepEval {metric.title()}"] = data[metric]["mean"]
result[f"DeepEval {metric.title()} Error"] = [
data[metric]["ci_lower"],
data[metric]["ci_upper"],
]
else:
print(f"Warning: Metric '{metric}' not found in {json_path}")
return result
def convert_to_dataframe(results: List[Dict[str, Any]]) -> pd.DataFrame:
"""Convert results list to DataFrame with expanded error columns."""
df_data = []
for result in results:
row = {}
for key, value in result.items():
if key.endswith("Error") and isinstance(value, list) and len(value) == 2:
# Split error columns into lower and upper
row[f"{key} Lower"] = value[0]
row[f"{key} Upper"] = value[1]
else:
row[key] = value
df_data.append(row)
return pd.DataFrame(df_data)
def process_multiple_files(
json_paths: List[str], output_path: str, metrics: List[str] = None
) -> None:
"""Process multiple JSON files and save concatenated results."""
if metrics is None:
metrics = ["correctness", "f1", "EM"]
results = []
for json_path in json_paths:
try:
converted = convert_metrics_file(json_path, metrics)
results.append(converted)
print(f"Processed: {json_path}")
except Exception as e:
print(f"Error processing {json_path}: {e}")
# Save JSON results
with open(output_path, "w") as f:
json.dump(results, f, indent=2)
print(f"Saved {len(results)} results to {output_path}")
# Convert to DataFrame and save CSV
df = convert_to_dataframe(results)
csv_path = output_path.replace(".json", ".csv")
df.to_csv(csv_path, index=False)
print(f"Saved DataFrame to {csv_path}")
if __name__ == "__main__":
# Default metrics (can be customized here)
# default_metrics = ['correctness', 'f1', 'EM']
default_metrics = ["correctness"]
# List JSON files in the current directory
current_dir = ""
json_files = [f for f in os.listdir(current_dir) if f.endswith(".json")]
if json_files:
print(f"Found {len(json_files)} JSON files:")
for f in json_files:
print(f" - {f}")
# Create full paths for JSON files and output file in current working directory
json_full_paths = [os.path.join(current_dir, f) for f in json_files]
output_file = os.path.join(current_dir, "converted_metrics.json")
process_multiple_files(json_full_paths, output_file, default_metrics)
else:
print("No JSON files found in current directory")

View file

@ -0,0 +1,161 @@
import modal
import os
import asyncio
import datetime
import hashlib
import json
from cognee.shared.logging_utils import get_logger
from cognee.eval_framework.eval_config import EvalConfig
from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
from cognee.eval_framework.metrics_dashboard import create_dashboard
logger = get_logger()
vol = modal.Volume.from_name("comparison-eval-answers", create_if_missing=True)
app = modal.App("comparison-eval-answerst")
image = (
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
.copy_local_file("pyproject.toml", "pyproject.toml")
.copy_local_file("poetry.lock", "poetry.lock")
.env(
{
"ENV": os.getenv("ENV"),
"LLM_API_KEY": os.getenv("LLM_API_KEY"),
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
}
)
.pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
)
@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol})
async def modal_evaluate_answers(
answers_json_content: dict, answers_filename: str, eval_config: dict = None
):
"""Evaluates answers from JSON content and returns metrics results."""
if eval_config is None:
eval_config = EvalConfig().to_dict()
timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
# Create temporary file path for the JSON content
base_name = os.path.splitext(answers_filename)[0]
temp_answers_path = f"/data/temp_answers_{base_name}_{timestamp}.json"
# Write JSON content to temporary file
with open(temp_answers_path, "w") as f:
json.dump(answers_json_content, f, ensure_ascii=False, indent=4)
# Set up output paths with simplified naming: prefix_original_file_name
eval_params = eval_config.copy()
eval_params["answers_path"] = temp_answers_path
eval_params["metrics_path"] = f"/data/metrics_{answers_filename}"
eval_params["aggregate_metrics_path"] = f"/data/aggregate_metrics_{answers_filename}"
eval_params["dashboard_path"] = f"/data/dashboard_{os.path.splitext(answers_filename)[0]}.html"
# eval_params["evaluation_engine"] = "DirectLLM"
# eval_params["evaluation_metrics"] = ["correctness"]
logger.info(f"Evaluating answers from: {answers_filename}")
logger.info(f"Using eval params: {eval_params}")
try:
# Only run evaluation (skip corpus building and question answering)
evaluated_answers = await run_evaluation(eval_params)
# Save evaluated answers
evaluated_answers_path = f"/data/evaluated_{answers_filename}"
with open(evaluated_answers_path, "w") as f:
json.dump(evaluated_answers, f, ensure_ascii=False, indent=4)
vol.commit()
# Generate dashboard if requested
if eval_params.get("dashboard"):
logger.info("Generating dashboard...")
html_output = create_dashboard(
metrics_path=eval_params["metrics_path"],
aggregate_metrics_path=eval_params["aggregate_metrics_path"],
output_file=eval_params["dashboard_path"],
benchmark=eval_params.get("benchmark", "Unknown"),
)
with open(eval_params["dashboard_path"], "w") as f:
f.write(html_output)
vol.commit()
logger.info(f"Evaluation completed for {answers_filename}")
# Return metrics results
result = {
"answers_file": answers_filename,
"metrics_path": eval_params["metrics_path"],
"aggregate_metrics_path": eval_params["aggregate_metrics_path"],
"dashboard_path": eval_params["dashboard_path"]
if eval_params.get("dashboard")
else None,
"evaluated_answers_path": evaluated_answers_path,
}
return result
except Exception as e:
logger.error(f"Error evaluating {answers_filename}: {e}")
raise
@app.local_entrypoint()
async def main():
"""Main entry point that evaluates multiple JSON answer files in parallel."""
json_files_dir = ""
json_files = [f for f in os.listdir(json_files_dir) if f.endswith(".json")]
json_file_paths = [os.path.join(json_files_dir, f) for f in json_files]
# Manually specify your evaluation configuration here
eval_config = EvalConfig(
# Only evaluation-related settings
evaluating_answers=True,
evaluating_contexts=False,
evaluation_engine="DeepEval",
evaluation_metrics=["correctness", "EM", "f1"],
calculate_metrics=True,
dashboard=True,
deepeval_model="gpt-4o-mini",
).to_dict()
logger.info(f"Starting evaluation of {len(json_file_paths)} JSON files")
# Read JSON files locally and prepare tasks
modal_tasks = []
for json_path in json_file_paths:
try:
# Read JSON content locally
with open(json_path, "r", encoding="utf-8") as f:
json_content = json.load(f)
filename = os.path.basename(json_path)
# Create remote evaluation task with JSON content
task = modal_evaluate_answers.remote.aio(json_content, filename, eval_config)
modal_tasks.append(task)
except (FileNotFoundError, json.JSONDecodeError) as e:
logger.error(f"Error reading {json_path}: {e}")
continue
if not modal_tasks:
logger.error("No valid JSON files found to process")
return []
# Run evaluations in parallel
results = await asyncio.gather(*modal_tasks, return_exceptions=True)
# Log results
for i, result in enumerate(results):
if isinstance(result, Exception):
logger.error(f"Failed to evaluate {json_file_paths[i]}: {result}")
else:
logger.info(f"Successfully evaluated {result['answers_file']}")
return results

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,159 @@
import asyncio
import json
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from dotenv import load_dotenv
from tqdm import tqdm
load_dotenv()
@dataclass
class QABenchmarkConfig:
"""Base configuration for QA benchmark pipelines."""
corpus_limit: Optional[int] = None
qa_limit: Optional[int] = None
results_file: str = "hotpot_qa_results.json"
print_results: bool = True
class QABenchmarkRAG(ABC):
"""Abstract base class for QA benchmarking with different RAG systems."""
def __init__(
self, corpus: List[str], qa_pairs: List[Dict[str, Any]], config: QABenchmarkConfig
):
"""Initialize the benchmark with corpus and QA data."""
self.corpus = corpus
self.qa_pairs = qa_pairs
self.config = config
self.rag_client = None
# Apply limits if specified
if config.corpus_limit is not None:
self.corpus = self.corpus[: config.corpus_limit]
print(f"Limited to first {config.corpus_limit} documents")
if config.qa_limit is not None:
self.qa_pairs = self.qa_pairs[: config.qa_limit]
print(f"Limited to first {config.qa_limit} questions")
@classmethod
def from_jsons(
cls, corpus_file: str, qa_pairs_file: str, config: QABenchmarkConfig
) -> "QABenchmarkRAG":
"""Create benchmark instance by loading data from JSON files."""
print(f"Loading corpus from {corpus_file}...")
with open(corpus_file) as file:
corpus = json.load(file)
print(f"Loading QA pairs from {qa_pairs_file}...")
with open(qa_pairs_file) as file:
qa_pairs = json.load(file)
return cls(corpus, qa_pairs, config)
@abstractmethod
async def initialize_rag(self) -> Any:
"""Initialize the RAG system. Returns the RAG client."""
pass
@abstractmethod
async def cleanup_rag(self) -> None:
"""Clean up RAG system resources."""
pass
@abstractmethod
async def insert_document(self, document: str, document_id: int) -> None:
"""Insert a single document into the RAG system."""
pass
@abstractmethod
async def query_rag(self, question: str) -> str:
"""Query the RAG system and return the answer."""
pass
@property
@abstractmethod
def system_name(self) -> str:
"""Return the name of the RAG system for logging."""
pass
async def load_corpus_to_rag(self) -> None:
"""Load corpus data into the RAG system."""
print(f"Adding {len(self.corpus)} documents to {self.system_name}...")
for i, document in enumerate(tqdm(self.corpus, desc="Adding documents")):
await self.insert_document(document, i + 1)
print(f"All documents added to {self.system_name}")
async def answer_questions(self) -> List[Dict[str, Any]]:
"""Answer questions using the RAG system."""
print(f"Processing {len(self.qa_pairs)} questions...")
results = []
for i, qa_pair in enumerate(self.qa_pairs):
question = qa_pair.get("question")
expected_answer = qa_pair.get("answer")
print(f"Processing question {i + 1}/{len(self.qa_pairs)}: {question}")
# Get answer from RAG system
try:
answer = await self.query_rag(question)
except Exception as e:
print(f"Error processing question {i + 1}: {e}")
answer = f"Error: {str(e)}"
result = {"question": question, "answer": answer, "golden_answer": expected_answer}
if self.config.print_results:
print(
f"Question {i + 1}: {question}\nResponse: {answer}\nExpected: {expected_answer}\n{'-' * 50}"
)
results.append(result)
return results
def save_results(self, results: List[Dict[str, Any]]) -> None:
"""Save results to JSON file."""
if self.config.results_file:
print(f"Saving results to {self.config.results_file}...")
with open(self.config.results_file, "w", encoding="utf-8") as file:
json.dump(results, file, indent=2)
async def run_benchmark(self) -> List[Dict[str, Any]]:
"""Run the complete benchmark pipeline."""
print(f"Starting QA benchmark for {self.system_name}...")
try:
# Initialize RAG system
self.rag_client = await self.initialize_rag()
# Load corpus
await self.load_corpus_to_rag()
# Answer questions
results = await self.answer_questions()
# Save results
self.save_results(results)
print(f"Results saved to {self.config.results_file}")
print("Pipeline completed successfully")
return results
except Exception as e:
print(f"An error occurred during benchmark: {e}")
raise
finally:
# Cleanup
if self.rag_client:
await self.cleanup_rag()
def run(self) -> List[Dict[str, Any]]:
"""Synchronous wrapper for the benchmark."""
return asyncio.run(self.run_benchmark())

View file

@ -0,0 +1,114 @@
import asyncio
import os
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from graphiti_core import Graphiti
from graphiti_core.nodes import EpisodeType
from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
load_dotenv()
@dataclass
class GraphitiConfig(QABenchmarkConfig):
"""Configuration for Graphiti QA benchmark."""
# Database parameters
db_url: str = os.getenv("NEO4J_URI")
db_user: str = os.getenv("NEO4J_USER")
db_password: str = os.getenv("NEO4J_PASSWORD")
# Model parameters
model_name: str = "gpt-4o-mini"
# Default results file
results_file: str = "hotpot_qa_graphiti_results.json"
class QABenchmarkGraphiti(QABenchmarkRAG):
"""Graphiti implementation of QA benchmark."""
def __init__(self, corpus, qa_pairs, config: GraphitiConfig):
super().__init__(corpus, qa_pairs, config)
self.config: GraphitiConfig = config
self.llm = None
async def initialize_rag(self) -> Any:
"""Initialize Graphiti and LLM."""
graphiti = Graphiti(self.config.db_url, self.config.db_user, self.config.db_password)
await graphiti.build_indices_and_constraints(delete_existing=True)
# Initialize LLM
self.llm = ChatOpenAI(model=self.config.model_name, temperature=0)
return graphiti
async def cleanup_rag(self) -> None:
"""Clean up Graphiti connection."""
if self.rag_client:
await self.rag_client.close()
async def insert_document(self, document: str, document_id: int) -> None:
"""Insert document into Graphiti as an episode."""
await self.rag_client.add_episode(
name=f"Document {document_id}",
episode_body=document,
source=EpisodeType.text,
source_description="corpus",
reference_time=datetime.now(timezone.utc),
)
async def query_rag(self, question: str) -> str:
"""Query Graphiti and generate answer using LLM."""
# Search Graphiti for relevant facts
results = await self.rag_client.search(query=question, num_results=10)
context = "\n".join(f"- {entry.fact}" for entry in results)
# Generate answer using LLM
messages = [
{
"role": "system",
"content": "Answer minimally using provided facts. Respond with one word or phrase.",
},
{"role": "user", "content": f"Facts:\n{context}\n\nQuestion: {question}"},
]
response = await self.llm.ainvoke(messages)
answer = response.content
# Store the QA interaction in Graphiti
qa_memory = f"Question: {question}\nAnswer: {answer}"
await self.rag_client.add_episode(
name="QA Interaction",
episode_body=qa_memory,
source=EpisodeType.text,
source_description="qa_interaction",
reference_time=datetime.now(timezone.utc),
)
return answer
@property
def system_name(self) -> str:
"""Return system name."""
return "Graphiti"
if __name__ == "__main__":
# Example usage
config = GraphitiConfig(
corpus_limit=5, # Small test
qa_limit=3,
print_results=True,
)
benchmark = QABenchmarkGraphiti.from_jsons(
corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
)
results = benchmark.run()

View file

@ -0,0 +1,91 @@
import asyncio
import os
from dataclasses import dataclass
from typing import Any
from dotenv import load_dotenv
from lightrag import LightRAG, QueryParam
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.utils import setup_logger
from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
load_dotenv()
setup_logger("lightrag", level="INFO")
@dataclass
class LightRAGConfig(QABenchmarkConfig):
"""Configuration for LightRAG QA benchmark."""
# Storage parameters
working_dir: str = "./lightrag_storage"
# Query parameters
query_mode: str = "hybrid" # "naive", "local", "global", "hybrid"
# Default results file
results_file: str = "hotpot_qa_lightrag_results.json"
class QABenchmarkLightRAG(QABenchmarkRAG):
"""LightRAG implementation of QA benchmark."""
def __init__(self, corpus, qa_pairs, config: LightRAGConfig):
super().__init__(corpus, qa_pairs, config)
self.config: LightRAGConfig = config
# Ensure working directory exists
if not os.path.exists(self.config.working_dir):
os.makedirs(self.config.working_dir)
async def initialize_rag(self) -> Any:
"""Initialize LightRAG with storage and pipeline setup."""
lightrag = LightRAG(
working_dir=self.config.working_dir,
embedding_func=openai_embed,
llm_model_func=gpt_4o_mini_complete,
)
await lightrag.initialize_storages()
await initialize_pipeline_status()
return lightrag
async def cleanup_rag(self) -> None:
"""Clean up LightRAG storages."""
if self.rag_client:
await self.rag_client.finalize_storages()
async def insert_document(self, document: str, document_id: int) -> None:
"""Insert document into LightRAG."""
await self.rag_client.ainsert([document])
async def query_rag(self, question: str) -> str:
"""Query LightRAG and return the answer."""
result = await self.rag_client.aquery(
question, param=QueryParam(mode=self.config.query_mode)
)
return result
@property
def system_name(self) -> str:
"""Return system name."""
return "LightRAG"
if __name__ == "__main__":
# Example usage
config = LightRAGConfig(
corpus_limit=5, # Small test
qa_limit=3,
query_mode="hybrid",
print_results=True,
)
benchmark = QABenchmarkLightRAG.from_jsons(
corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
)
results = benchmark.run()

View file

@ -0,0 +1,113 @@
import asyncio
from dataclasses import dataclass
from typing import Any
from dotenv import load_dotenv
from openai import OpenAI
from mem0 import Memory
from qa_benchmark_base import QABenchmarkRAG, QABenchmarkConfig
load_dotenv()
@dataclass
class Mem0Config(QABenchmarkConfig):
"""Configuration for Mem0 QA benchmark."""
# Memory parameters
user_id: str = "hotpot_qa_user"
# Model parameters
model_name: str = "gpt-4o-mini"
# Default results file
results_file: str = "hotpot_qa_mem0_results.json"
class QABenchmarkMem0(QABenchmarkRAG):
"""Mem0 implementation of QA benchmark."""
def __init__(self, corpus, qa_pairs, config: Mem0Config):
super().__init__(corpus, qa_pairs, config)
self.config: Mem0Config = config
self.openai_client = None
async def initialize_rag(self) -> Any:
"""Initialize Mem0 Memory and OpenAI client."""
memory = Memory()
self.openai_client = OpenAI()
return memory
async def cleanup_rag(self) -> None:
"""Clean up resources (no cleanup needed for Mem0)."""
pass
async def insert_document(self, document: str, document_id: int) -> None:
"""Insert document into Mem0 as conversation messages."""
# Create conversation messages format
messages = [
{"role": "system", "content": "This is a document to remember."},
{"role": "user", "content": "Please remember this document."},
{"role": "assistant", "content": document},
]
# Add to memory (wrap sync call in async)
await asyncio.get_event_loop().run_in_executor(
None, lambda: self.rag_client.add(messages, user_id=self.config.user_id)
)
async def query_rag(self, question: str) -> str:
"""Query Mem0 and generate answer using OpenAI."""
# Search Mem0 for relevant memories
relevant_memories = await asyncio.get_event_loop().run_in_executor(
None,
lambda: self.rag_client.search(query=question, user_id=self.config.user_id, limit=5),
)
# Format memories for context
memories_str = "\n".join(f"- {entry['memory']}" for entry in relevant_memories["results"])
# Generate response with OpenAI
system_prompt = f"You are a helpful AI assistant. Answer the question based on the provided context.\n\nContext:\n{memories_str}"
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": question},
]
# Call OpenAI API (wrap sync call in async)
response = await asyncio.get_event_loop().run_in_executor(
None,
lambda: self.openai_client.chat.completions.create(
model=self.config.model_name, messages=messages
),
)
answer = response.choices[0].message.content
# Store the QA interaction in Mem0
qa_messages = messages + [{"role": "assistant", "content": answer}]
await asyncio.get_event_loop().run_in_executor(
None, lambda: self.rag_client.add(qa_messages, user_id=self.config.user_id)
)
return answer
@property
def system_name(self) -> str:
"""Return system name."""
return "Mem0"
if __name__ == "__main__":
# Example usage
config = Mem0Config(
corpus_limit=5, # Small test
qa_limit=3,
print_results=True,
)
benchmark = QABenchmarkMem0.from_jsons(
corpus_file="hotpot_50_corpus.json", qa_pairs_file="hotpot_50_qa_pairs.json", config=config
)
results = benchmark.run()