cognee/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py
hajdul88 6a0c0e3ef8
feat: Cognee evaluation framework development (#498)
<!-- .github/pull_request_template.md -->

This PR contains the evaluation framework development for cognee

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Expanded evaluation framework now integrates asynchronous corpus
building, question answering, and performance evaluation with adaptive
benchmarks for improved metrics (correctness, exact match, and F1
score).

- **Infrastructure**
- Added database integration for persistent storage of questions,
answers, and metrics.
- Launched an interactive metrics dashboard featuring advanced
visualizations.
- Introduced an automated testing workflow for continuous quality
assurance.

- **Documentation**
  - Updated guidelines for generating concise, clear answers.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-02-11 16:31:54 +01:00

51 lines
1.9 KiB
Python

import requests
import os
import json
import random
from typing import Optional, Union, Any, LiteralString
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
class HotpotQAAdapter(BaseBenchmarkAdapter):
dataset_info = {
"filename": "hotpot_benchmark.json",
"url": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json",
# train: "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json" delete file after changing the url
# distractor test: "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json" delete file after changing the url
}
def load_corpus(
self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]:
filename = self.dataset_info["filename"]
if os.path.exists(filename):
with open(filename, "r", encoding="utf-8") as f:
corpus_json = json.load(f)
else:
response = requests.get(self.dataset_info["url"])
response.raise_for_status()
corpus_json = response.json()
with open(filename, "w", encoding="utf-8") as f:
json.dump(corpus_json, f, ensure_ascii=False, indent=4)
if limit is not None and 0 < limit < len(corpus_json):
random.seed(seed)
corpus_json = random.sample(corpus_json, limit)
corpus_list = []
question_answer_pairs = []
for item in corpus_json:
for title, sentences in item["context"]:
corpus_list.append(" ".join(sentences))
question_answer_pairs.append(
{
"question": item["question"],
"answer": item["answer"].lower(),
"level": item["level"],
}
)
return corpus_list, question_answer_pairs