Test: Parse context pieces separately in MusiqueQAAdapter and adjust tests [cog-1234] (#561)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Tests**
- Updated evaluation checks by removing assertions related to the
relationship between `corpus_list` and `qa_pairs`, now focusing solely
on `qa_pairs` limits.

- **Refactor**
- Improved content processing to append each paragraph individually to
`corpus_list`, enhancing clarity in data structure.
- Simplified type annotations in the `load_corpus` method across
multiple adapters, ensuring consistency in return types.

- **Chores**
- Updated dependency installation commands in GitHub Actions workflows
for Python 3.10, 3.11, and 3.12 to include additional evaluation-related
dependencies.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
This commit is contained in:
alekszievr 2025-02-20 14:23:53 +01:00 committed by GitHub
parent e25c7c93fe
commit 17231de5d0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 13 additions and 22 deletions

View file

@ -47,7 +47,7 @@ jobs:
installer-parallel: true
- name: Install dependencies
run: poetry install --no-interaction -E docs
run: poetry install --no-interaction -E docs -E evals
- name: Download NLTK tokenizer data
run: |
poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng

View file

@ -48,7 +48,7 @@ jobs:
installer-parallel: true
- name: Install dependencies
run: poetry install --no-interaction -E docs
run: poetry install --no-interaction -E docs -E evals
- name: Download NLTK tokenizer data
run: |

View file

@ -48,7 +48,7 @@ jobs:
installer-parallel: true
- name: Install dependencies
run: poetry install --no-interaction -E docs
run: poetry install --no-interaction -E docs -E evals
- name: Download NLTK tokenizer data
run: |
poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng

View file

@ -1,5 +1,4 @@
import pytest
import random
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
@ -46,12 +45,6 @@ def test_adapter_returns_some_content(AdapterClass):
assert len(corpus_list) > 0, f"{AdapterClass.__name__} returned an empty corpus_list."
assert len(qa_pairs) > 0, f"{AdapterClass.__name__} returned an empty question_answer_pairs."
# Check the shape
assert len(corpus_list) == len(qa_pairs), (
f"{AdapterClass.__name__} corpus_list and question_answer_pairs "
"should typically be the same length. Adjust if your adapter differs."
)
for item in qa_pairs:
assert "question" in item, f"{AdapterClass.__name__} missing 'question' key in QA pair."
assert "answer" in item, f"{AdapterClass.__name__} missing 'answer' key in QA pair."
@ -70,9 +63,7 @@ def test_adapter_limit(AdapterClass):
# Confirm that we didn't receive more than 'limit'
# (Some adapters might be allowed to return fewer if the dataset is small)
assert len(corpus_list) <= limit, (
f"{AdapterClass.__name__} returned more items than requested limit={limit}."
)
assert len(qa_pairs) <= limit, (
f"{AdapterClass.__name__} returned more QA items than requested limit={limit}."
)

View file

@ -1,4 +1,4 @@
from typing import Optional, Union, Any, LiteralString
from typing import Optional
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
@ -6,7 +6,7 @@ from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseB
class DummyAdapter(BaseBenchmarkAdapter):
def load_corpus(
self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[Union[LiteralString, str]], list[dict[str, str]]]:
) -> tuple[list[str], list[dict[str, str]]]:
corpus_list = [
"The cognee is an AI memory engine that supports different vector and graph databases",
"Neo4j is a graph database supported by cognee",

View file

@ -2,7 +2,7 @@ import requests
import os
import json
import random
from typing import Optional, Union, Any, LiteralString
from typing import Optional, Any
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
@ -16,7 +16,7 @@ class HotpotQAAdapter(BaseBenchmarkAdapter):
def load_corpus(
self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]:
) -> tuple[list[str], list[dict[str, Any]]]:
filename = self.dataset_info["filename"]
if os.path.exists(filename):

View file

@ -1,7 +1,7 @@
import os
import json
import random
from typing import Optional, Union, Any, LiteralString
from typing import Optional, Any
import zipfile
import gdown
@ -64,8 +64,8 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter):
for item in data:
# Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text'
paragraphs = item.get("paragraphs", [])
combined_paragraphs = " ".join(paragraph["paragraph_text"] for paragraph in paragraphs)
corpus_list.append(combined_paragraphs)
for paragraph in paragraphs:
corpus_list.append(paragraph["paragraph_text"])
question = item.get("question", "")
answer = item.get("answer", "")

View file

@ -2,7 +2,7 @@ import requests
import os
import json
import random
from typing import Optional, Union, Any, LiteralString
from typing import Optional, Any
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
@ -14,7 +14,7 @@ class TwoWikiMultihopAdapter(BaseBenchmarkAdapter):
def load_corpus(
self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]:
) -> tuple[list[str], list[dict[str, Any]]]:
filename = self.dataset_info["filename"]
if os.path.exists(filename):