Test: Parse context pieces separately in MusiqueQAAdapter and adjust tests [cog-1234] (#561)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Tests**
- Updated evaluation checks by removing assertions related to the
relationship between `corpus_list` and `qa_pairs`, now focusing solely
on `qa_pairs` limits.

- **Refactor**
- Improved content processing to append each paragraph individually to
`corpus_list`, enhancing clarity in data structure.
- Simplified type annotations in the `load_corpus` method across
multiple adapters, ensuring consistency in return types.

- **Chores**
- Updated dependency installation commands in GitHub Actions workflows
for Python 3.10, 3.11, and 3.12 to include additional evaluation-related
dependencies.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
This commit is contained in:
alekszievr 2025-02-20 14:23:53 +01:00 committed by GitHub
parent e25c7c93fe
commit 17231de5d0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 13 additions and 22 deletions

View file

@ -47,7 +47,7 @@ jobs:
installer-parallel: true installer-parallel: true
- name: Install dependencies - name: Install dependencies
run: poetry install --no-interaction -E docs run: poetry install --no-interaction -E docs -E evals
- name: Download NLTK tokenizer data - name: Download NLTK tokenizer data
run: | run: |
poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng

View file

@ -48,7 +48,7 @@ jobs:
installer-parallel: true installer-parallel: true
- name: Install dependencies - name: Install dependencies
run: poetry install --no-interaction -E docs run: poetry install --no-interaction -E docs -E evals
- name: Download NLTK tokenizer data - name: Download NLTK tokenizer data
run: | run: |

View file

@ -48,7 +48,7 @@ jobs:
installer-parallel: true installer-parallel: true
- name: Install dependencies - name: Install dependencies
run: poetry install --no-interaction -E docs run: poetry install --no-interaction -E docs -E evals
- name: Download NLTK tokenizer data - name: Download NLTK tokenizer data
run: | run: |
poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng

View file

@ -1,5 +1,4 @@
import pytest import pytest
import random
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
@ -46,12 +45,6 @@ def test_adapter_returns_some_content(AdapterClass):
assert len(corpus_list) > 0, f"{AdapterClass.__name__} returned an empty corpus_list." assert len(corpus_list) > 0, f"{AdapterClass.__name__} returned an empty corpus_list."
assert len(qa_pairs) > 0, f"{AdapterClass.__name__} returned an empty question_answer_pairs." assert len(qa_pairs) > 0, f"{AdapterClass.__name__} returned an empty question_answer_pairs."
# Check the shape
assert len(corpus_list) == len(qa_pairs), (
f"{AdapterClass.__name__} corpus_list and question_answer_pairs "
"should typically be the same length. Adjust if your adapter differs."
)
for item in qa_pairs: for item in qa_pairs:
assert "question" in item, f"{AdapterClass.__name__} missing 'question' key in QA pair." assert "question" in item, f"{AdapterClass.__name__} missing 'question' key in QA pair."
assert "answer" in item, f"{AdapterClass.__name__} missing 'answer' key in QA pair." assert "answer" in item, f"{AdapterClass.__name__} missing 'answer' key in QA pair."
@ -70,9 +63,7 @@ def test_adapter_limit(AdapterClass):
# Confirm that we didn't receive more than 'limit' # Confirm that we didn't receive more than 'limit'
# (Some adapters might be allowed to return fewer if the dataset is small) # (Some adapters might be allowed to return fewer if the dataset is small)
assert len(corpus_list) <= limit, (
f"{AdapterClass.__name__} returned more items than requested limit={limit}."
)
assert len(qa_pairs) <= limit, ( assert len(qa_pairs) <= limit, (
f"{AdapterClass.__name__} returned more QA items than requested limit={limit}." f"{AdapterClass.__name__} returned more QA items than requested limit={limit}."
) )

View file

@ -1,4 +1,4 @@
from typing import Optional, Union, Any, LiteralString from typing import Optional
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
@ -6,7 +6,7 @@ from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseB
class DummyAdapter(BaseBenchmarkAdapter): class DummyAdapter(BaseBenchmarkAdapter):
def load_corpus( def load_corpus(
self, limit: Optional[int] = None, seed: int = 42 self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[Union[LiteralString, str]], list[dict[str, str]]]: ) -> tuple[list[str], list[dict[str, str]]]:
corpus_list = [ corpus_list = [
"The cognee is an AI memory engine that supports different vector and graph databases", "The cognee is an AI memory engine that supports different vector and graph databases",
"Neo4j is a graph database supported by cognee", "Neo4j is a graph database supported by cognee",

View file

@ -2,7 +2,7 @@ import requests
import os import os
import json import json
import random import random
from typing import Optional, Union, Any, LiteralString from typing import Optional, Any
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
@ -16,7 +16,7 @@ class HotpotQAAdapter(BaseBenchmarkAdapter):
def load_corpus( def load_corpus(
self, limit: Optional[int] = None, seed: int = 42 self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]: ) -> tuple[list[str], list[dict[str, Any]]]:
filename = self.dataset_info["filename"] filename = self.dataset_info["filename"]
if os.path.exists(filename): if os.path.exists(filename):

View file

@ -1,7 +1,7 @@
import os import os
import json import json
import random import random
from typing import Optional, Union, Any, LiteralString from typing import Optional, Any
import zipfile import zipfile
import gdown import gdown
@ -64,8 +64,8 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter):
for item in data: for item in data:
# Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text' # Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text'
paragraphs = item.get("paragraphs", []) paragraphs = item.get("paragraphs", [])
combined_paragraphs = " ".join(paragraph["paragraph_text"] for paragraph in paragraphs) for paragraph in paragraphs:
corpus_list.append(combined_paragraphs) corpus_list.append(paragraph["paragraph_text"])
question = item.get("question", "") question = item.get("question", "")
answer = item.get("answer", "") answer = item.get("answer", "")

View file

@ -2,7 +2,7 @@ import requests
import os import os
import json import json
import random import random
from typing import Optional, Union, Any, LiteralString from typing import Optional, Any
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
@ -14,7 +14,7 @@ class TwoWikiMultihopAdapter(BaseBenchmarkAdapter):
def load_corpus( def load_corpus(
self, limit: Optional[int] = None, seed: int = 42 self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]: ) -> tuple[list[str], list[dict[str, Any]]]:
filename = self.dataset_info["filename"] filename = self.dataset_info["filename"]
if os.path.exists(filename): if os.path.exists(filename):