Test: Parse context pieces separately in MusiqueQAAdapter and adjust tests [cog-1234] (#561)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **Tests** - Updated evaluation checks by removing assertions related to the relationship between `corpus_list` and `qa_pairs`, now focusing solely on `qa_pairs` limits. - **Refactor** - Improved content processing to append each paragraph individually to `corpus_list`, enhancing clarity in data structure. - Simplified type annotations in the `load_corpus` method across multiple adapters, ensuring consistency in return types. - **Chores** - Updated dependency installation commands in GitHub Actions workflows for Python 3.10, 3.11, and 3.12 to include additional evaluation-related dependencies. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
This commit is contained in:
parent
e25c7c93fe
commit
17231de5d0
8 changed files with 13 additions and 22 deletions
2
.github/workflows/test_python_3_10.yml
vendored
2
.github/workflows/test_python_3_10.yml
vendored
|
|
@ -47,7 +47,7 @@ jobs:
|
|||
installer-parallel: true
|
||||
|
||||
- name: Install dependencies
|
||||
run: poetry install --no-interaction -E docs
|
||||
run: poetry install --no-interaction -E docs -E evals
|
||||
- name: Download NLTK tokenizer data
|
||||
run: |
|
||||
poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng
|
||||
|
|
|
|||
2
.github/workflows/test_python_3_11.yml
vendored
2
.github/workflows/test_python_3_11.yml
vendored
|
|
@ -48,7 +48,7 @@ jobs:
|
|||
installer-parallel: true
|
||||
|
||||
- name: Install dependencies
|
||||
run: poetry install --no-interaction -E docs
|
||||
run: poetry install --no-interaction -E docs -E evals
|
||||
|
||||
- name: Download NLTK tokenizer data
|
||||
run: |
|
||||
|
|
|
|||
2
.github/workflows/test_python_3_12.yml
vendored
2
.github/workflows/test_python_3_12.yml
vendored
|
|
@ -48,7 +48,7 @@ jobs:
|
|||
installer-parallel: true
|
||||
|
||||
- name: Install dependencies
|
||||
run: poetry install --no-interaction -E docs
|
||||
run: poetry install --no-interaction -E docs -E evals
|
||||
- name: Download NLTK tokenizer data
|
||||
run: |
|
||||
poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
import pytest
|
||||
import random
|
||||
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
|
||||
from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
|
||||
from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
|
||||
|
|
@ -46,12 +45,6 @@ def test_adapter_returns_some_content(AdapterClass):
|
|||
assert len(corpus_list) > 0, f"{AdapterClass.__name__} returned an empty corpus_list."
|
||||
assert len(qa_pairs) > 0, f"{AdapterClass.__name__} returned an empty question_answer_pairs."
|
||||
|
||||
# Check the shape
|
||||
assert len(corpus_list) == len(qa_pairs), (
|
||||
f"{AdapterClass.__name__} corpus_list and question_answer_pairs "
|
||||
"should typically be the same length. Adjust if your adapter differs."
|
||||
)
|
||||
|
||||
for item in qa_pairs:
|
||||
assert "question" in item, f"{AdapterClass.__name__} missing 'question' key in QA pair."
|
||||
assert "answer" in item, f"{AdapterClass.__name__} missing 'answer' key in QA pair."
|
||||
|
|
@ -70,9 +63,7 @@ def test_adapter_limit(AdapterClass):
|
|||
|
||||
# Confirm that we didn't receive more than 'limit'
|
||||
# (Some adapters might be allowed to return fewer if the dataset is small)
|
||||
assert len(corpus_list) <= limit, (
|
||||
f"{AdapterClass.__name__} returned more items than requested limit={limit}."
|
||||
)
|
||||
|
||||
assert len(qa_pairs) <= limit, (
|
||||
f"{AdapterClass.__name__} returned more QA items than requested limit={limit}."
|
||||
)
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional, Union, Any, LiteralString
|
||||
from typing import Optional
|
||||
|
||||
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
|
||||
|
||||
|
|
@ -6,7 +6,7 @@ from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseB
|
|||
class DummyAdapter(BaseBenchmarkAdapter):
|
||||
def load_corpus(
|
||||
self, limit: Optional[int] = None, seed: int = 42
|
||||
) -> tuple[list[Union[LiteralString, str]], list[dict[str, str]]]:
|
||||
) -> tuple[list[str], list[dict[str, str]]]:
|
||||
corpus_list = [
|
||||
"The cognee is an AI memory engine that supports different vector and graph databases",
|
||||
"Neo4j is a graph database supported by cognee",
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import requests
|
|||
import os
|
||||
import json
|
||||
import random
|
||||
from typing import Optional, Union, Any, LiteralString
|
||||
from typing import Optional, Any
|
||||
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
|
||||
|
||||
|
||||
|
|
@ -16,7 +16,7 @@ class HotpotQAAdapter(BaseBenchmarkAdapter):
|
|||
|
||||
def load_corpus(
|
||||
self, limit: Optional[int] = None, seed: int = 42
|
||||
) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]:
|
||||
) -> tuple[list[str], list[dict[str, Any]]]:
|
||||
filename = self.dataset_info["filename"]
|
||||
|
||||
if os.path.exists(filename):
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
import json
|
||||
import random
|
||||
from typing import Optional, Union, Any, LiteralString
|
||||
from typing import Optional, Any
|
||||
import zipfile
|
||||
|
||||
import gdown
|
||||
|
|
@ -64,8 +64,8 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter):
|
|||
for item in data:
|
||||
# Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text'
|
||||
paragraphs = item.get("paragraphs", [])
|
||||
combined_paragraphs = " ".join(paragraph["paragraph_text"] for paragraph in paragraphs)
|
||||
corpus_list.append(combined_paragraphs)
|
||||
for paragraph in paragraphs:
|
||||
corpus_list.append(paragraph["paragraph_text"])
|
||||
|
||||
question = item.get("question", "")
|
||||
answer = item.get("answer", "")
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import requests
|
|||
import os
|
||||
import json
|
||||
import random
|
||||
from typing import Optional, Union, Any, LiteralString
|
||||
from typing import Optional, Any
|
||||
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
|
||||
|
||||
|
||||
|
|
@ -14,7 +14,7 @@ class TwoWikiMultihopAdapter(BaseBenchmarkAdapter):
|
|||
|
||||
def load_corpus(
|
||||
self, limit: Optional[int] = None, seed: int = 42
|
||||
) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]:
|
||||
) -> tuple[list[str], list[dict[str, Any]]]:
|
||||
filename = self.dataset_info["filename"]
|
||||
|
||||
if os.path.exists(filename):
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue