From 17231de5d05a710c17e21b1f62f4b52fd2d6f39a Mon Sep 17 00:00:00 2001 From: alekszievr <44192193+alekszievr@users.noreply.github.com> Date: Thu, 20 Feb 2025 14:23:53 +0100 Subject: [PATCH] Test: Parse context pieces separately in MusiqueQAAdapter and adjust tests [cog-1234] (#561) ## Description ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **Tests** - Updated evaluation checks by removing assertions related to the relationship between `corpus_list` and `qa_pairs`, now focusing solely on `qa_pairs` limits. - **Refactor** - Improved content processing to append each paragraph individually to `corpus_list`, enhancing clarity in data structure. - Simplified type annotations in the `load_corpus` method across multiple adapters, ensuring consistency in return types. - **Chores** - Updated dependency installation commands in GitHub Actions workflows for Python 3.10, 3.11, and 3.12 to include additional evaluation-related dependencies. --------- Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com> --- .github/workflows/test_python_3_10.yml | 2 +- .github/workflows/test_python_3_11.yml | 2 +- .github/workflows/test_python_3_12.yml | 2 +- .../unit/eval_framework}/benchmark_adapters_test.py | 11 +---------- .../benchmark_adapters/dummy_adapter.py | 4 ++-- .../benchmark_adapters/hotpot_qa_adapter.py | 4 ++-- .../benchmark_adapters/musique_adapter.py | 6 +++--- .../benchmark_adapters/twowikimultihop_adapter.py | 4 ++-- 8 files changed, 13 insertions(+), 22 deletions(-) rename {evals/eval_framework/tests/unit => cognee/tests/unit/eval_framework}/benchmark_adapters_test.py (87%) diff --git a/.github/workflows/test_python_3_10.yml b/.github/workflows/test_python_3_10.yml index 45a70ca6f..be102010d 100644 --- a/.github/workflows/test_python_3_10.yml +++ b/.github/workflows/test_python_3_10.yml @@ -47,7 +47,7 @@ jobs: installer-parallel: true - name: Install dependencies - run: poetry install --no-interaction -E docs + run: poetry install --no-interaction -E docs -E evals - name: Download NLTK tokenizer data run: | poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng diff --git a/.github/workflows/test_python_3_11.yml b/.github/workflows/test_python_3_11.yml index ecaeddc36..551d2680f 100644 --- a/.github/workflows/test_python_3_11.yml +++ b/.github/workflows/test_python_3_11.yml @@ -48,7 +48,7 @@ jobs: installer-parallel: true - name: Install dependencies - run: poetry install --no-interaction -E docs + run: poetry install --no-interaction -E docs -E evals - name: Download NLTK tokenizer data run: | diff --git a/.github/workflows/test_python_3_12.yml b/.github/workflows/test_python_3_12.yml index 27d301789..31ac2c87c 100644 --- a/.github/workflows/test_python_3_12.yml +++ b/.github/workflows/test_python_3_12.yml @@ -48,7 +48,7 @@ jobs: installer-parallel: true - name: Install dependencies - run: poetry install --no-interaction -E docs + run: poetry install --no-interaction -E docs -E evals - name: Download NLTK tokenizer data run: | poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng diff --git a/evals/eval_framework/tests/unit/benchmark_adapters_test.py b/cognee/tests/unit/eval_framework/benchmark_adapters_test.py similarity index 87% rename from evals/eval_framework/tests/unit/benchmark_adapters_test.py rename to cognee/tests/unit/eval_framework/benchmark_adapters_test.py index a3e295910..34bf0d22a 100644 --- a/evals/eval_framework/tests/unit/benchmark_adapters_test.py +++ b/cognee/tests/unit/eval_framework/benchmark_adapters_test.py @@ -1,5 +1,4 @@ import pytest -import random from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter @@ -46,12 +45,6 @@ def test_adapter_returns_some_content(AdapterClass): assert len(corpus_list) > 0, f"{AdapterClass.__name__} returned an empty corpus_list." assert len(qa_pairs) > 0, f"{AdapterClass.__name__} returned an empty question_answer_pairs." - # Check the shape - assert len(corpus_list) == len(qa_pairs), ( - f"{AdapterClass.__name__} corpus_list and question_answer_pairs " - "should typically be the same length. Adjust if your adapter differs." - ) - for item in qa_pairs: assert "question" in item, f"{AdapterClass.__name__} missing 'question' key in QA pair." assert "answer" in item, f"{AdapterClass.__name__} missing 'answer' key in QA pair." @@ -70,9 +63,7 @@ def test_adapter_limit(AdapterClass): # Confirm that we didn't receive more than 'limit' # (Some adapters might be allowed to return fewer if the dataset is small) - assert len(corpus_list) <= limit, ( - f"{AdapterClass.__name__} returned more items than requested limit={limit}." - ) + assert len(qa_pairs) <= limit, ( f"{AdapterClass.__name__} returned more QA items than requested limit={limit}." ) diff --git a/evals/eval_framework/benchmark_adapters/dummy_adapter.py b/evals/eval_framework/benchmark_adapters/dummy_adapter.py index 3f35a8be6..c67440940 100644 --- a/evals/eval_framework/benchmark_adapters/dummy_adapter.py +++ b/evals/eval_framework/benchmark_adapters/dummy_adapter.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, Any, LiteralString +from typing import Optional from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter @@ -6,7 +6,7 @@ from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseB class DummyAdapter(BaseBenchmarkAdapter): def load_corpus( self, limit: Optional[int] = None, seed: int = 42 - ) -> tuple[list[Union[LiteralString, str]], list[dict[str, str]]]: + ) -> tuple[list[str], list[dict[str, str]]]: corpus_list = [ "The cognee is an AI memory engine that supports different vector and graph databases", "Neo4j is a graph database supported by cognee", diff --git a/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py b/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py index 3cf4753e1..2732a9d50 100644 --- a/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py +++ b/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py @@ -2,7 +2,7 @@ import requests import os import json import random -from typing import Optional, Union, Any, LiteralString +from typing import Optional, Any from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter @@ -16,7 +16,7 @@ class HotpotQAAdapter(BaseBenchmarkAdapter): def load_corpus( self, limit: Optional[int] = None, seed: int = 42 - ) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]: + ) -> tuple[list[str], list[dict[str, Any]]]: filename = self.dataset_info["filename"] if os.path.exists(filename): diff --git a/evals/eval_framework/benchmark_adapters/musique_adapter.py b/evals/eval_framework/benchmark_adapters/musique_adapter.py index 53858cbb0..73118d73f 100644 --- a/evals/eval_framework/benchmark_adapters/musique_adapter.py +++ b/evals/eval_framework/benchmark_adapters/musique_adapter.py @@ -1,7 +1,7 @@ import os import json import random -from typing import Optional, Union, Any, LiteralString +from typing import Optional, Any import zipfile import gdown @@ -64,8 +64,8 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter): for item in data: # Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text' paragraphs = item.get("paragraphs", []) - combined_paragraphs = " ".join(paragraph["paragraph_text"] for paragraph in paragraphs) - corpus_list.append(combined_paragraphs) + for paragraph in paragraphs: + corpus_list.append(paragraph["paragraph_text"]) question = item.get("question", "") answer = item.get("answer", "") diff --git a/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py b/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py index 0fbe3f156..b65a64364 100644 --- a/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py +++ b/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py @@ -2,7 +2,7 @@ import requests import os import json import random -from typing import Optional, Union, Any, LiteralString +from typing import Optional, Any from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter @@ -14,7 +14,7 @@ class TwoWikiMultihopAdapter(BaseBenchmarkAdapter): def load_corpus( self, limit: Optional[int] = None, seed: int = 42 - ) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]: + ) -> tuple[list[str], list[dict[str, Any]]]: filename = self.dataset_info["filename"] if os.path.exists(filename):