diff --git a/.github/workflows/test_python_3_10.yml b/.github/workflows/test_python_3_10.yml index 45a70ca6f..be102010d 100644 --- a/.github/workflows/test_python_3_10.yml +++ b/.github/workflows/test_python_3_10.yml @@ -47,7 +47,7 @@ jobs: installer-parallel: true - name: Install dependencies - run: poetry install --no-interaction -E docs + run: poetry install --no-interaction -E docs -E evals - name: Download NLTK tokenizer data run: | poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng diff --git a/.github/workflows/test_python_3_11.yml b/.github/workflows/test_python_3_11.yml index ecaeddc36..551d2680f 100644 --- a/.github/workflows/test_python_3_11.yml +++ b/.github/workflows/test_python_3_11.yml @@ -48,7 +48,7 @@ jobs: installer-parallel: true - name: Install dependencies - run: poetry install --no-interaction -E docs + run: poetry install --no-interaction -E docs -E evals - name: Download NLTK tokenizer data run: | diff --git a/.github/workflows/test_python_3_12.yml b/.github/workflows/test_python_3_12.yml index 27d301789..31ac2c87c 100644 --- a/.github/workflows/test_python_3_12.yml +++ b/.github/workflows/test_python_3_12.yml @@ -48,7 +48,7 @@ jobs: installer-parallel: true - name: Install dependencies - run: poetry install --no-interaction -E docs + run: poetry install --no-interaction -E docs -E evals - name: Download NLTK tokenizer data run: | poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng diff --git a/evals/eval_framework/tests/unit/benchmark_adapters_test.py b/cognee/tests/unit/eval_framework/benchmark_adapters_test.py similarity index 87% rename from evals/eval_framework/tests/unit/benchmark_adapters_test.py rename to cognee/tests/unit/eval_framework/benchmark_adapters_test.py index a3e295910..34bf0d22a 100644 --- a/evals/eval_framework/tests/unit/benchmark_adapters_test.py +++ b/cognee/tests/unit/eval_framework/benchmark_adapters_test.py @@ -1,5 +1,4 @@ import pytest -import random from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter @@ -46,12 +45,6 @@ def test_adapter_returns_some_content(AdapterClass): assert len(corpus_list) > 0, f"{AdapterClass.__name__} returned an empty corpus_list." assert len(qa_pairs) > 0, f"{AdapterClass.__name__} returned an empty question_answer_pairs." - # Check the shape - assert len(corpus_list) == len(qa_pairs), ( - f"{AdapterClass.__name__} corpus_list and question_answer_pairs " - "should typically be the same length. Adjust if your adapter differs." - ) - for item in qa_pairs: assert "question" in item, f"{AdapterClass.__name__} missing 'question' key in QA pair." assert "answer" in item, f"{AdapterClass.__name__} missing 'answer' key in QA pair." @@ -70,9 +63,7 @@ def test_adapter_limit(AdapterClass): # Confirm that we didn't receive more than 'limit' # (Some adapters might be allowed to return fewer if the dataset is small) - assert len(corpus_list) <= limit, ( - f"{AdapterClass.__name__} returned more items than requested limit={limit}." - ) + assert len(qa_pairs) <= limit, ( f"{AdapterClass.__name__} returned more QA items than requested limit={limit}." ) diff --git a/evals/eval_framework/benchmark_adapters/dummy_adapter.py b/evals/eval_framework/benchmark_adapters/dummy_adapter.py index 3f35a8be6..c67440940 100644 --- a/evals/eval_framework/benchmark_adapters/dummy_adapter.py +++ b/evals/eval_framework/benchmark_adapters/dummy_adapter.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, Any, LiteralString +from typing import Optional from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter @@ -6,7 +6,7 @@ from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseB class DummyAdapter(BaseBenchmarkAdapter): def load_corpus( self, limit: Optional[int] = None, seed: int = 42 - ) -> tuple[list[Union[LiteralString, str]], list[dict[str, str]]]: + ) -> tuple[list[str], list[dict[str, str]]]: corpus_list = [ "The cognee is an AI memory engine that supports different vector and graph databases", "Neo4j is a graph database supported by cognee", diff --git a/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py b/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py index 3cf4753e1..2732a9d50 100644 --- a/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py +++ b/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py @@ -2,7 +2,7 @@ import requests import os import json import random -from typing import Optional, Union, Any, LiteralString +from typing import Optional, Any from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter @@ -16,7 +16,7 @@ class HotpotQAAdapter(BaseBenchmarkAdapter): def load_corpus( self, limit: Optional[int] = None, seed: int = 42 - ) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]: + ) -> tuple[list[str], list[dict[str, Any]]]: filename = self.dataset_info["filename"] if os.path.exists(filename): diff --git a/evals/eval_framework/benchmark_adapters/musique_adapter.py b/evals/eval_framework/benchmark_adapters/musique_adapter.py index 53858cbb0..73118d73f 100644 --- a/evals/eval_framework/benchmark_adapters/musique_adapter.py +++ b/evals/eval_framework/benchmark_adapters/musique_adapter.py @@ -1,7 +1,7 @@ import os import json import random -from typing import Optional, Union, Any, LiteralString +from typing import Optional, Any import zipfile import gdown @@ -64,8 +64,8 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter): for item in data: # Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text' paragraphs = item.get("paragraphs", []) - combined_paragraphs = " ".join(paragraph["paragraph_text"] for paragraph in paragraphs) - corpus_list.append(combined_paragraphs) + for paragraph in paragraphs: + corpus_list.append(paragraph["paragraph_text"]) question = item.get("question", "") answer = item.get("answer", "") diff --git a/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py b/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py index 0fbe3f156..b65a64364 100644 --- a/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py +++ b/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py @@ -2,7 +2,7 @@ import requests import os import json import random -from typing import Optional, Union, Any, LiteralString +from typing import Optional, Any from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter @@ -14,7 +14,7 @@ class TwoWikiMultihopAdapter(BaseBenchmarkAdapter): def load_corpus( self, limit: Optional[int] = None, seed: int = 42 - ) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]: + ) -> tuple[list[str], list[dict[str, Any]]]: filename = self.dataset_info["filename"] if os.path.exists(filename):