feat/add correctness score calculation with LLM as a judge (#30)

This commit is contained in:
alekszievr 2024-12-03 17:47:18 +01:00 committed by GitHub
commit 706101113a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 1404 additions and 1043 deletions

1
.gitignore vendored
View file

@ -12,7 +12,6 @@ __pycache__/
*$py.class *$py.class
full_run.ipynb full_run.ipynb
evals/
# C extensions # C extensions
*.so *.so

View file

@ -1,9 +1,14 @@
from .api.v1.config.config import config
from .api.v1.add import add from .api.v1.add import add
from .api.v1.cognify import cognify from .api.v1.cognify import cognify
from .api.v1.config.config import config
from .api.v1.datasets.datasets import datasets from .api.v1.datasets.datasets import datasets
from .api.v1.search import search, SearchType, get_search_history
from .api.v1.prune import prune from .api.v1.prune import prune
from .api.v1.search import SearchType, get_search_history, search
# Pipelines # Pipelines
from .modules import pipelines from .modules import pipelines
try:
import dotenv
dotenv.load_dotenv()
except ImportError:
pass

View file

@ -0,0 +1,2 @@
Answer the question using the provided context. Be as brief as possible.
Each entry in the context is a paragraph, which is represented as a list with two elements [title, sentences] and sentences is a list of strings.

View file

@ -0,0 +1,2 @@
Answer the question using the provided context. Be as brief as possible.
Each entry in the context is tuple of length 3, representing an edge of a knowledge graph with its two nodes.

View file

@ -0,0 +1,2 @@
The question is: `{{ question }}`
And here is the context: `{{ context }}`

14
evals/deepeval_metrics.py Normal file
View file

@ -0,0 +1,14 @@
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
correctness_metric = GEval(
name="Correctness",
model="gpt-4o-mini",
evaluation_params=[
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT
],
evaluation_steps=[
"Determine whether the actual output is factually correct based on the expected output."
]
)

130
evals/llm_as_a_judge.py Normal file
View file

@ -0,0 +1,130 @@
import argparse
import asyncio
import json
import statistics
from pathlib import Path
import deepeval.metrics
import wget
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCase
from tqdm import tqdm
import cognee
import evals.deepeval_metrics
from cognee.api.v1.search import SearchType
from cognee.base_config import get_base_config
from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
async def answer_without_cognee(instance):
args = {
"question": instance["question"],
"context": instance["context"],
}
user_prompt = render_prompt("context_for_question.txt", args)
system_prompt = read_query_prompt("answer_hotpot_question.txt")
llm_client = get_llm_client()
answer_prediction = await llm_client.acreate_structured_output(
text_input=user_prompt,
system_prompt=system_prompt,
response_model=str,
)
return answer_prediction
async def answer_with_cognee(instance):
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
for (title, sentences) in instance["context"]:
await cognee.add("\n".join(sentences), dataset_name = "HotPotQA")
await cognee.cognify("HotPotQA")
search_results = await cognee.search(
SearchType.INSIGHTS, query_text=instance["question"]
)
args = {
"question": instance["question"],
"context": search_results,
}
user_prompt = render_prompt("context_for_question.txt", args)
system_prompt = read_query_prompt("answer_hotpot_using_cognee_search.txt")
llm_client = get_llm_client()
answer_prediction = await llm_client.acreate_structured_output(
text_input=user_prompt,
system_prompt=system_prompt,
response_model=str,
)
return answer_prediction
async def eval_answers(instances, answers, eval_metric):
test_cases = []
for instance, answer in zip(instances, answers):
test_case = LLMTestCase(
input=instance["question"],
actual_output=answer,
expected_output=instance["answer"]
)
test_cases.append(test_case)
eval_set = EvaluationDataset(test_cases)
eval_results = eval_set.evaluate([eval_metric])
return eval_results
async def eval_on_hotpotQA(answer_provider, num_samples, eval_metric):
base_config = get_base_config()
data_root_dir = base_config.data_root_directory
if not Path(data_root_dir).exists():
Path(data_root_dir).mkdir()
filepath = data_root_dir / Path("hotpot_dev_fullwiki_v1.json")
if not filepath.exists():
url = 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json'
wget.download(url, out=data_root_dir)
with open(filepath, "r") as file:
dataset = json.load(file)
instances = dataset if not num_samples else dataset[:num_samples]
answers = []
for instance in tqdm(instances, desc="Getting answers"):
answer = await answer_provider(instance)
answers.append(answer)
eval_results = await eval_answers(instances, answers, eval_metric)
avg_score = statistics.mean([result.metrics_data[0].score for result in eval_results.test_results])
return avg_score
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--with_cognee", action="store_true")
parser.add_argument("--num_samples", type=int, default=500)
parser.add_argument("--metric", type=str, default="correctness_metric")
args = parser.parse_args()
try:
metric_cls = getattr(deepeval.metrics, args.metric)
metric = metric_cls()
except AttributeError:
metric = getattr(evals.deepeval_metrics, args.metric)
if args.with_cognee:
answer_provider = answer_with_cognee
else:
answer_provider = answer_without_cognee
avg_score = asyncio.run(eval_on_hotpotQA(answer_provider, args.num_samples, metric))
print(f"Average {args.metric}: {avg_score}")

2282
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -70,6 +70,7 @@ asyncpg = "0.30.0"
pgvector = "^0.3.5" pgvector = "^0.3.5"
psycopg2 = {version = "^2.9.10", optional = true} psycopg2 = {version = "^2.9.10", optional = true}
llama-index-core = {version = "^0.11.22", optional = true} llama-index-core = {version = "^0.11.22", optional = true}
deepeval = {version = "^2.0.1", optional = true}
[tool.poetry.extras] [tool.poetry.extras]
filesystem = ["s3fs", "botocore"] filesystem = ["s3fs", "botocore"]
@ -80,6 +81,8 @@ neo4j = ["neo4j"]
postgres = ["psycopg2", "pgvector", "asyncpg"] postgres = ["psycopg2", "pgvector", "asyncpg"]
notebook = ["ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"] notebook = ["ipykernel", "overrides", "ipywidgets", "jupyterlab", "jupyterlab_widgets", "jupyterlab-server", "jupyterlab-git"]
llama-index = ["llama-index-core"] llama-index = ["llama-index-core"]
deepeval = ["deepeval"]
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]