feat: Adds modal parallel evaluation for retriever development (#844)
<!-- .github/pull_request_template.md --> ## Description Adds modal parallel evaluation for retriever development ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
parent
f8f78773dd
commit
5c36a5dd8a
4 changed files with 63 additions and 37 deletions
|
|
@ -18,15 +18,12 @@ RUN apt-get update && apt-get install -y \
|
|||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
ENV PYTHONPATH=/app
|
||||
WORKDIR /app
|
||||
COPY pyproject.toml poetry.lock /app/
|
||||
|
||||
COPY pyproject.toml poetry.lock README.md /app/
|
||||
|
||||
RUN pip install poetry
|
||||
|
||||
RUN poetry install --all-extras --no-root --without dev
|
||||
RUN poetry config virtualenvs.create false
|
||||
|
||||
RUN poetry install --extras neo4j --extras qdrant --no-root
|
||||
|
||||
COPY cognee/ /app/cognee
|
||||
COPY README.md /app/README.md
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
from functools import lru_cache
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class EvalConfig(BaseSettings):
|
||||
|
|
@ -43,6 +43,7 @@ class EvalConfig(BaseSettings):
|
|||
dashboard_path: str = "dashboard.html"
|
||||
direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
|
||||
direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
|
||||
instance_filter: Optional[List[str]] = None
|
||||
|
||||
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
||||
|
||||
|
|
@ -68,6 +69,7 @@ class EvalConfig(BaseSettings):
|
|||
"task_getter_type": self.task_getter_type,
|
||||
"direct_llm_system_prompt": self.direct_llm_system_prompt,
|
||||
"direct_llm_eval_prompt": self.direct_llm_eval_prompt,
|
||||
"instance_filter": self.instance_filter,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -167,4 +167,4 @@ def create_dashboard(
|
|||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(dashboard_html)
|
||||
|
||||
return output_file
|
||||
return dashboard_html
|
||||
|
|
|
|||
|
|
@ -1,8 +1,9 @@
|
|||
import modal
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
from cognee.shared.logging_utils import get_logger
|
||||
from cognee.eval_framework.eval_config import EvalConfig
|
||||
from cognee.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
|
||||
|
|
@ -10,8 +11,10 @@ from cognee.eval_framework.answer_generation.run_question_answering_module impor
|
|||
run_question_answering,
|
||||
)
|
||||
from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
|
||||
from cognee.eval_framework.metrics_dashboard import create_dashboard
|
||||
|
||||
logger = get_logger()
|
||||
vol = modal.Volume.from_name("evaluation_dashboard_results", create_if_missing=True)
|
||||
|
||||
|
||||
def read_and_combine_metrics(eval_params: dict) -> dict:
|
||||
|
|
@ -46,32 +49,54 @@ image = (
|
|||
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
|
||||
}
|
||||
)
|
||||
.poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
|
||||
.pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
|
||||
)
|
||||
|
||||
|
||||
@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1)
|
||||
@app.function(image=image, concurrency_limit=10, timeout=86400, volumes={"/data": vol})
|
||||
async def modal_run_eval(eval_params=None):
|
||||
"""Runs evaluation pipeline and returns combined metrics results."""
|
||||
if eval_params is None:
|
||||
eval_params = EvalConfig().to_dict()
|
||||
|
||||
version_name = "baseline"
|
||||
benchmark_name = os.environ.get("BENCHMARK", eval_params.get("benchmark", "benchmark"))
|
||||
timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
|
||||
|
||||
answers_filename = (
|
||||
f"{version_name}_{benchmark_name}_{timestamp}_{eval_params.get('answers_path')}"
|
||||
)
|
||||
html_filename = (
|
||||
f"{version_name}_{benchmark_name}_{timestamp}_{eval_params.get('dashboard_path')}"
|
||||
)
|
||||
|
||||
logger.info(f"Running evaluation with params: {eval_params}")
|
||||
|
||||
# Run the evaluation pipeline
|
||||
await run_corpus_builder(eval_params)
|
||||
await run_corpus_builder(eval_params, instance_filter=eval_params.get("instance_filter"))
|
||||
await run_question_answering(eval_params)
|
||||
await run_evaluation(eval_params)
|
||||
answers = await run_evaluation(eval_params)
|
||||
|
||||
# Early return if metrics calculation wasn't requested
|
||||
if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
|
||||
logger.info(
|
||||
"Skipping metrics collection as either evaluating_answers or calculate_metrics is False"
|
||||
with open("/data/" + answers_filename, "w") as f:
|
||||
json.dump(answers, f, ensure_ascii=False, indent=4)
|
||||
vol.commit()
|
||||
|
||||
if eval_params.get("dashboard"):
|
||||
logger.info("Generating dashboard...")
|
||||
html_output = create_dashboard(
|
||||
metrics_path=eval_params["metrics_path"],
|
||||
aggregate_metrics_path=eval_params["aggregate_metrics_path"],
|
||||
output_file=eval_params["dashboard_path"],
|
||||
benchmark=eval_params["benchmark"],
|
||||
)
|
||||
return None
|
||||
|
||||
return read_and_combine_metrics(eval_params)
|
||||
with open("/data/" + html_filename, "w") as f:
|
||||
f.write(html_output)
|
||||
vol.commit()
|
||||
|
||||
logger.info("Evaluation set finished...")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@app.local_entrypoint()
|
||||
|
|
@ -80,37 +105,39 @@ async def main():
|
|||
configs = [
|
||||
EvalConfig(
|
||||
task_getter_type="Default",
|
||||
number_of_samples_in_corpus=2,
|
||||
number_of_samples_in_corpus=10,
|
||||
benchmark="HotPotQA",
|
||||
qa_engine="cognee_graph_completion",
|
||||
building_corpus_from_scratch=True,
|
||||
answering_questions=True,
|
||||
evaluating_answers=True,
|
||||
calculate_metrics=True,
|
||||
dashboard=False,
|
||||
dashboard=True,
|
||||
),
|
||||
EvalConfig(
|
||||
task_getter_type="Default",
|
||||
number_of_samples_in_corpus=10,
|
||||
benchmark="TwoWikiMultiHop",
|
||||
qa_engine="cognee_graph_completion",
|
||||
building_corpus_from_scratch=True,
|
||||
answering_questions=True,
|
||||
evaluating_answers=True,
|
||||
calculate_metrics=True,
|
||||
dashboard=False,
|
||||
dashboard=True,
|
||||
),
|
||||
EvalConfig(
|
||||
task_getter_type="Default",
|
||||
number_of_samples_in_corpus=10,
|
||||
benchmark="Musique",
|
||||
qa_engine="cognee_graph_completion",
|
||||
building_corpus_from_scratch=True,
|
||||
answering_questions=True,
|
||||
evaluating_answers=True,
|
||||
calculate_metrics=True,
|
||||
dashboard=True,
|
||||
),
|
||||
]
|
||||
|
||||
# Run evaluations in parallel with different configurations
|
||||
modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs]
|
||||
results = await asyncio.gather(*modal_tasks)
|
||||
|
||||
# Filter out None results and save combined results
|
||||
results = [r for r in results if r is not None]
|
||||
if results:
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_file = f"combined_results_{timestamp}.json"
|
||||
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
|
||||
logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}")
|
||||
else:
|
||||
logger.info("No metrics were collected from any of the evaluation runs")
|
||||
await asyncio.gather(*modal_tasks)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue