cognee/evals/eval_framework/evaluation/metrics/f1.py
hajdul88 6a0c0e3ef8
feat: Cognee evaluation framework development (#498)
<!-- .github/pull_request_template.md -->

This PR contains the evaluation framework development for cognee

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Expanded evaluation framework now integrates asynchronous corpus
building, question answering, and performance evaluation with adaptive
benchmarks for improved metrics (correctness, exact match, and F1
score).

- **Infrastructure**
- Added database integration for persistent storage of questions,
answers, and metrics.
- Launched an interactive metrics dashboard featuring advanced
visualizations.
- Introduced an automated testing workflow for continuous quality
assurance.

- **Documentation**
  - Updated guidelines for generating concise, clear answers.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-02-11 16:31:54 +01:00

46 lines
1.6 KiB
Python

from collections import Counter
from deepeval.test_case import LLMTestCase
import re
from typing import Optional, Any
class F1ScoreMetric:
def __init__(self) -> None:
self.score: Optional[float] = None
self.reason: Optional[str] = None
def measure(self, test_case: "LLMTestCase") -> float:
actual = (test_case.actual_output or "").lower()
expected = (test_case.expected_output or "").lower()
actual_tokens = [
re.sub(r"\W+", "", token.strip())
for token in actual.split()
if re.sub(r"\W+", "", token.strip())
]
expected_tokens = [
re.sub(r"\W+", "", token.strip())
for token in expected.split()
if re.sub(r"\W+", "", token.strip())
]
if not actual_tokens and not expected_tokens:
self.score = 1.0
self.reason = "Both actual and expected are empty"
return self.score
actual_counts = Counter(actual_tokens)
expected_counts = Counter(expected_tokens)
tp = sum(min(actual_counts[word], expected_counts[word]) for word in actual_counts)
fp = sum(actual_counts[word] for word in actual_counts) - tp
fn = sum(expected_counts[word] for word in expected_counts) - tp
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
self.score = f1
self.reason = f"F1: {f1:.2f} (Precision: {precision:.2f}, Recall: {recall:.2f})"
return self.score