LightRAG/lightrag/metrics.py
clssck 59e89772de refactor: consolidate to PostgreSQL-only backend and modernize stack
Remove legacy storage implementations and deprecated examples:
- Delete FAISS, JSON, Memgraph, Milvus, MongoDB, Nano Vector DB, Neo4j, NetworkX, Qdrant, Redis storage backends
- Remove Kubernetes deployment manifests and installation scripts
- Delete unofficial examples for deprecated backends and offline deployment docs
Streamline core infrastructure:
- Consolidate storage layer to PostgreSQL-only implementation
- Add full-text search caching with FTS cache module
- Implement metrics collection and monitoring pipeline
- Add explain and metrics API routes
Modernize frontend and tooling:
- Switch web UI to Bun with bun.lock, remove npm and pnpm lockfiles
- Update Dockerfile for PostgreSQL-only deployment
- Add Makefile for common development tasks
- Update environment and configuration examples
Enhance evaluation and testing capabilities:
- Add prompt optimization with DSPy and auto-tuning
- Implement ground truth regeneration and variant testing
- Add prompt debugging and response comparison utilities
- Expand test coverage with new integration scenarios
Simplify dependencies and configuration:
- Remove offline-specific requirement files
- Update pyproject.toml with streamlined dependencies
- Add Python version pinning with .python-version
- Create project guidelines in CLAUDE.md and AGENTS.md
2025-12-12 16:28:49 +01:00

282 lines
8.4 KiB
Python

"""
Metrics collection and aggregation for LightRAG.
This module provides lightweight metrics collection without heavy computation
on each request. Metrics are collected incrementally and aggregated on demand.
Key features:
- Circular buffer for query history (bounded memory)
- Percentile calculations (p50, p95, p99)
- Cache hit rate tracking
- Thread-safe async operations
Environment Variables:
ENABLE_METRICS: Enable/disable metrics collection (default: true)
METRICS_HISTORY_SIZE: Number of queries to keep in buffer (default: 1000)
"""
from __future__ import annotations
import asyncio
import os
import time
from collections import deque
from dataclasses import dataclass, field
from typing import Any
from lightrag.constants import (
DEFAULT_METRICS_ENABLED,
DEFAULT_METRICS_HISTORY_SIZE,
DEFAULT_METRICS_WINDOW_SECONDS,
)
# Configuration
METRICS_ENABLED = os.getenv('ENABLE_METRICS', str(DEFAULT_METRICS_ENABLED)).lower() == 'true'
METRICS_HISTORY_SIZE = int(os.getenv('METRICS_HISTORY_SIZE', str(DEFAULT_METRICS_HISTORY_SIZE)))
# Global metrics instance (singleton pattern)
_metrics_instance: MetricsCollector | None = None
_metrics_lock = asyncio.Lock()
@dataclass
class QueryMetric:
"""Single query metric entry."""
timestamp: float
duration_ms: float
mode: str
cache_hit: bool
entities_count: int
relations_count: int
chunks_count: int
tokens_used: int
@dataclass
class MetricsCollector:
"""Collects and aggregates LightRAG metrics.
Uses a circular buffer to store recent query metrics, preventing
unbounded memory growth. Aggregations are computed on-demand.
"""
# Circular buffer for recent queries
query_history: deque[QueryMetric] = field(default_factory=lambda: deque(maxlen=METRICS_HISTORY_SIZE))
# Counters (monotonically increasing, never reset)
total_queries: int = 0
total_llm_calls: int = 0
total_llm_cache_hits: int = 0
total_embed_calls: int = 0
# Server start timestamp
started_at: float = field(default_factory=time.time)
def record_query(self, metric: QueryMetric) -> None:
"""Record a query metric.
Args:
metric: QueryMetric instance with query details
"""
if not METRICS_ENABLED:
return
self.query_history.append(metric)
self.total_queries += 1
if metric.cache_hit:
self.total_llm_cache_hits += 1
else:
self.total_llm_calls += 1
def record_llm_call(self, cache_hit: bool = False) -> None:
"""Record an LLM API call.
Args:
cache_hit: Whether this was a cache hit
"""
if cache_hit:
self.total_llm_cache_hits += 1
else:
self.total_llm_calls += 1
def record_embed_call(self) -> None:
"""Record an embedding API call."""
self.total_embed_calls += 1
def _get_percentile(self, values: list[float], percentile: float) -> float:
"""Calculate percentile from a list of values.
Uses linear interpolation between nearest ranks.
Args:
values: List of numeric values
percentile: Percentile to calculate (0-100)
Returns:
Calculated percentile value, or 0.0 if empty
"""
if not values:
return 0.0
sorted_values = sorted(values)
n = len(sorted_values)
if n == 1:
return sorted_values[0]
# Calculate the rank
k = (n - 1) * percentile / 100
f = int(k)
c = f + 1 if f < n - 1 else f
# Linear interpolation
return sorted_values[f] + (sorted_values[c] - sorted_values[f]) * (k - f)
def compute_stats(
self,
window_seconds: float = DEFAULT_METRICS_WINDOW_SECONDS,
) -> dict[str, Any]:
"""Compute aggregated statistics.
Args:
window_seconds: Time window for percentile calculations (default 1 hour)
Returns:
Dictionary with computed metrics
"""
now = time.time()
cutoff = now - window_seconds
# Filter recent queries within window
recent = [q for q in self.query_history if q.timestamp >= cutoff]
# Extract latencies
latencies = [q.duration_ms for q in recent] if recent else []
# Calculate percentiles
latency_percentiles = None
if latencies:
latency_percentiles = {
'p50': round(self._get_percentile(latencies, 50), 2),
'p95': round(self._get_percentile(latencies, 95), 2),
'p99': round(self._get_percentile(latencies, 99), 2),
'avg': round(sum(latencies) / len(latencies), 2),
'min': round(min(latencies), 2),
'max': round(max(latencies), 2),
}
# Calculate cache hit rate
total_llm_ops = self.total_llm_calls + self.total_llm_cache_hits
cache_hit_rate = self.total_llm_cache_hits / total_llm_ops if total_llm_ops > 0 else 0.0
# Mode distribution in window
mode_counts: dict[str, int] = {}
for q in recent:
mode_counts[q.mode] = mode_counts.get(q.mode, 0) + 1
return {
'uptime_seconds': round(now - self.started_at, 2),
'total_queries': self.total_queries,
'queries_in_window': len(recent),
'window_seconds': window_seconds,
'latency_percentiles': latency_percentiles,
'cache_stats': {
'total_llm_calls': self.total_llm_calls,
'total_cache_hits': self.total_llm_cache_hits,
'hit_rate': round(cache_hit_rate, 4),
},
'embed_stats': {
'total_calls': self.total_embed_calls,
},
'mode_distribution': mode_counts,
}
def get_recent_queries(self, limit: int = 10) -> list[dict[str, Any]]:
"""Get recent query metrics.
Args:
limit: Maximum number of recent queries to return
Returns:
List of recent query metrics as dictionaries
"""
recent = list(self.query_history)[-limit:]
return [
{
'timestamp': q.timestamp,
'duration_ms': q.duration_ms,
'mode': q.mode,
'cache_hit': q.cache_hit,
'entities_count': q.entities_count,
'relations_count': q.relations_count,
'chunks_count': q.chunks_count,
'tokens_used': q.tokens_used,
}
for q in reversed(recent) # Most recent first
]
async def get_metrics_collector() -> MetricsCollector:
"""Get or create the global metrics collector instance.
Thread-safe singleton pattern using async lock.
Returns:
The global MetricsCollector instance
"""
global _metrics_instance
async with _metrics_lock:
if _metrics_instance is None:
_metrics_instance = MetricsCollector()
return _metrics_instance
def get_metrics_collector_sync() -> MetricsCollector:
"""Get or create the global metrics collector instance (sync version).
Note: This should only be used in contexts where async is not available.
Prefer get_metrics_collector() for async code.
Returns:
The global MetricsCollector instance
"""
global _metrics_instance
if _metrics_instance is None:
_metrics_instance = MetricsCollector()
return _metrics_instance
async def record_query_metric(
duration_ms: float,
mode: str,
cache_hit: bool = False,
entities_count: int = 0,
relations_count: int = 0,
chunks_count: int = 0,
tokens_used: int = 0,
) -> None:
"""Convenience function to record a query metric.
Args:
duration_ms: Query duration in milliseconds
mode: Query mode (local, global, hybrid, mix, naive)
cache_hit: Whether LLM cache was hit
entities_count: Number of entities retrieved
relations_count: Number of relations retrieved
chunks_count: Number of chunks retrieved
tokens_used: Total tokens used in context
"""
collector = await get_metrics_collector()
collector.record_query(
QueryMetric(
timestamp=time.time(),
duration_ms=duration_ms,
mode=mode,
cache_hit=cache_hit,
entities_count=entities_count,
relations_count=relations_count,
chunks_count=chunks_count,
tokens_used=tokens_used,
)
)