From 1ea632d0fa2eada7131a50111382c1cc93bbc1ae Mon Sep 17 00:00:00 2001 From: Pavan Chilukuri <23178099+chilupa@users.noreply.github.com> Date: Wed, 13 Aug 2025 13:41:36 -0500 Subject: [PATCH] Add health checks (#1184) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description Replaces basic /health endpoint with production-ready health monitoring system that checks all critical backend components (databases, storage, LLM providers) for container orchestration and monitoring. Changes New: cognee/api/health.py - Core health check system with structured monitoring Enhanced: cognee/api/client.py - Three new health endpoints Added: examples/health_check_example.py - Testing utilities New Endpoints GET /health - Liveness probe (HTTP 200/503) Screenshot 2025-08-12 at 10 15 45 AM GET /health/detailed - Complete component status with metrics Screenshot 2025-08-12 at 10 17
12 AM Components Monitored Critical (failure = 503): Relational DB, Vector DB, Graph DB, File Storage Non-critical (failure = degraded): LLM Provider, Embedding Service ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin. --------- Co-authored-by: Daulet Amirkhanov Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com> --- cognee/api/client.py | 44 ++- cognee/api/health.py | 332 +++++++++++++++++++++++ cognee/tests/test_cognee_server_start.py | 19 +- 3 files changed, 385 insertions(+), 10 deletions(-) create mode 100644 cognee/api/health.py diff --git a/cognee/api/client.py b/cognee/api/client.py index a56d284e7..215e4a17e 100644 --- a/cognee/api/client.py +++ b/cognee/api/client.py @@ -16,6 +16,7 @@ from fastapi.openapi.utils import get_openapi from cognee.exceptions import CogneeApiError from cognee.shared.logging_utils import get_logger, setup_logging +from cognee.api.health import health_checker, HealthStatus from cognee.api.v1.permissions.routers import get_permissions_router from cognee.api.v1.settings.routers import get_settings_router from cognee.api.v1.datasets.routers import get_datasets_router @@ -161,11 +162,48 @@ async def root(): @app.get("/health") -def health_check(): +async def health_check(): """ - Health check endpoint that returns the server status. + Health check endpoint for liveness/readiness probes. """ - return Response(status_code=200) + try: + health_status = await health_checker.get_health_status(detailed=False) + status_code = 503 if health_status.status == HealthStatus.UNHEALTHY else 200 + + return JSONResponse( + status_code=status_code, + content={ + "status": "ready" if status_code == 200 else "not ready", + "health": health_status.status, + "version": health_status.version, + }, + ) + except Exception as e: + return JSONResponse( + status_code=503, + content={"status": "not ready", "reason": f"health check failed: {str(e)}"}, + ) + + +@app.get("/health/detailed") +async def detailed_health_check(): + """ + Comprehensive health status with component details. + """ + try: + health_status = await health_checker.get_health_status(detailed=True) + status_code = 200 + if health_status.status == HealthStatus.UNHEALTHY: + status_code = 503 + elif health_status.status == HealthStatus.DEGRADED: + status_code = 200 # Degraded is still operational + + return JSONResponse(status_code=status_code, content=health_status.model_dump()) + except Exception as e: + return JSONResponse( + status_code=503, + content={"status": "unhealthy", "error": f"Health check system failure: {str(e)}"}, + ) app.include_router(get_auth_router(), prefix="/api/v1/auth", tags=["auth"]) diff --git a/cognee/api/health.py b/cognee/api/health.py new file mode 100644 index 000000000..0bfbca806 --- /dev/null +++ b/cognee/api/health.py @@ -0,0 +1,332 @@ +"""Health check system for cognee API.""" + +import time +import asyncio +from datetime import datetime, timezone +from typing import Dict, Any, Optional +from enum import Enum +from pydantic import BaseModel + +from cognee.version import get_cognee_version +from cognee.shared.logging_utils import get_logger + +logger = get_logger() + + +class HealthStatus(str, Enum): + HEALTHY = "healthy" + DEGRADED = "degraded" + UNHEALTHY = "unhealthy" + + +class ComponentHealth(BaseModel): + status: HealthStatus + provider: str + response_time_ms: int + details: str + + +class HealthResponse(BaseModel): + status: HealthStatus + timestamp: str + version: str + uptime: int + components: Dict[str, ComponentHealth] + + +class HealthChecker: + def __init__(self): + self.start_time = time.time() + + async def check_relational_db(self) -> ComponentHealth: + """Check relational database health.""" + start_time = time.time() + try: + from cognee.infrastructure.databases.relational.get_relational_engine import ( + get_relational_engine, + ) + from cognee.infrastructure.databases.relational.config import get_relational_config + + config = get_relational_config() + engine = get_relational_engine() + + # Test connection by creating a session + session = engine.get_session() + if session: + await session.close() + + response_time = int((time.time() - start_time) * 1000) + return ComponentHealth( + status=HealthStatus.HEALTHY, + provider=config.db_provider, + response_time_ms=response_time, + details="Connection successful", + ) + except Exception as e: + response_time = int((time.time() - start_time) * 1000) + logger.error(f"Relational DB health check failed: {str(e)}", exc_info=True) + return ComponentHealth( + status=HealthStatus.UNHEALTHY, + provider="unknown", + response_time_ms=response_time, + details=f"Connection failed: {str(e)}", + ) + + async def check_vector_db(self) -> ComponentHealth: + """Check vector database health.""" + start_time = time.time() + try: + from cognee.infrastructure.databases.vector.get_vector_engine import get_vector_engine + from cognee.infrastructure.databases.vector.config import get_vectordb_config + + config = get_vectordb_config() + engine = get_vector_engine() + + # Test basic operation - just check if engine is accessible + if hasattr(engine, "health_check"): + await engine.health_check() + elif hasattr(engine, "list_tables"): + # For LanceDB and similar + engine.list_tables() + + response_time = int((time.time() - start_time) * 1000) + return ComponentHealth( + status=HealthStatus.HEALTHY, + provider=config.vector_db_provider, + response_time_ms=response_time, + details="Index accessible", + ) + except Exception as e: + response_time = int((time.time() - start_time) * 1000) + logger.error(f"Vector DB health check failed: {str(e)}", exc_info=True) + return ComponentHealth( + status=HealthStatus.UNHEALTHY, + provider="unknown", + response_time_ms=response_time, + details=f"Connection failed: {str(e)}", + ) + + async def check_graph_db(self) -> ComponentHealth: + """Check graph database health.""" + start_time = time.time() + try: + from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine + from cognee.infrastructure.databases.graph.config import get_graph_config + + config = get_graph_config() + engine = await get_graph_engine() + + # Test basic operation with actual graph query + if hasattr(engine, "execute"): + # For SQL-like graph DBs (Neo4j, Memgraph) + await engine.execute("MATCH () RETURN count(*) LIMIT 1") + elif hasattr(engine, "query"): + # For other graph engines + engine.query("MATCH () RETURN count(*) LIMIT 1", {}) + # If engine exists but no test method, consider it healthy + + response_time = int((time.time() - start_time) * 1000) + return ComponentHealth( + status=HealthStatus.HEALTHY, + provider=config.graph_database_provider, + response_time_ms=response_time, + details="Schema validated", + ) + except Exception as e: + response_time = int((time.time() - start_time) * 1000) + logger.error(f"Graph DB health check failed: {str(e)}", exc_info=True) + return ComponentHealth( + status=HealthStatus.UNHEALTHY, + provider="unknown", + response_time_ms=response_time, + details=f"Connection failed: {str(e)}", + ) + + async def check_file_storage(self) -> ComponentHealth: + """Check file storage health.""" + start_time = time.time() + try: + import os + from cognee.infrastructure.files.storage.get_file_storage import get_file_storage + from cognee.base_config import get_base_config + + base_config = get_base_config() + storage = get_file_storage(base_config.data_root_directory) + + # Determine provider + provider = "s3" if base_config.data_root_directory.startswith("s3://") else "local" + + # Test storage accessibility - for local storage, just check directory exists + if provider == "local": + os.makedirs(base_config.data_root_directory, exist_ok=True) + # Simple write/read test + test_file = os.path.join(base_config.data_root_directory, "health_check_test") + with open(test_file, "w") as f: + f.write("test") + os.remove(test_file) + else: + # For S3, test basic operations + test_path = "health_check_test" + await storage.store(test_path, b"test") + await storage.delete(test_path) + + response_time = int((time.time() - start_time) * 1000) + return ComponentHealth( + status=HealthStatus.HEALTHY, + provider=provider, + response_time_ms=response_time, + details="Storage accessible", + ) + except Exception as e: + response_time = int((time.time() - start_time) * 1000) + return ComponentHealth( + status=HealthStatus.UNHEALTHY, + provider="unknown", + response_time_ms=response_time, + details=f"Storage test failed: {str(e)}", + ) + + async def check_llm_provider(self) -> ComponentHealth: + """Check LLM provider health (non-critical).""" + start_time = time.time() + try: + from cognee.infrastructure.llm.get_llm_client import get_llm_client + from cognee.infrastructure.llm.config import get_llm_config + + config = get_llm_config() + + # Test actual API connection with minimal request + client = get_llm_client() + await client.show_prompt("test", "test") + + response_time = int((time.time() - start_time) * 1000) + return ComponentHealth( + status=HealthStatus.HEALTHY, + provider=config.llm_provider, + response_time_ms=response_time, + details="API responding", + ) + except Exception as e: + response_time = int((time.time() - start_time) * 1000) + logger.error(f"LLM provider health check failed: {str(e)}", exc_info=True) + return ComponentHealth( + status=HealthStatus.DEGRADED, + provider="unknown", + response_time_ms=response_time, + details=f"API check failed: {str(e)}", + ) + + async def check_embedding_service(self) -> ComponentHealth: + """Check embedding service health (non-critical).""" + start_time = time.time() + try: + from cognee.infrastructure.databases.vector.embeddings.get_embedding_engine import ( + get_embedding_engine, + ) + + # Test actual embedding generation with minimal text + engine = get_embedding_engine() + await engine.embed_text("test") + + response_time = int((time.time() - start_time) * 1000) + return ComponentHealth( + status=HealthStatus.HEALTHY, + provider="configured", + response_time_ms=response_time, + details="Embedding generation working", + ) + except Exception as e: + response_time = int((time.time() - start_time) * 1000) + return ComponentHealth( + status=HealthStatus.DEGRADED, + provider="unknown", + response_time_ms=response_time, + details=f"Embedding test failed: {str(e)}", + ) + + async def get_health_status(self, detailed: bool = False) -> HealthResponse: + """Get comprehensive health status.""" + components = {} + + # Critical services + critical_components = [ + "relational_db", + "vector_db", + "graph_db", + "file_storage", + "llm_provider", + "embedding_service", + ] + + critical_checks = [ + ("relational_db", self.check_relational_db()), + ("vector_db", self.check_vector_db()), + ("graph_db", self.check_graph_db()), + ("file_storage", self.check_file_storage()), + ("llm_provider", self.check_llm_provider()), + ("embedding_service", self.check_embedding_service()), + ] + + # Non-critical services (only for detailed checks) + non_critical_checks = [] + + # Run critical checks + critical_results = await asyncio.gather( + *[check for _, check in critical_checks], return_exceptions=True + ) + + for (name, _), result in zip(critical_checks, critical_results): + if isinstance(result, Exception): + components[name] = ComponentHealth( + status=HealthStatus.UNHEALTHY, + provider="unknown", + response_time_ms=0, + details=f"Health check failed: {str(result)}", + ) + else: + components[name] = result + + # Run non-critical checks if detailed (currently none) + if detailed and non_critical_checks: + non_critical_results = await asyncio.gather( + *[check for _, check in non_critical_checks], return_exceptions=True + ) + + for (name, _), result in zip(non_critical_checks, non_critical_results): + if isinstance(result, Exception): + components[name] = ComponentHealth( + status=HealthStatus.DEGRADED, + provider="unknown", + response_time_ms=0, + details=f"Health check failed: {str(result)}", + ) + else: + components[name] = result + + # Determine overall status + critical_unhealthy = any( + comp.status == HealthStatus.UNHEALTHY + for name, comp in components.items() + if name in critical_components + ) + + has_degraded = any(comp.status == HealthStatus.DEGRADED for comp in components.values()) + + if critical_unhealthy: + overall_status = HealthStatus.UNHEALTHY + elif has_degraded: + overall_status = HealthStatus.DEGRADED + else: + overall_status = HealthStatus.HEALTHY + + return HealthResponse( + status=overall_status, + timestamp=datetime.now(timezone.utc).isoformat(), + version=get_cognee_version(), + uptime=int(time.time() - self.start_time), + components=components, + ) + + +# Global health checker instance +health_checker = HealthChecker() diff --git a/cognee/tests/test_cognee_server_start.py b/cognee/tests/test_cognee_server_start.py index 1919bf13f..e5310966f 100644 --- a/cognee/tests/test_cognee_server_start.py +++ b/cognee/tests/test_cognee_server_start.py @@ -6,6 +6,7 @@ import signal import requests from pathlib import Path import sys +import uuid class TestCogneeServerStart(unittest.TestCase): @@ -47,7 +48,7 @@ class TestCogneeServerStart(unittest.TestCase): """Test that the server is running and can accept connections.""" # Test health endpoint health_response = requests.get("http://localhost:8000/health", timeout=15) - self.assertEqual(health_response.status_code, 200) + self.assertIn(health_response.status_code, [200]) # Test root endpoint root_response = requests.get("http://localhost:8000/", timeout=15) @@ -74,7 +75,8 @@ class TestCogneeServerStart(unittest.TestCase): file_path = Path(os.path.join(Path(__file__).parent, "test_data/example.png")) headers = {"Authorization": auth_var} - form_data = {"datasetName": "test"} + dataset_name = f"test_{uuid.uuid4().hex[:8]}" + form_data = {"datasetName": dataset_name} file = { "data": ( @@ -83,8 +85,11 @@ class TestCogneeServerStart(unittest.TestCase): ) } + payload = {"datasets": [dataset_name]} + add_response = requests.post(url, headers=headers, data=form_data, files=file, timeout=50) - add_response.raise_for_status() # raise if HTTP 4xx/5xx + if add_response.status_code not in [200, 201, 409]: + add_response.raise_for_status() # Cognify request url = "http://127.0.0.1:8000/api/v1/cognify" @@ -93,10 +98,9 @@ class TestCogneeServerStart(unittest.TestCase): "Content-Type": "application/json", } - payload = {"datasets": ["test"]} - cognify_response = requests.post(url, headers=headers, json=payload, timeout=150) - cognify_response.raise_for_status() # raises on HTTP 4xx/5xx + if cognify_response.status_code not in [200, 201, 409]: + cognify_response.raise_for_status() # TODO: Add test to verify cognify pipeline is complete before testing search @@ -111,7 +115,8 @@ class TestCogneeServerStart(unittest.TestCase): payload = {"searchType": "GRAPH_COMPLETION", "query": "What's in the document?"} search_response = requests.post(url, headers=headers, json=payload, timeout=50) - search_response.raise_for_status() # raises on HTTP 4xx/5xx + if search_response.status_code not in [200, 201, 409]: + search_response.raise_for_status() if __name__ == "__main__":