remove non critical services

This commit is contained in:
Pavan Chilukuri 2025-08-09 00:24:16 -05:00
parent f304d476d9
commit b2d83c47cd
2 changed files with 42 additions and 46 deletions

View file

@ -164,30 +164,20 @@ async def root():
@app.get("/health")
async def health_check():
"""
Basic health check endpoint for liveness probe.
Health check endpoint for liveness/readiness probes.
"""
try:
health_status = await health_checker.get_health_status(detailed=False)
if health_status.status == HealthStatus.UNHEALTHY:
return Response(status_code=503)
return Response(status_code=200)
except Exception:
return Response(status_code=503)
status_code = 503 if health_status.status == HealthStatus.UNHEALTHY else 200
@app.get("/health/ready")
async def readiness_check():
"""
Readiness probe for Kubernetes deployments.
"""
try:
health_status = await health_checker.get_health_status(detailed=False)
if health_status.status == HealthStatus.UNHEALTHY:
return JSONResponse(
status_code=503,
content={"status": "not ready", "reason": "critical services unhealthy"},
)
return JSONResponse(status_code=200, content={"status": "ready"})
return JSONResponse(
status_code=status_code,
content={
"status": "ready" if status_code == 200 else "not ready",
"health": health_status.status,
"version": health_status.version,
},
)
except Exception as e:
return JSONResponse(
status_code=503,

View file

@ -114,12 +114,14 @@ class HealthChecker:
config = get_graph_config()
engine = await get_graph_engine()
# Test basic operation - just check if engine is accessible
if hasattr(engine, "health_check"):
await engine.health_check()
# Test basic operation with actual graph query
if hasattr(engine, "execute"):
# For SQL-like graph DBs (Neo4j, Memgraph)
await engine.execute("MATCH () RETURN count(*) LIMIT 1")
elif hasattr(engine, "get_nodes"):
# Basic connectivity test
pass
# For other graph engines - try to get nodes
list(engine.get_nodes(limit=1))
# If engine exists but no test method, consider it healthy
response_time = int((time.time() - start_time) * 1000)
return ComponentHealth(
@ -190,20 +192,16 @@ class HealthChecker:
config = get_llm_config()
# Simple configuration check - don't actually call the API
if config.llm_api_key or config.llm_provider == "ollama":
status = HealthStatus.HEALTHY
details = "Configuration valid"
else:
status = HealthStatus.DEGRADED
details = "No API key configured"
# Test actual API connection with minimal request
client = get_llm_client()
await client.acomplete("test", max_tokens=1)
response_time = int((time.time() - start_time) * 1000)
return ComponentHealth(
status=status,
status=HealthStatus.HEALTHY,
provider=config.llm_provider,
response_time_ms=response_time,
details=details,
details="API responding",
)
except Exception as e:
response_time = int((time.time() - start_time) * 1000)
@ -211,7 +209,7 @@ class HealthChecker:
status=HealthStatus.DEGRADED,
provider="unknown",
response_time_ms=response_time,
details=f"Config check failed: {str(e)}",
details=f"API check failed: {str(e)}",
)
async def check_embedding_service(self) -> ComponentHealth:
@ -222,15 +220,16 @@ class HealthChecker:
get_embedding_engine,
)
# Just check if we can get the engine without calling it
get_embedding_engine()
# Test actual embedding generation with minimal text
engine = get_embedding_engine()
await engine.embed_text("test")
response_time = int((time.time() - start_time) * 1000)
return ComponentHealth(
status=HealthStatus.HEALTHY,
provider="configured",
response_time_ms=response_time,
details="Embedding engine accessible",
details="Embedding generation working",
)
except Exception as e:
response_time = int((time.time() - start_time) * 1000)
@ -238,7 +237,7 @@ class HealthChecker:
status=HealthStatus.DEGRADED,
provider="unknown",
response_time_ms=response_time,
details=f"Embedding engine failed: {str(e)}",
details=f"Embedding test failed: {str(e)}",
)
async def get_health_status(self, detailed: bool = False) -> HealthResponse:
@ -251,14 +250,13 @@ class HealthChecker:
("vector_db", self.check_vector_db()),
("graph_db", self.check_graph_db()),
("file_storage", self.check_file_storage()),
]
# Non-critical services (only for detailed checks)
non_critical_checks = [
("llm_provider", self.check_llm_provider()),
("embedding_service", self.check_embedding_service()),
]
# Non-critical services (only for detailed checks)
non_critical_checks = []
# Run critical checks
critical_results = await asyncio.gather(
*[check for _, check in critical_checks], return_exceptions=True
@ -275,8 +273,8 @@ class HealthChecker:
else:
components[name] = result
# Run non-critical checks if detailed
if detailed:
# Run non-critical checks if detailed (currently none)
if detailed and non_critical_checks:
non_critical_results = await asyncio.gather(
*[check for _, check in non_critical_checks], return_exceptions=True
)
@ -296,7 +294,15 @@ class HealthChecker:
critical_unhealthy = any(
comp.status == HealthStatus.UNHEALTHY
for name, comp in components.items()
if name in ["relational_db", "vector_db", "graph_db", "file_storage"]
if name
in [
"relational_db",
"vector_db",
"graph_db",
"file_storage",
"llm_provider",
"embedding_service",
]
)
has_degraded = any(comp.status == HealthStatus.DEGRADED for comp in components.values())