remove non critical services

This commit is contained in:
Pavan Chilukuri 2025-08-09 00:24:16 -05:00
parent f304d476d9
commit b2d83c47cd
2 changed files with 42 additions and 46 deletions

View file

@ -164,30 +164,20 @@ async def root():
@app.get("/health") @app.get("/health")
async def health_check(): async def health_check():
""" """
Basic health check endpoint for liveness probe. Health check endpoint for liveness/readiness probes.
""" """
try: try:
health_status = await health_checker.get_health_status(detailed=False) health_status = await health_checker.get_health_status(detailed=False)
if health_status.status == HealthStatus.UNHEALTHY: status_code = 503 if health_status.status == HealthStatus.UNHEALTHY else 200
return Response(status_code=503)
return Response(status_code=200)
except Exception:
return Response(status_code=503)
return JSONResponse(
@app.get("/health/ready") status_code=status_code,
async def readiness_check(): content={
""" "status": "ready" if status_code == 200 else "not ready",
Readiness probe for Kubernetes deployments. "health": health_status.status,
""" "version": health_status.version,
try: },
health_status = await health_checker.get_health_status(detailed=False) )
if health_status.status == HealthStatus.UNHEALTHY:
return JSONResponse(
status_code=503,
content={"status": "not ready", "reason": "critical services unhealthy"},
)
return JSONResponse(status_code=200, content={"status": "ready"})
except Exception as e: except Exception as e:
return JSONResponse( return JSONResponse(
status_code=503, status_code=503,

View file

@ -114,12 +114,14 @@ class HealthChecker:
config = get_graph_config() config = get_graph_config()
engine = await get_graph_engine() engine = await get_graph_engine()
# Test basic operation - just check if engine is accessible # Test basic operation with actual graph query
if hasattr(engine, "health_check"): if hasattr(engine, "execute"):
await engine.health_check() # For SQL-like graph DBs (Neo4j, Memgraph)
await engine.execute("MATCH () RETURN count(*) LIMIT 1")
elif hasattr(engine, "get_nodes"): elif hasattr(engine, "get_nodes"):
# Basic connectivity test # For other graph engines - try to get nodes
pass list(engine.get_nodes(limit=1))
# If engine exists but no test method, consider it healthy
response_time = int((time.time() - start_time) * 1000) response_time = int((time.time() - start_time) * 1000)
return ComponentHealth( return ComponentHealth(
@ -190,20 +192,16 @@ class HealthChecker:
config = get_llm_config() config = get_llm_config()
# Simple configuration check - don't actually call the API # Test actual API connection with minimal request
if config.llm_api_key or config.llm_provider == "ollama": client = get_llm_client()
status = HealthStatus.HEALTHY await client.acomplete("test", max_tokens=1)
details = "Configuration valid"
else:
status = HealthStatus.DEGRADED
details = "No API key configured"
response_time = int((time.time() - start_time) * 1000) response_time = int((time.time() - start_time) * 1000)
return ComponentHealth( return ComponentHealth(
status=status, status=HealthStatus.HEALTHY,
provider=config.llm_provider, provider=config.llm_provider,
response_time_ms=response_time, response_time_ms=response_time,
details=details, details="API responding",
) )
except Exception as e: except Exception as e:
response_time = int((time.time() - start_time) * 1000) response_time = int((time.time() - start_time) * 1000)
@ -211,7 +209,7 @@ class HealthChecker:
status=HealthStatus.DEGRADED, status=HealthStatus.DEGRADED,
provider="unknown", provider="unknown",
response_time_ms=response_time, response_time_ms=response_time,
details=f"Config check failed: {str(e)}", details=f"API check failed: {str(e)}",
) )
async def check_embedding_service(self) -> ComponentHealth: async def check_embedding_service(self) -> ComponentHealth:
@ -222,15 +220,16 @@ class HealthChecker:
get_embedding_engine, get_embedding_engine,
) )
# Just check if we can get the engine without calling it # Test actual embedding generation with minimal text
get_embedding_engine() engine = get_embedding_engine()
await engine.embed_text("test")
response_time = int((time.time() - start_time) * 1000) response_time = int((time.time() - start_time) * 1000)
return ComponentHealth( return ComponentHealth(
status=HealthStatus.HEALTHY, status=HealthStatus.HEALTHY,
provider="configured", provider="configured",
response_time_ms=response_time, response_time_ms=response_time,
details="Embedding engine accessible", details="Embedding generation working",
) )
except Exception as e: except Exception as e:
response_time = int((time.time() - start_time) * 1000) response_time = int((time.time() - start_time) * 1000)
@ -238,7 +237,7 @@ class HealthChecker:
status=HealthStatus.DEGRADED, status=HealthStatus.DEGRADED,
provider="unknown", provider="unknown",
response_time_ms=response_time, response_time_ms=response_time,
details=f"Embedding engine failed: {str(e)}", details=f"Embedding test failed: {str(e)}",
) )
async def get_health_status(self, detailed: bool = False) -> HealthResponse: async def get_health_status(self, detailed: bool = False) -> HealthResponse:
@ -251,14 +250,13 @@ class HealthChecker:
("vector_db", self.check_vector_db()), ("vector_db", self.check_vector_db()),
("graph_db", self.check_graph_db()), ("graph_db", self.check_graph_db()),
("file_storage", self.check_file_storage()), ("file_storage", self.check_file_storage()),
]
# Non-critical services (only for detailed checks)
non_critical_checks = [
("llm_provider", self.check_llm_provider()), ("llm_provider", self.check_llm_provider()),
("embedding_service", self.check_embedding_service()), ("embedding_service", self.check_embedding_service()),
] ]
# Non-critical services (only for detailed checks)
non_critical_checks = []
# Run critical checks # Run critical checks
critical_results = await asyncio.gather( critical_results = await asyncio.gather(
*[check for _, check in critical_checks], return_exceptions=True *[check for _, check in critical_checks], return_exceptions=True
@ -275,8 +273,8 @@ class HealthChecker:
else: else:
components[name] = result components[name] = result
# Run non-critical checks if detailed # Run non-critical checks if detailed (currently none)
if detailed: if detailed and non_critical_checks:
non_critical_results = await asyncio.gather( non_critical_results = await asyncio.gather(
*[check for _, check in non_critical_checks], return_exceptions=True *[check for _, check in non_critical_checks], return_exceptions=True
) )
@ -296,7 +294,15 @@ class HealthChecker:
critical_unhealthy = any( critical_unhealthy = any(
comp.status == HealthStatus.UNHEALTHY comp.status == HealthStatus.UNHEALTHY
for name, comp in components.items() for name, comp in components.items()
if name in ["relational_db", "vector_db", "graph_db", "file_storage"] if name
in [
"relational_db",
"vector_db",
"graph_db",
"file_storage",
"llm_provider",
"embedding_service",
]
) )
has_degraded = any(comp.status == HealthStatus.DEGRADED for comp in components.values()) has_degraded = any(comp.status == HealthStatus.DEGRADED for comp in components.values())