Coverage for excalidraw_mcp/monitoring/health_checker.py: 90%
115 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 08:08 -0700
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 08:08 -0700
1"""Advanced health checking with multi-level status and degradation detection."""
3import logging
4import time
5from dataclasses import dataclass
6from enum import Enum
7from typing import Any
9import psutil
11from ..config import config
12from ..http_client import http_client
14logger = logging.getLogger(__name__)
17class HealthStatus(Enum):
18 """Health status levels."""
20 HEALTHY = "healthy"
21 DEGRADED = "degraded"
22 UNHEALTHY = "unhealthy"
23 RECOVERING = "recovering"
24 UNKNOWN = "unknown"
27@dataclass
28class HealthCheckResult:
29 """Result of a health check operation."""
31 status: HealthStatus
32 response_time_ms: float
33 timestamp: float
34 details: dict[str, Any]
35 error: str | None = None
38class HealthChecker:
39 """Advanced health checker with multi-endpoint validation and degradation detection."""
41 def __init__(self) -> None:
42 self._consecutive_failures = 0
43 self._last_healthy_time = time.time()
44 self._response_times: list[float] = []
45 self._max_response_history = 10
47 # Health check endpoints to validate
48 self._endpoints = [
49 {
50 "path": "/health",
51 "timeout": config.monitoring.health_check_timeout_seconds,
52 },
53 {
54 "path": "/api/elements",
55 "timeout": config.monitoring.health_check_timeout_seconds * 1.5,
56 "method": "GET",
57 },
58 ]
60 async def check_health(self, force: bool = False) -> HealthCheckResult:
61 """Perform comprehensive health check with multiple endpoints."""
62 start_time = time.time()
64 try:
65 # Check basic health endpoint
66 primary_result = await self._check_primary_health()
68 # Check API functionality if primary is healthy
69 if primary_result.status in (HealthStatus.HEALTHY, HealthStatus.DEGRADED):
70 api_result = await self._check_api_health()
72 # Combine results
73 combined_result = self._combine_health_results(
74 primary_result, api_result
75 )
76 else:
77 combined_result = primary_result
79 # Update internal state
80 self._update_health_state(combined_result)
82 # Add resource monitoring if enabled
83 if config.monitoring.resource_monitoring_enabled:
84 combined_result.details[
85 "resources"
86 ] = await self._check_resource_usage()
88 return combined_result
90 except Exception as e:
91 error_result = HealthCheckResult(
92 status=HealthStatus.UNHEALTHY,
93 response_time_ms=(time.time() - start_time) * 1000,
94 timestamp=time.time(),
95 details={"error": str(e)},
96 error=str(e),
97 )
98 self._update_health_state(error_result)
99 return error_result
101 async def _check_primary_health(self) -> HealthCheckResult:
102 """Check primary health endpoint."""
103 start_time = time.time()
105 try:
106 # Use existing http_client health check but with detailed timing
107 is_healthy = await http_client.check_health(force=True)
108 response_time_ms = (time.time() - start_time) * 1000
110 status = HealthStatus.HEALTHY if is_healthy else HealthStatus.UNHEALTHY
112 return HealthCheckResult(
113 status=status,
114 response_time_ms=response_time_ms,
115 timestamp=time.time(),
116 details={
117 "endpoint": "/health",
118 "http_status": 200 if is_healthy else 500,
119 "response_time_ms": response_time_ms,
120 },
121 )
123 except Exception as e:
124 return HealthCheckResult(
125 status=HealthStatus.UNHEALTHY,
126 response_time_ms=(time.time() - start_time) * 1000,
127 timestamp=time.time(),
128 details={"endpoint": "/health", "error": str(e)},
129 error=str(e),
130 )
132 async def _check_api_health(self) -> HealthCheckResult:
133 """Check API functionality with elements endpoint."""
134 start_time = time.time()
136 try:
137 result = await http_client.get_json("/api/elements")
138 response_time_ms = (time.time() - start_time) * 1000
140 # Determine status based on response and timing
141 if result is not None:
142 # Check if response time indicates degradation
143 if (
144 response_time_ms
145 > config.monitoring.health_check_timeout_seconds * 800
146 ): # 80% of timeout
147 status = HealthStatus.DEGRADED
148 else:
149 status = HealthStatus.HEALTHY
150 else:
151 status = HealthStatus.UNHEALTHY
153 return HealthCheckResult(
154 status=status,
155 response_time_ms=response_time_ms,
156 timestamp=time.time(),
157 details={
158 "endpoint": "/api/elements",
159 "response_time_ms": response_time_ms,
160 "has_response": result is not None,
161 "element_count": len(result.get("elements", []))
162 if isinstance(result, dict) and result is not None
163 else 0,
164 },
165 )
167 except Exception as e:
168 return HealthCheckResult(
169 status=HealthStatus.UNHEALTHY,
170 response_time_ms=(time.time() - start_time) * 1000,
171 timestamp=time.time(),
172 details={"endpoint": "/api/elements", "error": str(e)},
173 error=str(e),
174 )
176 def _combine_health_results(
177 self, primary: HealthCheckResult, api: HealthCheckResult
178 ) -> HealthCheckResult:
179 """Combine multiple health check results into a single result."""
180 # Determine overall status (worst case wins)
181 status_priority = {
182 HealthStatus.UNHEALTHY: 0,
183 HealthStatus.RECOVERING: 1,
184 HealthStatus.DEGRADED: 2,
185 HealthStatus.HEALTHY: 3,
186 }
188 overall_status = min(
189 primary.status, api.status, key=lambda s: status_priority[s]
190 )
192 # Combine response times (average)
193 avg_response_time = (primary.response_time_ms + api.response_time_ms) / 2
195 # Combine details
196 combined_details = {
197 "primary_health": primary.details,
198 "api_health": api.details,
199 "overall_response_time_ms": avg_response_time,
200 }
202 return HealthCheckResult(
203 status=overall_status,
204 response_time_ms=avg_response_time,
205 timestamp=max(primary.timestamp, api.timestamp),
206 details=combined_details,
207 )
209 async def _check_resource_usage(self) -> dict[str, Any]:
210 """Check resource usage of canvas server process."""
211 try:
212 from ..process_manager import process_manager
214 if not process_manager.process_pid:
215 return {"error": "No process PID available"}
217 # Get process info
218 try:
219 process = psutil.Process(process_manager.process_pid)
221 # Get CPU and memory usage
222 cpu_percent = process.cpu_percent(interval=0.1)
223 memory_info = process.memory_info()
224 memory_percent = process.memory_percent()
226 # Get process status
227 status = process.status()
228 num_threads = process.num_threads()
230 return {
231 "cpu_percent": cpu_percent,
232 "memory_mb": memory_info.rss / (1024 * 1024),
233 "memory_percent": memory_percent,
234 "status": status,
235 "num_threads": num_threads,
236 "cpu_threshold_exceeded": cpu_percent
237 > config.monitoring.cpu_threshold_percent,
238 "memory_threshold_exceeded": memory_percent
239 > config.monitoring.memory_threshold_percent,
240 }
242 except psutil.NoSuchProcess:
243 return {"error": "Canvas server process not found"}
244 except psutil.AccessDenied:
245 return {"error": "Access denied to process information"}
247 except Exception as e:
248 logger.warning(f"Failed to check resource usage: {e}")
249 return {"error": str(e)}
251 def _update_health_state(self, result: HealthCheckResult) -> None:
252 """Update internal health state based on check result."""
253 # Track response times for performance monitoring
254 self._response_times.append(result.response_time_ms)
255 if len(self._response_times) > self._max_response_history:
256 self._response_times.pop(0)
258 # Update failure tracking
259 if result.status == HealthStatus.HEALTHY:
260 self._consecutive_failures = 0
261 self._last_healthy_time = result.timestamp
262 else:
263 self._consecutive_failures += 1
265 def is_failing(self) -> bool:
266 """Check if health checks are consistently failing."""
267 return (
268 self._consecutive_failures
269 >= config.monitoring.consecutive_failure_threshold
270 )
272 def get_failure_count(self) -> int:
273 """Get current consecutive failure count."""
274 return self._consecutive_failures
276 def get_average_response_time(self) -> float:
277 """Get average response time from recent checks."""
278 if not self._response_times:
279 return 0.0
280 return sum(self._response_times) / len(self._response_times)
282 def get_last_healthy_time(self) -> float:
283 """Get timestamp of last healthy check."""
284 return self._last_healthy_time
286 def reset_failure_count(self) -> None:
287 """Reset failure count (useful after recovery)."""
288 self._consecutive_failures = 0
290 def get_health_summary(self) -> dict[str, Any]:
291 """Get summary of health checker state."""
292 return {
293 "consecutive_failures": self._consecutive_failures,
294 "last_healthy_time": self._last_healthy_time,
295 "average_response_time_ms": self.get_average_response_time(),
296 "is_failing": self.is_failing(),
297 "response_time_history": self._response_times.copy(),
298 }