Coverage for excalidraw_mcp/monitoring/health_checker.py: 90%

115 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-16 08:08 -0700

1"""Advanced health checking with multi-level status and degradation detection.""" 

2 

3import logging 

4import time 

5from dataclasses import dataclass 

6from enum import Enum 

7from typing import Any 

8 

9import psutil 

10 

11from ..config import config 

12from ..http_client import http_client 

13 

14logger = logging.getLogger(__name__) 

15 

16 

17class HealthStatus(Enum): 

18 """Health status levels.""" 

19 

20 HEALTHY = "healthy" 

21 DEGRADED = "degraded" 

22 UNHEALTHY = "unhealthy" 

23 RECOVERING = "recovering" 

24 UNKNOWN = "unknown" 

25 

26 

27@dataclass 

28class HealthCheckResult: 

29 """Result of a health check operation.""" 

30 

31 status: HealthStatus 

32 response_time_ms: float 

33 timestamp: float 

34 details: dict[str, Any] 

35 error: str | None = None 

36 

37 

38class HealthChecker: 

39 """Advanced health checker with multi-endpoint validation and degradation detection.""" 

40 

41 def __init__(self) -> None: 

42 self._consecutive_failures = 0 

43 self._last_healthy_time = time.time() 

44 self._response_times: list[float] = [] 

45 self._max_response_history = 10 

46 

47 # Health check endpoints to validate 

48 self._endpoints = [ 

49 { 

50 "path": "/health", 

51 "timeout": config.monitoring.health_check_timeout_seconds, 

52 }, 

53 { 

54 "path": "/api/elements", 

55 "timeout": config.monitoring.health_check_timeout_seconds * 1.5, 

56 "method": "GET", 

57 }, 

58 ] 

59 

60 async def check_health(self, force: bool = False) -> HealthCheckResult: 

61 """Perform comprehensive health check with multiple endpoints.""" 

62 start_time = time.time() 

63 

64 try: 

65 # Check basic health endpoint 

66 primary_result = await self._check_primary_health() 

67 

68 # Check API functionality if primary is healthy 

69 if primary_result.status in (HealthStatus.HEALTHY, HealthStatus.DEGRADED): 

70 api_result = await self._check_api_health() 

71 

72 # Combine results 

73 combined_result = self._combine_health_results( 

74 primary_result, api_result 

75 ) 

76 else: 

77 combined_result = primary_result 

78 

79 # Update internal state 

80 self._update_health_state(combined_result) 

81 

82 # Add resource monitoring if enabled 

83 if config.monitoring.resource_monitoring_enabled: 

84 combined_result.details[ 

85 "resources" 

86 ] = await self._check_resource_usage() 

87 

88 return combined_result 

89 

90 except Exception as e: 

91 error_result = HealthCheckResult( 

92 status=HealthStatus.UNHEALTHY, 

93 response_time_ms=(time.time() - start_time) * 1000, 

94 timestamp=time.time(), 

95 details={"error": str(e)}, 

96 error=str(e), 

97 ) 

98 self._update_health_state(error_result) 

99 return error_result 

100 

101 async def _check_primary_health(self) -> HealthCheckResult: 

102 """Check primary health endpoint.""" 

103 start_time = time.time() 

104 

105 try: 

106 # Use existing http_client health check but with detailed timing 

107 is_healthy = await http_client.check_health(force=True) 

108 response_time_ms = (time.time() - start_time) * 1000 

109 

110 status = HealthStatus.HEALTHY if is_healthy else HealthStatus.UNHEALTHY 

111 

112 return HealthCheckResult( 

113 status=status, 

114 response_time_ms=response_time_ms, 

115 timestamp=time.time(), 

116 details={ 

117 "endpoint": "/health", 

118 "http_status": 200 if is_healthy else 500, 

119 "response_time_ms": response_time_ms, 

120 }, 

121 ) 

122 

123 except Exception as e: 

124 return HealthCheckResult( 

125 status=HealthStatus.UNHEALTHY, 

126 response_time_ms=(time.time() - start_time) * 1000, 

127 timestamp=time.time(), 

128 details={"endpoint": "/health", "error": str(e)}, 

129 error=str(e), 

130 ) 

131 

132 async def _check_api_health(self) -> HealthCheckResult: 

133 """Check API functionality with elements endpoint.""" 

134 start_time = time.time() 

135 

136 try: 

137 result = await http_client.get_json("/api/elements") 

138 response_time_ms = (time.time() - start_time) * 1000 

139 

140 # Determine status based on response and timing 

141 if result is not None: 

142 # Check if response time indicates degradation 

143 if ( 

144 response_time_ms 

145 > config.monitoring.health_check_timeout_seconds * 800 

146 ): # 80% of timeout 

147 status = HealthStatus.DEGRADED 

148 else: 

149 status = HealthStatus.HEALTHY 

150 else: 

151 status = HealthStatus.UNHEALTHY 

152 

153 return HealthCheckResult( 

154 status=status, 

155 response_time_ms=response_time_ms, 

156 timestamp=time.time(), 

157 details={ 

158 "endpoint": "/api/elements", 

159 "response_time_ms": response_time_ms, 

160 "has_response": result is not None, 

161 "element_count": len(result.get("elements", [])) 

162 if isinstance(result, dict) and result is not None 

163 else 0, 

164 }, 

165 ) 

166 

167 except Exception as e: 

168 return HealthCheckResult( 

169 status=HealthStatus.UNHEALTHY, 

170 response_time_ms=(time.time() - start_time) * 1000, 

171 timestamp=time.time(), 

172 details={"endpoint": "/api/elements", "error": str(e)}, 

173 error=str(e), 

174 ) 

175 

176 def _combine_health_results( 

177 self, primary: HealthCheckResult, api: HealthCheckResult 

178 ) -> HealthCheckResult: 

179 """Combine multiple health check results into a single result.""" 

180 # Determine overall status (worst case wins) 

181 status_priority = { 

182 HealthStatus.UNHEALTHY: 0, 

183 HealthStatus.RECOVERING: 1, 

184 HealthStatus.DEGRADED: 2, 

185 HealthStatus.HEALTHY: 3, 

186 } 

187 

188 overall_status = min( 

189 primary.status, api.status, key=lambda s: status_priority[s] 

190 ) 

191 

192 # Combine response times (average) 

193 avg_response_time = (primary.response_time_ms + api.response_time_ms) / 2 

194 

195 # Combine details 

196 combined_details = { 

197 "primary_health": primary.details, 

198 "api_health": api.details, 

199 "overall_response_time_ms": avg_response_time, 

200 } 

201 

202 return HealthCheckResult( 

203 status=overall_status, 

204 response_time_ms=avg_response_time, 

205 timestamp=max(primary.timestamp, api.timestamp), 

206 details=combined_details, 

207 ) 

208 

209 async def _check_resource_usage(self) -> dict[str, Any]: 

210 """Check resource usage of canvas server process.""" 

211 try: 

212 from ..process_manager import process_manager 

213 

214 if not process_manager.process_pid: 

215 return {"error": "No process PID available"} 

216 

217 # Get process info 

218 try: 

219 process = psutil.Process(process_manager.process_pid) 

220 

221 # Get CPU and memory usage 

222 cpu_percent = process.cpu_percent(interval=0.1) 

223 memory_info = process.memory_info() 

224 memory_percent = process.memory_percent() 

225 

226 # Get process status 

227 status = process.status() 

228 num_threads = process.num_threads() 

229 

230 return { 

231 "cpu_percent": cpu_percent, 

232 "memory_mb": memory_info.rss / (1024 * 1024), 

233 "memory_percent": memory_percent, 

234 "status": status, 

235 "num_threads": num_threads, 

236 "cpu_threshold_exceeded": cpu_percent 

237 > config.monitoring.cpu_threshold_percent, 

238 "memory_threshold_exceeded": memory_percent 

239 > config.monitoring.memory_threshold_percent, 

240 } 

241 

242 except psutil.NoSuchProcess: 

243 return {"error": "Canvas server process not found"} 

244 except psutil.AccessDenied: 

245 return {"error": "Access denied to process information"} 

246 

247 except Exception as e: 

248 logger.warning(f"Failed to check resource usage: {e}") 

249 return {"error": str(e)} 

250 

251 def _update_health_state(self, result: HealthCheckResult) -> None: 

252 """Update internal health state based on check result.""" 

253 # Track response times for performance monitoring 

254 self._response_times.append(result.response_time_ms) 

255 if len(self._response_times) > self._max_response_history: 

256 self._response_times.pop(0) 

257 

258 # Update failure tracking 

259 if result.status == HealthStatus.HEALTHY: 

260 self._consecutive_failures = 0 

261 self._last_healthy_time = result.timestamp 

262 else: 

263 self._consecutive_failures += 1 

264 

265 def is_failing(self) -> bool: 

266 """Check if health checks are consistently failing.""" 

267 return ( 

268 self._consecutive_failures 

269 >= config.monitoring.consecutive_failure_threshold 

270 ) 

271 

272 def get_failure_count(self) -> int: 

273 """Get current consecutive failure count.""" 

274 return self._consecutive_failures 

275 

276 def get_average_response_time(self) -> float: 

277 """Get average response time from recent checks.""" 

278 if not self._response_times: 

279 return 0.0 

280 return sum(self._response_times) / len(self._response_times) 

281 

282 def get_last_healthy_time(self) -> float: 

283 """Get timestamp of last healthy check.""" 

284 return self._last_healthy_time 

285 

286 def reset_failure_count(self) -> None: 

287 """Reset failure count (useful after recovery).""" 

288 self._consecutive_failures = 0 

289 

290 def get_health_summary(self) -> dict[str, Any]: 

291 """Get summary of health checker state.""" 

292 return { 

293 "consecutive_failures": self._consecutive_failures, 

294 "last_healthy_time": self._last_healthy_time, 

295 "average_response_time_ms": self.get_average_response_time(), 

296 "is_failing": self.is_failing(), 

297 "response_time_history": self._response_times.copy(), 

298 }