Coverage for excalidraw_mcp/monitoring/supervisor.py: 90%

181 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-16 08:08 -0700

1"""Main monitoring supervisor that orchestrates all monitoring components.""" 

2 

3import asyncio 

4import logging 

5import time 

6from collections.abc import Awaitable, Callable 

7from typing import Any 

8 

9from ..config import config 

10from ..retry_utils import RetryConfig, retry_async 

11from .alerts import AlertManager 

12from .circuit_breaker import CircuitBreaker 

13from .health_checker import HealthChecker, HealthCheckResult, HealthStatus 

14from .metrics import MetricsCollector 

15 

16logger = logging.getLogger(__name__) 

17 

18 

19class MonitoringSupervisor: 

20 """Orchestrates all monitoring components for canvas server oversight.""" 

21 

22 def __init__(self) -> None: 

23 # Core monitoring components 

24 self.health_checker = HealthChecker() 

25 self.circuit_breaker = CircuitBreaker() 

26 self.metrics_collector = MetricsCollector() 

27 self.alert_manager = AlertManager() 

28 

29 # Supervisor state 

30 self._running = False 

31 self._monitoring_task: asyncio.Task[Any] | None = None 

32 self._restart_count = 0 

33 self._start_time = time.time() 

34 

35 # Event hooks for external integration 

36 self._on_health_change_callbacks: list[Callable[..., Awaitable[None]]] = [] 

37 self._on_restart_callbacks: list[Callable[..., Awaitable[None]]] = [] 

38 

39 async def start(self) -> None: 

40 """Start monitoring supervision.""" 

41 if self._running: 

42 logger.warning("Monitoring supervisor is already running") 

43 return 

44 

45 if not config.monitoring.enabled: 

46 logger.info("Monitoring is disabled in configuration") 

47 return 

48 

49 logger.info("Starting monitoring supervisor...") 

50 self._running = True 

51 self._start_time = time.time() 

52 

53 # Start metrics collection 

54 await self.metrics_collector.start_collection() 

55 

56 # Start main monitoring loop 

57 self._monitoring_task = asyncio.create_task(self._monitoring_loop()) 

58 

59 logger.info("Monitoring supervisor started successfully") 

60 

61 async def stop(self) -> None: 

62 """Stop monitoring supervision.""" 

63 if not self._running: 

64 return 

65 

66 logger.info("Stopping monitoring supervisor...") 

67 self._running = False 

68 

69 # Stop monitoring loop 

70 if self._monitoring_task: 

71 self._monitoring_task.cancel() 

72 try: 

73 await self._monitoring_task 

74 except asyncio.CancelledError: 

75 pass 

76 

77 # Stop metrics collection 

78 await self.metrics_collector.stop_collection() 

79 

80 logger.info("Monitoring supervisor stopped") 

81 

82 async def _monitoring_loop(self) -> None: 

83 """Main monitoring loop that coordinates health checks, metrics, and alerts.""" 

84 # Configure retry for monitoring loop 

85 retry_config = RetryConfig( 

86 max_attempts=5, # Limit retries to prevent infinite loops 

87 max_delay=30.0, 

88 exponential_base=config.server.sync_retry_exponential_base, 

89 jitter=config.server.sync_retry_jitter, 

90 ) 

91 

92 async def _monitoring_cycle() -> None: 

93 loop_start_time = time.time() 

94 

95 # Perform health check 

96 health_result = await self._perform_monitored_health_check() 

97 

98 # Handle health status changes 

99 await self._handle_health_status(health_result) 

100 

101 # Collect and analyze metrics 

102 metrics = await self._collect_monitoring_metrics(health_result) 

103 

104 # Check alert conditions 

105 await self.alert_manager.check_conditions(metrics) 

106 

107 # Log monitoring cycle completion 

108 cycle_duration = time.time() - loop_start_time 

109 logger.debug(f"Monitoring cycle completed in {cycle_duration:.2f}s") 

110 

111 async def _on_retry(attempt: int, exception: Exception) -> None: 

112 logger.warning( 

113 f"Monitoring cycle failed (attempt {attempt}), retrying... Error: {exception}" 

114 ) 

115 

116 while self._running: 

117 try: 

118 await retry_async( 

119 _monitoring_cycle, 

120 retry_config=retry_config, 

121 retry_on_exceptions=(Exception,), 

122 on_retry=_on_retry, 

123 ) 

124 

125 # Wait for next cycle 

126 await asyncio.sleep(config.monitoring.health_check_interval_seconds) 

127 

128 except asyncio.CancelledError: 

129 break 

130 except Exception as e: 

131 logger.error(f"Error in monitoring loop: {e}") 

132 await asyncio.sleep(5) # Brief pause before continuing 

133 

134 async def _perform_monitored_health_check(self) -> HealthCheckResult: 

135 """Perform health check with metrics tracking.""" 

136 start_time = time.time() 

137 

138 try: 

139 # Use circuit breaker for health check 

140 health_result = await self.circuit_breaker.call( 

141 self.health_checker.check_health 

142 ) 

143 

144 # Record metrics 

145 duration = time.time() - start_time 

146 self.metrics_collector.increment_counter("health_checks_total") 

147 self.metrics_collector.observe_histogram( 

148 "health_check_duration_seconds", duration 

149 ) 

150 

151 if health_result.status != HealthStatus.HEALTHY: 

152 self.metrics_collector.increment_counter("health_check_failures_total") 

153 

154 return health_result # type: ignore 

155 

156 except Exception as e: 

157 # Handle circuit breaker or health check errors 

158 duration = time.time() - start_time 

159 self.metrics_collector.increment_counter("health_checks_total") 

160 self.metrics_collector.increment_counter("health_check_failures_total") 

161 self.metrics_collector.observe_histogram( 

162 "health_check_duration_seconds", duration 

163 ) 

164 

165 logger.error(f"Health check failed: {e}") 

166 

167 # Return unhealthy result 

168 from .health_checker import HealthCheckResult 

169 

170 result = HealthCheckResult( 

171 status=HealthStatus.UNHEALTHY, 

172 response_time_ms=duration * 1000, 

173 timestamp=time.time(), 

174 details={"error": str(e)}, 

175 error=str(e), 

176 ) 

177 return result 

178 

179 async def _handle_health_status(self, health_result: HealthCheckResult) -> None: 

180 """Handle health status changes and trigger recovery actions.""" 

181 current_status = health_result.status 

182 

183 # Update metrics 

184 self.metrics_collector.set_gauge( 

185 "health_check_consecutive_failures", self.health_checker.get_failure_count() 

186 ) 

187 

188 # Handle consecutive failures 

189 if self.health_checker.is_failing(): 

190 await self._handle_health_failure() 

191 

192 # Trigger callbacks for status changes 

193 for callback in self._on_health_change_callbacks: 

194 try: 

195 await callback(current_status, health_result) 

196 except Exception as e: 

197 logger.error(f"Error in health change callback: {e}") 

198 

199 async def _handle_health_failure(self) -> None: 

200 """Handle consistent health check failures with automatic recovery.""" 

201 failure_count = self.health_checker.get_failure_count() 

202 

203 logger.warning( 

204 f"Canvas server failing health checks ({failure_count} consecutive failures)" 

205 ) 

206 

207 # Attempt automatic restart if configured 

208 if ( 

209 failure_count >= config.monitoring.consecutive_failure_threshold 

210 and config.server.canvas_auto_start 

211 ): 

212 await self._attempt_restart() 

213 

214 async def _attempt_restart(self) -> None: 

215 """Attempt to restart the canvas server.""" 

216 try: 

217 logger.info( 

218 "Attempting to restart canvas server due to health check failures..." 

219 ) 

220 

221 from ..process_manager import process_manager 

222 

223 # Record restart attempt 

224 self._restart_count += 1 

225 self.metrics_collector.increment_counter("canvas_restarts_total") 

226 

227 # Configure retry for restart attempts 

228 retry_config = RetryConfig( 

229 base_delay=2.0, 

230 max_delay=10.0, 

231 exponential_base=config.server.sync_retry_exponential_base, 

232 jitter=config.server.sync_retry_jitter, 

233 ) 

234 

235 async def _perform_restart() -> bool: 

236 result = await process_manager.restart() 

237 if not result: 

238 raise RuntimeError("Restart failed") 

239 return result 

240 

241 # Attempt restart with retries 

242 try: 

243 restart_success = await retry_async( 

244 _perform_restart, 

245 retry_config=retry_config, 

246 retry_on_exceptions=(RuntimeError, Exception), 

247 ) 

248 except Exception: 

249 restart_success = False 

250 

251 if restart_success: 

252 logger.info("Canvas server restart successful") 

253 

254 # Reset health checker failure count 

255 self.health_checker.reset_failure_count() 

256 

257 # Trigger restart callbacks 

258 for callback in self._on_restart_callbacks: 

259 try: 

260 await callback(True, self._restart_count) 

261 except Exception as e: 

262 logger.error(f"Error in restart callback: {e}") 

263 else: 

264 logger.error("Canvas server restart failed") 

265 

266 # Trigger failure callbacks 

267 for callback in self._on_restart_callbacks: 

268 try: 

269 await callback(False, self._restart_count) 

270 except Exception as e: 

271 logger.error(f"Error in restart callback: {e}") 

272 

273 except Exception as e: 

274 logger.error(f"Error during restart attempt: {e}") 

275 

276 async def _collect_monitoring_metrics( 

277 self, health_result: HealthCheckResult 

278 ) -> dict[str, Any]: 

279 """Collect comprehensive metrics for alerting and analysis.""" 

280 metrics = {} 

281 

282 try: 

283 # Health metrics 

284 metrics.update( 

285 { 

286 "consecutive_health_failures": self.health_checker.get_failure_count(), 

287 "health_status": health_result.status.value, 

288 "health_response_time": health_result.response_time_ms, 

289 "avg_health_response_time": self.health_checker.get_average_response_time(), 

290 } 

291 ) 

292 

293 # Circuit breaker metrics 

294 circuit_stats = self.circuit_breaker.get_stats() 

295 metrics.update( 

296 { 

297 "circuit_state": circuit_stats["state"], 

298 "circuit_failure_rate": circuit_stats["failure_rate_percent"], 

299 "circuit_failures": circuit_stats["failed_calls"], 

300 "circuit_total_calls": circuit_stats["total_calls"], 

301 } 

302 ) 

303 

304 # Resource metrics (if available in health result) 

305 if "resources" in health_result.details: 

306 resources = health_result.details["resources"] 

307 if isinstance(resources, dict) and "error" not in resources: 

308 metrics.update( 

309 { 

310 "cpu_percent": resources.get("cpu_percent", 0), 

311 "memory_percent": resources.get("memory_percent", 0), 

312 "memory_mb": resources.get("memory_mb", 0), 

313 "num_threads": resources.get("num_threads", 0), 

314 } 

315 ) 

316 

317 # Process status 

318 from ..process_manager import process_manager 

319 

320 if process_manager.process_pid: 

321 metrics["process_status"] = "running" 

322 metrics["uptime_seconds"] = time.time() - self._start_time 

323 else: 

324 metrics["process_status"] = "dead" 

325 metrics["uptime_seconds"] = 0 

326 

327 # Monitoring supervisor metrics 

328 metrics.update( 

329 { 

330 "restart_count": self._restart_count, 

331 "supervisor_uptime": time.time() - self._start_time, 

332 } 

333 ) 

334 

335 except Exception as e: 

336 logger.error(f"Error collecting monitoring metrics: {e}") 

337 

338 return metrics 

339 

340 def add_health_change_callback( 

341 self, callback: Callable[..., Awaitable[None]] 

342 ) -> None: 

343 """Add callback for health status changes.""" 

344 self._on_health_change_callbacks.append(callback) 

345 

346 def add_restart_callback(self, callback: Callable[..., Awaitable[None]]) -> None: 

347 """Add callback for restart events.""" 

348 self._on_restart_callbacks.append(callback) 

349 

350 def get_monitoring_status(self) -> dict[str, Any]: 

351 """Get comprehensive monitoring status.""" 

352 return { 

353 "enabled": config.monitoring.enabled, 

354 "running": self._running, 

355 "uptime_seconds": time.time() - self._start_time, 

356 "restart_count": self._restart_count, 

357 # Component status 

358 "health_checker": { 

359 "consecutive_failures": self.health_checker.get_failure_count(), 

360 "is_failing": self.health_checker.is_failing(), 

361 "last_healthy_time": self.health_checker.get_last_healthy_time(), 

362 "avg_response_time": self.health_checker.get_average_response_time(), 

363 }, 

364 "circuit_breaker": self.circuit_breaker.get_stats(), 

365 "metrics_collector": { 

366 "collection_running": self.metrics_collector._running, 

367 "total_metrics": len(self.metrics_collector._counters) 

368 + len(self.metrics_collector._gauges) 

369 + len(self.metrics_collector._histograms), 

370 }, 

371 "alert_manager": self.alert_manager.get_alert_statistics(), 

372 } 

373 

374 async def force_health_check(self) -> dict[str, Any]: 

375 """Force an immediate health check and return results.""" 

376 health_result = await self.health_checker.check_health(force=True) 

377 

378 return { 

379 "status": health_result.status.value, 

380 "response_time_ms": health_result.response_time_ms, 

381 "timestamp": health_result.timestamp, 

382 "details": health_result.details, 

383 "error": health_result.error, 

384 } 

385 

386 async def trigger_restart(self) -> bool: 

387 """Manually trigger a canvas server restart.""" 

388 logger.info("Manual restart triggered via monitoring supervisor") 

389 await self._attempt_restart() 

390 

391 # Return success status 

392 from ..process_manager import process_manager 

393 

394 return process_manager._is_process_running() 

395 

396 def reset_circuit_breaker(self) -> None: 

397 """Reset circuit breaker to closed state.""" 

398 asyncio.create_task(self.circuit_breaker.reset()) 

399 logger.info("Circuit breaker reset via monitoring supervisor") 

400 

401 def get_metrics_summary(self) -> dict[str, Any]: 

402 """Get summary of all collected metrics.""" 

403 return self.metrics_collector.get_all_metrics() 

404 

405 def get_recent_alerts(self, limit: int = 10) -> list[Any]: 

406 """Get recent alert history.""" 

407 return self.alert_manager.get_alert_history(limit=limit) 

408 

409 @property 

410 def is_running(self) -> bool: 

411 """Check if monitoring supervisor is running.""" 

412 return self._running