Coverage for excalidraw_mcp/monitoring/supervisor.py: 90%
181 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 08:08 -0700
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 08:08 -0700
1"""Main monitoring supervisor that orchestrates all monitoring components."""
3import asyncio
4import logging
5import time
6from collections.abc import Awaitable, Callable
7from typing import Any
9from ..config import config
10from ..retry_utils import RetryConfig, retry_async
11from .alerts import AlertManager
12from .circuit_breaker import CircuitBreaker
13from .health_checker import HealthChecker, HealthCheckResult, HealthStatus
14from .metrics import MetricsCollector
16logger = logging.getLogger(__name__)
19class MonitoringSupervisor:
20 """Orchestrates all monitoring components for canvas server oversight."""
22 def __init__(self) -> None:
23 # Core monitoring components
24 self.health_checker = HealthChecker()
25 self.circuit_breaker = CircuitBreaker()
26 self.metrics_collector = MetricsCollector()
27 self.alert_manager = AlertManager()
29 # Supervisor state
30 self._running = False
31 self._monitoring_task: asyncio.Task[Any] | None = None
32 self._restart_count = 0
33 self._start_time = time.time()
35 # Event hooks for external integration
36 self._on_health_change_callbacks: list[Callable[..., Awaitable[None]]] = []
37 self._on_restart_callbacks: list[Callable[..., Awaitable[None]]] = []
39 async def start(self) -> None:
40 """Start monitoring supervision."""
41 if self._running:
42 logger.warning("Monitoring supervisor is already running")
43 return
45 if not config.monitoring.enabled:
46 logger.info("Monitoring is disabled in configuration")
47 return
49 logger.info("Starting monitoring supervisor...")
50 self._running = True
51 self._start_time = time.time()
53 # Start metrics collection
54 await self.metrics_collector.start_collection()
56 # Start main monitoring loop
57 self._monitoring_task = asyncio.create_task(self._monitoring_loop())
59 logger.info("Monitoring supervisor started successfully")
61 async def stop(self) -> None:
62 """Stop monitoring supervision."""
63 if not self._running:
64 return
66 logger.info("Stopping monitoring supervisor...")
67 self._running = False
69 # Stop monitoring loop
70 if self._monitoring_task:
71 self._monitoring_task.cancel()
72 try:
73 await self._monitoring_task
74 except asyncio.CancelledError:
75 pass
77 # Stop metrics collection
78 await self.metrics_collector.stop_collection()
80 logger.info("Monitoring supervisor stopped")
82 async def _monitoring_loop(self) -> None:
83 """Main monitoring loop that coordinates health checks, metrics, and alerts."""
84 # Configure retry for monitoring loop
85 retry_config = RetryConfig(
86 max_attempts=5, # Limit retries to prevent infinite loops
87 max_delay=30.0,
88 exponential_base=config.server.sync_retry_exponential_base,
89 jitter=config.server.sync_retry_jitter,
90 )
92 async def _monitoring_cycle() -> None:
93 loop_start_time = time.time()
95 # Perform health check
96 health_result = await self._perform_monitored_health_check()
98 # Handle health status changes
99 await self._handle_health_status(health_result)
101 # Collect and analyze metrics
102 metrics = await self._collect_monitoring_metrics(health_result)
104 # Check alert conditions
105 await self.alert_manager.check_conditions(metrics)
107 # Log monitoring cycle completion
108 cycle_duration = time.time() - loop_start_time
109 logger.debug(f"Monitoring cycle completed in {cycle_duration:.2f}s")
111 async def _on_retry(attempt: int, exception: Exception) -> None:
112 logger.warning(
113 f"Monitoring cycle failed (attempt {attempt}), retrying... Error: {exception}"
114 )
116 while self._running:
117 try:
118 await retry_async(
119 _monitoring_cycle,
120 retry_config=retry_config,
121 retry_on_exceptions=(Exception,),
122 on_retry=_on_retry,
123 )
125 # Wait for next cycle
126 await asyncio.sleep(config.monitoring.health_check_interval_seconds)
128 except asyncio.CancelledError:
129 break
130 except Exception as e:
131 logger.error(f"Error in monitoring loop: {e}")
132 await asyncio.sleep(5) # Brief pause before continuing
134 async def _perform_monitored_health_check(self) -> HealthCheckResult:
135 """Perform health check with metrics tracking."""
136 start_time = time.time()
138 try:
139 # Use circuit breaker for health check
140 health_result = await self.circuit_breaker.call(
141 self.health_checker.check_health
142 )
144 # Record metrics
145 duration = time.time() - start_time
146 self.metrics_collector.increment_counter("health_checks_total")
147 self.metrics_collector.observe_histogram(
148 "health_check_duration_seconds", duration
149 )
151 if health_result.status != HealthStatus.HEALTHY:
152 self.metrics_collector.increment_counter("health_check_failures_total")
154 return health_result # type: ignore
156 except Exception as e:
157 # Handle circuit breaker or health check errors
158 duration = time.time() - start_time
159 self.metrics_collector.increment_counter("health_checks_total")
160 self.metrics_collector.increment_counter("health_check_failures_total")
161 self.metrics_collector.observe_histogram(
162 "health_check_duration_seconds", duration
163 )
165 logger.error(f"Health check failed: {e}")
167 # Return unhealthy result
168 from .health_checker import HealthCheckResult
170 result = HealthCheckResult(
171 status=HealthStatus.UNHEALTHY,
172 response_time_ms=duration * 1000,
173 timestamp=time.time(),
174 details={"error": str(e)},
175 error=str(e),
176 )
177 return result
179 async def _handle_health_status(self, health_result: HealthCheckResult) -> None:
180 """Handle health status changes and trigger recovery actions."""
181 current_status = health_result.status
183 # Update metrics
184 self.metrics_collector.set_gauge(
185 "health_check_consecutive_failures", self.health_checker.get_failure_count()
186 )
188 # Handle consecutive failures
189 if self.health_checker.is_failing():
190 await self._handle_health_failure()
192 # Trigger callbacks for status changes
193 for callback in self._on_health_change_callbacks:
194 try:
195 await callback(current_status, health_result)
196 except Exception as e:
197 logger.error(f"Error in health change callback: {e}")
199 async def _handle_health_failure(self) -> None:
200 """Handle consistent health check failures with automatic recovery."""
201 failure_count = self.health_checker.get_failure_count()
203 logger.warning(
204 f"Canvas server failing health checks ({failure_count} consecutive failures)"
205 )
207 # Attempt automatic restart if configured
208 if (
209 failure_count >= config.monitoring.consecutive_failure_threshold
210 and config.server.canvas_auto_start
211 ):
212 await self._attempt_restart()
214 async def _attempt_restart(self) -> None:
215 """Attempt to restart the canvas server."""
216 try:
217 logger.info(
218 "Attempting to restart canvas server due to health check failures..."
219 )
221 from ..process_manager import process_manager
223 # Record restart attempt
224 self._restart_count += 1
225 self.metrics_collector.increment_counter("canvas_restarts_total")
227 # Configure retry for restart attempts
228 retry_config = RetryConfig(
229 base_delay=2.0,
230 max_delay=10.0,
231 exponential_base=config.server.sync_retry_exponential_base,
232 jitter=config.server.sync_retry_jitter,
233 )
235 async def _perform_restart() -> bool:
236 result = await process_manager.restart()
237 if not result:
238 raise RuntimeError("Restart failed")
239 return result
241 # Attempt restart with retries
242 try:
243 restart_success = await retry_async(
244 _perform_restart,
245 retry_config=retry_config,
246 retry_on_exceptions=(RuntimeError, Exception),
247 )
248 except Exception:
249 restart_success = False
251 if restart_success:
252 logger.info("Canvas server restart successful")
254 # Reset health checker failure count
255 self.health_checker.reset_failure_count()
257 # Trigger restart callbacks
258 for callback in self._on_restart_callbacks:
259 try:
260 await callback(True, self._restart_count)
261 except Exception as e:
262 logger.error(f"Error in restart callback: {e}")
263 else:
264 logger.error("Canvas server restart failed")
266 # Trigger failure callbacks
267 for callback in self._on_restart_callbacks:
268 try:
269 await callback(False, self._restart_count)
270 except Exception as e:
271 logger.error(f"Error in restart callback: {e}")
273 except Exception as e:
274 logger.error(f"Error during restart attempt: {e}")
276 async def _collect_monitoring_metrics(
277 self, health_result: HealthCheckResult
278 ) -> dict[str, Any]:
279 """Collect comprehensive metrics for alerting and analysis."""
280 metrics = {}
282 try:
283 # Health metrics
284 metrics.update(
285 {
286 "consecutive_health_failures": self.health_checker.get_failure_count(),
287 "health_status": health_result.status.value,
288 "health_response_time": health_result.response_time_ms,
289 "avg_health_response_time": self.health_checker.get_average_response_time(),
290 }
291 )
293 # Circuit breaker metrics
294 circuit_stats = self.circuit_breaker.get_stats()
295 metrics.update(
296 {
297 "circuit_state": circuit_stats["state"],
298 "circuit_failure_rate": circuit_stats["failure_rate_percent"],
299 "circuit_failures": circuit_stats["failed_calls"],
300 "circuit_total_calls": circuit_stats["total_calls"],
301 }
302 )
304 # Resource metrics (if available in health result)
305 if "resources" in health_result.details:
306 resources = health_result.details["resources"]
307 if isinstance(resources, dict) and "error" not in resources:
308 metrics.update(
309 {
310 "cpu_percent": resources.get("cpu_percent", 0),
311 "memory_percent": resources.get("memory_percent", 0),
312 "memory_mb": resources.get("memory_mb", 0),
313 "num_threads": resources.get("num_threads", 0),
314 }
315 )
317 # Process status
318 from ..process_manager import process_manager
320 if process_manager.process_pid:
321 metrics["process_status"] = "running"
322 metrics["uptime_seconds"] = time.time() - self._start_time
323 else:
324 metrics["process_status"] = "dead"
325 metrics["uptime_seconds"] = 0
327 # Monitoring supervisor metrics
328 metrics.update(
329 {
330 "restart_count": self._restart_count,
331 "supervisor_uptime": time.time() - self._start_time,
332 }
333 )
335 except Exception as e:
336 logger.error(f"Error collecting monitoring metrics: {e}")
338 return metrics
340 def add_health_change_callback(
341 self, callback: Callable[..., Awaitable[None]]
342 ) -> None:
343 """Add callback for health status changes."""
344 self._on_health_change_callbacks.append(callback)
346 def add_restart_callback(self, callback: Callable[..., Awaitable[None]]) -> None:
347 """Add callback for restart events."""
348 self._on_restart_callbacks.append(callback)
350 def get_monitoring_status(self) -> dict[str, Any]:
351 """Get comprehensive monitoring status."""
352 return {
353 "enabled": config.monitoring.enabled,
354 "running": self._running,
355 "uptime_seconds": time.time() - self._start_time,
356 "restart_count": self._restart_count,
357 # Component status
358 "health_checker": {
359 "consecutive_failures": self.health_checker.get_failure_count(),
360 "is_failing": self.health_checker.is_failing(),
361 "last_healthy_time": self.health_checker.get_last_healthy_time(),
362 "avg_response_time": self.health_checker.get_average_response_time(),
363 },
364 "circuit_breaker": self.circuit_breaker.get_stats(),
365 "metrics_collector": {
366 "collection_running": self.metrics_collector._running,
367 "total_metrics": len(self.metrics_collector._counters)
368 + len(self.metrics_collector._gauges)
369 + len(self.metrics_collector._histograms),
370 },
371 "alert_manager": self.alert_manager.get_alert_statistics(),
372 }
374 async def force_health_check(self) -> dict[str, Any]:
375 """Force an immediate health check and return results."""
376 health_result = await self.health_checker.check_health(force=True)
378 return {
379 "status": health_result.status.value,
380 "response_time_ms": health_result.response_time_ms,
381 "timestamp": health_result.timestamp,
382 "details": health_result.details,
383 "error": health_result.error,
384 }
386 async def trigger_restart(self) -> bool:
387 """Manually trigger a canvas server restart."""
388 logger.info("Manual restart triggered via monitoring supervisor")
389 await self._attempt_restart()
391 # Return success status
392 from ..process_manager import process_manager
394 return process_manager._is_process_running()
396 def reset_circuit_breaker(self) -> None:
397 """Reset circuit breaker to closed state."""
398 asyncio.create_task(self.circuit_breaker.reset())
399 logger.info("Circuit breaker reset via monitoring supervisor")
401 def get_metrics_summary(self) -> dict[str, Any]:
402 """Get summary of all collected metrics."""
403 return self.metrics_collector.get_all_metrics()
405 def get_recent_alerts(self, limit: int = 10) -> list[Any]:
406 """Get recent alert history."""
407 return self.alert_manager.get_alert_history(limit=limit)
409 @property
410 def is_running(self) -> bool:
411 """Check if monitoring supervisor is running."""
412 return self._running