Coverage for excalidraw_mcp/monitoring/alerts.py: 53%
218 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 08:08 -0700
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 08:08 -0700
1"""Alert management with rule-based triggering and multiple delivery channels."""
3import ast
4import asyncio
5import json
6import logging
7import operator
8import time
9from dataclasses import dataclass, field
10from enum import Enum
11from typing import Any
13from ..config import config
15logger = logging.getLogger(__name__)
18class AlertLevel(Enum):
19 """Alert severity levels."""
21 INFO = "info"
22 WARNING = "warning"
23 ERROR = "error"
24 CRITICAL = "critical"
27class AlertChannel(Enum):
28 """Alert delivery channels."""
30 LOG = "log"
31 WEBHOOK = "webhook"
32 EMAIL = "email"
33 SLACK = "slack"
36@dataclass
37class Alert:
38 """An alert instance."""
40 id: str
41 rule_name: str
42 level: AlertLevel
43 message: str
44 timestamp: float
45 source: str
46 labels: dict = field(default_factory=dict)
47 resolved: bool = False
48 resolved_at: float | None = None
51@dataclass
52class AlertRule:
53 """Configuration for an alert rule."""
55 name: str
56 condition: str
57 level: AlertLevel
58 message_template: str
59 channels: list[AlertChannel] = field(default_factory=list)
60 throttle_seconds: int = 300
61 enabled: bool = True
64class AlertManager:
65 """Manages alert notifications and delivery."""
67 def __init__(self) -> None:
68 self._active_alerts: dict[str, Alert] = {}
69 self._alert_history: list[Alert] = []
70 self._alert_counts: dict[str, int] = {}
71 self._last_sent: dict[str, float] = {}
72 self._lock = asyncio.Lock()
74 # Initialize alert rules
75 self._alert_rules = self._initialize_alert_rules()
77 def _initialize_alert_rules(self) -> list[AlertRule]:
78 """Initialize standard alert rules."""
79 rules: list[AlertRule] = []
81 # Health check failure alerts
82 rules.extend(
83 (
84 AlertRule(
85 name="health_check_failing",
86 condition="consecutive_health_failures >= 3",
87 level=AlertLevel.WARNING,
88 message_template="Canvas server health checks failing: {consecutive_failures} consecutive failures",
89 channels=[AlertChannel.LOG],
90 throttle_seconds=300,
91 ),
92 AlertRule(
93 name="health_check_critical",
94 condition="consecutive_health_failures >= 5",
95 level=AlertLevel.CRITICAL,
96 message_template="Canvas server health checks critical: {consecutive_failures} consecutive failures",
97 channels=[AlertChannel.LOG],
98 throttle_seconds=180,
99 ),
100 )
101 )
103 # Circuit breaker alerts
104 rules.append(
105 AlertRule(
106 name="circuit_breaker_opened",
107 condition="circuit_state == 'open'",
108 level=AlertLevel.ERROR,
109 message_template="Circuit breaker opened: {failure_rate}% failure rate",
110 channels=[AlertChannel.LOG],
111 throttle_seconds=600,
112 )
113 )
115 # CPU/Memory alerts
116 rules.extend(
117 (
118 AlertRule(
119 name="high_cpu_usage",
120 condition="cpu_percent >= 80.0",
121 level=AlertLevel.WARNING,
122 message_template="High CPU usage detected: {cpu_percent:.1f}%",
123 channels=[AlertChannel.LOG],
124 throttle_seconds=600,
125 ),
126 AlertRule(
127 name="high_memory_usage",
128 condition="memory_percent >= 85.0",
129 level=AlertLevel.WARNING,
130 message_template="High memory usage detected: {memory_percent:.1f}%",
131 channels=[AlertChannel.LOG],
132 throttle_seconds=600,
133 ),
134 )
135 )
137 # Process failure alerts
138 rules.append(
139 AlertRule(
140 name="canvas_process_died",
141 condition="process_status == 'dead'",
142 level=AlertLevel.CRITICAL,
143 message_template="Canvas server process has died",
144 channels=[AlertChannel.LOG],
145 throttle_seconds=60,
146 )
147 )
149 return rules
151 async def check_conditions(self, metrics: dict[str, Any]) -> None:
152 """Check alert conditions against current metrics."""
153 if not config.monitoring.alerting_enabled:
154 return
156 current_time = time.time()
158 for rule in self._alert_rules:
159 if not rule.enabled:
160 continue
162 try:
163 # Evaluate condition
164 if self._safe_eval_condition(rule.condition, metrics):
165 await self._trigger_alert(rule, metrics, current_time)
166 else:
167 # Check if we should resolve existing alert
168 await self._resolve_alert(rule.name, current_time)
170 except Exception as e:
171 logger.error(f"Error evaluating alert rule '{rule.name}': {e}")
173 def _eval_expression(self, node: ast.Expression, operators: dict) -> Any:
174 """Evaluate an expression node."""
175 return self._eval_node(node.body, operators)
177 def _eval_compare(self, node: ast.Compare, operators: dict) -> bool:
178 """Evaluate a comparison node."""
179 left = self._eval_node(node.left, operators)
180 comparisons = []
181 for op, comparator in zip(node.ops, node.comparators):
182 right = self._eval_node(comparator, operators)
183 if type(op) in operators:
184 comparisons.append(operators[type(op)](left, right))
185 else:
186 raise ValueError(f"Unsupported operator: {op}")
187 left = right
188 return all(comparisons)
190 def _eval_bool_op(self, node: ast.BoolOp, operators: dict) -> Any:
191 """Evaluate a boolean operation node."""
192 values = [self._eval_node(value, operators) for value in node.values]
193 if type(node.op) in operators:
194 result = values[0]
195 for value in values[1:]:
196 result = operators[type(node.op)](result, value)
197 return result
198 else:
199 raise ValueError(f"Unsupported boolean operator: {node.op}")
201 def _eval_unary_op(self, node: ast.UnaryOp, operators: dict) -> Any:
202 """Evaluate a unary operation node."""
203 if isinstance(node.op, ast.Not) and type(node.op) in operators:
204 return operators[type(node.op)](self._eval_node(node.operand, operators))
205 else:
206 raise ValueError(f"Unsupported unary operator: {node.op}")
208 def _eval_constant(self, node: ast.Constant) -> Any:
209 """Evaluate a constant node."""
210 return node.value
212 def _eval_name(self, node: ast.Name, context: dict[str, Any]) -> Any:
213 """Evaluate a name node."""
214 if node.id in context:
215 return context[node.id]
216 else:
217 raise ValueError(f"Undefined variable: {node.id}")
219 def _eval_node(
220 self, node: ast.AST, operators: dict, context: dict[str, Any] | None = None
221 ) -> Any:
222 """Recursively evaluate an AST node."""
223 if isinstance(node, ast.Expression):
224 return self._eval_expression(node, operators)
225 elif isinstance(node, ast.Compare):
226 return self._eval_compare(node, operators)
227 elif isinstance(node, ast.BoolOp):
228 return self._eval_bool_op(node, operators)
229 elif isinstance(node, ast.UnaryOp):
230 return self._eval_unary_op(node, operators)
231 elif isinstance(node, ast.Constant):
232 return self._eval_constant(node)
233 elif isinstance(node, ast.Num): # For Python < 3.8
234 return node.n
235 elif isinstance(node, ast.Str): # For Python < 3.8
236 return node.s
237 elif isinstance(node, ast.Name):
238 if context is None:
239 raise ValueError("Context required for name evaluation")
240 return self._eval_name(node, context)
241 else:
242 raise ValueError(f"Unsupported node type: {type(node)}")
244 def _safe_eval_condition(self, condition: str, context: dict[str, Any]) -> bool:
245 """Safely evaluate an alert condition using AST parsing.
247 Supports basic comparisons and logical operators only.
248 """
249 try:
250 # Parse the condition into an AST
251 tree = ast.parse(condition, mode="eval")
253 # Define allowed operations
254 operators = {
255 ast.Eq: operator.eq,
256 ast.NotEq: operator.ne,
257 ast.Lt: operator.lt,
258 ast.LtE: operator.le,
259 ast.Gt: operator.gt,
260 ast.GtE: operator.ge,
261 ast.And: operator.and_,
262 ast.Or: operator.or_,
263 ast.Not: operator.not_,
264 }
266 result = self._eval_node(tree, operators, context)
267 return bool(result)
269 except Exception as e:
270 logger.error(f"Error evaluating condition '{condition}': {e}")
271 return False
273 async def _trigger_alert(
274 self, rule: AlertRule, metrics: dict[str, Any], timestamp: float
275 ) -> None:
276 """Trigger an alert if conditions are met."""
277 async with self._lock:
278 # Check throttling
279 if self._should_throttle_alert(rule.name, timestamp):
280 return
282 # Generate alert ID
283 alert_id = f"{rule.name}_{int(timestamp)}"
285 # Format message
286 message = self._format_alert_message(rule.message_template, metrics)
288 alert = Alert(
289 id=alert_id,
290 rule_name=rule.name,
291 level=rule.level,
292 message=message,
293 timestamp=timestamp,
294 source="excalidraw-mcp",
295 labels=metrics,
296 )
298 # Store alert
299 self._active_alerts[rule.name] = alert
300 self._alert_history.append(alert)
301 self._alert_counts[rule.name] = self._alert_counts.get(rule.name, 0) + 1
302 self._last_sent[rule.name] = timestamp
304 # Send alert through configured channels
305 await self._send_alert(alert, rule.channels)
307 logger.info(f"Alert triggered: {alert.rule_name} - {alert.message}")
309 async def _resolve_alert(self, rule_name: str, timestamp: float) -> None:
310 """Resolve an active alert."""
311 async with self._lock:
312 if rule_name in self._active_alerts:
313 alert = self._active_alerts[rule_name]
314 alert.resolved = True
315 alert.resolved_at = timestamp
317 # Remove from active alerts
318 del self._active_alerts[rule_name]
320 logger.info(f"Alert resolved: {alert.rule_name}")
322 def _should_throttle_alert(self, rule_name: str, timestamp: float) -> bool:
323 """Check if alert should be throttled."""
324 if rule_name not in self._last_sent:
325 return False
327 rule = next((r for r in self._alert_rules if r.name == rule_name), None)
328 if not rule:
329 return False
331 time_since_last = timestamp - self._last_sent[rule_name]
332 return time_since_last < rule.throttle_seconds
334 def _format_alert_message(self, template: str, metrics: dict[str, Any]) -> str:
335 """Format alert message template with metric values."""
336 try:
337 # Create formatting context
338 context = {
339 "consecutive_failures": metrics.get("consecutive_health_failures", 0),
340 "cpu_percent": metrics.get("cpu_percent", 0),
341 "memory_percent": metrics.get("memory_percent", 0),
342 "cpu_threshold": config.monitoring.cpu_threshold_percent,
343 "memory_threshold": config.monitoring.memory_threshold_percent,
344 "failure_rate": metrics.get("circuit_failure_rate", 0),
345 "uptime": metrics.get("uptime_seconds", 0),
346 }
348 return template.format(**context)
350 except Exception as e:
351 logger.error(f"Error formatting alert message: {e}")
352 return template
354 async def _send_alert(self, alert: Alert, channels: list[AlertChannel]) -> None:
355 """Send alert through specified channels."""
356 for channel in channels:
357 try:
358 if channel == AlertChannel.LOG:
359 await self._send_log_alert(alert)
360 elif channel == AlertChannel.WEBHOOK:
361 await self._send_webhook_alert(alert)
362 # Add more channels as needed
364 except Exception as e:
365 logger.error(f"Failed to send alert via {channel.value}: {e}")
367 async def _send_log_alert(self, alert: Alert) -> None:
368 """Send alert to log."""
369 log_level = {
370 AlertLevel.INFO: logger.info,
371 AlertLevel.WARNING: logger.warning,
372 AlertLevel.ERROR: logger.error,
373 AlertLevel.CRITICAL: logger.critical,
374 }.get(alert.level, logger.info)
376 log_level(
377 f"ALERT [{alert.level.value.upper()}] {alert.rule_name}: {alert.message}"
378 )
380 async def _send_webhook_alert(self, alert: Alert) -> None:
381 """Send alert via webhook."""
382 # This would integrate with external webhook system
383 webhook_url = (
384 config.security.allowed_origins[0]
385 if config.security.allowed_origins
386 else None
387 )
389 if not webhook_url:
390 logger.warning("Webhook alert configured but no webhook URL available")
391 return
393 payload = {
394 "alert_id": alert.id,
395 "title": alert.rule_name,
396 "message": alert.message,
397 "level": alert.level.value,
398 "timestamp": alert.timestamp,
399 "source": alert.source,
400 "labels": alert.labels,
401 }
403 # Would use httpx to send webhook
404 logger.info(f"Would send webhook alert to {webhook_url}: {json.dumps(payload)}")
406 async def force_alert(
407 self,
408 title: str,
409 message: str,
410 level: AlertLevel = AlertLevel.INFO,
411 channels: list[AlertChannel] | None = None,
412 ) -> None:
413 """Manually trigger an alert."""
414 alert = Alert(
415 id=f"manual_{int(time.time())}",
416 rule_name=title,
417 level=level,
418 message=message,
419 timestamp=time.time(),
420 source="manual",
421 )
423 channels = channels or [AlertChannel.LOG]
424 await self._send_alert(alert, channels)
426 async with self._lock:
427 self._alert_history.append(alert)
429 def get_active_alerts(self) -> dict[str, Alert]:
430 """Get all currently active alerts."""
431 return self._active_alerts.copy()
433 def get_alert_history(self, limit: int | None = None) -> list[Alert]:
434 """Get alert history."""
435 history = self._alert_history.copy()
436 if limit:
437 history = history[-limit:]
438 return history
440 def get_alert_statistics(self) -> dict[str, Any]:
441 """Get alert statistics."""
442 return {
443 "active_alerts": len(self._active_alerts),
444 "total_alerts_sent": len(self._alert_history),
445 "alert_counts_by_type": self._alert_counts.copy(),
446 "rules_enabled": sum(1 for rule in self._alert_rules if rule.enabled),
447 "rules_total": len(self._alert_rules),
448 }
450 def enable_rule(self, rule_name: str) -> bool:
451 """Enable an alert rule."""
452 for rule in self._alert_rules:
453 if rule.name == rule_name:
454 rule.enabled = True
455 logger.info(f"Alert rule '{rule_name}' enabled")
456 return True
457 return False
459 def disable_rule(self, rule_name: str) -> bool:
460 """Disable an alert rule."""
461 for rule in self._alert_rules:
462 if rule.name == rule_name:
463 rule.enabled = False
464 logger.info(f"Alert rule '{rule_name}' disabled")
465 return True
466 return False
468 def clear_alert_history(self) -> None:
469 """Clear alert history."""
470 self._alert_history.clear()
471 self._alert_counts.clear()
472 logger.info("Alert history cleared")
474 def get_alert_rules(self) -> list[dict[str, Any]]:
475 """Get all alert rules configuration."""
476 return [
477 {
478 "name": rule.name,
479 "condition": rule.condition,
480 "level": rule.level.value,
481 "message_template": rule.message_template,
482 "channels": [c.value for c in rule.channels],
483 "throttle_seconds": rule.throttle_seconds,
484 "enabled": rule.enabled,
485 }
486 for rule in self._alert_rules
487 ]