Coverage for excalidraw_mcp/monitoring/alerts.py: 53%

218 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-16 08:08 -0700

1"""Alert management with rule-based triggering and multiple delivery channels.""" 

2 

3import ast 

4import asyncio 

5import json 

6import logging 

7import operator 

8import time 

9from dataclasses import dataclass, field 

10from enum import Enum 

11from typing import Any 

12 

13from ..config import config 

14 

15logger = logging.getLogger(__name__) 

16 

17 

18class AlertLevel(Enum): 

19 """Alert severity levels.""" 

20 

21 INFO = "info" 

22 WARNING = "warning" 

23 ERROR = "error" 

24 CRITICAL = "critical" 

25 

26 

27class AlertChannel(Enum): 

28 """Alert delivery channels.""" 

29 

30 LOG = "log" 

31 WEBHOOK = "webhook" 

32 EMAIL = "email" 

33 SLACK = "slack" 

34 

35 

36@dataclass 

37class Alert: 

38 """An alert instance.""" 

39 

40 id: str 

41 rule_name: str 

42 level: AlertLevel 

43 message: str 

44 timestamp: float 

45 source: str 

46 labels: dict = field(default_factory=dict) 

47 resolved: bool = False 

48 resolved_at: float | None = None 

49 

50 

51@dataclass 

52class AlertRule: 

53 """Configuration for an alert rule.""" 

54 

55 name: str 

56 condition: str 

57 level: AlertLevel 

58 message_template: str 

59 channels: list[AlertChannel] = field(default_factory=list) 

60 throttle_seconds: int = 300 

61 enabled: bool = True 

62 

63 

64class AlertManager: 

65 """Manages alert notifications and delivery.""" 

66 

67 def __init__(self) -> None: 

68 self._active_alerts: dict[str, Alert] = {} 

69 self._alert_history: list[Alert] = [] 

70 self._alert_counts: dict[str, int] = {} 

71 self._last_sent: dict[str, float] = {} 

72 self._lock = asyncio.Lock() 

73 

74 # Initialize alert rules 

75 self._alert_rules = self._initialize_alert_rules() 

76 

77 def _initialize_alert_rules(self) -> list[AlertRule]: 

78 """Initialize standard alert rules.""" 

79 rules: list[AlertRule] = [] 

80 

81 # Health check failure alerts 

82 rules.extend( 

83 ( 

84 AlertRule( 

85 name="health_check_failing", 

86 condition="consecutive_health_failures >= 3", 

87 level=AlertLevel.WARNING, 

88 message_template="Canvas server health checks failing: {consecutive_failures} consecutive failures", 

89 channels=[AlertChannel.LOG], 

90 throttle_seconds=300, 

91 ), 

92 AlertRule( 

93 name="health_check_critical", 

94 condition="consecutive_health_failures >= 5", 

95 level=AlertLevel.CRITICAL, 

96 message_template="Canvas server health checks critical: {consecutive_failures} consecutive failures", 

97 channels=[AlertChannel.LOG], 

98 throttle_seconds=180, 

99 ), 

100 ) 

101 ) 

102 

103 # Circuit breaker alerts 

104 rules.append( 

105 AlertRule( 

106 name="circuit_breaker_opened", 

107 condition="circuit_state == 'open'", 

108 level=AlertLevel.ERROR, 

109 message_template="Circuit breaker opened: {failure_rate}% failure rate", 

110 channels=[AlertChannel.LOG], 

111 throttle_seconds=600, 

112 ) 

113 ) 

114 

115 # CPU/Memory alerts 

116 rules.extend( 

117 ( 

118 AlertRule( 

119 name="high_cpu_usage", 

120 condition="cpu_percent >= 80.0", 

121 level=AlertLevel.WARNING, 

122 message_template="High CPU usage detected: {cpu_percent:.1f}%", 

123 channels=[AlertChannel.LOG], 

124 throttle_seconds=600, 

125 ), 

126 AlertRule( 

127 name="high_memory_usage", 

128 condition="memory_percent >= 85.0", 

129 level=AlertLevel.WARNING, 

130 message_template="High memory usage detected: {memory_percent:.1f}%", 

131 channels=[AlertChannel.LOG], 

132 throttle_seconds=600, 

133 ), 

134 ) 

135 ) 

136 

137 # Process failure alerts 

138 rules.append( 

139 AlertRule( 

140 name="canvas_process_died", 

141 condition="process_status == 'dead'", 

142 level=AlertLevel.CRITICAL, 

143 message_template="Canvas server process has died", 

144 channels=[AlertChannel.LOG], 

145 throttle_seconds=60, 

146 ) 

147 ) 

148 

149 return rules 

150 

151 async def check_conditions(self, metrics: dict[str, Any]) -> None: 

152 """Check alert conditions against current metrics.""" 

153 if not config.monitoring.alerting_enabled: 

154 return 

155 

156 current_time = time.time() 

157 

158 for rule in self._alert_rules: 

159 if not rule.enabled: 

160 continue 

161 

162 try: 

163 # Evaluate condition 

164 if self._safe_eval_condition(rule.condition, metrics): 

165 await self._trigger_alert(rule, metrics, current_time) 

166 else: 

167 # Check if we should resolve existing alert 

168 await self._resolve_alert(rule.name, current_time) 

169 

170 except Exception as e: 

171 logger.error(f"Error evaluating alert rule '{rule.name}': {e}") 

172 

173 def _eval_expression(self, node: ast.Expression, operators: dict) -> Any: 

174 """Evaluate an expression node.""" 

175 return self._eval_node(node.body, operators) 

176 

177 def _eval_compare(self, node: ast.Compare, operators: dict) -> bool: 

178 """Evaluate a comparison node.""" 

179 left = self._eval_node(node.left, operators) 

180 comparisons = [] 

181 for op, comparator in zip(node.ops, node.comparators): 

182 right = self._eval_node(comparator, operators) 

183 if type(op) in operators: 

184 comparisons.append(operators[type(op)](left, right)) 

185 else: 

186 raise ValueError(f"Unsupported operator: {op}") 

187 left = right 

188 return all(comparisons) 

189 

190 def _eval_bool_op(self, node: ast.BoolOp, operators: dict) -> Any: 

191 """Evaluate a boolean operation node.""" 

192 values = [self._eval_node(value, operators) for value in node.values] 

193 if type(node.op) in operators: 

194 result = values[0] 

195 for value in values[1:]: 

196 result = operators[type(node.op)](result, value) 

197 return result 

198 else: 

199 raise ValueError(f"Unsupported boolean operator: {node.op}") 

200 

201 def _eval_unary_op(self, node: ast.UnaryOp, operators: dict) -> Any: 

202 """Evaluate a unary operation node.""" 

203 if isinstance(node.op, ast.Not) and type(node.op) in operators: 

204 return operators[type(node.op)](self._eval_node(node.operand, operators)) 

205 else: 

206 raise ValueError(f"Unsupported unary operator: {node.op}") 

207 

208 def _eval_constant(self, node: ast.Constant) -> Any: 

209 """Evaluate a constant node.""" 

210 return node.value 

211 

212 def _eval_name(self, node: ast.Name, context: dict[str, Any]) -> Any: 

213 """Evaluate a name node.""" 

214 if node.id in context: 

215 return context[node.id] 

216 else: 

217 raise ValueError(f"Undefined variable: {node.id}") 

218 

219 def _eval_node( 

220 self, node: ast.AST, operators: dict, context: dict[str, Any] | None = None 

221 ) -> Any: 

222 """Recursively evaluate an AST node.""" 

223 if isinstance(node, ast.Expression): 

224 return self._eval_expression(node, operators) 

225 elif isinstance(node, ast.Compare): 

226 return self._eval_compare(node, operators) 

227 elif isinstance(node, ast.BoolOp): 

228 return self._eval_bool_op(node, operators) 

229 elif isinstance(node, ast.UnaryOp): 

230 return self._eval_unary_op(node, operators) 

231 elif isinstance(node, ast.Constant): 

232 return self._eval_constant(node) 

233 elif isinstance(node, ast.Num): # For Python < 3.8 

234 return node.n 

235 elif isinstance(node, ast.Str): # For Python < 3.8 

236 return node.s 

237 elif isinstance(node, ast.Name): 

238 if context is None: 

239 raise ValueError("Context required for name evaluation") 

240 return self._eval_name(node, context) 

241 else: 

242 raise ValueError(f"Unsupported node type: {type(node)}") 

243 

244 def _safe_eval_condition(self, condition: str, context: dict[str, Any]) -> bool: 

245 """Safely evaluate an alert condition using AST parsing. 

246 

247 Supports basic comparisons and logical operators only. 

248 """ 

249 try: 

250 # Parse the condition into an AST 

251 tree = ast.parse(condition, mode="eval") 

252 

253 # Define allowed operations 

254 operators = { 

255 ast.Eq: operator.eq, 

256 ast.NotEq: operator.ne, 

257 ast.Lt: operator.lt, 

258 ast.LtE: operator.le, 

259 ast.Gt: operator.gt, 

260 ast.GtE: operator.ge, 

261 ast.And: operator.and_, 

262 ast.Or: operator.or_, 

263 ast.Not: operator.not_, 

264 } 

265 

266 result = self._eval_node(tree, operators, context) 

267 return bool(result) 

268 

269 except Exception as e: 

270 logger.error(f"Error evaluating condition '{condition}': {e}") 

271 return False 

272 

273 async def _trigger_alert( 

274 self, rule: AlertRule, metrics: dict[str, Any], timestamp: float 

275 ) -> None: 

276 """Trigger an alert if conditions are met.""" 

277 async with self._lock: 

278 # Check throttling 

279 if self._should_throttle_alert(rule.name, timestamp): 

280 return 

281 

282 # Generate alert ID 

283 alert_id = f"{rule.name}_{int(timestamp)}" 

284 

285 # Format message 

286 message = self._format_alert_message(rule.message_template, metrics) 

287 

288 alert = Alert( 

289 id=alert_id, 

290 rule_name=rule.name, 

291 level=rule.level, 

292 message=message, 

293 timestamp=timestamp, 

294 source="excalidraw-mcp", 

295 labels=metrics, 

296 ) 

297 

298 # Store alert 

299 self._active_alerts[rule.name] = alert 

300 self._alert_history.append(alert) 

301 self._alert_counts[rule.name] = self._alert_counts.get(rule.name, 0) + 1 

302 self._last_sent[rule.name] = timestamp 

303 

304 # Send alert through configured channels 

305 await self._send_alert(alert, rule.channels) 

306 

307 logger.info(f"Alert triggered: {alert.rule_name} - {alert.message}") 

308 

309 async def _resolve_alert(self, rule_name: str, timestamp: float) -> None: 

310 """Resolve an active alert.""" 

311 async with self._lock: 

312 if rule_name in self._active_alerts: 

313 alert = self._active_alerts[rule_name] 

314 alert.resolved = True 

315 alert.resolved_at = timestamp 

316 

317 # Remove from active alerts 

318 del self._active_alerts[rule_name] 

319 

320 logger.info(f"Alert resolved: {alert.rule_name}") 

321 

322 def _should_throttle_alert(self, rule_name: str, timestamp: float) -> bool: 

323 """Check if alert should be throttled.""" 

324 if rule_name not in self._last_sent: 

325 return False 

326 

327 rule = next((r for r in self._alert_rules if r.name == rule_name), None) 

328 if not rule: 

329 return False 

330 

331 time_since_last = timestamp - self._last_sent[rule_name] 

332 return time_since_last < rule.throttle_seconds 

333 

334 def _format_alert_message(self, template: str, metrics: dict[str, Any]) -> str: 

335 """Format alert message template with metric values.""" 

336 try: 

337 # Create formatting context 

338 context = { 

339 "consecutive_failures": metrics.get("consecutive_health_failures", 0), 

340 "cpu_percent": metrics.get("cpu_percent", 0), 

341 "memory_percent": metrics.get("memory_percent", 0), 

342 "cpu_threshold": config.monitoring.cpu_threshold_percent, 

343 "memory_threshold": config.monitoring.memory_threshold_percent, 

344 "failure_rate": metrics.get("circuit_failure_rate", 0), 

345 "uptime": metrics.get("uptime_seconds", 0), 

346 } 

347 

348 return template.format(**context) 

349 

350 except Exception as e: 

351 logger.error(f"Error formatting alert message: {e}") 

352 return template 

353 

354 async def _send_alert(self, alert: Alert, channels: list[AlertChannel]) -> None: 

355 """Send alert through specified channels.""" 

356 for channel in channels: 

357 try: 

358 if channel == AlertChannel.LOG: 

359 await self._send_log_alert(alert) 

360 elif channel == AlertChannel.WEBHOOK: 

361 await self._send_webhook_alert(alert) 

362 # Add more channels as needed 

363 

364 except Exception as e: 

365 logger.error(f"Failed to send alert via {channel.value}: {e}") 

366 

367 async def _send_log_alert(self, alert: Alert) -> None: 

368 """Send alert to log.""" 

369 log_level = { 

370 AlertLevel.INFO: logger.info, 

371 AlertLevel.WARNING: logger.warning, 

372 AlertLevel.ERROR: logger.error, 

373 AlertLevel.CRITICAL: logger.critical, 

374 }.get(alert.level, logger.info) 

375 

376 log_level( 

377 f"ALERT [{alert.level.value.upper()}] {alert.rule_name}: {alert.message}" 

378 ) 

379 

380 async def _send_webhook_alert(self, alert: Alert) -> None: 

381 """Send alert via webhook.""" 

382 # This would integrate with external webhook system 

383 webhook_url = ( 

384 config.security.allowed_origins[0] 

385 if config.security.allowed_origins 

386 else None 

387 ) 

388 

389 if not webhook_url: 

390 logger.warning("Webhook alert configured but no webhook URL available") 

391 return 

392 

393 payload = { 

394 "alert_id": alert.id, 

395 "title": alert.rule_name, 

396 "message": alert.message, 

397 "level": alert.level.value, 

398 "timestamp": alert.timestamp, 

399 "source": alert.source, 

400 "labels": alert.labels, 

401 } 

402 

403 # Would use httpx to send webhook 

404 logger.info(f"Would send webhook alert to {webhook_url}: {json.dumps(payload)}") 

405 

406 async def force_alert( 

407 self, 

408 title: str, 

409 message: str, 

410 level: AlertLevel = AlertLevel.INFO, 

411 channels: list[AlertChannel] | None = None, 

412 ) -> None: 

413 """Manually trigger an alert.""" 

414 alert = Alert( 

415 id=f"manual_{int(time.time())}", 

416 rule_name=title, 

417 level=level, 

418 message=message, 

419 timestamp=time.time(), 

420 source="manual", 

421 ) 

422 

423 channels = channels or [AlertChannel.LOG] 

424 await self._send_alert(alert, channels) 

425 

426 async with self._lock: 

427 self._alert_history.append(alert) 

428 

429 def get_active_alerts(self) -> dict[str, Alert]: 

430 """Get all currently active alerts.""" 

431 return self._active_alerts.copy() 

432 

433 def get_alert_history(self, limit: int | None = None) -> list[Alert]: 

434 """Get alert history.""" 

435 history = self._alert_history.copy() 

436 if limit: 

437 history = history[-limit:] 

438 return history 

439 

440 def get_alert_statistics(self) -> dict[str, Any]: 

441 """Get alert statistics.""" 

442 return { 

443 "active_alerts": len(self._active_alerts), 

444 "total_alerts_sent": len(self._alert_history), 

445 "alert_counts_by_type": self._alert_counts.copy(), 

446 "rules_enabled": sum(1 for rule in self._alert_rules if rule.enabled), 

447 "rules_total": len(self._alert_rules), 

448 } 

449 

450 def enable_rule(self, rule_name: str) -> bool: 

451 """Enable an alert rule.""" 

452 for rule in self._alert_rules: 

453 if rule.name == rule_name: 

454 rule.enabled = True 

455 logger.info(f"Alert rule '{rule_name}' enabled") 

456 return True 

457 return False 

458 

459 def disable_rule(self, rule_name: str) -> bool: 

460 """Disable an alert rule.""" 

461 for rule in self._alert_rules: 

462 if rule.name == rule_name: 

463 rule.enabled = False 

464 logger.info(f"Alert rule '{rule_name}' disabled") 

465 return True 

466 return False 

467 

468 def clear_alert_history(self) -> None: 

469 """Clear alert history.""" 

470 self._alert_history.clear() 

471 self._alert_counts.clear() 

472 logger.info("Alert history cleared") 

473 

474 def get_alert_rules(self) -> list[dict[str, Any]]: 

475 """Get all alert rules configuration.""" 

476 return [ 

477 { 

478 "name": rule.name, 

479 "condition": rule.condition, 

480 "level": rule.level.value, 

481 "message_template": rule.message_template, 

482 "channels": [c.value for c in rule.channels], 

483 "throttle_seconds": rule.throttle_seconds, 

484 "enabled": rule.enabled, 

485 } 

486 for rule in self._alert_rules 

487 ]