Coverage for excalidraw_mcp/monitoring/metrics.py: 77%

230 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-16 08:08 -0700

1"""Metrics collection system for monitoring canvas server performance.""" 

2 

3import asyncio 

4import logging 

5import time 

6from collections import defaultdict, deque 

7from dataclasses import dataclass, field 

8from typing import Any 

9 

10import psutil 

11 

12from ..config import config 

13 

14logger = logging.getLogger(__name__) 

15 

16 

17@dataclass 

18class MetricPoint: 

19 """A single metric data point.""" 

20 

21 timestamp: float 

22 value: float 

23 labels: dict[str, str] = field(default_factory=dict) 

24 

25 

26@dataclass 

27class Counter: 

28 """Counter metric that only increases.""" 

29 

30 name: str 

31 help_text: str 

32 value: float = 0.0 

33 labels: dict[str, str] = field(default_factory=dict) 

34 

35 def inc(self, amount: float = 1.0) -> None: 

36 """Increment counter by amount.""" 

37 self.value += amount 

38 

39 def reset(self) -> None: 

40 """Reset counter to zero.""" 

41 self.value = 0.0 

42 

43 

44@dataclass 

45class Gauge: 

46 """Gauge metric that can increase or decrease.""" 

47 

48 name: str 

49 help_text: str 

50 value: float = 0.0 

51 labels: dict[str, str] = field(default_factory=dict) 

52 

53 def set(self, value: float) -> None: 

54 """Set gauge to specific value.""" 

55 self.value = value 

56 

57 def inc(self, amount: float = 1.0) -> None: 

58 """Increment gauge by amount.""" 

59 self.value += amount 

60 

61 def dec(self, amount: float = 1.0) -> None: 

62 """Decrement gauge by amount.""" 

63 self.value -= amount 

64 

65 

66@dataclass 

67class Histogram: 

68 """Histogram metric for tracking distributions.""" 

69 

70 name: str 

71 help_text: str 

72 buckets: list[float] = field( 

73 default_factory=lambda: [0.1, 0.5, 1.0, 2.5, 5.0, 10.0] 

74 ) 

75 counts: dict[float, int] = field(default_factory=dict) 

76 sum_value: float = 0.0 

77 count: int = 0 

78 labels: dict[str, str] = field(default_factory=dict) 

79 

80 def __post_init__(self) -> None: 

81 # Initialize bucket counts 

82 for bucket in self.buckets: 

83 self.counts[bucket] = 0 

84 self.counts[float("inf")] = 0 

85 

86 def observe(self, value: float) -> None: 

87 """Record an observation.""" 

88 self.sum_value += value 

89 self.count += 1 

90 

91 # Update bucket counts 

92 for bucket in self.buckets: 

93 if value <= bucket: 

94 self.counts[bucket] += 1 

95 self.counts[float("inf")] += 1 

96 

97 def reset(self) -> None: 

98 """Reset histogram.""" 

99 self.counts = {bucket: 0 for bucket in self.buckets} 

100 self.counts[float("inf")] = 0 

101 self.sum_value = 0.0 

102 self.count = 0 

103 

104 @property 

105 def average(self) -> float: 

106 """Get average value.""" 

107 return self.sum_value / max(self.count, 1) 

108 

109 

110class MetricsCollector: 

111 """Collects and manages metrics for canvas server monitoring.""" 

112 

113 def __init__(self) -> None: 

114 self._counters: dict[str, Counter] = {} 

115 self._gauges: dict[str, Gauge] = {} 

116 self._histograms: dict[str, Histogram] = {} 

117 

118 def _create_deque() -> deque[MetricPoint]: 

119 return deque(maxlen=100) 

120 

121 self._history: dict[str, deque[MetricPoint]] = defaultdict(_create_deque) 

122 self._collection_task: asyncio.Task[Any] | None = None 

123 self._running = False 

124 self._lock = asyncio.Lock() 

125 

126 # Initialize standard metrics 

127 self._initialize_standard_metrics() 

128 

129 def _initialize_standard_metrics(self) -> None: 

130 """Initialize standard metrics for canvas server monitoring.""" 

131 

132 # HTTP request metrics 

133 self.register_counter( 

134 "http_requests_total", 

135 "Total number of HTTP requests", 

136 {"method": "GET", "endpoint": "/health", "status": "200"}, 

137 ) 

138 self.register_counter( 

139 "http_request_errors_total", 

140 "Total number of HTTP request errors", 

141 {"method": "GET", "endpoint": "/health"}, 

142 ) 

143 self.register_histogram( 

144 "http_request_duration_seconds", 

145 "HTTP request duration in seconds", 

146 [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0], 

147 ) 

148 

149 # Health check metrics 

150 self.register_counter( 

151 "health_checks_total", "Total number of health checks performed" 

152 ) 

153 self.register_counter( 

154 "health_check_failures_total", "Total number of failed health checks" 

155 ) 

156 self.register_gauge( 

157 "health_check_consecutive_failures", 

158 "Number of consecutive health check failures", 

159 ) 

160 self.register_histogram( 

161 "health_check_duration_seconds", "Health check duration in seconds" 

162 ) 

163 

164 # Process metrics 

165 self.register_gauge("process_cpu_percent", "Process CPU usage percentage") 

166 self.register_gauge("process_memory_bytes", "Process memory usage in bytes") 

167 self.register_gauge("process_memory_percent", "Process memory usage percentage") 

168 self.register_gauge("process_threads_count", "Number of process threads") 

169 

170 # Canvas server metrics 

171 self.register_counter( 

172 "canvas_restarts_total", "Total number of canvas server restarts" 

173 ) 

174 self.register_gauge("canvas_uptime_seconds", "Canvas server uptime in seconds") 

175 self.register_gauge( 

176 "canvas_elements_count", "Current number of elements on canvas" 

177 ) 

178 

179 # Circuit breaker metrics 

180 self.register_counter( 

181 "circuit_breaker_state_changes_total", "Circuit breaker state changes" 

182 ) 

183 self.register_counter( 

184 "circuit_breaker_calls_total", "Circuit breaker total calls" 

185 ) 

186 self.register_counter( 

187 "circuit_breaker_failures_total", "Circuit breaker failures" 

188 ) 

189 self.register_counter( 

190 "circuit_breaker_rejections_total", "Circuit breaker rejections" 

191 ) 

192 

193 def register_counter( 

194 self, name: str, help_text: str, labels: dict[str, str] | None = None 

195 ) -> Counter: 

196 """Register a new counter metric.""" 

197 counter = Counter(name, help_text, labels=labels or {}) 

198 self._counters[name] = counter 

199 return counter 

200 

201 def register_gauge( 

202 self, name: str, help_text: str, labels: dict[str, str] | None = None 

203 ) -> Gauge: 

204 """Register a new gauge metric.""" 

205 gauge = Gauge(name, help_text, labels=labels or {}) 

206 self._gauges[name] = gauge 

207 return gauge 

208 

209 def register_histogram( 

210 self, name: str, help_text: str, buckets: list[float] | None = None 

211 ) -> Histogram: 

212 """Register a new histogram metric.""" 

213 histogram = Histogram(name, help_text, buckets=buckets or []) 

214 self._histograms[name] = histogram 

215 return histogram 

216 

217 def get_counter(self, name: str) -> Counter | None: 

218 """Get counter by name.""" 

219 return self._counters.get(name) 

220 

221 def get_gauge(self, name: str) -> Gauge | None: 

222 """Get gauge by name.""" 

223 return self._gauges.get(name) 

224 

225 def get_histogram(self, name: str) -> Histogram | None: 

226 """Get histogram by name.""" 

227 return self._histograms.get(name) 

228 

229 def increment_counter( 

230 self, name: str, amount: float = 1.0, labels: dict[str, str] | None = None 

231 ) -> None: 

232 """Increment a counter metric.""" 

233 counter = self._counters.get(name) 

234 if counter: 

235 if labels: 

236 counter.labels.update(labels) 

237 counter.inc(amount) 

238 

239 def set_gauge( 

240 self, name: str, value: float, labels: dict[str, str] | None = None 

241 ) -> None: 

242 """Set a gauge metric value.""" 

243 gauge = self._gauges.get(name) 

244 if gauge: 

245 if labels: 

246 gauge.labels.update(labels) 

247 gauge.set(value) 

248 

249 def observe_histogram( 

250 self, name: str, value: float, labels: dict[str, str] | None = None 

251 ) -> None: 

252 """Record a histogram observation.""" 

253 histogram = self._histograms.get(name) 

254 if histogram: 

255 if labels: 

256 histogram.labels.update(labels) 

257 histogram.observe(value) 

258 

259 async def collect_system_metrics(self) -> None: 

260 """Collect system and process metrics.""" 

261 if not config.monitoring.resource_monitoring_enabled: 

262 return 

263 

264 try: 

265 from ..process_manager import process_manager 

266 

267 # Process metrics 

268 if process_manager.process_pid: 

269 try: 

270 process = psutil.Process(process_manager.process_pid) 

271 

272 self.set_gauge("process_cpu_percent", process.cpu_percent()) 

273 

274 memory_info = process.memory_info() 

275 self.set_gauge("process_memory_bytes", memory_info.rss) 

276 self.set_gauge("process_memory_percent", process.memory_percent()) 

277 self.set_gauge("process_threads_count", process.num_threads()) 

278 

279 # Calculate uptime 

280 create_time = process.create_time() 

281 uptime = time.time() - create_time 

282 self.set_gauge("canvas_uptime_seconds", uptime) 

283 

284 except (psutil.NoSuchProcess, psutil.AccessDenied) as e: 

285 logger.warning(f"Failed to collect process metrics: {e}") 

286 

287 except Exception as e: 

288 logger.error(f"Error collecting system metrics: {e}") 

289 

290 async def collect_canvas_metrics(self) -> None: 

291 """Collect canvas-specific metrics.""" 

292 try: 

293 from ..http_client import http_client 

294 

295 # Get element count 

296 elements = await http_client.get_json("/api/elements") 

297 if elements is not None: 

298 # Elements is a dict with an 'elements' key containing the list 

299 element_list: list[Any] = ( 

300 elements.get("elements", []) if hasattr(elements, "get") else [] 

301 ) 

302 element_count = len(element_list) 

303 self.set_gauge("canvas_elements_count", element_count) 

304 

305 except Exception as e: 

306 logger.debug(f"Could not collect canvas metrics: {e}") 

307 

308 async def start_collection(self) -> None: 

309 """Start automatic metrics collection.""" 

310 if self._running: 

311 return 

312 

313 self._running = True 

314 self._collection_task = asyncio.create_task(self._collection_loop()) 

315 logger.info("Metrics collection started") 

316 

317 async def stop_collection(self) -> None: 

318 """Stop automatic metrics collection.""" 

319 if not self._running: 

320 return 

321 

322 self._running = False 

323 if self._collection_task: 

324 self._collection_task.cancel() 

325 try: 

326 await self._collection_task 

327 except asyncio.CancelledError: 

328 pass 

329 logger.info("Metrics collection stopped") 

330 

331 async def _collection_loop(self) -> None: 

332 """Main metrics collection loop.""" 

333 while self._running: 

334 try: 

335 async with self._lock: 

336 if config.monitoring.metrics_enabled: 

337 await self.collect_system_metrics() 

338 await self.collect_canvas_metrics() 

339 

340 # Store historical data 

341 timestamp = time.time() 

342 for name, gauge in self._gauges.items(): 

343 self._history[name].append( 

344 MetricPoint(timestamp, gauge.value, gauge.labels.copy()) 

345 ) 

346 

347 await asyncio.sleep( 

348 config.monitoring.metrics_collection_interval_seconds 

349 ) 

350 

351 except asyncio.CancelledError: 

352 break 

353 except Exception as e: 

354 logger.error(f"Error in metrics collection loop: {e}") 

355 await asyncio.sleep(5) # Brief pause before retrying 

356 

357 def get_all_metrics(self) -> dict[str, Any]: 

358 """Get all metrics in a structured format.""" 

359 return { 

360 "counters": { 

361 name: { 

362 "value": counter.value, 

363 "help": counter.help_text, 

364 "labels": counter.labels, 

365 } 

366 for name, counter in self._counters.items() 

367 }, 

368 "gauges": { 

369 name: { 

370 "value": gauge.value, 

371 "help": gauge.help_text, 

372 "labels": gauge.labels, 

373 } 

374 for name, gauge in self._gauges.items() 

375 }, 

376 "histograms": { 

377 name: { 

378 "count": hist.count, 

379 "sum": hist.sum_value, 

380 "average": hist.average, 

381 "buckets": hist.counts.copy(), 

382 "help": hist.help_text, 

383 "labels": hist.labels, 

384 } 

385 for name, hist in self._histograms.items() 

386 }, 

387 } 

388 

389 def get_prometheus_format(self) -> str: 

390 """Export metrics in Prometheus format.""" 

391 lines: list[str] = [] 

392 

393 # Counters 

394 for name, counter in self._counters.items(): 

395 lines.extend( 

396 (f"# HELP {name} {counter.help_text}", f"# TYPE {name} counter") 

397 ) 

398 label_str = ",".join(f'{k}="{v}"' for k, v in counter.labels.items()) 

399 if label_str: 

400 lines.append(f"{name}{{{label_str}}} {counter.value}") 

401 else: 

402 lines.append(f"{name} {counter.value}") 

403 

404 # Gauges 

405 for name, gauge in self._gauges.items(): 

406 lines.extend((f"# HELP {name} {gauge.help_text}", f"# TYPE {name} gauge")) 

407 label_str = ",".join(f'{k}="{v}"' for k, v in gauge.labels.items()) 

408 if label_str: 

409 lines.append(f"{name}{{{label_str}}} {gauge.value}") 

410 else: 

411 lines.append(f"{name} {gauge.value}") 

412 

413 # Histograms 

414 for name, hist in self._histograms.items(): 

415 lines.extend( 

416 (f"# HELP {name} {hist.help_text}", f"# TYPE {name} histogram") 

417 ) 

418 # Add histogram metrics 

419 label_str = ",".join(f'{k}="{v}"' for k, v in hist.labels.items()) 

420 base_name = f"{name}{{{label_str}}}" if label_str else name 

421 

422 lines.extend( 

423 ( 

424 f"{base_name}_count {hist.count}", 

425 f"{base_name}_sum {hist.sum_value}", 

426 f"{base_name}_average {hist.average}", 

427 ) 

428 ) 

429 

430 return "\n".join(lines) + "\n" 

431 

432 def reset_all_metrics(self) -> None: 

433 """Reset all metrics to initial state.""" 

434 for counter in self._counters.values(): 

435 counter.reset() 

436 for gauge in self._gauges.values(): 

437 gauge.set(0.0) 

438 for histogram in self._histograms.values(): 

439 histogram.reset() 

440 self._history.clear() 

441 logger.info("All metrics reset") 

442 

443 def get_metric_history( 

444 self, name: str, limit: int | None = None 

445 ) -> list[MetricPoint]: 

446 """Get historical data for a metric.""" 

447 history = list(self._history.get(name, [])) 

448 if limit: 

449 history = history[-limit:] 

450 return history