Coverage for excalidraw_mcp/monitoring/metrics.py: 77%
230 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 08:08 -0700
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 08:08 -0700
1"""Metrics collection system for monitoring canvas server performance."""
3import asyncio
4import logging
5import time
6from collections import defaultdict, deque
7from dataclasses import dataclass, field
8from typing import Any
10import psutil
12from ..config import config
14logger = logging.getLogger(__name__)
17@dataclass
18class MetricPoint:
19 """A single metric data point."""
21 timestamp: float
22 value: float
23 labels: dict[str, str] = field(default_factory=dict)
26@dataclass
27class Counter:
28 """Counter metric that only increases."""
30 name: str
31 help_text: str
32 value: float = 0.0
33 labels: dict[str, str] = field(default_factory=dict)
35 def inc(self, amount: float = 1.0) -> None:
36 """Increment counter by amount."""
37 self.value += amount
39 def reset(self) -> None:
40 """Reset counter to zero."""
41 self.value = 0.0
44@dataclass
45class Gauge:
46 """Gauge metric that can increase or decrease."""
48 name: str
49 help_text: str
50 value: float = 0.0
51 labels: dict[str, str] = field(default_factory=dict)
53 def set(self, value: float) -> None:
54 """Set gauge to specific value."""
55 self.value = value
57 def inc(self, amount: float = 1.0) -> None:
58 """Increment gauge by amount."""
59 self.value += amount
61 def dec(self, amount: float = 1.0) -> None:
62 """Decrement gauge by amount."""
63 self.value -= amount
66@dataclass
67class Histogram:
68 """Histogram metric for tracking distributions."""
70 name: str
71 help_text: str
72 buckets: list[float] = field(
73 default_factory=lambda: [0.1, 0.5, 1.0, 2.5, 5.0, 10.0]
74 )
75 counts: dict[float, int] = field(default_factory=dict)
76 sum_value: float = 0.0
77 count: int = 0
78 labels: dict[str, str] = field(default_factory=dict)
80 def __post_init__(self) -> None:
81 # Initialize bucket counts
82 for bucket in self.buckets:
83 self.counts[bucket] = 0
84 self.counts[float("inf")] = 0
86 def observe(self, value: float) -> None:
87 """Record an observation."""
88 self.sum_value += value
89 self.count += 1
91 # Update bucket counts
92 for bucket in self.buckets:
93 if value <= bucket:
94 self.counts[bucket] += 1
95 self.counts[float("inf")] += 1
97 def reset(self) -> None:
98 """Reset histogram."""
99 self.counts = {bucket: 0 for bucket in self.buckets}
100 self.counts[float("inf")] = 0
101 self.sum_value = 0.0
102 self.count = 0
104 @property
105 def average(self) -> float:
106 """Get average value."""
107 return self.sum_value / max(self.count, 1)
110class MetricsCollector:
111 """Collects and manages metrics for canvas server monitoring."""
113 def __init__(self) -> None:
114 self._counters: dict[str, Counter] = {}
115 self._gauges: dict[str, Gauge] = {}
116 self._histograms: dict[str, Histogram] = {}
118 def _create_deque() -> deque[MetricPoint]:
119 return deque(maxlen=100)
121 self._history: dict[str, deque[MetricPoint]] = defaultdict(_create_deque)
122 self._collection_task: asyncio.Task[Any] | None = None
123 self._running = False
124 self._lock = asyncio.Lock()
126 # Initialize standard metrics
127 self._initialize_standard_metrics()
129 def _initialize_standard_metrics(self) -> None:
130 """Initialize standard metrics for canvas server monitoring."""
132 # HTTP request metrics
133 self.register_counter(
134 "http_requests_total",
135 "Total number of HTTP requests",
136 {"method": "GET", "endpoint": "/health", "status": "200"},
137 )
138 self.register_counter(
139 "http_request_errors_total",
140 "Total number of HTTP request errors",
141 {"method": "GET", "endpoint": "/health"},
142 )
143 self.register_histogram(
144 "http_request_duration_seconds",
145 "HTTP request duration in seconds",
146 [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0],
147 )
149 # Health check metrics
150 self.register_counter(
151 "health_checks_total", "Total number of health checks performed"
152 )
153 self.register_counter(
154 "health_check_failures_total", "Total number of failed health checks"
155 )
156 self.register_gauge(
157 "health_check_consecutive_failures",
158 "Number of consecutive health check failures",
159 )
160 self.register_histogram(
161 "health_check_duration_seconds", "Health check duration in seconds"
162 )
164 # Process metrics
165 self.register_gauge("process_cpu_percent", "Process CPU usage percentage")
166 self.register_gauge("process_memory_bytes", "Process memory usage in bytes")
167 self.register_gauge("process_memory_percent", "Process memory usage percentage")
168 self.register_gauge("process_threads_count", "Number of process threads")
170 # Canvas server metrics
171 self.register_counter(
172 "canvas_restarts_total", "Total number of canvas server restarts"
173 )
174 self.register_gauge("canvas_uptime_seconds", "Canvas server uptime in seconds")
175 self.register_gauge(
176 "canvas_elements_count", "Current number of elements on canvas"
177 )
179 # Circuit breaker metrics
180 self.register_counter(
181 "circuit_breaker_state_changes_total", "Circuit breaker state changes"
182 )
183 self.register_counter(
184 "circuit_breaker_calls_total", "Circuit breaker total calls"
185 )
186 self.register_counter(
187 "circuit_breaker_failures_total", "Circuit breaker failures"
188 )
189 self.register_counter(
190 "circuit_breaker_rejections_total", "Circuit breaker rejections"
191 )
193 def register_counter(
194 self, name: str, help_text: str, labels: dict[str, str] | None = None
195 ) -> Counter:
196 """Register a new counter metric."""
197 counter = Counter(name, help_text, labels=labels or {})
198 self._counters[name] = counter
199 return counter
201 def register_gauge(
202 self, name: str, help_text: str, labels: dict[str, str] | None = None
203 ) -> Gauge:
204 """Register a new gauge metric."""
205 gauge = Gauge(name, help_text, labels=labels or {})
206 self._gauges[name] = gauge
207 return gauge
209 def register_histogram(
210 self, name: str, help_text: str, buckets: list[float] | None = None
211 ) -> Histogram:
212 """Register a new histogram metric."""
213 histogram = Histogram(name, help_text, buckets=buckets or [])
214 self._histograms[name] = histogram
215 return histogram
217 def get_counter(self, name: str) -> Counter | None:
218 """Get counter by name."""
219 return self._counters.get(name)
221 def get_gauge(self, name: str) -> Gauge | None:
222 """Get gauge by name."""
223 return self._gauges.get(name)
225 def get_histogram(self, name: str) -> Histogram | None:
226 """Get histogram by name."""
227 return self._histograms.get(name)
229 def increment_counter(
230 self, name: str, amount: float = 1.0, labels: dict[str, str] | None = None
231 ) -> None:
232 """Increment a counter metric."""
233 counter = self._counters.get(name)
234 if counter:
235 if labels:
236 counter.labels.update(labels)
237 counter.inc(amount)
239 def set_gauge(
240 self, name: str, value: float, labels: dict[str, str] | None = None
241 ) -> None:
242 """Set a gauge metric value."""
243 gauge = self._gauges.get(name)
244 if gauge:
245 if labels:
246 gauge.labels.update(labels)
247 gauge.set(value)
249 def observe_histogram(
250 self, name: str, value: float, labels: dict[str, str] | None = None
251 ) -> None:
252 """Record a histogram observation."""
253 histogram = self._histograms.get(name)
254 if histogram:
255 if labels:
256 histogram.labels.update(labels)
257 histogram.observe(value)
259 async def collect_system_metrics(self) -> None:
260 """Collect system and process metrics."""
261 if not config.monitoring.resource_monitoring_enabled:
262 return
264 try:
265 from ..process_manager import process_manager
267 # Process metrics
268 if process_manager.process_pid:
269 try:
270 process = psutil.Process(process_manager.process_pid)
272 self.set_gauge("process_cpu_percent", process.cpu_percent())
274 memory_info = process.memory_info()
275 self.set_gauge("process_memory_bytes", memory_info.rss)
276 self.set_gauge("process_memory_percent", process.memory_percent())
277 self.set_gauge("process_threads_count", process.num_threads())
279 # Calculate uptime
280 create_time = process.create_time()
281 uptime = time.time() - create_time
282 self.set_gauge("canvas_uptime_seconds", uptime)
284 except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
285 logger.warning(f"Failed to collect process metrics: {e}")
287 except Exception as e:
288 logger.error(f"Error collecting system metrics: {e}")
290 async def collect_canvas_metrics(self) -> None:
291 """Collect canvas-specific metrics."""
292 try:
293 from ..http_client import http_client
295 # Get element count
296 elements = await http_client.get_json("/api/elements")
297 if elements is not None:
298 # Elements is a dict with an 'elements' key containing the list
299 element_list: list[Any] = (
300 elements.get("elements", []) if hasattr(elements, "get") else []
301 )
302 element_count = len(element_list)
303 self.set_gauge("canvas_elements_count", element_count)
305 except Exception as e:
306 logger.debug(f"Could not collect canvas metrics: {e}")
308 async def start_collection(self) -> None:
309 """Start automatic metrics collection."""
310 if self._running:
311 return
313 self._running = True
314 self._collection_task = asyncio.create_task(self._collection_loop())
315 logger.info("Metrics collection started")
317 async def stop_collection(self) -> None:
318 """Stop automatic metrics collection."""
319 if not self._running:
320 return
322 self._running = False
323 if self._collection_task:
324 self._collection_task.cancel()
325 try:
326 await self._collection_task
327 except asyncio.CancelledError:
328 pass
329 logger.info("Metrics collection stopped")
331 async def _collection_loop(self) -> None:
332 """Main metrics collection loop."""
333 while self._running:
334 try:
335 async with self._lock:
336 if config.monitoring.metrics_enabled:
337 await self.collect_system_metrics()
338 await self.collect_canvas_metrics()
340 # Store historical data
341 timestamp = time.time()
342 for name, gauge in self._gauges.items():
343 self._history[name].append(
344 MetricPoint(timestamp, gauge.value, gauge.labels.copy())
345 )
347 await asyncio.sleep(
348 config.monitoring.metrics_collection_interval_seconds
349 )
351 except asyncio.CancelledError:
352 break
353 except Exception as e:
354 logger.error(f"Error in metrics collection loop: {e}")
355 await asyncio.sleep(5) # Brief pause before retrying
357 def get_all_metrics(self) -> dict[str, Any]:
358 """Get all metrics in a structured format."""
359 return {
360 "counters": {
361 name: {
362 "value": counter.value,
363 "help": counter.help_text,
364 "labels": counter.labels,
365 }
366 for name, counter in self._counters.items()
367 },
368 "gauges": {
369 name: {
370 "value": gauge.value,
371 "help": gauge.help_text,
372 "labels": gauge.labels,
373 }
374 for name, gauge in self._gauges.items()
375 },
376 "histograms": {
377 name: {
378 "count": hist.count,
379 "sum": hist.sum_value,
380 "average": hist.average,
381 "buckets": hist.counts.copy(),
382 "help": hist.help_text,
383 "labels": hist.labels,
384 }
385 for name, hist in self._histograms.items()
386 },
387 }
389 def get_prometheus_format(self) -> str:
390 """Export metrics in Prometheus format."""
391 lines: list[str] = []
393 # Counters
394 for name, counter in self._counters.items():
395 lines.extend(
396 (f"# HELP {name} {counter.help_text}", f"# TYPE {name} counter")
397 )
398 label_str = ",".join(f'{k}="{v}"' for k, v in counter.labels.items())
399 if label_str:
400 lines.append(f"{name}{{{label_str}}} {counter.value}")
401 else:
402 lines.append(f"{name} {counter.value}")
404 # Gauges
405 for name, gauge in self._gauges.items():
406 lines.extend((f"# HELP {name} {gauge.help_text}", f"# TYPE {name} gauge"))
407 label_str = ",".join(f'{k}="{v}"' for k, v in gauge.labels.items())
408 if label_str:
409 lines.append(f"{name}{{{label_str}}} {gauge.value}")
410 else:
411 lines.append(f"{name} {gauge.value}")
413 # Histograms
414 for name, hist in self._histograms.items():
415 lines.extend(
416 (f"# HELP {name} {hist.help_text}", f"# TYPE {name} histogram")
417 )
418 # Add histogram metrics
419 label_str = ",".join(f'{k}="{v}"' for k, v in hist.labels.items())
420 base_name = f"{name}{{{label_str}}}" if label_str else name
422 lines.extend(
423 (
424 f"{base_name}_count {hist.count}",
425 f"{base_name}_sum {hist.sum_value}",
426 f"{base_name}_average {hist.average}",
427 )
428 )
430 return "\n".join(lines) + "\n"
432 def reset_all_metrics(self) -> None:
433 """Reset all metrics to initial state."""
434 for counter in self._counters.values():
435 counter.reset()
436 for gauge in self._gauges.values():
437 gauge.set(0.0)
438 for histogram in self._histograms.values():
439 histogram.reset()
440 self._history.clear()
441 logger.info("All metrics reset")
443 def get_metric_history(
444 self, name: str, limit: int | None = None
445 ) -> list[MetricPoint]:
446 """Get historical data for a metric."""
447 history = list(self._history.get(name, []))
448 if limit:
449 history = history[-limit:]
450 return history