vmm/logger/metrics.rs
1// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Defines the metrics system.
5//!
6//! # Metrics format
7//! The metrics are flushed in JSON format each 60 seconds. The first field will always be the
8//! timestamp followed by the JSON representation of the structures representing each component on
9//! which we are capturing specific metrics.
10//!
11//! ## JSON example with metrics:
12//! ```json
13//! {
14//! "utc_timestamp_ms": 1541591155180,
15//! "api_server": {
16//! "process_startup_time_us": 0,
17//! "process_startup_time_cpu_us": 0
18//! },
19//! "block": {
20//! "activate_fails": 0,
21//! "cfg_fails": 0,
22//! "event_fails": 0,
23//! "flush_count": 0,
24//! "queue_event_count": 0,
25//! "read_count": 0,
26//! "write_count": 0
27//! }
28//! }
29//! ```
30//! The example above means that inside the structure representing all the metrics there is a field
31//! named `block` which is in turn a serializable child structure collecting metrics for
32//! the block device such as `activate_fails`, `cfg_fails`, etc.
33//!
34//! # Limitations
35//! Metrics are only written to buffers.
36//!
37//! # Design
38//! The main design goals of this system are:
39//! * Use lockless operations, preferably ones that don't require anything other than simple
40//! reads/writes being atomic.
41//! * Exploit interior mutability and atomics being Sync to allow all methods (including the ones
42//! which are effectively mutable) to be callable on a global non-mut static.
43//! * Rely on `serde` to provide the actual serialization for writing the metrics.
44//! * Since all metrics start at 0, we implement the `Default` trait via derive for all of them, to
45//! avoid having to initialize everything by hand.
46//!
47//! The system implements 2 types of metrics:
48//! * Shared Incremental Metrics (SharedIncMetrics) - dedicated for the metrics which need a counter
49//! (i.e the number of times an API request failed). These metrics are reset upon flush.
50//! * Shared Store Metrics (SharedStoreMetrics) - are targeted at keeping a persistent value, it is
51//! not intended to act as a counter (i.e for measure the process start up time for example).
52//!
53//! The current approach for the `SharedIncMetrics` type is to store two values (current and
54//! previous) and compute the delta between them each time we do a flush (i.e by serialization).
55//! There are a number of advantages to this approach, including:
56//! * We don't have to introduce an additional write (to reset the value) from the thread which does
57//! to actual writing, so less synchronization effort is required.
58//! * We don't have to worry at all that much about losing some data if writing fails for a while
59//! (this could be a concern, I guess).
60//!
61//! If if turns out this approach is not really what we want, it's pretty easy to resort to
62//! something else, while working behind the same interface.
63
64use std::fmt::Debug;
65use std::io::Write;
66use std::ops::Deref;
67use std::sync::atomic::{AtomicU64, Ordering};
68use std::sync::{Mutex, OnceLock};
69
70use serde::{Serialize, Serializer};
71use utils::time::{ClockType, get_time_ns, get_time_us};
72
73use super::FcLineWriter;
74use crate::devices::legacy;
75use crate::devices::virtio::balloon::metrics as balloon_metrics;
76use crate::devices::virtio::block::virtio::metrics as block_metrics;
77use crate::devices::virtio::mem::metrics as virtio_mem_metrics;
78use crate::devices::virtio::net::metrics as net_metrics;
79use crate::devices::virtio::pmem::metrics as pmem_metrics;
80use crate::devices::virtio::rng::metrics as entropy_metrics;
81use crate::devices::virtio::vhost_user_metrics;
82use crate::devices::virtio::vsock::metrics as vsock_metrics;
83
84/// Static instance used for handling metrics.
85pub static METRICS: Metrics<FirecrackerMetrics, FcLineWriter> =
86 Metrics::<FirecrackerMetrics, FcLineWriter>::new(FirecrackerMetrics::new());
87
88/// Metrics system.
89// All member fields have types which are Sync, and exhibit interior mutability, so
90// we can call operations on metrics using a non-mut static global variable.
91#[derive(Debug)]
92pub struct Metrics<T: Serialize, M: Write + Send> {
93 // Metrics will get flushed here.
94 metrics_buf: OnceLock<Mutex<M>>,
95 pub app_metrics: T,
96}
97
98impl<T: Serialize + Debug, M: Write + Send + Debug> Metrics<T, M> {
99 /// Creates a new instance of the current metrics.
100 pub const fn new(app_metrics: T) -> Metrics<T, M> {
101 Metrics {
102 metrics_buf: OnceLock::new(),
103 app_metrics,
104 }
105 }
106
107 /// Initialize metrics system (once and only once).
108 /// Every call made after the first will have no effect besides returning `Ok` or `Err`.
109 ///
110 /// This function is supposed to be called only from a single thread, once.
111 /// It is not thread-safe and is not meant to be used in a multithreaded
112 /// scenario. The reason `is_initialized` is an `AtomicBool` instead of
113 /// just a `bool` is that `lazy_static` enforces thread-safety on all its
114 /// members.
115 ///
116 /// # Arguments
117 ///
118 /// * `metrics_dest` - Buffer for JSON formatted metrics. Needs to implement `Write` and `Send`.
119 pub fn init(&self, metrics_dest: M) -> Result<(), MetricsError> {
120 self.metrics_buf
121 .set(Mutex::new(metrics_dest))
122 .map_err(|_| MetricsError::AlreadyInitialized)
123 }
124
125 /// Writes metrics to the destination provided as argument upon initialization of the metrics.
126 /// Upon failure, an error is returned if metrics system is initialized and metrics could not be
127 /// written.
128 /// Upon success, the function will return `True` (if metrics system was initialized and metrics
129 /// were successfully written to disk) or `False` (if metrics system was not yet initialized).
130 ///
131 /// This function is usually supposed to be called only from a single thread and
132 /// is not meant to be used in a multithreaded scenario. The reason
133 /// `metrics_buf` is enclosed in a `Mutex` is that `lazy_static` enforces
134 /// thread-safety on all its members.
135 /// The only exception is for signal handlers that result in process exit, which may be run on
136 /// any thread. To prevent the race condition present in the serialisation step of
137 /// SharedIncMetrics, deadly signals use SharedStoreMetrics instead (which have a thread-safe
138 /// serialise implementation).
139 /// The only known caveat is that other metrics may not be properly written before exiting from
140 /// a signal handler. We make this compromise since the process will be killed anyway and the
141 /// important metric in this case is the signal one.
142 /// The alternative is to hold a Mutex over the entire function call, but this increases the
143 /// known deadlock potential.
144 pub fn write(&self) -> Result<bool, MetricsError> {
145 if let Some(lock) = self.metrics_buf.get() {
146 let mut writer = lock.lock().expect("poisoned lock");
147 serde_json::to_writer(writer.by_ref(), &self.app_metrics)
148 .map_err(|err| MetricsError::Serde(err.to_string()))?;
149 writer.write_all(b"\n").map_err(MetricsError::Write)?;
150 Ok(true)
151 } else {
152 // If the metrics are not initialized, no error is thrown but we do let the user know
153 // that metrics were not written.
154 Ok(false)
155 }
156 }
157}
158
159impl<T: Serialize + Debug, M: Write + Send + Debug> Deref for Metrics<T, M> {
160 type Target = T;
161
162 fn deref(&self) -> &Self::Target {
163 &self.app_metrics
164 }
165}
166
167/// Describes the errors which may occur while handling metrics scenarios.
168#[derive(Debug, thiserror::Error, displaydoc::Display)]
169pub enum MetricsError {
170 /// {0}
171 NeverInitialized(String),
172 /// Reinitialization of metrics not allowed.
173 AlreadyInitialized,
174 /// {0}
175 Serde(String),
176 /// Failed to write metrics: {0}
177 Write(std::io::Error),
178}
179
180/// Used for defining new types of metrics that act as a counter (i.e they are continuously updated
181/// by incrementing their value).
182pub trait IncMetric {
183 /// Adds `value` to the current counter.
184 fn add(&self, value: u64);
185 /// Increments by 1 unit the current counter.
186 fn inc(&self) {
187 self.add(1);
188 }
189 /// Returns current value of the counter.
190 fn count(&self) -> u64;
191
192 /// Returns diff of current and old value of the counter.
193 /// Mostly used in process of aggregating per device metrics.
194 fn fetch_diff(&self) -> u64;
195}
196
197/// Used for defining new types of metrics that do not need a counter and act as a persistent
198/// indicator.
199pub trait StoreMetric {
200 /// Returns current value of the counter.
201 fn fetch(&self) -> u64;
202 /// Stores `value` to the current counter.
203 fn store(&self, value: u64);
204}
205
206/// Representation of a metric that is expected to be incremented from more than one thread, so more
207/// synchronization is necessary.
208// It's currently used for vCPU metrics. An alternative here would be
209// to have one instance of every metric for each thread, and to
210// aggregate them when writing. However this probably overkill unless we have a lot of vCPUs
211// incrementing metrics very often. Still, it's there if we ever need it :-s
212// We will be keeping two values for each metric for being able to reset
213// counters on each metric.
214// 1st member - current value being updated
215// 2nd member - old value that gets the current value whenever metrics is flushed to disk
216#[derive(Debug, Default)]
217pub struct SharedIncMetric(AtomicU64, AtomicU64);
218impl SharedIncMetric {
219 /// Const default construction.
220 pub const fn new() -> Self {
221 Self(AtomicU64::new(0), AtomicU64::new(0))
222 }
223}
224
225/// Representation of a metric that is expected to hold a value that can be accessed
226/// from more than one thread, so more synchronization is necessary.
227#[derive(Debug, Default)]
228pub struct SharedStoreMetric(AtomicU64);
229impl SharedStoreMetric {
230 /// Const default construction.
231 pub const fn new() -> Self {
232 Self(AtomicU64::new(0))
233 }
234}
235
236impl IncMetric for SharedIncMetric {
237 // While the order specified for this operation is still Relaxed, the actual instruction will
238 // be an asm "LOCK; something" and thus atomic across multiple threads, simply because of the
239 // fetch_and_add (as opposed to "store(load() + 1)") implementation for atomics.
240 // TODO: would a stronger ordering make a difference here?
241 fn add(&self, value: u64) {
242 self.0.fetch_add(value, Ordering::Relaxed);
243 }
244
245 fn count(&self) -> u64 {
246 self.0.load(Ordering::Relaxed)
247 }
248 fn fetch_diff(&self) -> u64 {
249 self.0.load(Ordering::Relaxed) - self.1.load(Ordering::Relaxed)
250 }
251}
252
253impl StoreMetric for SharedStoreMetric {
254 fn fetch(&self) -> u64 {
255 self.0.load(Ordering::Relaxed)
256 }
257
258 fn store(&self, value: u64) {
259 self.0.store(value, Ordering::Relaxed);
260 }
261}
262
263impl Serialize for SharedIncMetric {
264 /// Reset counters of each metrics. Here we suppose that Serialize's goal is to help with the
265 /// flushing of metrics.
266 /// !!! Any print of the metrics will also reset them. Use with caution !!!
267 fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
268 let snapshot = self.0.load(Ordering::Relaxed);
269 let res = serializer.serialize_u64(snapshot - self.1.load(Ordering::Relaxed));
270
271 if res.is_ok() {
272 self.1.store(snapshot, Ordering::Relaxed);
273 }
274 res
275 }
276}
277
278impl Serialize for SharedStoreMetric {
279 fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
280 serializer.serialize_u64(self.0.load(Ordering::Relaxed))
281 }
282}
283
284/// Reporter object which computes the process wall time and
285/// process CPU time and populates the metric with the results.
286#[derive(Debug)]
287pub struct ProcessTimeReporter {
288 // Process start time in us.
289 start_time_us: Option<u64>,
290 // Process CPU start time in us.
291 start_time_cpu_us: Option<u64>,
292 // Firecracker's parent process CPU time.
293 parent_cpu_time_us: Option<u64>,
294}
295
296impl ProcessTimeReporter {
297 /// Constructor for the process time-related reporter.
298 pub fn new(
299 start_time_us: Option<u64>,
300 start_time_cpu_us: Option<u64>,
301 parent_cpu_time_us: Option<u64>,
302 ) -> ProcessTimeReporter {
303 ProcessTimeReporter {
304 start_time_us,
305 start_time_cpu_us,
306 parent_cpu_time_us,
307 }
308 }
309
310 /// Obtain process start time in microseconds.
311 pub fn report_start_time(&self) {
312 if let Some(start_time) = self.start_time_us {
313 let delta_us = get_time_us(ClockType::Monotonic) - start_time;
314 METRICS.api_server.process_startup_time_us.store(delta_us);
315 }
316 }
317
318 /// Obtain process CPU start time in microseconds.
319 pub fn report_cpu_start_time(&self) {
320 if let Some(cpu_start_time) = self.start_time_cpu_us {
321 let delta_us = get_time_us(ClockType::ProcessCpu) - cpu_start_time
322 + self.parent_cpu_time_us.unwrap_or_default();
323 METRICS
324 .api_server
325 .process_startup_time_cpu_us
326 .store(delta_us);
327 }
328 }
329}
330
331// The following structs are used to define a certain organization for the set of metrics we
332// are interested in. Whenever the name of a field differs from its ideal textual representation
333// in the serialized form, we can use the #[serde(rename = "name")] attribute to, well, rename it.
334
335/// Metrics related to the internal API server.
336#[derive(Debug, Default, Serialize)]
337pub struct ApiServerMetrics {
338 /// Measures the process's startup time in microseconds.
339 pub process_startup_time_us: SharedStoreMetric,
340 /// Measures the cpu's startup time in microseconds.
341 pub process_startup_time_cpu_us: SharedStoreMetric,
342}
343impl ApiServerMetrics {
344 /// Const default construction.
345 pub const fn new() -> Self {
346 Self {
347 process_startup_time_us: SharedStoreMetric::new(),
348 process_startup_time_cpu_us: SharedStoreMetric::new(),
349 }
350 }
351}
352
353/// Metrics specific to GET API Requests for counting user triggered actions and/or failures.
354#[derive(Debug, Default, Serialize)]
355pub struct GetRequestsMetrics {
356 /// Number of GETs for getting information on the instance.
357 pub instance_info_count: SharedIncMetric,
358 /// Number of GETs for getting status on attaching machine configuration.
359 pub machine_cfg_count: SharedIncMetric,
360 /// Number of GETs for getting mmds.
361 pub mmds_count: SharedIncMetric,
362 /// Number of GETs for getting the VMM version.
363 pub vmm_version_count: SharedIncMetric,
364 /// Number of GETs for getting hotpluggable memory status.
365 pub hotplug_memory_count: SharedIncMetric,
366}
367impl GetRequestsMetrics {
368 /// Const default construction.
369 pub const fn new() -> Self {
370 Self {
371 instance_info_count: SharedIncMetric::new(),
372 machine_cfg_count: SharedIncMetric::new(),
373 mmds_count: SharedIncMetric::new(),
374 vmm_version_count: SharedIncMetric::new(),
375 hotplug_memory_count: SharedIncMetric::new(),
376 }
377 }
378}
379
380/// Metrics specific to PUT API Requests for counting user triggered actions and/or failures.
381#[derive(Debug, Default, Serialize)]
382pub struct PutRequestsMetrics {
383 /// Number of PUTs triggering an action on the VM.
384 pub actions_count: SharedIncMetric,
385 /// Number of failures in triggering an action on the VM.
386 pub actions_fails: SharedIncMetric,
387 /// Number of PUTs for attaching source of boot.
388 pub boot_source_count: SharedIncMetric,
389 /// Number of failures during attaching source of boot.
390 pub boot_source_fails: SharedIncMetric,
391 /// Number of PUTs triggering a block attach.
392 pub drive_count: SharedIncMetric,
393 /// Number of failures in attaching a block device.
394 pub drive_fails: SharedIncMetric,
395 /// Number of PUTs for initializing the logging system.
396 pub logger_count: SharedIncMetric,
397 /// Number of failures in initializing the logging system.
398 pub logger_fails: SharedIncMetric,
399 /// Number of PUTs for configuring the machine.
400 pub machine_cfg_count: SharedIncMetric,
401 /// Number of failures in configuring the machine.
402 pub machine_cfg_fails: SharedIncMetric,
403 /// Number of PUTs for configuring a guest's vCPUs.
404 pub cpu_cfg_count: SharedIncMetric,
405 /// Number of failures in configuring a guest's vCPUs.
406 pub cpu_cfg_fails: SharedIncMetric,
407 /// Number of PUTs for initializing the metrics system.
408 pub metrics_count: SharedIncMetric,
409 /// Number of failures in initializing the metrics system.
410 pub metrics_fails: SharedIncMetric,
411 /// Number of PUTs for creating a new network interface.
412 pub network_count: SharedIncMetric,
413 /// Number of failures in creating a new network interface.
414 pub network_fails: SharedIncMetric,
415 /// Number of PUTs for creating mmds.
416 pub mmds_count: SharedIncMetric,
417 /// Number of failures in creating a new mmds.
418 pub mmds_fails: SharedIncMetric,
419 /// Number of PUTs for creating a vsock device.
420 pub vsock_count: SharedIncMetric,
421 /// Number of failures in creating a vsock device.
422 pub vsock_fails: SharedIncMetric,
423 /// Number of PUTs triggering a pmem attach.
424 pub pmem_count: SharedIncMetric,
425 /// Number of failures in attaching a pmem device.
426 pub pmem_fails: SharedIncMetric,
427 /// Number of PUTs to /serial
428 pub serial_count: SharedIncMetric,
429 /// Number of failed PUTs to /serial
430 pub serial_fails: SharedIncMetric,
431 /// Number of PUTs to /hotplug/memory
432 pub hotplug_memory_count: SharedIncMetric,
433 /// Number of failed PUTs to /hotplug/memory
434 pub hotplug_memory_fails: SharedIncMetric,
435}
436impl PutRequestsMetrics {
437 /// Const default construction.
438 pub const fn new() -> Self {
439 Self {
440 actions_count: SharedIncMetric::new(),
441 actions_fails: SharedIncMetric::new(),
442 boot_source_count: SharedIncMetric::new(),
443 boot_source_fails: SharedIncMetric::new(),
444 drive_count: SharedIncMetric::new(),
445 drive_fails: SharedIncMetric::new(),
446 logger_count: SharedIncMetric::new(),
447 logger_fails: SharedIncMetric::new(),
448 machine_cfg_count: SharedIncMetric::new(),
449 machine_cfg_fails: SharedIncMetric::new(),
450 cpu_cfg_count: SharedIncMetric::new(),
451 cpu_cfg_fails: SharedIncMetric::new(),
452 metrics_count: SharedIncMetric::new(),
453 metrics_fails: SharedIncMetric::new(),
454 network_count: SharedIncMetric::new(),
455 network_fails: SharedIncMetric::new(),
456 mmds_count: SharedIncMetric::new(),
457 mmds_fails: SharedIncMetric::new(),
458 vsock_count: SharedIncMetric::new(),
459 vsock_fails: SharedIncMetric::new(),
460 pmem_count: SharedIncMetric::new(),
461 pmem_fails: SharedIncMetric::new(),
462 serial_count: SharedIncMetric::new(),
463 serial_fails: SharedIncMetric::new(),
464 hotplug_memory_count: SharedIncMetric::new(),
465 hotplug_memory_fails: SharedIncMetric::new(),
466 }
467 }
468}
469
470/// Metrics specific to PATCH API Requests for counting user triggered actions and/or failures.
471#[derive(Debug, Default, Serialize)]
472pub struct PatchRequestsMetrics {
473 /// Number of tries to PATCH a block device.
474 pub drive_count: SharedIncMetric,
475 /// Number of failures in PATCHing a block device.
476 pub drive_fails: SharedIncMetric,
477 /// Number of tries to PATCH a net device.
478 pub network_count: SharedIncMetric,
479 /// Number of failures in PATCHing a net device.
480 pub network_fails: SharedIncMetric,
481 /// Number of PATCHs for configuring the machine.
482 pub machine_cfg_count: SharedIncMetric,
483 /// Number of failures in configuring the machine.
484 pub machine_cfg_fails: SharedIncMetric,
485 /// Number of tries to PATCH an mmds.
486 pub mmds_count: SharedIncMetric,
487 /// Number of failures in PATCHing an mmds.
488 pub mmds_fails: SharedIncMetric,
489 /// Number of PATCHes to /hotplug/memory
490 pub hotplug_memory_count: SharedIncMetric,
491 /// Number of failed PATCHes to /hotplug/memory
492 pub hotplug_memory_fails: SharedIncMetric,
493}
494impl PatchRequestsMetrics {
495 /// Const default construction.
496 pub const fn new() -> Self {
497 Self {
498 drive_count: SharedIncMetric::new(),
499 drive_fails: SharedIncMetric::new(),
500 network_count: SharedIncMetric::new(),
501 network_fails: SharedIncMetric::new(),
502 machine_cfg_count: SharedIncMetric::new(),
503 machine_cfg_fails: SharedIncMetric::new(),
504 mmds_count: SharedIncMetric::new(),
505 mmds_fails: SharedIncMetric::new(),
506 hotplug_memory_count: SharedIncMetric::new(),
507 hotplug_memory_fails: SharedIncMetric::new(),
508 }
509 }
510}
511
512/// Metrics related to deprecated user-facing API calls.
513#[derive(Debug, Default, Serialize)]
514pub struct DeprecatedApiMetrics {
515 /// Total number of calls to deprecated HTTP endpoints.
516 pub deprecated_http_api_calls: SharedIncMetric,
517}
518impl DeprecatedApiMetrics {
519 /// Const default construction.
520 pub const fn new() -> Self {
521 Self {
522 deprecated_http_api_calls: SharedIncMetric::new(),
523 }
524 }
525}
526
527/// Metrics for the logging subsystem.
528#[derive(Debug, Default, Serialize)]
529pub struct LoggerSystemMetrics {
530 /// Number of misses on flushing metrics.
531 pub missed_metrics_count: SharedIncMetric,
532 /// Number of errors during metrics handling.
533 pub metrics_fails: SharedIncMetric,
534 /// Number of misses on logging human readable content.
535 pub missed_log_count: SharedIncMetric,
536}
537impl LoggerSystemMetrics {
538 /// Const default construction.
539 pub const fn new() -> Self {
540 Self {
541 missed_metrics_count: SharedIncMetric::new(),
542 metrics_fails: SharedIncMetric::new(),
543 missed_log_count: SharedIncMetric::new(),
544 }
545 }
546}
547
548/// Metrics for the MMDS functionality.
549#[derive(Debug, Default, Serialize)]
550pub struct MmdsMetrics {
551 /// Number of frames rerouted to MMDS.
552 pub rx_accepted: SharedIncMetric,
553 /// Number of errors while handling a frame through MMDS.
554 pub rx_accepted_err: SharedIncMetric,
555 /// Number of uncommon events encountered while processing packets through MMDS.
556 pub rx_accepted_unusual: SharedIncMetric,
557 /// The number of buffers which couldn't be parsed as valid Ethernet frames by the MMDS.
558 pub rx_bad_eth: SharedIncMetric,
559 /// The number of GET requests with invalid tokens.
560 pub rx_invalid_token: SharedIncMetric,
561 /// The number of GET requests with no tokens.
562 pub rx_no_token: SharedIncMetric,
563 /// The total number of successful receive operations by the MMDS.
564 pub rx_count: SharedIncMetric,
565 /// The total number of bytes sent by the MMDS.
566 pub tx_bytes: SharedIncMetric,
567 /// The total number of successful send operations by the MMDS.
568 pub tx_count: SharedIncMetric,
569 /// The number of errors raised by the MMDS while attempting to send frames/packets/segments.
570 pub tx_errors: SharedIncMetric,
571 /// The number of frames sent by the MMDS.
572 pub tx_frames: SharedIncMetric,
573 /// The number of connections successfully accepted by the MMDS TCP handler.
574 pub connections_created: SharedIncMetric,
575 /// The number of connections cleaned up by the MMDS TCP handler.
576 pub connections_destroyed: SharedIncMetric,
577}
578impl MmdsMetrics {
579 /// Const default construction.
580 pub const fn new() -> Self {
581 Self {
582 rx_accepted: SharedIncMetric::new(),
583 rx_accepted_err: SharedIncMetric::new(),
584 rx_accepted_unusual: SharedIncMetric::new(),
585 rx_bad_eth: SharedIncMetric::new(),
586 rx_invalid_token: SharedIncMetric::new(),
587 rx_no_token: SharedIncMetric::new(),
588 rx_count: SharedIncMetric::new(),
589 tx_bytes: SharedIncMetric::new(),
590 tx_count: SharedIncMetric::new(),
591 tx_errors: SharedIncMetric::new(),
592 tx_frames: SharedIncMetric::new(),
593 connections_created: SharedIncMetric::new(),
594 connections_destroyed: SharedIncMetric::new(),
595 }
596 }
597}
598
599/// Performance metrics related for the moment only to snapshots.
600// These store the duration of creating/loading a snapshot and of
601// pausing/resuming the microVM.
602// If there are more than one `/snapshot/create` request in a minute
603// (until the metrics are flushed), only the duration of the last
604// snapshot creation is stored in the metric. If the user is interested
605// in all the durations, a `FlushMetrics` request should be sent after
606// each `create` request.
607#[derive(Debug, Default, Serialize)]
608pub struct PerformanceMetrics {
609 /// Measures the snapshot full create time, at the API (user) level, in microseconds.
610 pub full_create_snapshot: SharedStoreMetric,
611 /// Measures the snapshot diff create time, at the API (user) level, in microseconds.
612 pub diff_create_snapshot: SharedStoreMetric,
613 /// Measures the snapshot load time, at the API (user) level, in microseconds.
614 pub load_snapshot: SharedStoreMetric,
615 /// Measures the microVM pausing duration, at the API (user) level, in microseconds.
616 pub pause_vm: SharedStoreMetric,
617 /// Measures the microVM resuming duration, at the API (user) level, in microseconds.
618 pub resume_vm: SharedStoreMetric,
619 /// Measures the snapshot full create time, at the VMM level, in microseconds.
620 pub vmm_full_create_snapshot: SharedStoreMetric,
621 /// Measures the snapshot diff create time, at the VMM level, in microseconds.
622 pub vmm_diff_create_snapshot: SharedStoreMetric,
623 /// Measures the snapshot load time, at the VMM level, in microseconds.
624 pub vmm_load_snapshot: SharedStoreMetric,
625 /// Measures the microVM pausing duration, at the VMM level, in microseconds.
626 pub vmm_pause_vm: SharedStoreMetric,
627 /// Measures the microVM resuming duration, at the VMM level, in microseconds.
628 pub vmm_resume_vm: SharedStoreMetric,
629}
630impl PerformanceMetrics {
631 /// Const default construction.
632 pub const fn new() -> Self {
633 Self {
634 full_create_snapshot: SharedStoreMetric::new(),
635 diff_create_snapshot: SharedStoreMetric::new(),
636 load_snapshot: SharedStoreMetric::new(),
637 pause_vm: SharedStoreMetric::new(),
638 resume_vm: SharedStoreMetric::new(),
639 vmm_full_create_snapshot: SharedStoreMetric::new(),
640 vmm_diff_create_snapshot: SharedStoreMetric::new(),
641 vmm_load_snapshot: SharedStoreMetric::new(),
642 vmm_pause_vm: SharedStoreMetric::new(),
643 vmm_resume_vm: SharedStoreMetric::new(),
644 }
645 }
646}
647
648/// Metrics for the seccomp filtering.
649#[derive(Debug, Default, Serialize)]
650pub struct SeccompMetrics {
651 /// Number of errors inside the seccomp filtering.
652 pub num_faults: SharedStoreMetric,
653}
654impl SeccompMetrics {
655 /// Const default construction.
656 pub const fn new() -> Self {
657 Self {
658 num_faults: SharedStoreMetric::new(),
659 }
660 }
661}
662
663/// Metrics related to signals.
664/// Deadly signals must be of `SharedStoreMetric` type, since they can ever be either 0 or 1.
665/// This avoids a tricky race condition caused by the unatomic serialize method of
666/// `SharedIncMetric`, between two threads calling `METRICS.write()`.
667#[derive(Debug, Default, Serialize)]
668pub struct SignalMetrics {
669 /// Number of times that SIGBUS was handled.
670 pub sigbus: SharedStoreMetric,
671 /// Number of times that SIGSEGV was handled.
672 pub sigsegv: SharedStoreMetric,
673 /// Number of times that SIGXFSZ was handled.
674 pub sigxfsz: SharedStoreMetric,
675 /// Number of times that SIGXCPU was handled.
676 pub sigxcpu: SharedStoreMetric,
677 /// Number of times that SIGPIPE was handled.
678 pub sigpipe: SharedIncMetric,
679 /// Number of times that SIGHUP was handled.
680 pub sighup: SharedStoreMetric,
681 /// Number of times that SIGILL was handled.
682 pub sigill: SharedStoreMetric,
683}
684impl SignalMetrics {
685 /// Const default construction.
686 pub const fn new() -> Self {
687 Self {
688 sigbus: SharedStoreMetric::new(),
689 sigsegv: SharedStoreMetric::new(),
690 sigxfsz: SharedStoreMetric::new(),
691 sigxcpu: SharedStoreMetric::new(),
692 sigpipe: SharedIncMetric::new(),
693 sighup: SharedStoreMetric::new(),
694 sigill: SharedStoreMetric::new(),
695 }
696 }
697}
698
699/// Provides efficient way to record LatencyAggregateMetrics
700#[derive(Debug)]
701pub struct LatencyMetricsRecorder<'a> {
702 start_time: u64,
703 metric: &'a LatencyAggregateMetrics,
704}
705
706impl<'a> LatencyMetricsRecorder<'a> {
707 /// Const default construction.
708 fn new(metric: &'a LatencyAggregateMetrics) -> Self {
709 Self {
710 start_time: get_time_us(ClockType::Monotonic),
711 metric,
712 }
713 }
714}
715impl Drop for LatencyMetricsRecorder<'_> {
716 /// records aggregate (min/max/sum) for the given metric
717 /// This captures delta between self.start_time and current time
718 /// and updates min/max/sum metrics.
719 /// self.start_time is recorded in new() and metrics are updated in drop
720 fn drop(&mut self) {
721 let delta_us = get_time_us(ClockType::Monotonic) - self.start_time;
722 self.metric.sum_us.add(delta_us);
723 let min_us = self.metric.min_us.fetch();
724 let max_us = self.metric.max_us.fetch();
725 if (0 == min_us) || (min_us > delta_us) {
726 self.metric.min_us.store(delta_us);
727 }
728 if (0 == max_us) || (max_us < delta_us) {
729 self.metric.max_us.store(delta_us);
730 }
731 }
732}
733
734/// Used to record Aggregate (min/max/sum) of latency metrics
735#[derive(Debug, Default, Serialize)]
736pub struct LatencyAggregateMetrics {
737 /// represents minimum value of the metrics in microseconds
738 pub min_us: SharedStoreMetric,
739 /// represents maximum value of the metrics in microseconds
740 pub max_us: SharedStoreMetric,
741 /// represents sum of the metrics in microseconds
742 pub sum_us: SharedIncMetric,
743}
744impl LatencyAggregateMetrics {
745 /// Const default construction.
746 pub const fn new() -> Self {
747 Self {
748 min_us: SharedStoreMetric::new(),
749 max_us: SharedStoreMetric::new(),
750 sum_us: SharedIncMetric::new(),
751 }
752 }
753
754 /// returns a latency recorder which captures stores start_time
755 /// and updates the actual metrics at the end of recorders lifetime.
756 /// in short instead of below 2 lines :
757 /// 1st for start_time_us = get_time_us()
758 /// 2nd for delta_time_us = get_time_us() - start_time; and metrics.store(delta_time_us)
759 /// we have just `_m = metrics.record_latency_metrics()`
760 pub fn record_latency_metrics(&self) -> LatencyMetricsRecorder<'_> {
761 LatencyMetricsRecorder::new(self)
762 }
763}
764
765/// Structure provides Metrics specific to VCPUs' mode of functioning.
766/// Sample_count or number of kvm exits for IO and MMIO VM exits are covered by:
767/// `exit_io_in`, `exit_io_out`, `exit_mmio_read` and , `exit_mmio_write`.
768/// Count of other vm exits for events like shutdown/hlt/errors are
769/// covered by existing "failures" metric.
770/// The only vm exit for which sample_count is not covered is system
771/// event reset/shutdown but that should be fine since they are not
772/// failures and the vm is terminated anyways.
773/// LatencyAggregateMetrics only covers minimum, maximum and sum
774/// because average can be deduced from available metrics. e.g.
775/// dividing `exit_io_in_agg.sum_us` by exit_io_in` gives average of KVM exits handling input IO.
776#[derive(Debug, Default, Serialize)]
777pub struct VcpuMetrics {
778 /// Number of KVM exits for handling input IO.
779 pub exit_io_in: SharedIncMetric,
780 /// Number of KVM exits for handling output IO.
781 pub exit_io_out: SharedIncMetric,
782 /// Number of KVM exits for handling MMIO reads.
783 pub exit_mmio_read: SharedIncMetric,
784 /// Number of KVM exits for handling MMIO writes.
785 pub exit_mmio_write: SharedIncMetric,
786 /// Number of errors during this VCPU's run.
787 pub failures: SharedIncMetric,
788 /// Number of times that the `KVM_KVMCLOCK_CTRL` ioctl failed.
789 pub kvmclock_ctrl_fails: SharedIncMetric,
790 /// Provides Min/max/sum for KVM exits handling input IO.
791 pub exit_io_in_agg: LatencyAggregateMetrics,
792 /// Provides Min/max/sum for KVM exits handling output IO.
793 pub exit_io_out_agg: LatencyAggregateMetrics,
794 /// Provides Min/max/sum for KVM exits handling MMIO reads.
795 pub exit_mmio_read_agg: LatencyAggregateMetrics,
796 /// Provides Min/max/sum for KVM exits handling MMIO writes.
797 pub exit_mmio_write_agg: LatencyAggregateMetrics,
798}
799impl VcpuMetrics {
800 /// Const default construction.
801 pub const fn new() -> Self {
802 Self {
803 exit_io_in: SharedIncMetric::new(),
804 exit_io_out: SharedIncMetric::new(),
805 exit_mmio_read: SharedIncMetric::new(),
806 exit_mmio_write: SharedIncMetric::new(),
807 failures: SharedIncMetric::new(),
808 kvmclock_ctrl_fails: SharedIncMetric::new(),
809 exit_io_in_agg: LatencyAggregateMetrics::new(),
810 exit_io_out_agg: LatencyAggregateMetrics::new(),
811 exit_mmio_read_agg: LatencyAggregateMetrics::new(),
812 exit_mmio_write_agg: LatencyAggregateMetrics::new(),
813 }
814 }
815}
816
817/// MicroVM interrupt-related metrics
818#[derive(Debug, Default, Serialize)]
819pub struct InterruptMetrics {
820 /// Number of interrupt triggers
821 pub triggers: SharedIncMetric,
822 /// Configuration updates
823 pub config_updates: SharedIncMetric,
824}
825
826impl InterruptMetrics {
827 /// Const default construction.
828 pub const fn new() -> Self {
829 Self {
830 triggers: SharedIncMetric::new(),
831 config_updates: SharedIncMetric::new(),
832 }
833 }
834}
835
836/// Metrics specific to the machine manager as a whole.
837#[derive(Debug, Default, Serialize)]
838pub struct VmmMetrics {
839 /// Metric for signaling a panic has occurred.
840 pub panic_count: SharedStoreMetric,
841}
842impl VmmMetrics {
843 /// Const default construction.
844 pub const fn new() -> Self {
845 Self {
846 panic_count: SharedStoreMetric::new(),
847 }
848 }
849}
850
851// The sole purpose of this struct is to produce an UTC timestamp when an instance is serialized.
852#[derive(Debug, Default)]
853struct SerializeToUtcTimestampMs;
854impl SerializeToUtcTimestampMs {
855 /// Const default construction.
856 pub const fn new() -> Self {
857 SerializeToUtcTimestampMs
858 }
859}
860
861impl Serialize for SerializeToUtcTimestampMs {
862 fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
863 serializer.serialize_i64(i64::try_from(get_time_ns(ClockType::Real) / 1_000_000).unwrap())
864 }
865}
866
867macro_rules! create_serialize_proxy {
868 // By using the below structure in FirecrackerMetrics it is easy
869 // to serialise Firecracker app_metrics as a single json object which
870 // otherwise would have required extra string manipulation to pack
871 // $metric_mod as part of the same json object as FirecrackerMetrics.
872 ($proxy_struct:ident, $metric_mod:ident) => {
873 #[derive(Default, Debug)]
874 pub struct $proxy_struct;
875
876 impl Serialize for $proxy_struct {
877 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
878 where
879 S: Serializer,
880 {
881 $metric_mod::flush_metrics(serializer)
882 }
883 }
884 };
885}
886
887create_serialize_proxy!(BlockMetricsSerializeProxy, block_metrics);
888create_serialize_proxy!(NetMetricsSerializeProxy, net_metrics);
889create_serialize_proxy!(VhostUserMetricsSerializeProxy, vhost_user_metrics);
890create_serialize_proxy!(BalloonMetricsSerializeProxy, balloon_metrics);
891create_serialize_proxy!(EntropyMetricsSerializeProxy, entropy_metrics);
892create_serialize_proxy!(VsockMetricsSerializeProxy, vsock_metrics);
893create_serialize_proxy!(PmemMetricsSerializeProxy, pmem_metrics);
894create_serialize_proxy!(LegacyDevMetricsSerializeProxy, legacy);
895create_serialize_proxy!(MemoryHotplugSerializeProxy, virtio_mem_metrics);
896
897/// Structure storing all metrics while enforcing serialization support on them.
898#[derive(Debug, Default, Serialize)]
899pub struct FirecrackerMetrics {
900 utc_timestamp_ms: SerializeToUtcTimestampMs,
901 /// API Server related metrics.
902 pub api_server: ApiServerMetrics,
903 #[serde(flatten)]
904 /// A balloon device's related metrics.
905 pub balloon_ser: BalloonMetricsSerializeProxy,
906 #[serde(flatten)]
907 /// A block device's related metrics.
908 pub block_ser: BlockMetricsSerializeProxy,
909 /// Metrics related to deprecated API calls.
910 pub deprecated_api: DeprecatedApiMetrics,
911 /// Metrics related to API GET requests.
912 pub get_api_requests: GetRequestsMetrics,
913 #[serde(flatten)]
914 /// Metrics related to the legacy device.
915 pub legacy_dev_ser: LegacyDevMetricsSerializeProxy,
916 /// Metrics related to performance measurements.
917 pub latencies_us: PerformanceMetrics,
918 /// Logging related metrics.
919 pub logger: LoggerSystemMetrics,
920 /// Metrics specific to MMDS functionality.
921 pub mmds: MmdsMetrics,
922 #[serde(flatten)]
923 /// A network device's related metrics.
924 pub net_ser: NetMetricsSerializeProxy,
925 /// Metrics related to API PATCH requests.
926 pub patch_api_requests: PatchRequestsMetrics,
927 /// Metrics related to API PUT requests.
928 pub put_api_requests: PutRequestsMetrics,
929 /// Metrics related to seccomp filtering.
930 pub seccomp: SeccompMetrics,
931 /// Metrics related to a vcpu's functioning.
932 pub vcpu: VcpuMetrics,
933 /// Metrics related to the virtual machine manager.
934 pub vmm: VmmMetrics,
935 /// Metrics related to signals.
936 pub signals: SignalMetrics,
937 #[serde(flatten)]
938 /// Metrics related to virtio-vsockets.
939 pub vsock_ser: VsockMetricsSerializeProxy,
940 #[serde(flatten)]
941 /// Metrics related to virtio-rng entropy device.
942 pub entropy_ser: EntropyMetricsSerializeProxy,
943 #[serde(flatten)]
944 /// Metrics related to virtio-pmem entropy device.
945 pub pmem_ser: PmemMetricsSerializeProxy,
946 #[serde(flatten)]
947 /// Vhost-user device related metrics.
948 pub vhost_user_ser: VhostUserMetricsSerializeProxy,
949 /// Interrupt related metrics
950 pub interrupts: InterruptMetrics,
951 #[serde(flatten)]
952 /// Virtio-mem device related metrics (memory hotplugging)
953 pub memory_hotplug_ser: MemoryHotplugSerializeProxy,
954}
955impl FirecrackerMetrics {
956 /// Const default construction.
957 pub const fn new() -> Self {
958 Self {
959 utc_timestamp_ms: SerializeToUtcTimestampMs::new(),
960 api_server: ApiServerMetrics::new(),
961 balloon_ser: BalloonMetricsSerializeProxy {},
962 block_ser: BlockMetricsSerializeProxy {},
963 deprecated_api: DeprecatedApiMetrics::new(),
964 get_api_requests: GetRequestsMetrics::new(),
965 legacy_dev_ser: LegacyDevMetricsSerializeProxy {},
966 latencies_us: PerformanceMetrics::new(),
967 logger: LoggerSystemMetrics::new(),
968 mmds: MmdsMetrics::new(),
969 net_ser: NetMetricsSerializeProxy {},
970 patch_api_requests: PatchRequestsMetrics::new(),
971 put_api_requests: PutRequestsMetrics::new(),
972 seccomp: SeccompMetrics::new(),
973 vcpu: VcpuMetrics::new(),
974 vmm: VmmMetrics::new(),
975 signals: SignalMetrics::new(),
976 vsock_ser: VsockMetricsSerializeProxy {},
977 entropy_ser: EntropyMetricsSerializeProxy {},
978 pmem_ser: PmemMetricsSerializeProxy {},
979 vhost_user_ser: VhostUserMetricsSerializeProxy {},
980 interrupts: InterruptMetrics::new(),
981 memory_hotplug_ser: MemoryHotplugSerializeProxy {},
982 }
983 }
984}
985
986#[cfg(test)]
987mod tests {
988 use std::io::{ErrorKind, LineWriter};
989 use std::sync::Arc;
990 use std::sync::atomic::fence;
991 use std::thread;
992
993 use vmm_sys_util::tempfile::TempFile;
994
995 use super::*;
996
997 #[test]
998 fn test_init() {
999 // This test has a conflict with the vmm_config test
1000 // `test_init_metrics` which also uses "METRICS" and
1001 // tests fail with an already initialized error.
1002 // This test is to validate the init() which doesn't require
1003 // using METRICS specifically. So, to avoid the conflict we
1004 // use a local Metrics to test init() instead of the global
1005 // "METRICS"
1006 let m = &Metrics::<_, FcLineWriter>::new(FirecrackerMetrics::new());
1007
1008 // Trying to write metrics, when metrics system is not initialized, should not throw error.
1009 let res = m.write();
1010 assert!(res.is_ok() && !res.unwrap());
1011
1012 let f = TempFile::new().expect("Failed to create temporary metrics file");
1013 m.init(LineWriter::new(f.into_file())).unwrap();
1014
1015 m.write().unwrap();
1016
1017 let f = TempFile::new().expect("Failed to create temporary metrics file");
1018
1019 m.init(LineWriter::new(f.into_file())).unwrap_err();
1020 }
1021
1022 #[test]
1023 fn test_shared_inc_metric() {
1024 let metric = Arc::new(SharedIncMetric::default());
1025
1026 // We're going to create a number of threads that will attempt to increase this metric
1027 // in parallel. If everything goes fine we still can't be sure the synchronization works,
1028 // but if something fails, then we definitely have a problem :-s
1029
1030 const NUM_THREADS_TO_SPAWN: usize = 4;
1031 const NUM_INCREMENTS_PER_THREAD: u64 = 10_0000;
1032 const M2_INITIAL_COUNT: u64 = 123;
1033
1034 metric.add(M2_INITIAL_COUNT);
1035
1036 let mut v = Vec::with_capacity(NUM_THREADS_TO_SPAWN);
1037
1038 for _ in 0..NUM_THREADS_TO_SPAWN {
1039 let r = metric.clone();
1040 v.push(thread::spawn(move || {
1041 for _ in 0..NUM_INCREMENTS_PER_THREAD {
1042 r.inc();
1043 }
1044 }));
1045 }
1046
1047 for handle in v {
1048 handle.join().unwrap();
1049 }
1050
1051 assert_eq!(
1052 metric.count(),
1053 M2_INITIAL_COUNT + NUM_THREADS_TO_SPAWN as u64 * NUM_INCREMENTS_PER_THREAD
1054 );
1055 }
1056
1057 #[test]
1058 fn test_shared_store_metric() {
1059 let m1 = Arc::new(SharedStoreMetric::default());
1060 m1.store(1);
1061 fence(Ordering::SeqCst);
1062 assert_eq!(1, m1.fetch());
1063 }
1064
1065 #[test]
1066 fn test_serialize() {
1067 let s = serde_json::to_string(&FirecrackerMetrics::default());
1068 s.unwrap();
1069 }
1070
1071 #[test]
1072 fn test_error_messages() {
1073 assert_eq!(
1074 format!(
1075 "{}",
1076 MetricsError::NeverInitialized(String::from("Bad Metrics Path Provided"))
1077 ),
1078 "Bad Metrics Path Provided"
1079 );
1080 assert_eq!(
1081 format!("{}", MetricsError::AlreadyInitialized),
1082 "Reinitialization of metrics not allowed."
1083 );
1084 assert_eq!(
1085 format!(
1086 "{}",
1087 MetricsError::Write(std::io::Error::new(ErrorKind::Interrupted, "write"))
1088 ),
1089 "Failed to write metrics: write"
1090 );
1091 assert_eq!(
1092 format!(
1093 "{}",
1094 MetricsError::Serde("Failed to serialize the given data structure.".to_string())
1095 ),
1096 "Failed to serialize the given data structure."
1097 );
1098 }
1099}