vmm/logger/
metrics.rs

1// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Defines the metrics system.
5//!
6//! # Metrics format
7//! The metrics are flushed in JSON format each 60 seconds. The first field will always be the
8//! timestamp followed by the JSON representation of the structures representing each component on
9//! which we are capturing specific metrics.
10//!
11//! ## JSON example with metrics:
12//! ```json
13//! {
14//!  "utc_timestamp_ms": 1541591155180,
15//!  "api_server": {
16//!    "process_startup_time_us": 0,
17//!    "process_startup_time_cpu_us": 0
18//!  },
19//!  "block": {
20//!    "activate_fails": 0,
21//!    "cfg_fails": 0,
22//!    "event_fails": 0,
23//!    "flush_count": 0,
24//!    "queue_event_count": 0,
25//!    "read_count": 0,
26//!    "write_count": 0
27//!  }
28//! }
29//! ```
30//! The example above means that inside the structure representing all the metrics there is a field
31//! named `block` which is in turn a serializable child structure collecting metrics for
32//! the block device such as `activate_fails`, `cfg_fails`, etc.
33//!
34//! # Limitations
35//! Metrics are only written to buffers.
36//!
37//! # Design
38//! The main design goals of this system are:
39//! * Use lockless operations, preferably ones that don't require anything other than simple
40//!   reads/writes being atomic.
41//! * Exploit interior mutability and atomics being Sync to allow all methods (including the ones
42//!   which are effectively mutable) to be callable on a global non-mut static.
43//! * Rely on `serde` to provide the actual serialization for writing the metrics.
44//! * Since all metrics start at 0, we implement the `Default` trait via derive for all of them, to
45//!   avoid having to initialize everything by hand.
46//!
47//! The system implements 2 types of metrics:
48//! * Shared Incremental Metrics (SharedIncMetrics) - dedicated for the metrics which need a counter
49//!   (i.e the number of times an API request failed). These metrics are reset upon flush.
50//! * Shared Store Metrics (SharedStoreMetrics) - are targeted at keeping a persistent value, it is
51//!   not intended to act as a counter (i.e for measure the process start up time for example).
52//!
53//! The current approach for the `SharedIncMetrics` type is to store two values (current and
54//! previous) and compute the delta between them each time we do a flush (i.e by serialization).
55//! There are a number of advantages to this approach, including:
56//! * We don't have to introduce an additional write (to reset the value) from the thread which does
57//!   to actual writing, so less synchronization effort is required.
58//! * We don't have to worry at all that much about losing some data if writing fails for a while
59//!   (this could be a concern, I guess).
60//!
61//! If if turns out this approach is not really what we want, it's pretty easy to resort to
62//! something else, while working behind the same interface.
63
64use std::fmt::Debug;
65use std::io::Write;
66use std::ops::Deref;
67use std::sync::atomic::{AtomicU64, Ordering};
68use std::sync::{Mutex, OnceLock};
69
70use serde::{Serialize, Serializer};
71use utils::time::{ClockType, get_time_ns, get_time_us};
72
73use super::FcLineWriter;
74use crate::devices::legacy;
75use crate::devices::virtio::balloon::metrics as balloon_metrics;
76use crate::devices::virtio::block::virtio::metrics as block_metrics;
77use crate::devices::virtio::mem::metrics as virtio_mem_metrics;
78use crate::devices::virtio::net::metrics as net_metrics;
79use crate::devices::virtio::pmem::metrics as pmem_metrics;
80use crate::devices::virtio::rng::metrics as entropy_metrics;
81use crate::devices::virtio::vhost_user_metrics;
82use crate::devices::virtio::vsock::metrics as vsock_metrics;
83
84/// Static instance used for handling metrics.
85pub static METRICS: Metrics<FirecrackerMetrics, FcLineWriter> =
86    Metrics::<FirecrackerMetrics, FcLineWriter>::new(FirecrackerMetrics::new());
87
88/// Metrics system.
89// All member fields have types which are Sync, and exhibit interior mutability, so
90// we can call operations on metrics using a non-mut static global variable.
91#[derive(Debug)]
92pub struct Metrics<T: Serialize, M: Write + Send> {
93    // Metrics will get flushed here.
94    metrics_buf: OnceLock<Mutex<M>>,
95    pub app_metrics: T,
96}
97
98impl<T: Serialize + Debug, M: Write + Send + Debug> Metrics<T, M> {
99    /// Creates a new instance of the current metrics.
100    pub const fn new(app_metrics: T) -> Metrics<T, M> {
101        Metrics {
102            metrics_buf: OnceLock::new(),
103            app_metrics,
104        }
105    }
106
107    /// Initialize metrics system (once and only once).
108    /// Every call made after the first will have no effect besides returning `Ok` or `Err`.
109    ///
110    /// This function is supposed to be called only from a single thread, once.
111    /// It is not thread-safe and is not meant to be used in a multithreaded
112    /// scenario. The reason `is_initialized` is an `AtomicBool` instead of
113    /// just a `bool` is that `lazy_static` enforces thread-safety on all its
114    /// members.
115    ///
116    /// # Arguments
117    ///
118    /// * `metrics_dest` - Buffer for JSON formatted metrics. Needs to implement `Write` and `Send`.
119    pub fn init(&self, metrics_dest: M) -> Result<(), MetricsError> {
120        self.metrics_buf
121            .set(Mutex::new(metrics_dest))
122            .map_err(|_| MetricsError::AlreadyInitialized)
123    }
124
125    /// Writes metrics to the destination provided as argument upon initialization of the metrics.
126    /// Upon failure, an error is returned if metrics system is initialized and metrics could not be
127    /// written.
128    /// Upon success, the function will return `True` (if metrics system was initialized and metrics
129    /// were successfully written to disk) or `False` (if metrics system was not yet initialized).
130    ///
131    /// This function is usually supposed to be called only from a single thread and
132    /// is not meant to be used in a multithreaded scenario. The reason
133    /// `metrics_buf` is enclosed in a `Mutex` is that `lazy_static` enforces
134    /// thread-safety on all its members.
135    /// The only exception is for signal handlers that result in process exit, which may be run on
136    /// any thread. To prevent the race condition present in the serialisation step of
137    /// SharedIncMetrics, deadly signals use SharedStoreMetrics instead (which have a thread-safe
138    /// serialise implementation).
139    /// The only known caveat is that other metrics may not be properly written before exiting from
140    /// a signal handler. We make this compromise since the process will be killed anyway and the
141    /// important metric in this case is the signal one.
142    /// The alternative is to hold a Mutex over the entire function call, but this increases the
143    /// known deadlock potential.
144    pub fn write(&self) -> Result<bool, MetricsError> {
145        if let Some(lock) = self.metrics_buf.get() {
146            let mut writer = lock.lock().expect("poisoned lock");
147            serde_json::to_writer(writer.by_ref(), &self.app_metrics)
148                .map_err(|err| MetricsError::Serde(err.to_string()))?;
149            writer.write_all(b"\n").map_err(MetricsError::Write)?;
150            Ok(true)
151        } else {
152            // If the metrics are not initialized, no error is thrown but we do let the user know
153            // that metrics were not written.
154            Ok(false)
155        }
156    }
157}
158
159impl<T: Serialize + Debug, M: Write + Send + Debug> Deref for Metrics<T, M> {
160    type Target = T;
161
162    fn deref(&self) -> &Self::Target {
163        &self.app_metrics
164    }
165}
166
167/// Describes the errors which may occur while handling metrics scenarios.
168#[derive(Debug, thiserror::Error, displaydoc::Display)]
169pub enum MetricsError {
170    /// {0}
171    NeverInitialized(String),
172    /// Reinitialization of metrics not allowed.
173    AlreadyInitialized,
174    /// {0}
175    Serde(String),
176    /// Failed to write metrics: {0}
177    Write(std::io::Error),
178}
179
180/// Used for defining new types of metrics that act as a counter (i.e they are continuously updated
181/// by incrementing their value).
182pub trait IncMetric {
183    /// Adds `value` to the current counter.
184    fn add(&self, value: u64);
185    /// Increments by 1 unit the current counter.
186    fn inc(&self) {
187        self.add(1);
188    }
189    /// Returns current value of the counter.
190    fn count(&self) -> u64;
191
192    /// Returns diff of current and old value of the counter.
193    /// Mostly used in process of aggregating per device metrics.
194    fn fetch_diff(&self) -> u64;
195}
196
197/// Used for defining new types of metrics that do not need a counter and act as a persistent
198/// indicator.
199pub trait StoreMetric {
200    /// Returns current value of the counter.
201    fn fetch(&self) -> u64;
202    /// Stores `value` to the current counter.
203    fn store(&self, value: u64);
204}
205
206/// Representation of a metric that is expected to be incremented from more than one thread, so more
207/// synchronization is necessary.
208// It's currently used for vCPU metrics. An alternative here would be
209// to have one instance of every metric for each thread, and to
210// aggregate them when writing. However this probably overkill unless we have a lot of vCPUs
211// incrementing metrics very often. Still, it's there if we ever need it :-s
212// We will be keeping two values for each metric for being able to reset
213// counters on each metric.
214// 1st member - current value being updated
215// 2nd member - old value that gets the current value whenever metrics is flushed to disk
216#[derive(Debug, Default)]
217pub struct SharedIncMetric(AtomicU64, AtomicU64);
218impl SharedIncMetric {
219    /// Const default construction.
220    pub const fn new() -> Self {
221        Self(AtomicU64::new(0), AtomicU64::new(0))
222    }
223}
224
225/// Representation of a metric that is expected to hold a value that can be accessed
226/// from more than one thread, so more synchronization is necessary.
227#[derive(Debug, Default)]
228pub struct SharedStoreMetric(AtomicU64);
229impl SharedStoreMetric {
230    /// Const default construction.
231    pub const fn new() -> Self {
232        Self(AtomicU64::new(0))
233    }
234}
235
236impl IncMetric for SharedIncMetric {
237    // While the order specified for this operation is still Relaxed, the actual instruction will
238    // be an asm "LOCK; something" and thus atomic across multiple threads, simply because of the
239    // fetch_and_add (as opposed to "store(load() + 1)") implementation for atomics.
240    // TODO: would a stronger ordering make a difference here?
241    fn add(&self, value: u64) {
242        self.0.fetch_add(value, Ordering::Relaxed);
243    }
244
245    fn count(&self) -> u64 {
246        self.0.load(Ordering::Relaxed)
247    }
248    fn fetch_diff(&self) -> u64 {
249        self.0.load(Ordering::Relaxed) - self.1.load(Ordering::Relaxed)
250    }
251}
252
253impl StoreMetric for SharedStoreMetric {
254    fn fetch(&self) -> u64 {
255        self.0.load(Ordering::Relaxed)
256    }
257
258    fn store(&self, value: u64) {
259        self.0.store(value, Ordering::Relaxed);
260    }
261}
262
263impl Serialize for SharedIncMetric {
264    /// Reset counters of each metrics. Here we suppose that Serialize's goal is to help with the
265    /// flushing of metrics.
266    /// !!! Any print of the metrics will also reset them. Use with caution !!!
267    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
268        let snapshot = self.0.load(Ordering::Relaxed);
269        let res = serializer.serialize_u64(snapshot - self.1.load(Ordering::Relaxed));
270
271        if res.is_ok() {
272            self.1.store(snapshot, Ordering::Relaxed);
273        }
274        res
275    }
276}
277
278impl Serialize for SharedStoreMetric {
279    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
280        serializer.serialize_u64(self.0.load(Ordering::Relaxed))
281    }
282}
283
284/// Reporter object which computes the process wall time and
285/// process CPU time and populates the metric with the results.
286#[derive(Debug)]
287pub struct ProcessTimeReporter {
288    // Process start time in us.
289    start_time_us: Option<u64>,
290    // Process CPU start time in us.
291    start_time_cpu_us: Option<u64>,
292    // Firecracker's parent process CPU time.
293    parent_cpu_time_us: Option<u64>,
294}
295
296impl ProcessTimeReporter {
297    /// Constructor for the process time-related reporter.
298    pub fn new(
299        start_time_us: Option<u64>,
300        start_time_cpu_us: Option<u64>,
301        parent_cpu_time_us: Option<u64>,
302    ) -> ProcessTimeReporter {
303        ProcessTimeReporter {
304            start_time_us,
305            start_time_cpu_us,
306            parent_cpu_time_us,
307        }
308    }
309
310    /// Obtain process start time in microseconds.
311    pub fn report_start_time(&self) {
312        if let Some(start_time) = self.start_time_us {
313            let delta_us = get_time_us(ClockType::Monotonic) - start_time;
314            METRICS.api_server.process_startup_time_us.store(delta_us);
315        }
316    }
317
318    /// Obtain process CPU start time in microseconds.
319    pub fn report_cpu_start_time(&self) {
320        if let Some(cpu_start_time) = self.start_time_cpu_us {
321            let delta_us = get_time_us(ClockType::ProcessCpu) - cpu_start_time
322                + self.parent_cpu_time_us.unwrap_or_default();
323            METRICS
324                .api_server
325                .process_startup_time_cpu_us
326                .store(delta_us);
327        }
328    }
329}
330
331// The following structs are used to define a certain organization for the set of metrics we
332// are interested in. Whenever the name of a field differs from its ideal textual representation
333// in the serialized form, we can use the #[serde(rename = "name")] attribute to, well, rename it.
334
335/// Metrics related to the internal API server.
336#[derive(Debug, Default, Serialize)]
337pub struct ApiServerMetrics {
338    /// Measures the process's startup time in microseconds.
339    pub process_startup_time_us: SharedStoreMetric,
340    /// Measures the cpu's startup time in microseconds.
341    pub process_startup_time_cpu_us: SharedStoreMetric,
342}
343impl ApiServerMetrics {
344    /// Const default construction.
345    pub const fn new() -> Self {
346        Self {
347            process_startup_time_us: SharedStoreMetric::new(),
348            process_startup_time_cpu_us: SharedStoreMetric::new(),
349        }
350    }
351}
352
353/// Metrics specific to GET API Requests for counting user triggered actions and/or failures.
354#[derive(Debug, Default, Serialize)]
355pub struct GetRequestsMetrics {
356    /// Number of GETs for getting information on the instance.
357    pub instance_info_count: SharedIncMetric,
358    /// Number of GETs for getting status on attaching machine configuration.
359    pub machine_cfg_count: SharedIncMetric,
360    /// Number of GETs for getting mmds.
361    pub mmds_count: SharedIncMetric,
362    /// Number of GETs for getting the VMM version.
363    pub vmm_version_count: SharedIncMetric,
364    /// Number of GETs for getting hotpluggable memory status.
365    pub hotplug_memory_count: SharedIncMetric,
366}
367impl GetRequestsMetrics {
368    /// Const default construction.
369    pub const fn new() -> Self {
370        Self {
371            instance_info_count: SharedIncMetric::new(),
372            machine_cfg_count: SharedIncMetric::new(),
373            mmds_count: SharedIncMetric::new(),
374            vmm_version_count: SharedIncMetric::new(),
375            hotplug_memory_count: SharedIncMetric::new(),
376        }
377    }
378}
379
380/// Metrics specific to PUT API Requests for counting user triggered actions and/or failures.
381#[derive(Debug, Default, Serialize)]
382pub struct PutRequestsMetrics {
383    /// Number of PUTs triggering an action on the VM.
384    pub actions_count: SharedIncMetric,
385    /// Number of failures in triggering an action on the VM.
386    pub actions_fails: SharedIncMetric,
387    /// Number of PUTs for attaching source of boot.
388    pub boot_source_count: SharedIncMetric,
389    /// Number of failures during attaching source of boot.
390    pub boot_source_fails: SharedIncMetric,
391    /// Number of PUTs triggering a block attach.
392    pub drive_count: SharedIncMetric,
393    /// Number of failures in attaching a block device.
394    pub drive_fails: SharedIncMetric,
395    /// Number of PUTs for initializing the logging system.
396    pub logger_count: SharedIncMetric,
397    /// Number of failures in initializing the logging system.
398    pub logger_fails: SharedIncMetric,
399    /// Number of PUTs for configuring the machine.
400    pub machine_cfg_count: SharedIncMetric,
401    /// Number of failures in configuring the machine.
402    pub machine_cfg_fails: SharedIncMetric,
403    /// Number of PUTs for configuring a guest's vCPUs.
404    pub cpu_cfg_count: SharedIncMetric,
405    /// Number of failures in configuring a guest's vCPUs.
406    pub cpu_cfg_fails: SharedIncMetric,
407    /// Number of PUTs for initializing the metrics system.
408    pub metrics_count: SharedIncMetric,
409    /// Number of failures in initializing the metrics system.
410    pub metrics_fails: SharedIncMetric,
411    /// Number of PUTs for creating a new network interface.
412    pub network_count: SharedIncMetric,
413    /// Number of failures in creating a new network interface.
414    pub network_fails: SharedIncMetric,
415    /// Number of PUTs for creating mmds.
416    pub mmds_count: SharedIncMetric,
417    /// Number of failures in creating a new mmds.
418    pub mmds_fails: SharedIncMetric,
419    /// Number of PUTs for creating a vsock device.
420    pub vsock_count: SharedIncMetric,
421    /// Number of failures in creating a vsock device.
422    pub vsock_fails: SharedIncMetric,
423    /// Number of PUTs triggering a pmem attach.
424    pub pmem_count: SharedIncMetric,
425    /// Number of failures in attaching a pmem device.
426    pub pmem_fails: SharedIncMetric,
427    /// Number of PUTs to /serial
428    pub serial_count: SharedIncMetric,
429    /// Number of failed PUTs to /serial
430    pub serial_fails: SharedIncMetric,
431    /// Number of PUTs to /hotplug/memory
432    pub hotplug_memory_count: SharedIncMetric,
433    /// Number of failed PUTs to /hotplug/memory
434    pub hotplug_memory_fails: SharedIncMetric,
435}
436impl PutRequestsMetrics {
437    /// Const default construction.
438    pub const fn new() -> Self {
439        Self {
440            actions_count: SharedIncMetric::new(),
441            actions_fails: SharedIncMetric::new(),
442            boot_source_count: SharedIncMetric::new(),
443            boot_source_fails: SharedIncMetric::new(),
444            drive_count: SharedIncMetric::new(),
445            drive_fails: SharedIncMetric::new(),
446            logger_count: SharedIncMetric::new(),
447            logger_fails: SharedIncMetric::new(),
448            machine_cfg_count: SharedIncMetric::new(),
449            machine_cfg_fails: SharedIncMetric::new(),
450            cpu_cfg_count: SharedIncMetric::new(),
451            cpu_cfg_fails: SharedIncMetric::new(),
452            metrics_count: SharedIncMetric::new(),
453            metrics_fails: SharedIncMetric::new(),
454            network_count: SharedIncMetric::new(),
455            network_fails: SharedIncMetric::new(),
456            mmds_count: SharedIncMetric::new(),
457            mmds_fails: SharedIncMetric::new(),
458            vsock_count: SharedIncMetric::new(),
459            vsock_fails: SharedIncMetric::new(),
460            pmem_count: SharedIncMetric::new(),
461            pmem_fails: SharedIncMetric::new(),
462            serial_count: SharedIncMetric::new(),
463            serial_fails: SharedIncMetric::new(),
464            hotplug_memory_count: SharedIncMetric::new(),
465            hotplug_memory_fails: SharedIncMetric::new(),
466        }
467    }
468}
469
470/// Metrics specific to PATCH API Requests for counting user triggered actions and/or failures.
471#[derive(Debug, Default, Serialize)]
472pub struct PatchRequestsMetrics {
473    /// Number of tries to PATCH a block device.
474    pub drive_count: SharedIncMetric,
475    /// Number of failures in PATCHing a block device.
476    pub drive_fails: SharedIncMetric,
477    /// Number of tries to PATCH a net device.
478    pub network_count: SharedIncMetric,
479    /// Number of failures in PATCHing a net device.
480    pub network_fails: SharedIncMetric,
481    /// Number of PATCHs for configuring the machine.
482    pub machine_cfg_count: SharedIncMetric,
483    /// Number of failures in configuring the machine.
484    pub machine_cfg_fails: SharedIncMetric,
485    /// Number of tries to PATCH an mmds.
486    pub mmds_count: SharedIncMetric,
487    /// Number of failures in PATCHing an mmds.
488    pub mmds_fails: SharedIncMetric,
489    /// Number of PATCHes to /hotplug/memory
490    pub hotplug_memory_count: SharedIncMetric,
491    /// Number of failed PATCHes to /hotplug/memory
492    pub hotplug_memory_fails: SharedIncMetric,
493}
494impl PatchRequestsMetrics {
495    /// Const default construction.
496    pub const fn new() -> Self {
497        Self {
498            drive_count: SharedIncMetric::new(),
499            drive_fails: SharedIncMetric::new(),
500            network_count: SharedIncMetric::new(),
501            network_fails: SharedIncMetric::new(),
502            machine_cfg_count: SharedIncMetric::new(),
503            machine_cfg_fails: SharedIncMetric::new(),
504            mmds_count: SharedIncMetric::new(),
505            mmds_fails: SharedIncMetric::new(),
506            hotplug_memory_count: SharedIncMetric::new(),
507            hotplug_memory_fails: SharedIncMetric::new(),
508        }
509    }
510}
511
512/// Metrics related to deprecated user-facing API calls.
513#[derive(Debug, Default, Serialize)]
514pub struct DeprecatedApiMetrics {
515    /// Total number of calls to deprecated HTTP endpoints.
516    pub deprecated_http_api_calls: SharedIncMetric,
517}
518impl DeprecatedApiMetrics {
519    /// Const default construction.
520    pub const fn new() -> Self {
521        Self {
522            deprecated_http_api_calls: SharedIncMetric::new(),
523        }
524    }
525}
526
527/// Metrics for the logging subsystem.
528#[derive(Debug, Default, Serialize)]
529pub struct LoggerSystemMetrics {
530    /// Number of misses on flushing metrics.
531    pub missed_metrics_count: SharedIncMetric,
532    /// Number of errors during metrics handling.
533    pub metrics_fails: SharedIncMetric,
534    /// Number of misses on logging human readable content.
535    pub missed_log_count: SharedIncMetric,
536}
537impl LoggerSystemMetrics {
538    /// Const default construction.
539    pub const fn new() -> Self {
540        Self {
541            missed_metrics_count: SharedIncMetric::new(),
542            metrics_fails: SharedIncMetric::new(),
543            missed_log_count: SharedIncMetric::new(),
544        }
545    }
546}
547
548/// Metrics for the MMDS functionality.
549#[derive(Debug, Default, Serialize)]
550pub struct MmdsMetrics {
551    /// Number of frames rerouted to MMDS.
552    pub rx_accepted: SharedIncMetric,
553    /// Number of errors while handling a frame through MMDS.
554    pub rx_accepted_err: SharedIncMetric,
555    /// Number of uncommon events encountered while processing packets through MMDS.
556    pub rx_accepted_unusual: SharedIncMetric,
557    /// The number of buffers which couldn't be parsed as valid Ethernet frames by the MMDS.
558    pub rx_bad_eth: SharedIncMetric,
559    /// The number of GET requests with invalid tokens.
560    pub rx_invalid_token: SharedIncMetric,
561    /// The number of GET requests with no tokens.
562    pub rx_no_token: SharedIncMetric,
563    /// The total number of successful receive operations by the MMDS.
564    pub rx_count: SharedIncMetric,
565    /// The total number of bytes sent by the MMDS.
566    pub tx_bytes: SharedIncMetric,
567    /// The total number of successful send operations by the MMDS.
568    pub tx_count: SharedIncMetric,
569    /// The number of errors raised by the MMDS while attempting to send frames/packets/segments.
570    pub tx_errors: SharedIncMetric,
571    /// The number of frames sent by the MMDS.
572    pub tx_frames: SharedIncMetric,
573    /// The number of connections successfully accepted by the MMDS TCP handler.
574    pub connections_created: SharedIncMetric,
575    /// The number of connections cleaned up by the MMDS TCP handler.
576    pub connections_destroyed: SharedIncMetric,
577}
578impl MmdsMetrics {
579    /// Const default construction.
580    pub const fn new() -> Self {
581        Self {
582            rx_accepted: SharedIncMetric::new(),
583            rx_accepted_err: SharedIncMetric::new(),
584            rx_accepted_unusual: SharedIncMetric::new(),
585            rx_bad_eth: SharedIncMetric::new(),
586            rx_invalid_token: SharedIncMetric::new(),
587            rx_no_token: SharedIncMetric::new(),
588            rx_count: SharedIncMetric::new(),
589            tx_bytes: SharedIncMetric::new(),
590            tx_count: SharedIncMetric::new(),
591            tx_errors: SharedIncMetric::new(),
592            tx_frames: SharedIncMetric::new(),
593            connections_created: SharedIncMetric::new(),
594            connections_destroyed: SharedIncMetric::new(),
595        }
596    }
597}
598
599/// Performance metrics related for the moment only to snapshots.
600// These store the duration of creating/loading a snapshot and of
601// pausing/resuming the microVM.
602// If there are more than one `/snapshot/create` request in a minute
603// (until the metrics are flushed), only the duration of the last
604// snapshot creation is stored in the metric. If the user is interested
605// in all the durations, a `FlushMetrics` request should be sent after
606// each `create` request.
607#[derive(Debug, Default, Serialize)]
608pub struct PerformanceMetrics {
609    /// Measures the snapshot full create time, at the API (user) level, in microseconds.
610    pub full_create_snapshot: SharedStoreMetric,
611    /// Measures the snapshot diff create time, at the API (user) level, in microseconds.
612    pub diff_create_snapshot: SharedStoreMetric,
613    /// Measures the snapshot load time, at the API (user) level, in microseconds.
614    pub load_snapshot: SharedStoreMetric,
615    /// Measures the microVM pausing duration, at the API (user) level, in microseconds.
616    pub pause_vm: SharedStoreMetric,
617    /// Measures the microVM resuming duration, at the API (user) level, in microseconds.
618    pub resume_vm: SharedStoreMetric,
619    /// Measures the snapshot full create time, at the VMM level, in microseconds.
620    pub vmm_full_create_snapshot: SharedStoreMetric,
621    /// Measures the snapshot diff create time, at the VMM level, in microseconds.
622    pub vmm_diff_create_snapshot: SharedStoreMetric,
623    /// Measures the snapshot load time, at the VMM level, in microseconds.
624    pub vmm_load_snapshot: SharedStoreMetric,
625    /// Measures the microVM pausing duration, at the VMM level, in microseconds.
626    pub vmm_pause_vm: SharedStoreMetric,
627    /// Measures the microVM resuming duration, at the VMM level, in microseconds.
628    pub vmm_resume_vm: SharedStoreMetric,
629}
630impl PerformanceMetrics {
631    /// Const default construction.
632    pub const fn new() -> Self {
633        Self {
634            full_create_snapshot: SharedStoreMetric::new(),
635            diff_create_snapshot: SharedStoreMetric::new(),
636            load_snapshot: SharedStoreMetric::new(),
637            pause_vm: SharedStoreMetric::new(),
638            resume_vm: SharedStoreMetric::new(),
639            vmm_full_create_snapshot: SharedStoreMetric::new(),
640            vmm_diff_create_snapshot: SharedStoreMetric::new(),
641            vmm_load_snapshot: SharedStoreMetric::new(),
642            vmm_pause_vm: SharedStoreMetric::new(),
643            vmm_resume_vm: SharedStoreMetric::new(),
644        }
645    }
646}
647
648/// Metrics for the seccomp filtering.
649#[derive(Debug, Default, Serialize)]
650pub struct SeccompMetrics {
651    /// Number of errors inside the seccomp filtering.
652    pub num_faults: SharedStoreMetric,
653}
654impl SeccompMetrics {
655    /// Const default construction.
656    pub const fn new() -> Self {
657        Self {
658            num_faults: SharedStoreMetric::new(),
659        }
660    }
661}
662
663/// Metrics related to signals.
664/// Deadly signals must be of `SharedStoreMetric` type, since they can ever be either 0 or 1.
665/// This avoids a tricky race condition caused by the unatomic serialize method of
666/// `SharedIncMetric`, between two threads calling `METRICS.write()`.
667#[derive(Debug, Default, Serialize)]
668pub struct SignalMetrics {
669    /// Number of times that SIGBUS was handled.
670    pub sigbus: SharedStoreMetric,
671    /// Number of times that SIGSEGV was handled.
672    pub sigsegv: SharedStoreMetric,
673    /// Number of times that SIGXFSZ was handled.
674    pub sigxfsz: SharedStoreMetric,
675    /// Number of times that SIGXCPU was handled.
676    pub sigxcpu: SharedStoreMetric,
677    /// Number of times that SIGPIPE was handled.
678    pub sigpipe: SharedIncMetric,
679    /// Number of times that SIGHUP was handled.
680    pub sighup: SharedStoreMetric,
681    /// Number of times that SIGILL was handled.
682    pub sigill: SharedStoreMetric,
683}
684impl SignalMetrics {
685    /// Const default construction.
686    pub const fn new() -> Self {
687        Self {
688            sigbus: SharedStoreMetric::new(),
689            sigsegv: SharedStoreMetric::new(),
690            sigxfsz: SharedStoreMetric::new(),
691            sigxcpu: SharedStoreMetric::new(),
692            sigpipe: SharedIncMetric::new(),
693            sighup: SharedStoreMetric::new(),
694            sigill: SharedStoreMetric::new(),
695        }
696    }
697}
698
699/// Provides efficient way to record LatencyAggregateMetrics
700#[derive(Debug)]
701pub struct LatencyMetricsRecorder<'a> {
702    start_time: u64,
703    metric: &'a LatencyAggregateMetrics,
704}
705
706impl<'a> LatencyMetricsRecorder<'a> {
707    /// Const default construction.
708    fn new(metric: &'a LatencyAggregateMetrics) -> Self {
709        Self {
710            start_time: get_time_us(ClockType::Monotonic),
711            metric,
712        }
713    }
714}
715impl Drop for LatencyMetricsRecorder<'_> {
716    /// records aggregate (min/max/sum) for the given metric
717    /// This captures delta between self.start_time and current time
718    /// and updates min/max/sum metrics.
719    ///  self.start_time is recorded in new() and metrics are updated in drop
720    fn drop(&mut self) {
721        let delta_us = get_time_us(ClockType::Monotonic) - self.start_time;
722        self.metric.sum_us.add(delta_us);
723        let min_us = self.metric.min_us.fetch();
724        let max_us = self.metric.max_us.fetch();
725        if (0 == min_us) || (min_us > delta_us) {
726            self.metric.min_us.store(delta_us);
727        }
728        if (0 == max_us) || (max_us < delta_us) {
729            self.metric.max_us.store(delta_us);
730        }
731    }
732}
733
734/// Used to record Aggregate (min/max/sum) of latency metrics
735#[derive(Debug, Default, Serialize)]
736pub struct LatencyAggregateMetrics {
737    /// represents minimum value of the metrics in microseconds
738    pub min_us: SharedStoreMetric,
739    /// represents maximum value of the metrics in microseconds
740    pub max_us: SharedStoreMetric,
741    /// represents sum of the metrics in microseconds
742    pub sum_us: SharedIncMetric,
743}
744impl LatencyAggregateMetrics {
745    /// Const default construction.
746    pub const fn new() -> Self {
747        Self {
748            min_us: SharedStoreMetric::new(),
749            max_us: SharedStoreMetric::new(),
750            sum_us: SharedIncMetric::new(),
751        }
752    }
753
754    /// returns a latency recorder which captures stores start_time
755    /// and updates the actual metrics at the end of recorders lifetime.
756    /// in short instead of below 2 lines :
757    /// 1st for start_time_us = get_time_us()
758    /// 2nd for delta_time_us = get_time_us() - start_time; and metrics.store(delta_time_us)
759    /// we have just `_m = metrics.record_latency_metrics()`
760    pub fn record_latency_metrics(&self) -> LatencyMetricsRecorder<'_> {
761        LatencyMetricsRecorder::new(self)
762    }
763}
764
765/// Structure provides Metrics specific to VCPUs' mode of functioning.
766/// Sample_count or number of kvm exits for IO and MMIO VM exits are covered by:
767/// `exit_io_in`, `exit_io_out`, `exit_mmio_read` and , `exit_mmio_write`.
768/// Count of other vm exits for events like shutdown/hlt/errors are
769/// covered by existing "failures" metric.
770/// The only vm exit for which sample_count is not covered is system
771/// event reset/shutdown but that should be fine since they are not
772/// failures and the vm is terminated anyways.
773/// LatencyAggregateMetrics only covers minimum, maximum and sum
774/// because average can be deduced from available metrics. e.g.
775/// dividing `exit_io_in_agg.sum_us` by exit_io_in` gives average of KVM exits handling input IO.
776#[derive(Debug, Default, Serialize)]
777pub struct VcpuMetrics {
778    /// Number of KVM exits for handling input IO.
779    pub exit_io_in: SharedIncMetric,
780    /// Number of KVM exits for handling output IO.
781    pub exit_io_out: SharedIncMetric,
782    /// Number of KVM exits for handling MMIO reads.
783    pub exit_mmio_read: SharedIncMetric,
784    /// Number of KVM exits for handling MMIO writes.
785    pub exit_mmio_write: SharedIncMetric,
786    /// Number of errors during this VCPU's run.
787    pub failures: SharedIncMetric,
788    /// Number of times that the `KVM_KVMCLOCK_CTRL` ioctl failed.
789    pub kvmclock_ctrl_fails: SharedIncMetric,
790    /// Provides Min/max/sum for KVM exits handling input IO.
791    pub exit_io_in_agg: LatencyAggregateMetrics,
792    /// Provides Min/max/sum for KVM exits handling output IO.
793    pub exit_io_out_agg: LatencyAggregateMetrics,
794    /// Provides Min/max/sum for KVM exits handling MMIO reads.
795    pub exit_mmio_read_agg: LatencyAggregateMetrics,
796    /// Provides Min/max/sum for KVM exits handling MMIO writes.
797    pub exit_mmio_write_agg: LatencyAggregateMetrics,
798}
799impl VcpuMetrics {
800    /// Const default construction.
801    pub const fn new() -> Self {
802        Self {
803            exit_io_in: SharedIncMetric::new(),
804            exit_io_out: SharedIncMetric::new(),
805            exit_mmio_read: SharedIncMetric::new(),
806            exit_mmio_write: SharedIncMetric::new(),
807            failures: SharedIncMetric::new(),
808            kvmclock_ctrl_fails: SharedIncMetric::new(),
809            exit_io_in_agg: LatencyAggregateMetrics::new(),
810            exit_io_out_agg: LatencyAggregateMetrics::new(),
811            exit_mmio_read_agg: LatencyAggregateMetrics::new(),
812            exit_mmio_write_agg: LatencyAggregateMetrics::new(),
813        }
814    }
815}
816
817/// MicroVM interrupt-related metrics
818#[derive(Debug, Default, Serialize)]
819pub struct InterruptMetrics {
820    /// Number of interrupt triggers
821    pub triggers: SharedIncMetric,
822    /// Configuration updates
823    pub config_updates: SharedIncMetric,
824}
825
826impl InterruptMetrics {
827    /// Const default construction.
828    pub const fn new() -> Self {
829        Self {
830            triggers: SharedIncMetric::new(),
831            config_updates: SharedIncMetric::new(),
832        }
833    }
834}
835
836/// Metrics specific to the machine manager as a whole.
837#[derive(Debug, Default, Serialize)]
838pub struct VmmMetrics {
839    /// Metric for signaling a panic has occurred.
840    pub panic_count: SharedStoreMetric,
841}
842impl VmmMetrics {
843    /// Const default construction.
844    pub const fn new() -> Self {
845        Self {
846            panic_count: SharedStoreMetric::new(),
847        }
848    }
849}
850
851// The sole purpose of this struct is to produce an UTC timestamp when an instance is serialized.
852#[derive(Debug, Default)]
853struct SerializeToUtcTimestampMs;
854impl SerializeToUtcTimestampMs {
855    /// Const default construction.
856    pub const fn new() -> Self {
857        SerializeToUtcTimestampMs
858    }
859}
860
861impl Serialize for SerializeToUtcTimestampMs {
862    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
863        serializer.serialize_i64(i64::try_from(get_time_ns(ClockType::Real) / 1_000_000).unwrap())
864    }
865}
866
867macro_rules! create_serialize_proxy {
868    // By using the below structure in FirecrackerMetrics it is easy
869    // to serialise Firecracker app_metrics as a single json object which
870    // otherwise would have required extra string manipulation to pack
871    // $metric_mod as part of the same json object as FirecrackerMetrics.
872    ($proxy_struct:ident, $metric_mod:ident) => {
873        #[derive(Default, Debug)]
874        pub struct $proxy_struct;
875
876        impl Serialize for $proxy_struct {
877            fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
878            where
879                S: Serializer,
880            {
881                $metric_mod::flush_metrics(serializer)
882            }
883        }
884    };
885}
886
887create_serialize_proxy!(BlockMetricsSerializeProxy, block_metrics);
888create_serialize_proxy!(NetMetricsSerializeProxy, net_metrics);
889create_serialize_proxy!(VhostUserMetricsSerializeProxy, vhost_user_metrics);
890create_serialize_proxy!(BalloonMetricsSerializeProxy, balloon_metrics);
891create_serialize_proxy!(EntropyMetricsSerializeProxy, entropy_metrics);
892create_serialize_proxy!(VsockMetricsSerializeProxy, vsock_metrics);
893create_serialize_proxy!(PmemMetricsSerializeProxy, pmem_metrics);
894create_serialize_proxy!(LegacyDevMetricsSerializeProxy, legacy);
895create_serialize_proxy!(MemoryHotplugSerializeProxy, virtio_mem_metrics);
896
897/// Structure storing all metrics while enforcing serialization support on them.
898#[derive(Debug, Default, Serialize)]
899pub struct FirecrackerMetrics {
900    utc_timestamp_ms: SerializeToUtcTimestampMs,
901    /// API Server related metrics.
902    pub api_server: ApiServerMetrics,
903    #[serde(flatten)]
904    /// A balloon device's related metrics.
905    pub balloon_ser: BalloonMetricsSerializeProxy,
906    #[serde(flatten)]
907    /// A block device's related metrics.
908    pub block_ser: BlockMetricsSerializeProxy,
909    /// Metrics related to deprecated API calls.
910    pub deprecated_api: DeprecatedApiMetrics,
911    /// Metrics related to API GET requests.
912    pub get_api_requests: GetRequestsMetrics,
913    #[serde(flatten)]
914    /// Metrics related to the legacy device.
915    pub legacy_dev_ser: LegacyDevMetricsSerializeProxy,
916    /// Metrics related to performance measurements.
917    pub latencies_us: PerformanceMetrics,
918    /// Logging related metrics.
919    pub logger: LoggerSystemMetrics,
920    /// Metrics specific to MMDS functionality.
921    pub mmds: MmdsMetrics,
922    #[serde(flatten)]
923    /// A network device's related metrics.
924    pub net_ser: NetMetricsSerializeProxy,
925    /// Metrics related to API PATCH requests.
926    pub patch_api_requests: PatchRequestsMetrics,
927    /// Metrics related to API PUT requests.
928    pub put_api_requests: PutRequestsMetrics,
929    /// Metrics related to seccomp filtering.
930    pub seccomp: SeccompMetrics,
931    /// Metrics related to a vcpu's functioning.
932    pub vcpu: VcpuMetrics,
933    /// Metrics related to the virtual machine manager.
934    pub vmm: VmmMetrics,
935    /// Metrics related to signals.
936    pub signals: SignalMetrics,
937    #[serde(flatten)]
938    /// Metrics related to virtio-vsockets.
939    pub vsock_ser: VsockMetricsSerializeProxy,
940    #[serde(flatten)]
941    /// Metrics related to virtio-rng entropy device.
942    pub entropy_ser: EntropyMetricsSerializeProxy,
943    #[serde(flatten)]
944    /// Metrics related to virtio-pmem entropy device.
945    pub pmem_ser: PmemMetricsSerializeProxy,
946    #[serde(flatten)]
947    /// Vhost-user device related metrics.
948    pub vhost_user_ser: VhostUserMetricsSerializeProxy,
949    /// Interrupt related metrics
950    pub interrupts: InterruptMetrics,
951    #[serde(flatten)]
952    /// Virtio-mem device related metrics (memory hotplugging)
953    pub memory_hotplug_ser: MemoryHotplugSerializeProxy,
954}
955impl FirecrackerMetrics {
956    /// Const default construction.
957    pub const fn new() -> Self {
958        Self {
959            utc_timestamp_ms: SerializeToUtcTimestampMs::new(),
960            api_server: ApiServerMetrics::new(),
961            balloon_ser: BalloonMetricsSerializeProxy {},
962            block_ser: BlockMetricsSerializeProxy {},
963            deprecated_api: DeprecatedApiMetrics::new(),
964            get_api_requests: GetRequestsMetrics::new(),
965            legacy_dev_ser: LegacyDevMetricsSerializeProxy {},
966            latencies_us: PerformanceMetrics::new(),
967            logger: LoggerSystemMetrics::new(),
968            mmds: MmdsMetrics::new(),
969            net_ser: NetMetricsSerializeProxy {},
970            patch_api_requests: PatchRequestsMetrics::new(),
971            put_api_requests: PutRequestsMetrics::new(),
972            seccomp: SeccompMetrics::new(),
973            vcpu: VcpuMetrics::new(),
974            vmm: VmmMetrics::new(),
975            signals: SignalMetrics::new(),
976            vsock_ser: VsockMetricsSerializeProxy {},
977            entropy_ser: EntropyMetricsSerializeProxy {},
978            pmem_ser: PmemMetricsSerializeProxy {},
979            vhost_user_ser: VhostUserMetricsSerializeProxy {},
980            interrupts: InterruptMetrics::new(),
981            memory_hotplug_ser: MemoryHotplugSerializeProxy {},
982        }
983    }
984}
985
986#[cfg(test)]
987mod tests {
988    use std::io::{ErrorKind, LineWriter};
989    use std::sync::Arc;
990    use std::sync::atomic::fence;
991    use std::thread;
992
993    use vmm_sys_util::tempfile::TempFile;
994
995    use super::*;
996
997    #[test]
998    fn test_init() {
999        // This test has a conflict with the vmm_config test
1000        // `test_init_metrics` which also uses "METRICS" and
1001        // tests fail with an already initialized error.
1002        // This test is to validate the init() which doesn't require
1003        // using METRICS specifically. So, to avoid the conflict we
1004        // use a local Metrics to test init() instead of the global
1005        // "METRICS"
1006        let m = &Metrics::<_, FcLineWriter>::new(FirecrackerMetrics::new());
1007
1008        // Trying to write metrics, when metrics system is not initialized, should not throw error.
1009        let res = m.write();
1010        assert!(res.is_ok() && !res.unwrap());
1011
1012        let f = TempFile::new().expect("Failed to create temporary metrics file");
1013        m.init(LineWriter::new(f.into_file())).unwrap();
1014
1015        m.write().unwrap();
1016
1017        let f = TempFile::new().expect("Failed to create temporary metrics file");
1018
1019        m.init(LineWriter::new(f.into_file())).unwrap_err();
1020    }
1021
1022    #[test]
1023    fn test_shared_inc_metric() {
1024        let metric = Arc::new(SharedIncMetric::default());
1025
1026        // We're going to create a number of threads that will attempt to increase this metric
1027        // in parallel. If everything goes fine we still can't be sure the synchronization works,
1028        // but if something fails, then we definitely have a problem :-s
1029
1030        const NUM_THREADS_TO_SPAWN: usize = 4;
1031        const NUM_INCREMENTS_PER_THREAD: u64 = 10_0000;
1032        const M2_INITIAL_COUNT: u64 = 123;
1033
1034        metric.add(M2_INITIAL_COUNT);
1035
1036        let mut v = Vec::with_capacity(NUM_THREADS_TO_SPAWN);
1037
1038        for _ in 0..NUM_THREADS_TO_SPAWN {
1039            let r = metric.clone();
1040            v.push(thread::spawn(move || {
1041                for _ in 0..NUM_INCREMENTS_PER_THREAD {
1042                    r.inc();
1043                }
1044            }));
1045        }
1046
1047        for handle in v {
1048            handle.join().unwrap();
1049        }
1050
1051        assert_eq!(
1052            metric.count(),
1053            M2_INITIAL_COUNT + NUM_THREADS_TO_SPAWN as u64 * NUM_INCREMENTS_PER_THREAD
1054        );
1055    }
1056
1057    #[test]
1058    fn test_shared_store_metric() {
1059        let m1 = Arc::new(SharedStoreMetric::default());
1060        m1.store(1);
1061        fence(Ordering::SeqCst);
1062        assert_eq!(1, m1.fetch());
1063    }
1064
1065    #[test]
1066    fn test_serialize() {
1067        let s = serde_json::to_string(&FirecrackerMetrics::default());
1068        s.unwrap();
1069    }
1070
1071    #[test]
1072    fn test_error_messages() {
1073        assert_eq!(
1074            format!(
1075                "{}",
1076                MetricsError::NeverInitialized(String::from("Bad Metrics Path Provided"))
1077            ),
1078            "Bad Metrics Path Provided"
1079        );
1080        assert_eq!(
1081            format!("{}", MetricsError::AlreadyInitialized),
1082            "Reinitialization of metrics not allowed."
1083        );
1084        assert_eq!(
1085            format!(
1086                "{}",
1087                MetricsError::Write(std::io::Error::new(ErrorKind::Interrupted, "write"))
1088            ),
1089            "Failed to write metrics: write"
1090        );
1091        assert_eq!(
1092            format!(
1093                "{}",
1094                MetricsError::Serde("Failed to serialize the given data structure.".to_string())
1095            ),
1096            "Failed to serialize the given data structure."
1097        );
1098    }
1099}