vmm/devices/virtio/
vhost_user_metrics.rs

1// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Defines the metrics system for vhost-user devices.
5//!
6//! # Metrics format
7//! The metrics are flushed in JSON when requested by vmm::logger::metrics::METRICS.write().
8//!
9//! ## JSON example with metrics:
10//! ```json
11//! {
12//!  "vhost_user_{mod}_id0": {
13//!     "activate_fails": "SharedIncMetric",
14//!     "cfg_fails": "SharedIncMetric",
15//!     "init_time_us": SharedStoreMetric,
16//!     "activate_time_us": SharedStoreMetric,
17//!     "config_change_time_us": SharedStoreMetric,
18//!  }
19//!  "vhost_user_{mod}_id1": {
20//!     "activate_fails": "SharedIncMetric",
21//!     "cfg_fails": "SharedIncMetric",
22//!     "init_time_us": SharedStoreMetric,
23//!     "activate_time_us": SharedStoreMetric,
24//!     "config_change_time_us": SharedStoreMetric,
25//!  }
26//!  ...
27//!  "vhost_user_{mod}_idN": {
28//!     "activate_fails": "SharedIncMetric",
29//!     "cfg_fails": "SharedIncMetric",
30//!     "init_time_us": SharedStoreMetric,
31//!     "activate_time_us": SharedStoreMetric,
32//!     "config_change_time_us": SharedStoreMetric,
33//!  }
34//! }
35//! ```
36//! Each `vhost_user` field in the example above is a serializable `VhostUserDeviceMetrics`
37//! structure collecting metrics such as `activate_fails`, `cfg_fails`, `init_time_us`,
38//! `activate_time_us` and `config_change_time_us` for the vhost_user device.
39//! For vhost-user block device having endpoint "/drives/drv0" the emitted metrics would be
40//! `vhost_user_block_drv0`.
41//! For vhost-user block device having endpoint "/drives/drvN" the emitted metrics would be
42//! `vhost_user_block_drvN`.
43//! Aggregate metrics for `vhost_user` if `not` emitted as it can be easily obtained in
44//! typical observability tools.
45//!
46//! # Design
47//! The main design goals of this system are:
48//! * To improve vhost_user device metrics by logging them at per device granularity.
49//! * `vhost_user` is a new device with no metrics emitted before so, backward compatibility doesn't
50//!   come into picture like it was in the case of block/net devices. And since, metrics can be
51//!   easily aggregated using typical observability tools, we chose not to provide aggregate
52//!   vhost_user metrics.
53//! * Rely on `serde` to provide the actual serialization for writing the metrics.
54//! * Since all metrics start at 0, we implement the `Default` trait via derive for all of them, to
55//!   avoid having to initialize everything by hand.
56//!
57//! * Follow the design of Block and Net device metrics and use a map of vhost_user device name and
58//!   corresponding metrics.
59//! * Metrics are flushed with key `vhost_user_{module_specific_name}` and each module sets an
60//!   appropriate `module_specific_name` in the format `{mod}_{id}`. e.g. vhost-user block device in
61//!   this commit set this as `format!("{}_{}", "block_", config.drive_id.clone());` This way
62//!   vhost_user_metrics stay generic while the specific vhost_user devices can have their unique
63//!   metrics.
64//!
65//! The system implements 2 type of metrics:
66//! * Shared Incremental Metrics (SharedIncMetrics) - dedicated for the metrics which need a counter
67//!   (i.e the number of times activating a device failed). These metrics are reset upon flush.
68//! * Shared Store Metrics (SharedStoreMetrics) - are targeted at keeping a persistent value, it is
69//!   `not` intended to act as a counter (i.e for measure the process start up time for example).
70//!
71//! We add VhostUserDeviceMetrics entries from vhost_user_metrics::METRICS into vhost_user device
72//! instead of vhost_user device having individual separate VhostUserDeviceMetrics entries because
73//! vhost_user device is not accessible from signal handlers to flush metrics and
74//! vhost_user_metrics::METRICS is.
75
76use std::collections::BTreeMap;
77use std::sync::{Arc, RwLock};
78
79use serde::ser::SerializeMap;
80use serde::{Serialize, Serializer};
81
82use crate::logger::{SharedIncMetric, SharedStoreMetric};
83
84/// map of vhost_user drive id and metrics
85/// this should be protected by a lock before accessing.
86#[allow(missing_debug_implementations)]
87pub struct VhostUserMetricsPerDevice {
88    /// used to access per vhost_user device metrics
89    pub metrics: BTreeMap<String, Arc<VhostUserDeviceMetrics>>,
90}
91
92impl VhostUserMetricsPerDevice {
93    /// Allocate `VhostUserDeviceMetrics` for vhost_user device having
94    /// id `drive_id`. Also, allocate only if it doesn't
95    /// exist to avoid overwriting previously allocated data.
96    /// lock is always initialized so it is safe the unwrap
97    /// the lock without a check.
98    pub fn alloc(drive_id: String) -> Arc<VhostUserDeviceMetrics> {
99        Arc::clone(
100            METRICS
101                .write()
102                .unwrap()
103                .metrics
104                .entry(drive_id)
105                .or_insert_with(|| Arc::new(VhostUserDeviceMetrics::default())),
106        )
107    }
108}
109
110/// Pool of vhost_user-related metrics per device behind a lock to
111/// keep things thread safe. Since the lock is initialized here
112/// it is safe to unwrap it without any check.
113static METRICS: RwLock<VhostUserMetricsPerDevice> = RwLock::new(VhostUserMetricsPerDevice {
114    metrics: BTreeMap::new(),
115});
116
117/// This function facilitates serialization of vhost_user device metrics.
118pub fn flush_metrics<S: Serializer>(serializer: S) -> Result<S::Ok, S::Error> {
119    let vhost_user_metrics = METRICS.read().unwrap();
120    let metrics_len = vhost_user_metrics.metrics.len();
121    let mut seq = serializer.serialize_map(Some(metrics_len))?;
122
123    for (name, metrics) in vhost_user_metrics.metrics.iter() {
124        let devn = format!("vhost_user_{}", name);
125        seq.serialize_entry(&devn, metrics)?;
126    }
127    seq.end()
128}
129
130/// vhost_user Device associated metrics.
131#[derive(Debug, Default, Serialize)]
132pub struct VhostUserDeviceMetrics {
133    /// Number of times when activate failed on a vhost_user device.
134    pub activate_fails: SharedIncMetric,
135    /// Number of times when interacting with the space config of a vhost-user device failed.
136    pub cfg_fails: SharedIncMetric,
137    // Vhost-user init time in microseconds.
138    pub init_time_us: SharedStoreMetric,
139    // Vhost-user activate time in microseconds.
140    pub activate_time_us: SharedStoreMetric,
141    // Vhost-user config change time in microseconds.
142    pub config_change_time_us: SharedStoreMetric,
143}
144
145#[cfg(test)]
146pub mod tests {
147    use utils::time::{ClockType, get_time_us};
148
149    use super::*;
150    use crate::logger::{IncMetric, StoreMetric};
151
152    // vhost-user metrics has both SharedIncMetrics and SharedStoreMetrics
153    // In this test we try to test one field for each type by creating a
154    // dummy vhost_user_block metric named `vhost_user_block_drvN`.
155    // There is no specific reason to storing the measured time taken vs a
156    // random number in `init_time_us`.
157    // We add an additional test to confirm that `vhost_user_metrics::METRICS`
158    // actually has an entry for `vhost_user_block_drvN` and compare it.
159    // We chose serde_json to compare because that seemed easiest to compare
160    // the entire struct format and serialization of VhostUserDeviceMetrics.
161    #[test]
162    fn test_vhost_user_basic_metrics() {
163        let vhost_user_dev_name: String = String::from("vhost_user_block_drvN");
164        let start_time = get_time_us(ClockType::Monotonic);
165        let vhost_user_metrics: Arc<VhostUserDeviceMetrics> =
166            VhostUserMetricsPerDevice::alloc(vhost_user_dev_name.clone());
167        let delta_us = get_time_us(ClockType::Monotonic) - start_time;
168        vhost_user_metrics.activate_fails.inc();
169        assert_eq!(vhost_user_metrics.activate_fails.count(), 1);
170
171        vhost_user_metrics.init_time_us.store(delta_us);
172        assert_eq!(vhost_user_metrics.init_time_us.fetch(), delta_us);
173
174        // fill another local variable with the same data and use it to compare with the METRICS
175        // entry
176        let vhost_user_metrics_backup: VhostUserDeviceMetrics = VhostUserDeviceMetrics::default();
177        vhost_user_metrics_backup.activate_fails.inc();
178        vhost_user_metrics_backup.init_time_us.store(delta_us);
179
180        // serializing METRICS also flushes the SharedIncMetric data so we have to use _backup
181        // variable for comparison.
182        let vhost_user_metrics_global: String =
183            serde_json::to_string(&METRICS.read().unwrap().metrics.get(&vhost_user_dev_name))
184                .unwrap();
185        let vhost_user_metrics_local: String =
186            serde_json::to_string(&vhost_user_metrics_backup).unwrap();
187        assert_eq!(vhost_user_metrics_local, vhost_user_metrics_global);
188    }
189}