vmm/arch/x86_64/
vcpu.rs

1// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0
3//
4// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
5// Use of this source code is governed by a BSD-style license that can be
6// found in the THIRD-PARTY file.
7
8use std::collections::BTreeMap;
9use std::fmt::Debug;
10use std::sync::Arc;
11
12use kvm_bindings::{
13    CpuId, KVM_MAX_CPUID_ENTRIES, KVM_MAX_MSR_ENTRIES, Msrs, Xsave, kvm_debugregs, kvm_lapic_state,
14    kvm_mp_state, kvm_regs, kvm_sregs, kvm_vcpu_events, kvm_xcrs, kvm_xsave, kvm_xsave2,
15};
16use kvm_ioctls::{VcpuExit, VcpuFd};
17use log::{error, warn};
18use serde::{Deserialize, Serialize};
19use vmm_sys_util::fam::{self, FamStruct};
20
21use crate::arch::EntryPoint;
22use crate::arch::x86_64::generated::msr_index::{MSR_IA32_TSC, MSR_IA32_TSC_DEADLINE};
23use crate::arch::x86_64::interrupts;
24use crate::arch::x86_64::msr::{MsrError, create_boot_msr_entries};
25use crate::arch::x86_64::regs::{SetupFpuError, SetupRegistersError, SetupSpecialRegistersError};
26use crate::cpu_config::x86_64::{CpuConfiguration, cpuid};
27use crate::logger::{IncMetric, METRICS};
28use crate::vstate::bus::Bus;
29use crate::vstate::memory::GuestMemoryMmap;
30use crate::vstate::vcpu::{VcpuConfig, VcpuEmulation, VcpuError};
31use crate::vstate::vm::Vm;
32
33// Tolerance for TSC frequency expected variation.
34// The value of 250 parts per million is based on
35// the QEMU approach, more details here:
36// https://bugzilla.redhat.com/show_bug.cgi?id=1839095
37const TSC_KHZ_TOL_NUMERATOR: i64 = 250;
38const TSC_KHZ_TOL_DENOMINATOR: i64 = 1_000_000;
39
40/// A set of MSRs that should be restored separately after all other MSRs have already been restored
41const DEFERRED_MSRS: [u32; 1] = [
42    // MSR_IA32_TSC_DEADLINE must be restored after MSR_IA32_TSC, otherwise we risk "losing" timer
43    // interrupts across the snapshot restore boundary (due to KVM querying MSR_IA32_TSC upon
44    // writes to the TSC_DEADLINE MSR to determine whether it needs to prime a timer - if
45    // MSR_IA32_TSC is not initialized correctly, it can wrongly assume no timer needs to be
46    // primed, or the timer can be initialized with a wrong expiry).
47    MSR_IA32_TSC_DEADLINE,
48];
49
50/// Errors associated with the wrappers over KVM ioctls.
51#[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)]
52pub enum KvmVcpuError {
53    /// Failed to convert `kvm_bindings::CpuId` to `Cpuid`: {0}
54    ConvertCpuidType(#[from] cpuid::CpuidTryFromKvmCpuid),
55    /// Failed FamStructWrapper operation: {0}
56    Fam(#[from] vmm_sys_util::fam::Error),
57    /// Failed to get dumpable MSR index list: {0}
58    GetMsrsToDump(#[from] crate::arch::x86_64::msr::MsrError),
59    /// Cannot open the VCPU file descriptor: {0}
60    VcpuFd(kvm_ioctls::Error),
61    /// Failed to get KVM vcpu debug regs: {0}
62    VcpuGetDebugRegs(kvm_ioctls::Error),
63    /// Failed to get KVM vcpu lapic: {0}
64    VcpuGetLapic(kvm_ioctls::Error),
65    /// Failed to get KVM vcpu mp state: {0}
66    VcpuGetMpState(kvm_ioctls::Error),
67    /// Failed to get KVM vcpu msr: {0:#x}
68    VcpuGetMsr(u32),
69    /// Failed to get KVM vcpu msrs: {0}
70    VcpuGetMsrs(kvm_ioctls::Error),
71    /// Failed to get KVM vcpu regs: {0}
72    VcpuGetRegs(kvm_ioctls::Error),
73    /// Failed to get KVM vcpu sregs: {0}
74    VcpuGetSregs(kvm_ioctls::Error),
75    /// Failed to get KVM vcpu event: {0}
76    VcpuGetVcpuEvents(kvm_ioctls::Error),
77    /// Failed to get KVM vcpu xcrs: {0}
78    VcpuGetXcrs(kvm_ioctls::Error),
79    /// Failed to get KVM vcpu xsave via KVM_GET_XSAVE: {0}
80    VcpuGetXsave(kvm_ioctls::Error),
81    /// Failed to get KVM vcpu xsave via KVM_GET_XSAVE2: {0}
82    VcpuGetXsave2(kvm_ioctls::Error),
83    /// Failed to get KVM vcpu cpuid: {0}
84    VcpuGetCpuid(kvm_ioctls::Error),
85    /// Failed to get KVM TSC frequency: {0}
86    VcpuGetTsc(kvm_ioctls::Error),
87    /// Failed to set KVM vcpu cpuid: {0}
88    VcpuSetCpuid(kvm_ioctls::Error),
89    /// Failed to set KVM vcpu debug regs: {0}
90    VcpuSetDebugRegs(kvm_ioctls::Error),
91    /// Failed to set KVM vcpu lapic: {0}
92    VcpuSetLapic(kvm_ioctls::Error),
93    /// Failed to set KVM vcpu mp state: {0}
94    VcpuSetMpState(kvm_ioctls::Error),
95    /// Failed to set KVM vcpu msrs: {0}
96    VcpuSetMsrs(kvm_ioctls::Error),
97    /// Failed to set all KVM MSRs for this vCPU. Only a partial write was done.
98    VcpuSetMsrsIncomplete,
99    /// Failed to set KVM vcpu regs: {0}
100    VcpuSetRegs(kvm_ioctls::Error),
101    /// Failed to set KVM vcpu sregs: {0}
102    VcpuSetSregs(kvm_ioctls::Error),
103    /// Failed to set KVM vcpu event: {0}
104    VcpuSetVcpuEvents(kvm_ioctls::Error),
105    /// Failed to set KVM vcpu xcrs: {0}
106    VcpuSetXcrs(kvm_ioctls::Error),
107    /// Failed to set KVM vcpu xsave: {0}
108    VcpuSetXsave(kvm_ioctls::Error),
109}
110
111/// Error type for [`KvmVcpu::get_tsc_khz`] and [`KvmVcpu::is_tsc_scaling_required`].
112#[derive(Debug, thiserror::Error, derive_more::From, Eq, PartialEq)]
113#[error("{0}")]
114pub struct GetTscError(vmm_sys_util::errno::Error);
115
116/// Error type for [`KvmVcpu::set_tsc_khz`].
117#[derive(Debug, thiserror::Error, Eq, PartialEq)]
118#[error("{0}")]
119pub struct SetTscError(#[from] kvm_ioctls::Error);
120
121/// Error type for [`KvmVcpu::configure`].
122#[derive(Debug, thiserror::Error, displaydoc::Display, Eq, PartialEq)]
123pub enum KvmVcpuConfigureError {
124    /// Failed to convert `Cpuid` to `kvm_bindings::CpuId`: {0}
125    ConvertCpuidType(#[from] vmm_sys_util::fam::Error),
126    /// Failed to apply modifications to CPUID: {0}
127    NormalizeCpuidError(#[from] cpuid::NormalizeCpuidError),
128    /// Failed to set CPUID: {0}
129    SetCpuid(#[from] vmm_sys_util::errno::Error),
130    /// Failed to set MSRs: {0}
131    SetMsrs(#[from] MsrError),
132    /// Failed to setup registers: {0}
133    SetupRegisters(#[from] SetupRegistersError),
134    /// Failed to setup FPU: {0}
135    SetupFpu(#[from] SetupFpuError),
136    /// Failed to setup special registers: {0}
137    SetupSpecialRegisters(#[from] SetupSpecialRegistersError),
138    /// Failed to configure LAPICs: {0}
139    SetLint(#[from] interrupts::InterruptError),
140}
141
142/// A wrapper around creating and using a kvm x86_64 vcpu.
143#[derive(Debug)]
144pub struct KvmVcpu {
145    /// Index of vcpu.
146    pub index: u8,
147    /// KVM vcpu fd.
148    pub fd: VcpuFd,
149    /// Vcpu peripherals, such as buses
150    pub peripherals: Peripherals,
151    /// The list of MSRs to include in a VM snapshot, in the same order as KVM returned them
152    /// from KVM_GET_MSR_INDEX_LIST
153    msrs_to_save: Vec<u32>,
154    /// Size in bytes requiring to hold the dynamically-sized `kvm_xsave` struct.
155    ///
156    /// `None` if `KVM_CAP_XSAVE2` not supported.
157    xsave2_size: Option<usize>,
158}
159
160/// Vcpu peripherals
161#[derive(Default, Debug)]
162pub struct Peripherals {
163    /// Pio bus.
164    pub pio_bus: Option<Arc<Bus>>,
165    /// Mmio bus.
166    pub mmio_bus: Option<Arc<Bus>>,
167}
168
169impl KvmVcpu {
170    /// Constructs a new kvm vcpu with arch specific functionality.
171    ///
172    /// # Arguments
173    ///
174    /// * `index` - Represents the 0-based CPU index between [0, max vcpus).
175    /// * `vm` - The vm to which this vcpu will get attached.
176    pub fn new(index: u8, vm: &Vm) -> Result<Self, KvmVcpuError> {
177        let kvm_vcpu = vm
178            .fd()
179            .create_vcpu(index.into())
180            .map_err(KvmVcpuError::VcpuFd)?;
181
182        Ok(KvmVcpu {
183            index,
184            fd: kvm_vcpu,
185            peripherals: Default::default(),
186            msrs_to_save: vm.msrs_to_save().to_vec(),
187            xsave2_size: vm.xsave2_size(),
188        })
189    }
190
191    /// Configures a x86_64 specific vcpu for booting Linux and should be called once per vcpu.
192    ///
193    /// # Arguments
194    ///
195    /// * `guest_mem` - The guest memory used by this microvm.
196    /// * `kernel_entry_point` - Specifies the boot protocol and offset from `guest_mem` at which
197    ///   the kernel starts.
198    /// * `vcpu_config` - The vCPU configuration.
199    /// * `cpuid` - The capabilities exposed by this vCPU.
200    pub fn configure(
201        &mut self,
202        guest_mem: &GuestMemoryMmap,
203        kernel_entry_point: EntryPoint,
204        vcpu_config: &VcpuConfig,
205    ) -> Result<(), KvmVcpuConfigureError> {
206        let mut cpuid = vcpu_config.cpu_config.cpuid.clone();
207
208        // Apply machine specific changes to CPUID.
209        cpuid.normalize(
210            // The index of the current logical CPU in the range [0..cpu_count].
211            self.index,
212            // The total number of logical CPUs.
213            vcpu_config.vcpu_count,
214            // The number of bits needed to enumerate logical CPUs per core.
215            u8::from(vcpu_config.vcpu_count > 1 && vcpu_config.smt),
216        )?;
217
218        // Set CPUID.
219        let kvm_cpuid = kvm_bindings::CpuId::try_from(cpuid)?;
220
221        // Set CPUID in the KVM
222        self.fd
223            .set_cpuid2(&kvm_cpuid)
224            .map_err(KvmVcpuConfigureError::SetCpuid)?;
225
226        // Clone MSR entries that are modified by CPU template from `VcpuConfig`.
227        let mut msrs = vcpu_config.cpu_config.msrs.clone();
228        self.msrs_to_save.extend(msrs.keys());
229
230        // Apply MSR modification to comply the linux boot protocol.
231        create_boot_msr_entries().into_iter().for_each(|entry| {
232            msrs.insert(entry.index, entry.data);
233        });
234
235        // TODO - Add/amend MSRs for vCPUs based on cpu_config
236        // By this point the Guest CPUID is established. Some CPU features require MSRs
237        // to configure and interact with those features. If a MSR is writable from
238        // inside the Guest, or is changed by KVM or Firecracker on behalf of the Guest,
239        // then we will need to save it every time we take a snapshot, and restore its
240        // value when we restore the microVM since the Guest may need that value.
241        // Since CPUID tells us what features are enabled for the Guest, we can infer
242        // the extra MSRs that we need to save based on a dependency map.
243        let extra_msrs = cpuid::common::msrs_to_save_by_cpuid(&kvm_cpuid);
244        self.msrs_to_save.extend(extra_msrs);
245
246        // TODO: Some MSRs depend on values of other MSRs. This dependency will need to
247        // be implemented.
248
249        // By this point we know that at snapshot, the list of MSRs we need to
250        // save is `architectural MSRs` + `MSRs inferred through CPUID` + `other
251        // MSRs defined by the template`
252
253        let kvm_msrs = msrs
254            .into_iter()
255            .map(|entry| kvm_bindings::kvm_msr_entry {
256                index: entry.0,
257                data: entry.1,
258                ..Default::default()
259            })
260            .collect::<Vec<_>>();
261
262        crate::arch::x86_64::msr::set_msrs(&self.fd, &kvm_msrs)?;
263        crate::arch::x86_64::regs::setup_regs(&self.fd, kernel_entry_point)?;
264        crate::arch::x86_64::regs::setup_fpu(&self.fd)?;
265        crate::arch::x86_64::regs::setup_sregs(guest_mem, &self.fd, kernel_entry_point.protocol)?;
266        crate::arch::x86_64::interrupts::set_lint(&self.fd)?;
267        Ok(())
268    }
269
270    /// Sets a Port Mapped IO bus for this vcpu.
271    pub fn set_pio_bus(&mut self, pio_bus: Arc<Bus>) {
272        self.peripherals.pio_bus = Some(pio_bus);
273    }
274
275    /// Calls KVM_KVMCLOCK_CTRL to avoid guest soft lockup watchdog panics on resume.
276    /// See https://docs.kernel.org/virt/kvm/api.html .
277    pub fn kvmclock_ctrl(&self) {
278        // We do not want to fail if the call is not successful, because that may be acceptable
279        // depending on the workload. For example, EINVAL is returned if kvm-clock is not
280        // activated (e.g., no-kvmclock is specified in the guest kernel parameter).
281        // https://elixir.bootlin.com/linux/v6.17.5/source/arch/x86/kvm/x86.c#L5736-L5737
282        if let Err(err) = self.fd.kvmclock_ctrl() {
283            METRICS.vcpu.kvmclock_ctrl_fails.inc();
284            warn!("KVM_KVMCLOCK_CTRL call failed {}", err);
285        }
286    }
287
288    /// Get the current XSAVE state for this vCPU.
289    ///
290    /// The C `kvm_xsave` struct was extended by adding a flexible array member (FAM) in the end
291    /// to support variable-sized XSTATE buffer.
292    ///
293    /// https://elixir.bootlin.com/linux/v6.13.6/source/arch/x86/include/uapi/asm/kvm.h#L381
294    /// ```c
295    /// struct kvm_xsave {
296    ///         __u32 region[1024];
297    ///         __u32 extra[];
298    /// };
299    /// ```
300    ///
301    /// As shown above, the C `kvm_xsave` struct does not have any field for the size of itself or
302    /// the length of its FAM. The required size (in bytes) of `kvm_xsave` struct can be retrieved
303    /// via `KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)`.
304    ///
305    /// kvm-bindings defines `kvm_xsave2` struct that wraps the `kvm_xsave` struct to have `len`
306    /// field that indicates the number of FAM entries (i.e. `extra`), it also defines `Xsave` as
307    /// a `FamStructWrapper` of `kvm_xsave2`.
308    ///
309    /// https://github.com/rust-vmm/kvm/blob/68fff5491703bf32bd35656f7ba994a4cae9ea7d/kvm-bindings/src/x86_64/fam_wrappers.rs#L106
310    /// ```rs
311    /// pub struct kvm_xsave2 {
312    ///     pub len: usize,
313    ///     pub xsave: kvm_xsave,
314    /// }
315    /// ```
316    fn get_xsave(&self) -> Result<Xsave, KvmVcpuError> {
317        match self.xsave2_size {
318            // if `KVM_CAP_XSAVE2` supported
319            Some(xsave2_size) => {
320                // Convert the `kvm_xsave` size in bytes to the length of FAM (i.e. `extra`).
321                let fam_len =
322                    // Calculate the size of FAM (`extra`) area in bytes. Note that the subtraction
323                    // never underflows because `KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)` always returns
324                    // at least 4096 bytes that is the size of `kvm_xsave` without FAM area.
325                    (xsave2_size - std::mem::size_of::<kvm_xsave>())
326                    // Divide by the size of FAM (`extra`) entry (i.e. `__u32`).
327                    .div_ceil(std::mem::size_of::<<kvm_xsave2 as FamStruct>::Entry>());
328                let mut xsave = Xsave::new(fam_len).map_err(KvmVcpuError::Fam)?;
329                // SAFETY: Safe because `xsave` is allocated with enough size to save XSTATE.
330                unsafe { self.fd.get_xsave2(&mut xsave) }.map_err(KvmVcpuError::VcpuGetXsave2)?;
331                Ok(xsave)
332            }
333            // if `KVM_CAP_XSAVE2` not supported
334            None => Ok(
335                // SAFETY: The content is correctly laid out.
336                unsafe {
337                    Xsave::from_raw(vec![kvm_xsave2 {
338                        // Note that `len` is the number of FAM (`extra`) entries that didn't exist
339                        // on older kernels not supporting `KVM_CAP_XSAVE2`. Thus, it's always zero.
340                        len: 0,
341                        xsave: self.fd.get_xsave().map_err(KvmVcpuError::VcpuGetXsave)?,
342                    }])
343                },
344            ),
345        }
346    }
347
348    /// Get the current TSC frequency for this vCPU.
349    ///
350    /// # Errors
351    ///
352    /// When [`kvm_ioctls::VcpuFd::get_tsc_khz`] errors.
353    pub fn get_tsc_khz(&self) -> Result<u32, GetTscError> {
354        let res = self.fd.get_tsc_khz()?;
355        Ok(res)
356    }
357
358    /// Get CPUID for this vCPU.
359    ///
360    /// Opposed to KVM_GET_SUPPORTED_CPUID, KVM_GET_CPUID2 does not update "nent" with valid number
361    /// of entries on success. Thus, when it passes "num_entries" greater than required, zeroed
362    /// entries follow after valid entries. This function removes such zeroed empty entries.
363    ///
364    /// # Errors
365    ///
366    /// * When [`kvm_ioctls::VcpuFd::get_cpuid2`] returns errors.
367    fn get_cpuid(&self) -> Result<kvm_bindings::CpuId, KvmVcpuError> {
368        let mut cpuid = self
369            .fd
370            .get_cpuid2(KVM_MAX_CPUID_ENTRIES)
371            .map_err(KvmVcpuError::VcpuGetCpuid)?;
372
373        // As CPUID.0h:EAX should have the largest CPUID standard function, we don't need to check
374        // EBX, ECX and EDX to confirm whether it is a valid entry.
375        cpuid.retain(|entry| {
376            !(entry.function == 0 && entry.index == 0 && entry.flags == 0 && entry.eax == 0)
377        });
378
379        Ok(cpuid)
380    }
381
382    /// If the IA32_TSC_DEADLINE MSR value is zero, update it
383    /// with the IA32_TSC value to guarantee that
384    /// the vCPU will continue receiving interrupts after restoring from a snapshot.
385    ///
386    /// Rationale: we observed that sometimes when taking a snapshot,
387    /// the IA32_TSC_DEADLINE MSR is cleared, but the interrupt is not
388    /// delivered to the guest, leading to a situation where one
389    /// of the vCPUs never receives TSC interrupts after restoring,
390    /// until the MSR is updated externally, eg by setting the system time.
391    fn fix_zero_tsc_deadline_msr(msr_chunks: &mut [Msrs]) {
392        // We do not expect more than 1 TSC MSR entry, but if there are multiple, pick the maximum.
393        let max_tsc_value = msr_chunks
394            .iter()
395            .flat_map(|msrs| msrs.as_slice())
396            .filter(|msr| msr.index == MSR_IA32_TSC)
397            .map(|msr| msr.data)
398            .max();
399
400        if let Some(tsc_value) = max_tsc_value {
401            msr_chunks
402                .iter_mut()
403                .flat_map(|msrs| msrs.as_mut_slice())
404                .filter(|msr| msr.index == MSR_IA32_TSC_DEADLINE && msr.data == 0)
405                .for_each(|msr| {
406                    warn!(
407                        "MSR_IA32_TSC_DEADLINE is 0, replacing with {:#x}.",
408                        tsc_value
409                    );
410                    msr.data = tsc_value;
411                });
412        }
413    }
414
415    /// Looks for MSRs from the [`DEFERRED_MSRS`] array and removes them from `msr_chunks`.
416    /// Returns a new [`Msrs`] object containing all the removed MSRs.
417    ///
418    /// We use this to capture some causal dependencies between MSRs where the relative order
419    /// of restoration matters (e.g. MSR_IA32_TSC must be restored before MSR_IA32_TSC_DEADLINE).
420    fn extract_deferred_msrs(msr_chunks: &mut [Msrs]) -> Result<Msrs, fam::Error> {
421        // Use 0 here as FamStructWrapper doesn't really give an equivalent of `Vec::with_capacity`,
422        // and if we specify something N != 0 here, then it will create a FamStructWrapper with N
423        // elements pre-allocated and zero'd out. Unless we then actually "fill" all those N values,
424        // KVM will later yell at us about invalid MSRs.
425        let mut deferred_msrs = Msrs::new(0)?;
426
427        for msrs in msr_chunks {
428            msrs.retain(|msr| {
429                if DEFERRED_MSRS.contains(&msr.index) {
430                    deferred_msrs
431                        .push(*msr)
432                        .inspect_err(|err| {
433                            error!(
434                                "Failed to move MSR {} into later chunk: {:?}",
435                                msr.index, err
436                            )
437                        })
438                        .is_err()
439                } else {
440                    true
441                }
442            });
443        }
444
445        Ok(deferred_msrs)
446    }
447
448    /// Get MSR chunks for the given MSR index list.
449    ///
450    /// KVM only supports getting `KVM_MAX_MSR_ENTRIES` at a time, so we divide
451    /// the list of MSR indices into chunks, call `KVM_GET_MSRS` for each
452    /// chunk, and collect into a [`Vec<Msrs>`].
453    ///
454    /// # Arguments
455    ///
456    /// * `msr_index_iter`: Iterator over MSR indices.
457    ///
458    /// # Errors
459    ///
460    /// * When [`kvm_bindings::Msrs::new`] returns errors.
461    /// * When [`kvm_ioctls::VcpuFd::get_msrs`] returns errors.
462    /// * When the return value of [`kvm_ioctls::VcpuFd::get_msrs`] (the number of entries that
463    ///   could be gotten) is less than expected.
464    fn get_msr_chunks(
465        &self,
466        mut msr_index_iter: impl ExactSizeIterator<Item = u32>,
467    ) -> Result<Vec<Msrs>, KvmVcpuError> {
468        let num_chunks = msr_index_iter.len().div_ceil(KVM_MAX_MSR_ENTRIES);
469
470        // + 1 for the chunk of deferred MSRs
471        let mut msr_chunks: Vec<Msrs> = Vec::with_capacity(num_chunks + 1);
472
473        for _ in 0..num_chunks {
474            let chunk_len = msr_index_iter.len().min(KVM_MAX_MSR_ENTRIES);
475            let chunk = self.get_msr_chunk(&mut msr_index_iter, chunk_len)?;
476            msr_chunks.push(chunk);
477        }
478
479        Self::fix_zero_tsc_deadline_msr(&mut msr_chunks);
480
481        let deferred = Self::extract_deferred_msrs(&mut msr_chunks)?;
482        msr_chunks.push(deferred);
483
484        Ok(msr_chunks)
485    }
486
487    /// Get single MSR chunk for the given MSR index iterator with
488    /// specified length. Iterator should have enough elements
489    /// to fill the chunk with indices, otherwise KVM will
490    /// return an error when processing half filled chunk.
491    ///
492    /// # Arguments
493    ///
494    /// * `msr_index_iter`: Iterator over MSR indices.
495    /// * `chunk_size`: Length of a chunk.
496    ///
497    /// # Errors
498    ///
499    /// * When [`kvm_bindings::Msrs::new`] returns errors.
500    /// * When [`kvm_ioctls::VcpuFd::get_msrs`] returns errors.
501    /// * When the return value of [`kvm_ioctls::VcpuFd::get_msrs`] (the number of entries that
502    ///   could be gotten) is less than expected.
503    pub fn get_msr_chunk(
504        &self,
505        msr_index_iter: impl Iterator<Item = u32>,
506        chunk_size: usize,
507    ) -> Result<Msrs, KvmVcpuError> {
508        let chunk_iter = msr_index_iter.take(chunk_size);
509
510        let mut msrs = Msrs::new(chunk_size)?;
511        let msr_entries = msrs.as_mut_slice();
512        for (pos, msr_index) in chunk_iter.enumerate() {
513            msr_entries[pos].index = msr_index;
514        }
515
516        let nmsrs = self
517            .fd
518            .get_msrs(&mut msrs)
519            .map_err(KvmVcpuError::VcpuGetMsrs)?;
520        // GET_MSRS returns a number of successfully set msrs.
521        // If number of set msrs is not equal to the length of
522        // `msrs`, then the value returned by GET_MSRS can act
523        // as an index to the problematic msr.
524        if nmsrs != chunk_size {
525            Err(KvmVcpuError::VcpuGetMsr(msrs.as_slice()[nmsrs].index))
526        } else {
527            Ok(msrs)
528        }
529    }
530
531    /// Get MSRs for the given MSR index list.
532    ///
533    /// # Arguments
534    ///
535    /// * `msr_index_list`: List of MSR indices
536    ///
537    /// # Errors
538    ///
539    /// * When `KvmVcpu::get_msr_chunks()` returns errors.
540    pub fn get_msrs(
541        &self,
542        msr_index_iter: impl ExactSizeIterator<Item = u32>,
543    ) -> Result<BTreeMap<u32, u64>, KvmVcpuError> {
544        let mut msrs = BTreeMap::new();
545        self.get_msr_chunks(msr_index_iter)?
546            .iter()
547            .for_each(|msr_chunk| {
548                msr_chunk.as_slice().iter().for_each(|msr| {
549                    msrs.insert(msr.index, msr.data);
550                });
551            });
552        Ok(msrs)
553    }
554
555    /// Save the KVM internal state.
556    pub fn save_state(&self) -> Result<VcpuState, KvmVcpuError> {
557        // Ordering requirements:
558        //
559        // KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
560        // vCPU/LAPIC state. As such, it must be done before most everything
561        // else, otherwise we cannot restore everything and expect it to work.
562        //
563        // KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
564        // still running.
565        //
566        // KVM_GET_LAPIC may change state of LAPIC before returning it.
567        //
568        // GET_VCPU_EVENTS should probably be last to save. The code looks as
569        // it might as well be affected by internal state modifications of the
570        // GET ioctls.
571        //
572        // SREGS saves/restores a pending interrupt, similar to what
573        // VCPU_EVENTS also does.
574
575        let mp_state = self
576            .fd
577            .get_mp_state()
578            .map_err(KvmVcpuError::VcpuGetMpState)?;
579        let regs = self.fd.get_regs().map_err(KvmVcpuError::VcpuGetRegs)?;
580        let sregs = self.fd.get_sregs().map_err(KvmVcpuError::VcpuGetSregs)?;
581        let xsave = self.get_xsave()?;
582        let xcrs = self.fd.get_xcrs().map_err(KvmVcpuError::VcpuGetXcrs)?;
583        let debug_regs = self
584            .fd
585            .get_debug_regs()
586            .map_err(KvmVcpuError::VcpuGetDebugRegs)?;
587        let lapic = self.fd.get_lapic().map_err(KvmVcpuError::VcpuGetLapic)?;
588        let tsc_khz = self.get_tsc_khz().ok().or_else(|| {
589            // v0.25 and newer snapshots without TSC will only work on
590            // the same CPU model as the host on which they were taken.
591            // TODO: Add negative test for this warning failure.
592            warn!("TSC freq not available. Snapshot cannot be loaded on a different CPU model.");
593            None
594        });
595        let cpuid = self.get_cpuid()?;
596        let saved_msrs = self.get_msr_chunks(self.msrs_to_save.iter().copied())?;
597        let vcpu_events = self
598            .fd
599            .get_vcpu_events()
600            .map_err(KvmVcpuError::VcpuGetVcpuEvents)?;
601
602        Ok(VcpuState {
603            cpuid,
604            saved_msrs,
605            debug_regs,
606            lapic,
607            mp_state,
608            regs,
609            sregs,
610            vcpu_events,
611            xcrs,
612            xsave,
613            tsc_khz,
614        })
615    }
616
617    /// Dumps CPU configuration (CPUID and MSRs).
618    ///
619    /// Opposed to `save_state()`, this dumps all the supported and dumpable MSRs not limited to
620    /// serializable ones.
621    pub fn dump_cpu_config(&self) -> Result<CpuConfiguration, KvmVcpuError> {
622        let cpuid = cpuid::Cpuid::try_from(self.get_cpuid()?)?;
623        let kvm = kvm_ioctls::Kvm::new().unwrap();
624        let msr_index_list = crate::arch::x86_64::msr::get_msrs_to_dump(&kvm)?;
625        let msrs = self.get_msrs(msr_index_list.as_slice().iter().copied())?;
626        Ok(CpuConfiguration { cpuid, msrs })
627    }
628
629    /// Checks whether the TSC needs scaling when restoring a snapshot.
630    ///
631    /// # Errors
632    ///
633    /// When
634    pub fn is_tsc_scaling_required(&self, state_tsc_freq: u32) -> Result<bool, GetTscError> {
635        // Compare the current TSC freq to the one found
636        // in the state. If they are different, we need to
637        // scale the TSC to the freq found in the state.
638        // We accept values within a tolerance of 250 parts
639        // per million because it is common for TSC frequency
640        // to differ due to calibration at boot time.
641        let diff = (i64::from(self.get_tsc_khz()?) - i64::from(state_tsc_freq)).abs();
642        // Cannot overflow since u32::MAX * 250 < i64::MAX
643        Ok(diff > i64::from(state_tsc_freq) * TSC_KHZ_TOL_NUMERATOR / TSC_KHZ_TOL_DENOMINATOR)
644    }
645
646    /// Scale the TSC frequency of this vCPU to the one provided as a parameter.
647    pub fn set_tsc_khz(&self, tsc_freq: u32) -> Result<(), SetTscError> {
648        self.fd.set_tsc_khz(tsc_freq).map_err(SetTscError)
649    }
650
651    /// Use provided state to populate KVM internal state.
652    pub fn restore_state(&self, state: &VcpuState) -> Result<(), KvmVcpuError> {
653        // Ordering requirements:
654        //
655        // KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
656        // still running.
657        //
658        // Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
659        // if we ever change the BSP, we have to do that before restoring anything.
660        // The same seems to be true for CPUID stuff.
661        //
662        // SREGS saves/restores a pending interrupt, similar to what
663        // VCPU_EVENTS also does.
664        //
665        // SET_REGS clears pending exceptions unconditionally, thus, it must be
666        // done before SET_VCPU_EVENTS, which restores it.
667        //
668        // SET_LAPIC must come after SET_SREGS, because the latter restores
669        // the apic base msr.
670        //
671        // SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
672        // only restores successfully, when the LAPIC is correctly configured.
673
674        self.fd
675            .set_cpuid2(&state.cpuid)
676            .map_err(KvmVcpuError::VcpuSetCpuid)?;
677        self.fd
678            .set_mp_state(state.mp_state)
679            .map_err(KvmVcpuError::VcpuSetMpState)?;
680        self.fd
681            .set_regs(&state.regs)
682            .map_err(KvmVcpuError::VcpuSetRegs)?;
683        self.fd
684            .set_sregs(&state.sregs)
685            .map_err(KvmVcpuError::VcpuSetSregs)?;
686        // SAFETY: Safe unless the snapshot is corrupted.
687        unsafe {
688            // kvm-ioctl's `set_xsave2()` can be called even on kernel versions not supporting
689            // `KVM_CAP_XSAVE2`, because it internally calls `KVM_SET_XSAVE` API that was extended
690            // by Linux kernel. Thus, `KVM_SET_XSAVE2` API does not exist as a KVM interface.
691            // However, kvm-ioctl added `set_xsave2()` to allow users to pass `Xsave` instead of the
692            // older `kvm_xsave`.
693            self.fd
694                .set_xsave2(&state.xsave)
695                .map_err(KvmVcpuError::VcpuSetXsave)?;
696        }
697        self.fd
698            .set_xcrs(&state.xcrs)
699            .map_err(KvmVcpuError::VcpuSetXcrs)?;
700        self.fd
701            .set_debug_regs(&state.debug_regs)
702            .map_err(KvmVcpuError::VcpuSetDebugRegs)?;
703        self.fd
704            .set_lapic(&state.lapic)
705            .map_err(KvmVcpuError::VcpuSetLapic)?;
706        for msrs in &state.saved_msrs {
707            let nmsrs = self.fd.set_msrs(msrs).map_err(KvmVcpuError::VcpuSetMsrs)?;
708            if nmsrs < msrs.as_fam_struct_ref().nmsrs as usize {
709                return Err(KvmVcpuError::VcpuSetMsrsIncomplete);
710            }
711        }
712        self.fd
713            .set_vcpu_events(&state.vcpu_events)
714            .map_err(KvmVcpuError::VcpuSetVcpuEvents)?;
715
716        self.kvmclock_ctrl();
717        Ok(())
718    }
719}
720
721impl Peripherals {
722    /// Runs the vCPU in KVM context and handles the kvm exit reason.
723    ///
724    /// Returns error or enum specifying whether emulation was handled or interrupted.
725    pub fn run_arch_emulation(&self, exit: VcpuExit) -> Result<VcpuEmulation, VcpuError> {
726        match exit {
727            VcpuExit::IoIn(addr, data) => {
728                if let Some(pio_bus) = &self.pio_bus {
729                    let _metric = METRICS.vcpu.exit_io_in_agg.record_latency_metrics();
730                    if let Err(err) = pio_bus.read(u64::from(addr), data) {
731                        warn!("vcpu: IO read @ {addr:#x}:{:#x} failed: {err}", data.len());
732                    }
733                    METRICS.vcpu.exit_io_in.inc();
734                }
735                Ok(VcpuEmulation::Handled)
736            }
737            VcpuExit::IoOut(addr, data) => {
738                if let Some(pio_bus) = &self.pio_bus {
739                    let _metric = METRICS.vcpu.exit_io_out_agg.record_latency_metrics();
740                    if let Err(err) = pio_bus.write(u64::from(addr), data) {
741                        warn!("vcpu: IO write @ {addr:#x}:{:#x} failed: {err}", data.len());
742                    }
743                    METRICS.vcpu.exit_io_out.inc();
744                }
745                Ok(VcpuEmulation::Handled)
746            }
747            unexpected_exit => {
748                METRICS.vcpu.failures.inc();
749                // TODO: Are we sure we want to finish running a vcpu upon
750                // receiving a vm exit that is not necessarily an error?
751                error!("Unexpected exit reason on vcpu run: {:?}", unexpected_exit);
752                Err(VcpuError::UnhandledKvmExit(format!(
753                    "{:?}",
754                    unexpected_exit
755                )))
756            }
757        }
758    }
759}
760
761/// Structure holding VCPU kvm state.
762#[derive(Serialize, Deserialize)]
763pub struct VcpuState {
764    /// CpuId.
765    pub cpuid: CpuId,
766    /// Saved msrs.
767    pub saved_msrs: Vec<Msrs>,
768    /// Debug regs.
769    pub debug_regs: kvm_debugregs,
770    /// Lapic.
771    pub lapic: kvm_lapic_state,
772    /// Mp state
773    pub mp_state: kvm_mp_state,
774    /// Kvm regs.
775    pub regs: kvm_regs,
776    /// Sregs.
777    pub sregs: kvm_sregs,
778    /// Vcpu events
779    pub vcpu_events: kvm_vcpu_events,
780    /// Xcrs.
781    pub xcrs: kvm_xcrs,
782    /// Xsave.
783    pub xsave: Xsave,
784    /// Tsc khz.
785    pub tsc_khz: Option<u32>,
786}
787
788impl Debug for VcpuState {
789    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
790        let mut debug_kvm_regs: Vec<kvm_bindings::kvm_msrs> = Vec::new();
791        for kvm_msrs in self.saved_msrs.iter() {
792            debug_kvm_regs = kvm_msrs.clone().into_raw();
793            debug_kvm_regs.sort_by_key(|msr| (msr.nmsrs, msr.pad));
794        }
795        f.debug_struct("VcpuState")
796            .field("cpuid", &self.cpuid)
797            .field("saved_msrs", &debug_kvm_regs)
798            .field("debug_regs", &self.debug_regs)
799            .field("lapic", &self.lapic)
800            .field("mp_state", &self.mp_state)
801            .field("regs", &self.regs)
802            .field("sregs", &self.sregs)
803            .field("vcpu_events", &self.vcpu_events)
804            .field("xcrs", &self.xcrs)
805            .field("xsave", &self.xsave)
806            .field("tsc_khz", &self.tsc_khz)
807            .finish()
808    }
809}
810
811#[cfg(test)]
812mod tests {
813    #![allow(clippy::undocumented_unsafe_blocks)]
814
815    use kvm_bindings::kvm_msr_entry;
816    use kvm_ioctls::Cap;
817    use vm_memory::GuestAddress;
818
819    use super::*;
820    use crate::arch::BootProtocol;
821    use crate::arch::x86_64::cpu_model::CpuModel;
822    use crate::cpu_config::templates::{
823        CpuConfiguration, CpuTemplateType, CustomCpuTemplate, GetCpuTemplate, GuestConfigError,
824        StaticCpuTemplate,
825    };
826    use crate::cpu_config::x86_64::cpuid::{Cpuid, CpuidEntry, CpuidKey};
827    use crate::vstate::kvm::Kvm;
828    use crate::vstate::vm::Vm;
829    use crate::vstate::vm::tests::{setup_vm, setup_vm_with_memory};
830
831    impl Default for VcpuState {
832        fn default() -> Self {
833            VcpuState {
834                cpuid: CpuId::new(1).unwrap(),
835                saved_msrs: vec![Msrs::new(1).unwrap()],
836                debug_regs: Default::default(),
837                lapic: Default::default(),
838                mp_state: Default::default(),
839                regs: Default::default(),
840                sregs: Default::default(),
841                vcpu_events: Default::default(),
842                xcrs: Default::default(),
843                xsave: Xsave::new(0).unwrap(),
844                tsc_khz: Some(0),
845            }
846        }
847    }
848
849    fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, KvmVcpu) {
850        let (kvm, vm) = setup_vm_with_memory(mem_size);
851        vm.setup_irqchip().unwrap();
852        let vcpu = KvmVcpu::new(0, &vm).unwrap();
853        (kvm, vm, vcpu)
854    }
855
856    fn create_vcpu_config(
857        kvm: &Kvm,
858        vcpu: &KvmVcpu,
859        template: &CustomCpuTemplate,
860    ) -> Result<VcpuConfig, GuestConfigError> {
861        let cpuid = Cpuid::try_from(kvm.supported_cpuid.clone())
862            .map_err(GuestConfigError::CpuidFromKvmCpuid)?;
863        let msrs = vcpu
864            .get_msrs(template.msr_index_iter())
865            .map_err(GuestConfigError::VcpuIoctl)?;
866        let base_cpu_config = CpuConfiguration { cpuid, msrs };
867        let cpu_config = CpuConfiguration::apply_template(base_cpu_config, template)?;
868        Ok(VcpuConfig {
869            vcpu_count: 1,
870            smt: false,
871            cpu_config,
872        })
873    }
874
875    #[test]
876    fn test_configure_vcpu() {
877        let (kvm, vm, mut vcpu) = setup_vcpu(0x10000);
878
879        let vcpu_config = create_vcpu_config(&kvm, &vcpu, &CustomCpuTemplate::default()).unwrap();
880        assert_eq!(
881            vcpu.configure(
882                vm.guest_memory(),
883                EntryPoint {
884                    entry_addr: GuestAddress(0),
885                    protocol: BootProtocol::LinuxBoot,
886                },
887                &vcpu_config,
888            ),
889            Ok(())
890        );
891
892        let try_configure = |kvm: &Kvm, vcpu: &mut KvmVcpu, template| -> bool {
893            let cpu_template = Some(CpuTemplateType::Static(template));
894            let template = cpu_template.get_cpu_template();
895            match template {
896                Ok(template) => match create_vcpu_config(kvm, vcpu, &template) {
897                    Ok(config) => vcpu
898                        .configure(
899                            vm.guest_memory(),
900                            EntryPoint {
901                                entry_addr: GuestAddress(crate::arch::get_kernel_start()),
902                                protocol: BootProtocol::LinuxBoot,
903                            },
904                            &config,
905                        )
906                        .is_ok(),
907                    Err(_) => false,
908                },
909                Err(_) => false,
910            }
911        };
912
913        // Test configure while using the T2 template.
914        let t2_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2);
915
916        // Test configure while using the C3 template.
917        let c3_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::C3);
918
919        // Test configure while using the T2S template.
920        let t2s_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2S);
921
922        // Test configure while using the T2CL template.
923        let t2cl_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2CL);
924
925        // Test configure while using the T2S template.
926        let t2a_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2A);
927
928        let cpu_model = CpuModel::get_cpu_model();
929        match &cpuid::common::get_vendor_id_from_host().unwrap() {
930            cpuid::VENDOR_ID_INTEL => {
931                assert_eq!(
932                    t2_res,
933                    StaticCpuTemplate::T2
934                        .get_supported_cpu_models()
935                        .contains(&cpu_model)
936                );
937                assert_eq!(
938                    c3_res,
939                    StaticCpuTemplate::C3
940                        .get_supported_cpu_models()
941                        .contains(&cpu_model)
942                );
943                assert_eq!(
944                    t2s_res,
945                    StaticCpuTemplate::T2S
946                        .get_supported_cpu_models()
947                        .contains(&cpu_model)
948                );
949                assert_eq!(
950                    t2cl_res,
951                    StaticCpuTemplate::T2CL
952                        .get_supported_cpu_models()
953                        .contains(&cpu_model)
954                );
955                assert!(!t2a_res);
956            }
957            cpuid::VENDOR_ID_AMD => {
958                assert!(!t2_res);
959                assert!(!c3_res);
960                assert!(!t2s_res);
961                assert!(!t2cl_res);
962                assert_eq!(
963                    t2a_res,
964                    StaticCpuTemplate::T2A
965                        .get_supported_cpu_models()
966                        .contains(&cpu_model)
967                );
968            }
969            _ => {
970                assert!(!t2_res);
971                assert!(!c3_res);
972                assert!(!t2s_res);
973                assert!(!t2cl_res);
974                assert!(!t2a_res);
975            }
976        }
977    }
978
979    #[test]
980    fn test_vcpu_cpuid_restore() {
981        let (kvm, _, vcpu) = setup_vcpu(0x10000);
982        vcpu.fd.set_cpuid2(&kvm.supported_cpuid).unwrap();
983
984        // Mutate the CPUID.
985        // Leaf 0x3 / EAX that is an unused (reserved to be accurate) register, so it's harmless.
986        let mut state = vcpu.save_state().unwrap();
987        state.cpuid.as_mut_slice().iter_mut().for_each(|entry| {
988            if entry.function == 3 && entry.index == 0 {
989                entry.eax = 0x1234_5678;
990            }
991        });
992
993        // Restore the state into the existing vcpu.
994        let result1 = vcpu.restore_state(&state);
995        assert!(result1.is_ok(), "{}", result1.unwrap_err());
996        drop(vcpu);
997
998        // Restore the state into a new vcpu.
999        let (_, _vm, vcpu) = setup_vcpu(0x10000);
1000        let result2 = vcpu.restore_state(&state);
1001        assert!(result2.is_ok(), "{}", result2.unwrap_err());
1002
1003        // Validate the mutated cpuid is restored correctly.
1004        let state = vcpu.save_state().unwrap();
1005        let cpuid = Cpuid::try_from(state.cpuid).unwrap();
1006        let leaf3 = cpuid
1007            .inner()
1008            .get(&CpuidKey {
1009                leaf: 0x3,
1010                subleaf: 0x0,
1011            })
1012            .unwrap();
1013        assert!(leaf3.result.eax == 0x1234_5678);
1014    }
1015
1016    #[test]
1017    fn test_empty_cpuid_entries_removed() {
1018        // Test that `get_cpuid()` removes zeroed empty entries from the `KVM_GET_CPUID2` result.
1019        let (kvm, vm, mut vcpu) = setup_vcpu(0x10000);
1020        let vcpu_config = VcpuConfig {
1021            vcpu_count: 1,
1022            smt: false,
1023            cpu_config: CpuConfiguration {
1024                cpuid: Cpuid::try_from(kvm.supported_cpuid.clone()).unwrap(),
1025                msrs: BTreeMap::new(),
1026            },
1027        };
1028        vcpu.configure(
1029            vm.guest_memory(),
1030            EntryPoint {
1031                entry_addr: GuestAddress(0),
1032                protocol: BootProtocol::LinuxBoot,
1033            },
1034            &vcpu_config,
1035        )
1036        .unwrap();
1037
1038        // Invalid entries filled with 0 should not exist.
1039        let cpuid = vcpu.get_cpuid().unwrap();
1040        cpuid.as_slice().iter().for_each(|entry| {
1041            assert!(
1042                !(entry.function == 0
1043                    && entry.index == 0
1044                    && entry.flags == 0
1045                    && entry.eax == 0
1046                    && entry.ebx == 0
1047                    && entry.ecx == 0
1048                    && entry.edx == 0)
1049            );
1050        });
1051
1052        // Leaf 0 should have non-zero entry in `Cpuid`.
1053        let cpuid = Cpuid::try_from(cpuid).unwrap();
1054        assert_ne!(
1055            cpuid
1056                .inner()
1057                .get(&CpuidKey {
1058                    leaf: 0,
1059                    subleaf: 0,
1060                })
1061                .unwrap(),
1062            &CpuidEntry {
1063                ..Default::default()
1064            }
1065        );
1066    }
1067
1068    #[test]
1069    fn test_dump_cpu_config_with_non_configured_vcpu() {
1070        // Test `dump_cpu_config()` before vcpu configuration.
1071        //
1072        // `KVM_GET_CPUID2` returns the result of `KVM_SET_CPUID2`. See
1073        // https://docs.kernel.org/virt/kvm/api.html#kvm-set-cpuid
1074        // Since `KVM_SET_CPUID2` has not been called before vcpu configuration, all leaves should
1075        // be filled with zero. Therefore, `KvmVcpu::dump_cpu_config()` should fail with CPUID type
1076        // conversion error due to the lack of brand string info in leaf 0x0.
1077        let (_, _, vcpu) = setup_vcpu(0x10000);
1078        match vcpu.dump_cpu_config() {
1079            Err(KvmVcpuError::ConvertCpuidType(_)) => (),
1080            Err(err) => panic!("Unexpected error: {err}"),
1081            Ok(_) => panic!("Dumping CPU configuration should fail before vcpu configuration."),
1082        }
1083    }
1084
1085    #[test]
1086    fn test_dump_cpu_config_with_configured_vcpu() {
1087        // Test `dump_cpu_config()` after vcpu configuration.
1088        let (kvm, vm, mut vcpu) = setup_vcpu(0x10000);
1089        let vcpu_config = VcpuConfig {
1090            vcpu_count: 1,
1091            smt: false,
1092            cpu_config: CpuConfiguration {
1093                cpuid: Cpuid::try_from(kvm.supported_cpuid.clone()).unwrap(),
1094                msrs: BTreeMap::new(),
1095            },
1096        };
1097
1098        vcpu.configure(
1099            vm.guest_memory(),
1100            EntryPoint {
1101                entry_addr: GuestAddress(0),
1102                protocol: BootProtocol::LinuxBoot,
1103            },
1104            &vcpu_config,
1105        )
1106        .unwrap();
1107        vcpu.dump_cpu_config().unwrap();
1108    }
1109
1110    #[test]
1111    #[allow(clippy::redundant_clone)]
1112    fn test_is_tsc_scaling_required() {
1113        // Test `is_tsc_scaling_required` as if it were on the same
1114        // CPU model as the one in the snapshot state.
1115        let (_, _, vcpu) = setup_vcpu(0x1000);
1116
1117        {
1118            // The frequency difference is within tolerance.
1119            let mut state = vcpu.save_state().unwrap();
1120            state.tsc_khz = Some(
1121                state.tsc_khz.unwrap()
1122                    + state.tsc_khz.unwrap() * u32::try_from(TSC_KHZ_TOL_NUMERATOR).unwrap()
1123                        / u32::try_from(TSC_KHZ_TOL_DENOMINATOR).unwrap()
1124                        / 2,
1125            );
1126            assert!(
1127                !vcpu
1128                    .is_tsc_scaling_required(state.tsc_khz.unwrap())
1129                    .unwrap()
1130            );
1131        }
1132
1133        {
1134            // The frequency difference is over the tolerance.
1135            let mut state = vcpu.save_state().unwrap();
1136            state.tsc_khz = Some(
1137                state.tsc_khz.unwrap()
1138                    + state.tsc_khz.unwrap() * u32::try_from(TSC_KHZ_TOL_NUMERATOR).unwrap()
1139                        / u32::try_from(TSC_KHZ_TOL_DENOMINATOR).unwrap()
1140                        * 2,
1141            );
1142            assert!(
1143                vcpu.is_tsc_scaling_required(state.tsc_khz.unwrap())
1144                    .unwrap()
1145            );
1146        }
1147
1148        {
1149            // Try a large frequency (30GHz) in the state and check it doesn't
1150            // overflow
1151            assert!(vcpu.is_tsc_scaling_required(30_000_000).unwrap());
1152        }
1153    }
1154
1155    #[test]
1156    fn test_set_tsc() {
1157        let (kvm, _, vcpu) = setup_vcpu(0x1000);
1158        let mut state = vcpu.save_state().unwrap();
1159        state.tsc_khz = Some(
1160            state.tsc_khz.unwrap()
1161                + state.tsc_khz.unwrap() * u32::try_from(TSC_KHZ_TOL_NUMERATOR).unwrap()
1162                    / u32::try_from(TSC_KHZ_TOL_DENOMINATOR).unwrap()
1163                    * 2,
1164        );
1165
1166        if kvm.fd.check_extension(Cap::TscControl) {
1167            vcpu.set_tsc_khz(state.tsc_khz.unwrap()).unwrap();
1168            if kvm.fd.check_extension(Cap::GetTscKhz) {
1169                assert_eq!(vcpu.get_tsc_khz().ok(), state.tsc_khz);
1170            } else {
1171                vcpu.get_tsc_khz().unwrap_err();
1172            }
1173        } else {
1174            vcpu.set_tsc_khz(state.tsc_khz.unwrap()).unwrap_err();
1175        }
1176    }
1177
1178    #[test]
1179    fn test_get_msrs_with_msrs_to_save() {
1180        // Test `get_msrs()` with the MSR indices that should be serialized into snapshots.
1181        // The MSR indices should be valid and this test should succeed.
1182        let (_, _, vcpu) = setup_vcpu(0x1000);
1183        vcpu.get_msrs(vcpu.msrs_to_save.iter().copied()).unwrap();
1184    }
1185
1186    #[test]
1187    fn test_get_msrs_with_msrs_to_dump() {
1188        // Test `get_msrs()` with the MSR indices that should be dumped.
1189        // All the MSR indices should be valid and the call should succeed.
1190        let (_, _, vcpu) = setup_vcpu(0x1000);
1191
1192        let kvm = kvm_ioctls::Kvm::new().unwrap();
1193        let msrs_to_dump = crate::arch::x86_64::msr::get_msrs_to_dump(&kvm).unwrap();
1194        vcpu.get_msrs(msrs_to_dump.as_slice().iter().copied())
1195            .unwrap();
1196    }
1197
1198    #[test]
1199    fn test_get_msrs_with_invalid_msr_index() {
1200        // Test `get_msrs()` with unsupported MSR indices. This should return `VcpuGetMsr` error
1201        // that happens when `KVM_GET_MSRS` fails to populate MSR values in the middle and exits.
1202        // Currently, MSR indices 2..=4 are not listed as supported MSRs.
1203        let (_, _, vcpu) = setup_vcpu(0x1000);
1204        let msr_index_list: Vec<u32> = vec![2, 3, 4];
1205        match vcpu.get_msrs(msr_index_list.iter().copied()) {
1206            Err(KvmVcpuError::VcpuGetMsr(_)) => (),
1207            Err(err) => panic!("Unexpected error: {err}"),
1208            Ok(_) => {
1209                panic!("KvmVcpu::get_msrs() for unsupported MSRs should fail with VcpuGetMsr.")
1210            }
1211        }
1212    }
1213
1214    fn msrs_from_entries(msr_entries: &[(u32, u64)]) -> Msrs {
1215        Msrs::from_entries(
1216            &msr_entries
1217                .iter()
1218                .map(|&(index, data)| kvm_msr_entry {
1219                    index,
1220                    data,
1221                    ..Default::default()
1222                })
1223                .collect::<Vec<_>>(),
1224        )
1225        .unwrap()
1226    }
1227
1228    fn assert_msrs(msr_chunks: &[Msrs], expected_msr_entries: &[(u32, u64)]) {
1229        let flattened_msrs = msr_chunks.iter().flat_map(|msrs| msrs.as_slice());
1230        for (a, b) in flattened_msrs.zip(expected_msr_entries.iter()) {
1231            assert_eq!(a.index, b.0);
1232            assert_eq!(a.data, b.1);
1233        }
1234    }
1235
1236    #[test]
1237    fn test_defer_msrs() {
1238        let to_defer = DEFERRED_MSRS[0];
1239
1240        let mut msr_chunks = [msrs_from_entries(&[(to_defer, 0), (MSR_IA32_TSC, 1)])];
1241
1242        let deferred = KvmVcpu::extract_deferred_msrs(&mut msr_chunks).unwrap();
1243
1244        assert_eq!(deferred.as_slice().len(), 1, "did not correctly defer MSR");
1245        assert_eq!(
1246            msr_chunks[0].as_slice().len(),
1247            1,
1248            "deferred MSR not removed from chunk"
1249        );
1250
1251        assert_eq!(deferred.as_slice()[0].index, to_defer);
1252        assert_eq!(msr_chunks[0].as_slice()[0].index, MSR_IA32_TSC);
1253    }
1254
1255    #[test]
1256    fn test_fix_zero_tsc_deadline_msr_zero_same_chunk() {
1257        // Place both TSC and TSC_DEADLINE MSRs in the same chunk.
1258        let mut msr_chunks = [msrs_from_entries(&[
1259            (MSR_IA32_TSC_DEADLINE, 0),
1260            (MSR_IA32_TSC, 42),
1261        ])];
1262
1263        KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
1264
1265        // We expect for the MSR_IA32_TSC_DEADLINE to get updated with the MSR_IA32_TSC value.
1266        assert_msrs(
1267            &msr_chunks,
1268            &[(MSR_IA32_TSC_DEADLINE, 42), (MSR_IA32_TSC, 42)],
1269        );
1270    }
1271
1272    #[test]
1273    fn test_fix_zero_tsc_deadline_msr_zero_separate_chunks() {
1274        // Place both TSC and TSC_DEADLINE MSRs in separate chunks.
1275        let mut msr_chunks = [
1276            msrs_from_entries(&[(MSR_IA32_TSC_DEADLINE, 0)]),
1277            msrs_from_entries(&[(MSR_IA32_TSC, 42)]),
1278        ];
1279
1280        KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
1281
1282        // We expect for the MSR_IA32_TSC_DEADLINE to get updated with the MSR_IA32_TSC value.
1283        assert_msrs(
1284            &msr_chunks,
1285            &[(MSR_IA32_TSC_DEADLINE, 42), (MSR_IA32_TSC, 42)],
1286        );
1287    }
1288
1289    #[test]
1290    fn test_fix_zero_tsc_deadline_msr_non_zero() {
1291        let mut msr_chunks = [msrs_from_entries(&[
1292            (MSR_IA32_TSC_DEADLINE, 1),
1293            (MSR_IA32_TSC, 2),
1294        ])];
1295
1296        KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
1297
1298        // We expect that MSR_IA32_TSC_DEADLINE should remain unchanged, because it is non-zero
1299        // already.
1300        assert_msrs(
1301            &msr_chunks,
1302            &[(MSR_IA32_TSC_DEADLINE, 1), (MSR_IA32_TSC, 2)],
1303        );
1304    }
1305
1306    #[test]
1307    fn test_get_msr_chunks_preserved_order() {
1308        // Regression test for #4666
1309        let (_, vm) = setup_vm();
1310        let vcpu = KvmVcpu::new(0, &vm).unwrap();
1311
1312        // The list of supported MSR indices, in the order they were returned by KVM
1313        let msrs_to_save = vm.msrs_to_save();
1314        // The MSRs after processing. The order should be identical to the one returned by KVM, with
1315        // the exception of deferred MSRs, which should be moved to the end (but show up in the same
1316        // order as they are listed in [`DEFERRED_MSRS`].
1317        let msr_chunks = vcpu
1318            .get_msr_chunks(vcpu.msrs_to_save.iter().copied())
1319            .unwrap();
1320
1321        msr_chunks
1322            .iter()
1323            .flat_map(|chunk| chunk.as_slice().iter())
1324            .zip(
1325                msrs_to_save
1326                    .iter()
1327                    .filter(|&idx| !DEFERRED_MSRS.contains(idx))
1328                    .chain(DEFERRED_MSRS.iter()),
1329            )
1330            .for_each(|(left, &right)| assert_eq!(left.index, right));
1331    }
1332}