vmm/arch/x86_64/vcpu.rs
1// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0
3//
4// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
5// Use of this source code is governed by a BSD-style license that can be
6// found in the THIRD-PARTY file.
7
8use std::collections::BTreeMap;
9use std::fmt::Debug;
10use std::sync::Arc;
11
12use kvm_bindings::{
13 CpuId, KVM_MAX_CPUID_ENTRIES, KVM_MAX_MSR_ENTRIES, Msrs, Xsave, kvm_debugregs, kvm_lapic_state,
14 kvm_mp_state, kvm_regs, kvm_sregs, kvm_vcpu_events, kvm_xcrs, kvm_xsave, kvm_xsave2,
15};
16use kvm_ioctls::{VcpuExit, VcpuFd};
17use log::{error, warn};
18use serde::{Deserialize, Serialize};
19use vmm_sys_util::fam::{self, FamStruct};
20
21use crate::arch::EntryPoint;
22use crate::arch::x86_64::generated::msr_index::{MSR_IA32_TSC, MSR_IA32_TSC_DEADLINE};
23use crate::arch::x86_64::interrupts;
24use crate::arch::x86_64::msr::{MsrError, create_boot_msr_entries};
25use crate::arch::x86_64::regs::{SetupFpuError, SetupRegistersError, SetupSpecialRegistersError};
26use crate::cpu_config::x86_64::{CpuConfiguration, cpuid};
27use crate::logger::{IncMetric, METRICS};
28use crate::vstate::bus::Bus;
29use crate::vstate::memory::GuestMemoryMmap;
30use crate::vstate::vcpu::{VcpuConfig, VcpuEmulation, VcpuError};
31use crate::vstate::vm::Vm;
32
33// Tolerance for TSC frequency expected variation.
34// The value of 250 parts per million is based on
35// the QEMU approach, more details here:
36// https://bugzilla.redhat.com/show_bug.cgi?id=1839095
37const TSC_KHZ_TOL_NUMERATOR: i64 = 250;
38const TSC_KHZ_TOL_DENOMINATOR: i64 = 1_000_000;
39
40/// A set of MSRs that should be restored separately after all other MSRs have already been restored
41const DEFERRED_MSRS: [u32; 1] = [
42 // MSR_IA32_TSC_DEADLINE must be restored after MSR_IA32_TSC, otherwise we risk "losing" timer
43 // interrupts across the snapshot restore boundary (due to KVM querying MSR_IA32_TSC upon
44 // writes to the TSC_DEADLINE MSR to determine whether it needs to prime a timer - if
45 // MSR_IA32_TSC is not initialized correctly, it can wrongly assume no timer needs to be
46 // primed, or the timer can be initialized with a wrong expiry).
47 MSR_IA32_TSC_DEADLINE,
48];
49
50/// Errors associated with the wrappers over KVM ioctls.
51#[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)]
52pub enum KvmVcpuError {
53 /// Failed to convert `kvm_bindings::CpuId` to `Cpuid`: {0}
54 ConvertCpuidType(#[from] cpuid::CpuidTryFromKvmCpuid),
55 /// Failed FamStructWrapper operation: {0}
56 Fam(#[from] vmm_sys_util::fam::Error),
57 /// Failed to get dumpable MSR index list: {0}
58 GetMsrsToDump(#[from] crate::arch::x86_64::msr::MsrError),
59 /// Cannot open the VCPU file descriptor: {0}
60 VcpuFd(kvm_ioctls::Error),
61 /// Failed to get KVM vcpu debug regs: {0}
62 VcpuGetDebugRegs(kvm_ioctls::Error),
63 /// Failed to get KVM vcpu lapic: {0}
64 VcpuGetLapic(kvm_ioctls::Error),
65 /// Failed to get KVM vcpu mp state: {0}
66 VcpuGetMpState(kvm_ioctls::Error),
67 /// Failed to get KVM vcpu msr: {0:#x}
68 VcpuGetMsr(u32),
69 /// Failed to get KVM vcpu msrs: {0}
70 VcpuGetMsrs(kvm_ioctls::Error),
71 /// Failed to get KVM vcpu regs: {0}
72 VcpuGetRegs(kvm_ioctls::Error),
73 /// Failed to get KVM vcpu sregs: {0}
74 VcpuGetSregs(kvm_ioctls::Error),
75 /// Failed to get KVM vcpu event: {0}
76 VcpuGetVcpuEvents(kvm_ioctls::Error),
77 /// Failed to get KVM vcpu xcrs: {0}
78 VcpuGetXcrs(kvm_ioctls::Error),
79 /// Failed to get KVM vcpu xsave via KVM_GET_XSAVE: {0}
80 VcpuGetXsave(kvm_ioctls::Error),
81 /// Failed to get KVM vcpu xsave via KVM_GET_XSAVE2: {0}
82 VcpuGetXsave2(kvm_ioctls::Error),
83 /// Failed to get KVM vcpu cpuid: {0}
84 VcpuGetCpuid(kvm_ioctls::Error),
85 /// Failed to get KVM TSC frequency: {0}
86 VcpuGetTsc(kvm_ioctls::Error),
87 /// Failed to set KVM vcpu cpuid: {0}
88 VcpuSetCpuid(kvm_ioctls::Error),
89 /// Failed to set KVM vcpu debug regs: {0}
90 VcpuSetDebugRegs(kvm_ioctls::Error),
91 /// Failed to set KVM vcpu lapic: {0}
92 VcpuSetLapic(kvm_ioctls::Error),
93 /// Failed to set KVM vcpu mp state: {0}
94 VcpuSetMpState(kvm_ioctls::Error),
95 /// Failed to set KVM vcpu msrs: {0}
96 VcpuSetMsrs(kvm_ioctls::Error),
97 /// Failed to set all KVM MSRs for this vCPU. Only a partial write was done.
98 VcpuSetMsrsIncomplete,
99 /// Failed to set KVM vcpu regs: {0}
100 VcpuSetRegs(kvm_ioctls::Error),
101 /// Failed to set KVM vcpu sregs: {0}
102 VcpuSetSregs(kvm_ioctls::Error),
103 /// Failed to set KVM vcpu event: {0}
104 VcpuSetVcpuEvents(kvm_ioctls::Error),
105 /// Failed to set KVM vcpu xcrs: {0}
106 VcpuSetXcrs(kvm_ioctls::Error),
107 /// Failed to set KVM vcpu xsave: {0}
108 VcpuSetXsave(kvm_ioctls::Error),
109}
110
111/// Error type for [`KvmVcpu::get_tsc_khz`] and [`KvmVcpu::is_tsc_scaling_required`].
112#[derive(Debug, thiserror::Error, derive_more::From, Eq, PartialEq)]
113#[error("{0}")]
114pub struct GetTscError(vmm_sys_util::errno::Error);
115
116/// Error type for [`KvmVcpu::set_tsc_khz`].
117#[derive(Debug, thiserror::Error, Eq, PartialEq)]
118#[error("{0}")]
119pub struct SetTscError(#[from] kvm_ioctls::Error);
120
121/// Error type for [`KvmVcpu::configure`].
122#[derive(Debug, thiserror::Error, displaydoc::Display, Eq, PartialEq)]
123pub enum KvmVcpuConfigureError {
124 /// Failed to convert `Cpuid` to `kvm_bindings::CpuId`: {0}
125 ConvertCpuidType(#[from] vmm_sys_util::fam::Error),
126 /// Failed to apply modifications to CPUID: {0}
127 NormalizeCpuidError(#[from] cpuid::NormalizeCpuidError),
128 /// Failed to set CPUID: {0}
129 SetCpuid(#[from] vmm_sys_util::errno::Error),
130 /// Failed to set MSRs: {0}
131 SetMsrs(#[from] MsrError),
132 /// Failed to setup registers: {0}
133 SetupRegisters(#[from] SetupRegistersError),
134 /// Failed to setup FPU: {0}
135 SetupFpu(#[from] SetupFpuError),
136 /// Failed to setup special registers: {0}
137 SetupSpecialRegisters(#[from] SetupSpecialRegistersError),
138 /// Failed to configure LAPICs: {0}
139 SetLint(#[from] interrupts::InterruptError),
140}
141
142/// A wrapper around creating and using a kvm x86_64 vcpu.
143#[derive(Debug)]
144pub struct KvmVcpu {
145 /// Index of vcpu.
146 pub index: u8,
147 /// KVM vcpu fd.
148 pub fd: VcpuFd,
149 /// Vcpu peripherals, such as buses
150 pub peripherals: Peripherals,
151 /// The list of MSRs to include in a VM snapshot, in the same order as KVM returned them
152 /// from KVM_GET_MSR_INDEX_LIST
153 msrs_to_save: Vec<u32>,
154 /// Size in bytes requiring to hold the dynamically-sized `kvm_xsave` struct.
155 ///
156 /// `None` if `KVM_CAP_XSAVE2` not supported.
157 xsave2_size: Option<usize>,
158}
159
160/// Vcpu peripherals
161#[derive(Default, Debug)]
162pub struct Peripherals {
163 /// Pio bus.
164 pub pio_bus: Option<Arc<Bus>>,
165 /// Mmio bus.
166 pub mmio_bus: Option<Arc<Bus>>,
167}
168
169impl KvmVcpu {
170 /// Constructs a new kvm vcpu with arch specific functionality.
171 ///
172 /// # Arguments
173 ///
174 /// * `index` - Represents the 0-based CPU index between [0, max vcpus).
175 /// * `vm` - The vm to which this vcpu will get attached.
176 pub fn new(index: u8, vm: &Vm) -> Result<Self, KvmVcpuError> {
177 let kvm_vcpu = vm
178 .fd()
179 .create_vcpu(index.into())
180 .map_err(KvmVcpuError::VcpuFd)?;
181
182 Ok(KvmVcpu {
183 index,
184 fd: kvm_vcpu,
185 peripherals: Default::default(),
186 msrs_to_save: vm.msrs_to_save().to_vec(),
187 xsave2_size: vm.xsave2_size(),
188 })
189 }
190
191 /// Configures a x86_64 specific vcpu for booting Linux and should be called once per vcpu.
192 ///
193 /// # Arguments
194 ///
195 /// * `guest_mem` - The guest memory used by this microvm.
196 /// * `kernel_entry_point` - Specifies the boot protocol and offset from `guest_mem` at which
197 /// the kernel starts.
198 /// * `vcpu_config` - The vCPU configuration.
199 /// * `cpuid` - The capabilities exposed by this vCPU.
200 pub fn configure(
201 &mut self,
202 guest_mem: &GuestMemoryMmap,
203 kernel_entry_point: EntryPoint,
204 vcpu_config: &VcpuConfig,
205 ) -> Result<(), KvmVcpuConfigureError> {
206 let mut cpuid = vcpu_config.cpu_config.cpuid.clone();
207
208 // Apply machine specific changes to CPUID.
209 cpuid.normalize(
210 // The index of the current logical CPU in the range [0..cpu_count].
211 self.index,
212 // The total number of logical CPUs.
213 vcpu_config.vcpu_count,
214 // The number of bits needed to enumerate logical CPUs per core.
215 u8::from(vcpu_config.vcpu_count > 1 && vcpu_config.smt),
216 )?;
217
218 // Set CPUID.
219 let kvm_cpuid = kvm_bindings::CpuId::try_from(cpuid)?;
220
221 // Set CPUID in the KVM
222 self.fd
223 .set_cpuid2(&kvm_cpuid)
224 .map_err(KvmVcpuConfigureError::SetCpuid)?;
225
226 // Clone MSR entries that are modified by CPU template from `VcpuConfig`.
227 let mut msrs = vcpu_config.cpu_config.msrs.clone();
228 self.msrs_to_save.extend(msrs.keys());
229
230 // Apply MSR modification to comply the linux boot protocol.
231 create_boot_msr_entries().into_iter().for_each(|entry| {
232 msrs.insert(entry.index, entry.data);
233 });
234
235 // TODO - Add/amend MSRs for vCPUs based on cpu_config
236 // By this point the Guest CPUID is established. Some CPU features require MSRs
237 // to configure and interact with those features. If a MSR is writable from
238 // inside the Guest, or is changed by KVM or Firecracker on behalf of the Guest,
239 // then we will need to save it every time we take a snapshot, and restore its
240 // value when we restore the microVM since the Guest may need that value.
241 // Since CPUID tells us what features are enabled for the Guest, we can infer
242 // the extra MSRs that we need to save based on a dependency map.
243 let extra_msrs = cpuid::common::msrs_to_save_by_cpuid(&kvm_cpuid);
244 self.msrs_to_save.extend(extra_msrs);
245
246 // TODO: Some MSRs depend on values of other MSRs. This dependency will need to
247 // be implemented.
248
249 // By this point we know that at snapshot, the list of MSRs we need to
250 // save is `architectural MSRs` + `MSRs inferred through CPUID` + `other
251 // MSRs defined by the template`
252
253 let kvm_msrs = msrs
254 .into_iter()
255 .map(|entry| kvm_bindings::kvm_msr_entry {
256 index: entry.0,
257 data: entry.1,
258 ..Default::default()
259 })
260 .collect::<Vec<_>>();
261
262 crate::arch::x86_64::msr::set_msrs(&self.fd, &kvm_msrs)?;
263 crate::arch::x86_64::regs::setup_regs(&self.fd, kernel_entry_point)?;
264 crate::arch::x86_64::regs::setup_fpu(&self.fd)?;
265 crate::arch::x86_64::regs::setup_sregs(guest_mem, &self.fd, kernel_entry_point.protocol)?;
266 crate::arch::x86_64::interrupts::set_lint(&self.fd)?;
267 Ok(())
268 }
269
270 /// Sets a Port Mapped IO bus for this vcpu.
271 pub fn set_pio_bus(&mut self, pio_bus: Arc<Bus>) {
272 self.peripherals.pio_bus = Some(pio_bus);
273 }
274
275 /// Calls KVM_KVMCLOCK_CTRL to avoid guest soft lockup watchdog panics on resume.
276 /// See https://docs.kernel.org/virt/kvm/api.html .
277 pub fn kvmclock_ctrl(&self) {
278 // We do not want to fail if the call is not successful, because that may be acceptable
279 // depending on the workload. For example, EINVAL is returned if kvm-clock is not
280 // activated (e.g., no-kvmclock is specified in the guest kernel parameter).
281 // https://elixir.bootlin.com/linux/v6.17.5/source/arch/x86/kvm/x86.c#L5736-L5737
282 if let Err(err) = self.fd.kvmclock_ctrl() {
283 METRICS.vcpu.kvmclock_ctrl_fails.inc();
284 warn!("KVM_KVMCLOCK_CTRL call failed {}", err);
285 }
286 }
287
288 /// Get the current XSAVE state for this vCPU.
289 ///
290 /// The C `kvm_xsave` struct was extended by adding a flexible array member (FAM) in the end
291 /// to support variable-sized XSTATE buffer.
292 ///
293 /// https://elixir.bootlin.com/linux/v6.13.6/source/arch/x86/include/uapi/asm/kvm.h#L381
294 /// ```c
295 /// struct kvm_xsave {
296 /// __u32 region[1024];
297 /// __u32 extra[];
298 /// };
299 /// ```
300 ///
301 /// As shown above, the C `kvm_xsave` struct does not have any field for the size of itself or
302 /// the length of its FAM. The required size (in bytes) of `kvm_xsave` struct can be retrieved
303 /// via `KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)`.
304 ///
305 /// kvm-bindings defines `kvm_xsave2` struct that wraps the `kvm_xsave` struct to have `len`
306 /// field that indicates the number of FAM entries (i.e. `extra`), it also defines `Xsave` as
307 /// a `FamStructWrapper` of `kvm_xsave2`.
308 ///
309 /// https://github.com/rust-vmm/kvm/blob/68fff5491703bf32bd35656f7ba994a4cae9ea7d/kvm-bindings/src/x86_64/fam_wrappers.rs#L106
310 /// ```rs
311 /// pub struct kvm_xsave2 {
312 /// pub len: usize,
313 /// pub xsave: kvm_xsave,
314 /// }
315 /// ```
316 fn get_xsave(&self) -> Result<Xsave, KvmVcpuError> {
317 match self.xsave2_size {
318 // if `KVM_CAP_XSAVE2` supported
319 Some(xsave2_size) => {
320 // Convert the `kvm_xsave` size in bytes to the length of FAM (i.e. `extra`).
321 let fam_len =
322 // Calculate the size of FAM (`extra`) area in bytes. Note that the subtraction
323 // never underflows because `KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)` always returns
324 // at least 4096 bytes that is the size of `kvm_xsave` without FAM area.
325 (xsave2_size - std::mem::size_of::<kvm_xsave>())
326 // Divide by the size of FAM (`extra`) entry (i.e. `__u32`).
327 .div_ceil(std::mem::size_of::<<kvm_xsave2 as FamStruct>::Entry>());
328 let mut xsave = Xsave::new(fam_len).map_err(KvmVcpuError::Fam)?;
329 // SAFETY: Safe because `xsave` is allocated with enough size to save XSTATE.
330 unsafe { self.fd.get_xsave2(&mut xsave) }.map_err(KvmVcpuError::VcpuGetXsave2)?;
331 Ok(xsave)
332 }
333 // if `KVM_CAP_XSAVE2` not supported
334 None => Ok(
335 // SAFETY: The content is correctly laid out.
336 unsafe {
337 Xsave::from_raw(vec![kvm_xsave2 {
338 // Note that `len` is the number of FAM (`extra`) entries that didn't exist
339 // on older kernels not supporting `KVM_CAP_XSAVE2`. Thus, it's always zero.
340 len: 0,
341 xsave: self.fd.get_xsave().map_err(KvmVcpuError::VcpuGetXsave)?,
342 }])
343 },
344 ),
345 }
346 }
347
348 /// Get the current TSC frequency for this vCPU.
349 ///
350 /// # Errors
351 ///
352 /// When [`kvm_ioctls::VcpuFd::get_tsc_khz`] errors.
353 pub fn get_tsc_khz(&self) -> Result<u32, GetTscError> {
354 let res = self.fd.get_tsc_khz()?;
355 Ok(res)
356 }
357
358 /// Get CPUID for this vCPU.
359 ///
360 /// Opposed to KVM_GET_SUPPORTED_CPUID, KVM_GET_CPUID2 does not update "nent" with valid number
361 /// of entries on success. Thus, when it passes "num_entries" greater than required, zeroed
362 /// entries follow after valid entries. This function removes such zeroed empty entries.
363 ///
364 /// # Errors
365 ///
366 /// * When [`kvm_ioctls::VcpuFd::get_cpuid2`] returns errors.
367 fn get_cpuid(&self) -> Result<kvm_bindings::CpuId, KvmVcpuError> {
368 let mut cpuid = self
369 .fd
370 .get_cpuid2(KVM_MAX_CPUID_ENTRIES)
371 .map_err(KvmVcpuError::VcpuGetCpuid)?;
372
373 // As CPUID.0h:EAX should have the largest CPUID standard function, we don't need to check
374 // EBX, ECX and EDX to confirm whether it is a valid entry.
375 cpuid.retain(|entry| {
376 !(entry.function == 0 && entry.index == 0 && entry.flags == 0 && entry.eax == 0)
377 });
378
379 Ok(cpuid)
380 }
381
382 /// If the IA32_TSC_DEADLINE MSR value is zero, update it
383 /// with the IA32_TSC value to guarantee that
384 /// the vCPU will continue receiving interrupts after restoring from a snapshot.
385 ///
386 /// Rationale: we observed that sometimes when taking a snapshot,
387 /// the IA32_TSC_DEADLINE MSR is cleared, but the interrupt is not
388 /// delivered to the guest, leading to a situation where one
389 /// of the vCPUs never receives TSC interrupts after restoring,
390 /// until the MSR is updated externally, eg by setting the system time.
391 fn fix_zero_tsc_deadline_msr(msr_chunks: &mut [Msrs]) {
392 // We do not expect more than 1 TSC MSR entry, but if there are multiple, pick the maximum.
393 let max_tsc_value = msr_chunks
394 .iter()
395 .flat_map(|msrs| msrs.as_slice())
396 .filter(|msr| msr.index == MSR_IA32_TSC)
397 .map(|msr| msr.data)
398 .max();
399
400 if let Some(tsc_value) = max_tsc_value {
401 msr_chunks
402 .iter_mut()
403 .flat_map(|msrs| msrs.as_mut_slice())
404 .filter(|msr| msr.index == MSR_IA32_TSC_DEADLINE && msr.data == 0)
405 .for_each(|msr| {
406 warn!(
407 "MSR_IA32_TSC_DEADLINE is 0, replacing with {:#x}.",
408 tsc_value
409 );
410 msr.data = tsc_value;
411 });
412 }
413 }
414
415 /// Looks for MSRs from the [`DEFERRED_MSRS`] array and removes them from `msr_chunks`.
416 /// Returns a new [`Msrs`] object containing all the removed MSRs.
417 ///
418 /// We use this to capture some causal dependencies between MSRs where the relative order
419 /// of restoration matters (e.g. MSR_IA32_TSC must be restored before MSR_IA32_TSC_DEADLINE).
420 fn extract_deferred_msrs(msr_chunks: &mut [Msrs]) -> Result<Msrs, fam::Error> {
421 // Use 0 here as FamStructWrapper doesn't really give an equivalent of `Vec::with_capacity`,
422 // and if we specify something N != 0 here, then it will create a FamStructWrapper with N
423 // elements pre-allocated and zero'd out. Unless we then actually "fill" all those N values,
424 // KVM will later yell at us about invalid MSRs.
425 let mut deferred_msrs = Msrs::new(0)?;
426
427 for msrs in msr_chunks {
428 msrs.retain(|msr| {
429 if DEFERRED_MSRS.contains(&msr.index) {
430 deferred_msrs
431 .push(*msr)
432 .inspect_err(|err| {
433 error!(
434 "Failed to move MSR {} into later chunk: {:?}",
435 msr.index, err
436 )
437 })
438 .is_err()
439 } else {
440 true
441 }
442 });
443 }
444
445 Ok(deferred_msrs)
446 }
447
448 /// Get MSR chunks for the given MSR index list.
449 ///
450 /// KVM only supports getting `KVM_MAX_MSR_ENTRIES` at a time, so we divide
451 /// the list of MSR indices into chunks, call `KVM_GET_MSRS` for each
452 /// chunk, and collect into a [`Vec<Msrs>`].
453 ///
454 /// # Arguments
455 ///
456 /// * `msr_index_iter`: Iterator over MSR indices.
457 ///
458 /// # Errors
459 ///
460 /// * When [`kvm_bindings::Msrs::new`] returns errors.
461 /// * When [`kvm_ioctls::VcpuFd::get_msrs`] returns errors.
462 /// * When the return value of [`kvm_ioctls::VcpuFd::get_msrs`] (the number of entries that
463 /// could be gotten) is less than expected.
464 fn get_msr_chunks(
465 &self,
466 mut msr_index_iter: impl ExactSizeIterator<Item = u32>,
467 ) -> Result<Vec<Msrs>, KvmVcpuError> {
468 let num_chunks = msr_index_iter.len().div_ceil(KVM_MAX_MSR_ENTRIES);
469
470 // + 1 for the chunk of deferred MSRs
471 let mut msr_chunks: Vec<Msrs> = Vec::with_capacity(num_chunks + 1);
472
473 for _ in 0..num_chunks {
474 let chunk_len = msr_index_iter.len().min(KVM_MAX_MSR_ENTRIES);
475 let chunk = self.get_msr_chunk(&mut msr_index_iter, chunk_len)?;
476 msr_chunks.push(chunk);
477 }
478
479 Self::fix_zero_tsc_deadline_msr(&mut msr_chunks);
480
481 let deferred = Self::extract_deferred_msrs(&mut msr_chunks)?;
482 msr_chunks.push(deferred);
483
484 Ok(msr_chunks)
485 }
486
487 /// Get single MSR chunk for the given MSR index iterator with
488 /// specified length. Iterator should have enough elements
489 /// to fill the chunk with indices, otherwise KVM will
490 /// return an error when processing half filled chunk.
491 ///
492 /// # Arguments
493 ///
494 /// * `msr_index_iter`: Iterator over MSR indices.
495 /// * `chunk_size`: Length of a chunk.
496 ///
497 /// # Errors
498 ///
499 /// * When [`kvm_bindings::Msrs::new`] returns errors.
500 /// * When [`kvm_ioctls::VcpuFd::get_msrs`] returns errors.
501 /// * When the return value of [`kvm_ioctls::VcpuFd::get_msrs`] (the number of entries that
502 /// could be gotten) is less than expected.
503 pub fn get_msr_chunk(
504 &self,
505 msr_index_iter: impl Iterator<Item = u32>,
506 chunk_size: usize,
507 ) -> Result<Msrs, KvmVcpuError> {
508 let chunk_iter = msr_index_iter.take(chunk_size);
509
510 let mut msrs = Msrs::new(chunk_size)?;
511 let msr_entries = msrs.as_mut_slice();
512 for (pos, msr_index) in chunk_iter.enumerate() {
513 msr_entries[pos].index = msr_index;
514 }
515
516 let nmsrs = self
517 .fd
518 .get_msrs(&mut msrs)
519 .map_err(KvmVcpuError::VcpuGetMsrs)?;
520 // GET_MSRS returns a number of successfully set msrs.
521 // If number of set msrs is not equal to the length of
522 // `msrs`, then the value returned by GET_MSRS can act
523 // as an index to the problematic msr.
524 if nmsrs != chunk_size {
525 Err(KvmVcpuError::VcpuGetMsr(msrs.as_slice()[nmsrs].index))
526 } else {
527 Ok(msrs)
528 }
529 }
530
531 /// Get MSRs for the given MSR index list.
532 ///
533 /// # Arguments
534 ///
535 /// * `msr_index_list`: List of MSR indices
536 ///
537 /// # Errors
538 ///
539 /// * When `KvmVcpu::get_msr_chunks()` returns errors.
540 pub fn get_msrs(
541 &self,
542 msr_index_iter: impl ExactSizeIterator<Item = u32>,
543 ) -> Result<BTreeMap<u32, u64>, KvmVcpuError> {
544 let mut msrs = BTreeMap::new();
545 self.get_msr_chunks(msr_index_iter)?
546 .iter()
547 .for_each(|msr_chunk| {
548 msr_chunk.as_slice().iter().for_each(|msr| {
549 msrs.insert(msr.index, msr.data);
550 });
551 });
552 Ok(msrs)
553 }
554
555 /// Save the KVM internal state.
556 pub fn save_state(&self) -> Result<VcpuState, KvmVcpuError> {
557 // Ordering requirements:
558 //
559 // KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
560 // vCPU/LAPIC state. As such, it must be done before most everything
561 // else, otherwise we cannot restore everything and expect it to work.
562 //
563 // KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
564 // still running.
565 //
566 // KVM_GET_LAPIC may change state of LAPIC before returning it.
567 //
568 // GET_VCPU_EVENTS should probably be last to save. The code looks as
569 // it might as well be affected by internal state modifications of the
570 // GET ioctls.
571 //
572 // SREGS saves/restores a pending interrupt, similar to what
573 // VCPU_EVENTS also does.
574
575 let mp_state = self
576 .fd
577 .get_mp_state()
578 .map_err(KvmVcpuError::VcpuGetMpState)?;
579 let regs = self.fd.get_regs().map_err(KvmVcpuError::VcpuGetRegs)?;
580 let sregs = self.fd.get_sregs().map_err(KvmVcpuError::VcpuGetSregs)?;
581 let xsave = self.get_xsave()?;
582 let xcrs = self.fd.get_xcrs().map_err(KvmVcpuError::VcpuGetXcrs)?;
583 let debug_regs = self
584 .fd
585 .get_debug_regs()
586 .map_err(KvmVcpuError::VcpuGetDebugRegs)?;
587 let lapic = self.fd.get_lapic().map_err(KvmVcpuError::VcpuGetLapic)?;
588 let tsc_khz = self.get_tsc_khz().ok().or_else(|| {
589 // v0.25 and newer snapshots without TSC will only work on
590 // the same CPU model as the host on which they were taken.
591 // TODO: Add negative test for this warning failure.
592 warn!("TSC freq not available. Snapshot cannot be loaded on a different CPU model.");
593 None
594 });
595 let cpuid = self.get_cpuid()?;
596 let saved_msrs = self.get_msr_chunks(self.msrs_to_save.iter().copied())?;
597 let vcpu_events = self
598 .fd
599 .get_vcpu_events()
600 .map_err(KvmVcpuError::VcpuGetVcpuEvents)?;
601
602 Ok(VcpuState {
603 cpuid,
604 saved_msrs,
605 debug_regs,
606 lapic,
607 mp_state,
608 regs,
609 sregs,
610 vcpu_events,
611 xcrs,
612 xsave,
613 tsc_khz,
614 })
615 }
616
617 /// Dumps CPU configuration (CPUID and MSRs).
618 ///
619 /// Opposed to `save_state()`, this dumps all the supported and dumpable MSRs not limited to
620 /// serializable ones.
621 pub fn dump_cpu_config(&self) -> Result<CpuConfiguration, KvmVcpuError> {
622 let cpuid = cpuid::Cpuid::try_from(self.get_cpuid()?)?;
623 let kvm = kvm_ioctls::Kvm::new().unwrap();
624 let msr_index_list = crate::arch::x86_64::msr::get_msrs_to_dump(&kvm)?;
625 let msrs = self.get_msrs(msr_index_list.as_slice().iter().copied())?;
626 Ok(CpuConfiguration { cpuid, msrs })
627 }
628
629 /// Checks whether the TSC needs scaling when restoring a snapshot.
630 ///
631 /// # Errors
632 ///
633 /// When
634 pub fn is_tsc_scaling_required(&self, state_tsc_freq: u32) -> Result<bool, GetTscError> {
635 // Compare the current TSC freq to the one found
636 // in the state. If they are different, we need to
637 // scale the TSC to the freq found in the state.
638 // We accept values within a tolerance of 250 parts
639 // per million because it is common for TSC frequency
640 // to differ due to calibration at boot time.
641 let diff = (i64::from(self.get_tsc_khz()?) - i64::from(state_tsc_freq)).abs();
642 // Cannot overflow since u32::MAX * 250 < i64::MAX
643 Ok(diff > i64::from(state_tsc_freq) * TSC_KHZ_TOL_NUMERATOR / TSC_KHZ_TOL_DENOMINATOR)
644 }
645
646 /// Scale the TSC frequency of this vCPU to the one provided as a parameter.
647 pub fn set_tsc_khz(&self, tsc_freq: u32) -> Result<(), SetTscError> {
648 self.fd.set_tsc_khz(tsc_freq).map_err(SetTscError)
649 }
650
651 /// Use provided state to populate KVM internal state.
652 pub fn restore_state(&self, state: &VcpuState) -> Result<(), KvmVcpuError> {
653 // Ordering requirements:
654 //
655 // KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
656 // still running.
657 //
658 // Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
659 // if we ever change the BSP, we have to do that before restoring anything.
660 // The same seems to be true for CPUID stuff.
661 //
662 // SREGS saves/restores a pending interrupt, similar to what
663 // VCPU_EVENTS also does.
664 //
665 // SET_REGS clears pending exceptions unconditionally, thus, it must be
666 // done before SET_VCPU_EVENTS, which restores it.
667 //
668 // SET_LAPIC must come after SET_SREGS, because the latter restores
669 // the apic base msr.
670 //
671 // SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
672 // only restores successfully, when the LAPIC is correctly configured.
673
674 self.fd
675 .set_cpuid2(&state.cpuid)
676 .map_err(KvmVcpuError::VcpuSetCpuid)?;
677 self.fd
678 .set_mp_state(state.mp_state)
679 .map_err(KvmVcpuError::VcpuSetMpState)?;
680 self.fd
681 .set_regs(&state.regs)
682 .map_err(KvmVcpuError::VcpuSetRegs)?;
683 self.fd
684 .set_sregs(&state.sregs)
685 .map_err(KvmVcpuError::VcpuSetSregs)?;
686 // SAFETY: Safe unless the snapshot is corrupted.
687 unsafe {
688 // kvm-ioctl's `set_xsave2()` can be called even on kernel versions not supporting
689 // `KVM_CAP_XSAVE2`, because it internally calls `KVM_SET_XSAVE` API that was extended
690 // by Linux kernel. Thus, `KVM_SET_XSAVE2` API does not exist as a KVM interface.
691 // However, kvm-ioctl added `set_xsave2()` to allow users to pass `Xsave` instead of the
692 // older `kvm_xsave`.
693 self.fd
694 .set_xsave2(&state.xsave)
695 .map_err(KvmVcpuError::VcpuSetXsave)?;
696 }
697 self.fd
698 .set_xcrs(&state.xcrs)
699 .map_err(KvmVcpuError::VcpuSetXcrs)?;
700 self.fd
701 .set_debug_regs(&state.debug_regs)
702 .map_err(KvmVcpuError::VcpuSetDebugRegs)?;
703 self.fd
704 .set_lapic(&state.lapic)
705 .map_err(KvmVcpuError::VcpuSetLapic)?;
706 for msrs in &state.saved_msrs {
707 let nmsrs = self.fd.set_msrs(msrs).map_err(KvmVcpuError::VcpuSetMsrs)?;
708 if nmsrs < msrs.as_fam_struct_ref().nmsrs as usize {
709 return Err(KvmVcpuError::VcpuSetMsrsIncomplete);
710 }
711 }
712 self.fd
713 .set_vcpu_events(&state.vcpu_events)
714 .map_err(KvmVcpuError::VcpuSetVcpuEvents)?;
715
716 self.kvmclock_ctrl();
717 Ok(())
718 }
719}
720
721impl Peripherals {
722 /// Runs the vCPU in KVM context and handles the kvm exit reason.
723 ///
724 /// Returns error or enum specifying whether emulation was handled or interrupted.
725 pub fn run_arch_emulation(&self, exit: VcpuExit) -> Result<VcpuEmulation, VcpuError> {
726 match exit {
727 VcpuExit::IoIn(addr, data) => {
728 if let Some(pio_bus) = &self.pio_bus {
729 let _metric = METRICS.vcpu.exit_io_in_agg.record_latency_metrics();
730 if let Err(err) = pio_bus.read(u64::from(addr), data) {
731 warn!("vcpu: IO read @ {addr:#x}:{:#x} failed: {err}", data.len());
732 }
733 METRICS.vcpu.exit_io_in.inc();
734 }
735 Ok(VcpuEmulation::Handled)
736 }
737 VcpuExit::IoOut(addr, data) => {
738 if let Some(pio_bus) = &self.pio_bus {
739 let _metric = METRICS.vcpu.exit_io_out_agg.record_latency_metrics();
740 if let Err(err) = pio_bus.write(u64::from(addr), data) {
741 warn!("vcpu: IO write @ {addr:#x}:{:#x} failed: {err}", data.len());
742 }
743 METRICS.vcpu.exit_io_out.inc();
744 }
745 Ok(VcpuEmulation::Handled)
746 }
747 unexpected_exit => {
748 METRICS.vcpu.failures.inc();
749 // TODO: Are we sure we want to finish running a vcpu upon
750 // receiving a vm exit that is not necessarily an error?
751 error!("Unexpected exit reason on vcpu run: {:?}", unexpected_exit);
752 Err(VcpuError::UnhandledKvmExit(format!(
753 "{:?}",
754 unexpected_exit
755 )))
756 }
757 }
758 }
759}
760
761/// Structure holding VCPU kvm state.
762#[derive(Serialize, Deserialize)]
763pub struct VcpuState {
764 /// CpuId.
765 pub cpuid: CpuId,
766 /// Saved msrs.
767 pub saved_msrs: Vec<Msrs>,
768 /// Debug regs.
769 pub debug_regs: kvm_debugregs,
770 /// Lapic.
771 pub lapic: kvm_lapic_state,
772 /// Mp state
773 pub mp_state: kvm_mp_state,
774 /// Kvm regs.
775 pub regs: kvm_regs,
776 /// Sregs.
777 pub sregs: kvm_sregs,
778 /// Vcpu events
779 pub vcpu_events: kvm_vcpu_events,
780 /// Xcrs.
781 pub xcrs: kvm_xcrs,
782 /// Xsave.
783 pub xsave: Xsave,
784 /// Tsc khz.
785 pub tsc_khz: Option<u32>,
786}
787
788impl Debug for VcpuState {
789 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
790 let mut debug_kvm_regs: Vec<kvm_bindings::kvm_msrs> = Vec::new();
791 for kvm_msrs in self.saved_msrs.iter() {
792 debug_kvm_regs = kvm_msrs.clone().into_raw();
793 debug_kvm_regs.sort_by_key(|msr| (msr.nmsrs, msr.pad));
794 }
795 f.debug_struct("VcpuState")
796 .field("cpuid", &self.cpuid)
797 .field("saved_msrs", &debug_kvm_regs)
798 .field("debug_regs", &self.debug_regs)
799 .field("lapic", &self.lapic)
800 .field("mp_state", &self.mp_state)
801 .field("regs", &self.regs)
802 .field("sregs", &self.sregs)
803 .field("vcpu_events", &self.vcpu_events)
804 .field("xcrs", &self.xcrs)
805 .field("xsave", &self.xsave)
806 .field("tsc_khz", &self.tsc_khz)
807 .finish()
808 }
809}
810
811#[cfg(test)]
812mod tests {
813 #![allow(clippy::undocumented_unsafe_blocks)]
814
815 use kvm_bindings::kvm_msr_entry;
816 use kvm_ioctls::Cap;
817 use vm_memory::GuestAddress;
818
819 use super::*;
820 use crate::arch::BootProtocol;
821 use crate::arch::x86_64::cpu_model::CpuModel;
822 use crate::cpu_config::templates::{
823 CpuConfiguration, CpuTemplateType, CustomCpuTemplate, GetCpuTemplate, GuestConfigError,
824 StaticCpuTemplate,
825 };
826 use crate::cpu_config::x86_64::cpuid::{Cpuid, CpuidEntry, CpuidKey};
827 use crate::vstate::kvm::Kvm;
828 use crate::vstate::vm::Vm;
829 use crate::vstate::vm::tests::{setup_vm, setup_vm_with_memory};
830
831 impl Default for VcpuState {
832 fn default() -> Self {
833 VcpuState {
834 cpuid: CpuId::new(1).unwrap(),
835 saved_msrs: vec![Msrs::new(1).unwrap()],
836 debug_regs: Default::default(),
837 lapic: Default::default(),
838 mp_state: Default::default(),
839 regs: Default::default(),
840 sregs: Default::default(),
841 vcpu_events: Default::default(),
842 xcrs: Default::default(),
843 xsave: Xsave::new(0).unwrap(),
844 tsc_khz: Some(0),
845 }
846 }
847 }
848
849 fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, KvmVcpu) {
850 let (kvm, vm) = setup_vm_with_memory(mem_size);
851 vm.setup_irqchip().unwrap();
852 let vcpu = KvmVcpu::new(0, &vm).unwrap();
853 (kvm, vm, vcpu)
854 }
855
856 fn create_vcpu_config(
857 kvm: &Kvm,
858 vcpu: &KvmVcpu,
859 template: &CustomCpuTemplate,
860 ) -> Result<VcpuConfig, GuestConfigError> {
861 let cpuid = Cpuid::try_from(kvm.supported_cpuid.clone())
862 .map_err(GuestConfigError::CpuidFromKvmCpuid)?;
863 let msrs = vcpu
864 .get_msrs(template.msr_index_iter())
865 .map_err(GuestConfigError::VcpuIoctl)?;
866 let base_cpu_config = CpuConfiguration { cpuid, msrs };
867 let cpu_config = CpuConfiguration::apply_template(base_cpu_config, template)?;
868 Ok(VcpuConfig {
869 vcpu_count: 1,
870 smt: false,
871 cpu_config,
872 })
873 }
874
875 #[test]
876 fn test_configure_vcpu() {
877 let (kvm, vm, mut vcpu) = setup_vcpu(0x10000);
878
879 let vcpu_config = create_vcpu_config(&kvm, &vcpu, &CustomCpuTemplate::default()).unwrap();
880 assert_eq!(
881 vcpu.configure(
882 vm.guest_memory(),
883 EntryPoint {
884 entry_addr: GuestAddress(0),
885 protocol: BootProtocol::LinuxBoot,
886 },
887 &vcpu_config,
888 ),
889 Ok(())
890 );
891
892 let try_configure = |kvm: &Kvm, vcpu: &mut KvmVcpu, template| -> bool {
893 let cpu_template = Some(CpuTemplateType::Static(template));
894 let template = cpu_template.get_cpu_template();
895 match template {
896 Ok(template) => match create_vcpu_config(kvm, vcpu, &template) {
897 Ok(config) => vcpu
898 .configure(
899 vm.guest_memory(),
900 EntryPoint {
901 entry_addr: GuestAddress(crate::arch::get_kernel_start()),
902 protocol: BootProtocol::LinuxBoot,
903 },
904 &config,
905 )
906 .is_ok(),
907 Err(_) => false,
908 },
909 Err(_) => false,
910 }
911 };
912
913 // Test configure while using the T2 template.
914 let t2_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2);
915
916 // Test configure while using the C3 template.
917 let c3_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::C3);
918
919 // Test configure while using the T2S template.
920 let t2s_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2S);
921
922 // Test configure while using the T2CL template.
923 let t2cl_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2CL);
924
925 // Test configure while using the T2S template.
926 let t2a_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2A);
927
928 let cpu_model = CpuModel::get_cpu_model();
929 match &cpuid::common::get_vendor_id_from_host().unwrap() {
930 cpuid::VENDOR_ID_INTEL => {
931 assert_eq!(
932 t2_res,
933 StaticCpuTemplate::T2
934 .get_supported_cpu_models()
935 .contains(&cpu_model)
936 );
937 assert_eq!(
938 c3_res,
939 StaticCpuTemplate::C3
940 .get_supported_cpu_models()
941 .contains(&cpu_model)
942 );
943 assert_eq!(
944 t2s_res,
945 StaticCpuTemplate::T2S
946 .get_supported_cpu_models()
947 .contains(&cpu_model)
948 );
949 assert_eq!(
950 t2cl_res,
951 StaticCpuTemplate::T2CL
952 .get_supported_cpu_models()
953 .contains(&cpu_model)
954 );
955 assert!(!t2a_res);
956 }
957 cpuid::VENDOR_ID_AMD => {
958 assert!(!t2_res);
959 assert!(!c3_res);
960 assert!(!t2s_res);
961 assert!(!t2cl_res);
962 assert_eq!(
963 t2a_res,
964 StaticCpuTemplate::T2A
965 .get_supported_cpu_models()
966 .contains(&cpu_model)
967 );
968 }
969 _ => {
970 assert!(!t2_res);
971 assert!(!c3_res);
972 assert!(!t2s_res);
973 assert!(!t2cl_res);
974 assert!(!t2a_res);
975 }
976 }
977 }
978
979 #[test]
980 fn test_vcpu_cpuid_restore() {
981 let (kvm, _, vcpu) = setup_vcpu(0x10000);
982 vcpu.fd.set_cpuid2(&kvm.supported_cpuid).unwrap();
983
984 // Mutate the CPUID.
985 // Leaf 0x3 / EAX that is an unused (reserved to be accurate) register, so it's harmless.
986 let mut state = vcpu.save_state().unwrap();
987 state.cpuid.as_mut_slice().iter_mut().for_each(|entry| {
988 if entry.function == 3 && entry.index == 0 {
989 entry.eax = 0x1234_5678;
990 }
991 });
992
993 // Restore the state into the existing vcpu.
994 let result1 = vcpu.restore_state(&state);
995 assert!(result1.is_ok(), "{}", result1.unwrap_err());
996 drop(vcpu);
997
998 // Restore the state into a new vcpu.
999 let (_, _vm, vcpu) = setup_vcpu(0x10000);
1000 let result2 = vcpu.restore_state(&state);
1001 assert!(result2.is_ok(), "{}", result2.unwrap_err());
1002
1003 // Validate the mutated cpuid is restored correctly.
1004 let state = vcpu.save_state().unwrap();
1005 let cpuid = Cpuid::try_from(state.cpuid).unwrap();
1006 let leaf3 = cpuid
1007 .inner()
1008 .get(&CpuidKey {
1009 leaf: 0x3,
1010 subleaf: 0x0,
1011 })
1012 .unwrap();
1013 assert!(leaf3.result.eax == 0x1234_5678);
1014 }
1015
1016 #[test]
1017 fn test_empty_cpuid_entries_removed() {
1018 // Test that `get_cpuid()` removes zeroed empty entries from the `KVM_GET_CPUID2` result.
1019 let (kvm, vm, mut vcpu) = setup_vcpu(0x10000);
1020 let vcpu_config = VcpuConfig {
1021 vcpu_count: 1,
1022 smt: false,
1023 cpu_config: CpuConfiguration {
1024 cpuid: Cpuid::try_from(kvm.supported_cpuid.clone()).unwrap(),
1025 msrs: BTreeMap::new(),
1026 },
1027 };
1028 vcpu.configure(
1029 vm.guest_memory(),
1030 EntryPoint {
1031 entry_addr: GuestAddress(0),
1032 protocol: BootProtocol::LinuxBoot,
1033 },
1034 &vcpu_config,
1035 )
1036 .unwrap();
1037
1038 // Invalid entries filled with 0 should not exist.
1039 let cpuid = vcpu.get_cpuid().unwrap();
1040 cpuid.as_slice().iter().for_each(|entry| {
1041 assert!(
1042 !(entry.function == 0
1043 && entry.index == 0
1044 && entry.flags == 0
1045 && entry.eax == 0
1046 && entry.ebx == 0
1047 && entry.ecx == 0
1048 && entry.edx == 0)
1049 );
1050 });
1051
1052 // Leaf 0 should have non-zero entry in `Cpuid`.
1053 let cpuid = Cpuid::try_from(cpuid).unwrap();
1054 assert_ne!(
1055 cpuid
1056 .inner()
1057 .get(&CpuidKey {
1058 leaf: 0,
1059 subleaf: 0,
1060 })
1061 .unwrap(),
1062 &CpuidEntry {
1063 ..Default::default()
1064 }
1065 );
1066 }
1067
1068 #[test]
1069 fn test_dump_cpu_config_with_non_configured_vcpu() {
1070 // Test `dump_cpu_config()` before vcpu configuration.
1071 //
1072 // `KVM_GET_CPUID2` returns the result of `KVM_SET_CPUID2`. See
1073 // https://docs.kernel.org/virt/kvm/api.html#kvm-set-cpuid
1074 // Since `KVM_SET_CPUID2` has not been called before vcpu configuration, all leaves should
1075 // be filled with zero. Therefore, `KvmVcpu::dump_cpu_config()` should fail with CPUID type
1076 // conversion error due to the lack of brand string info in leaf 0x0.
1077 let (_, _, vcpu) = setup_vcpu(0x10000);
1078 match vcpu.dump_cpu_config() {
1079 Err(KvmVcpuError::ConvertCpuidType(_)) => (),
1080 Err(err) => panic!("Unexpected error: {err}"),
1081 Ok(_) => panic!("Dumping CPU configuration should fail before vcpu configuration."),
1082 }
1083 }
1084
1085 #[test]
1086 fn test_dump_cpu_config_with_configured_vcpu() {
1087 // Test `dump_cpu_config()` after vcpu configuration.
1088 let (kvm, vm, mut vcpu) = setup_vcpu(0x10000);
1089 let vcpu_config = VcpuConfig {
1090 vcpu_count: 1,
1091 smt: false,
1092 cpu_config: CpuConfiguration {
1093 cpuid: Cpuid::try_from(kvm.supported_cpuid.clone()).unwrap(),
1094 msrs: BTreeMap::new(),
1095 },
1096 };
1097
1098 vcpu.configure(
1099 vm.guest_memory(),
1100 EntryPoint {
1101 entry_addr: GuestAddress(0),
1102 protocol: BootProtocol::LinuxBoot,
1103 },
1104 &vcpu_config,
1105 )
1106 .unwrap();
1107 vcpu.dump_cpu_config().unwrap();
1108 }
1109
1110 #[test]
1111 #[allow(clippy::redundant_clone)]
1112 fn test_is_tsc_scaling_required() {
1113 // Test `is_tsc_scaling_required` as if it were on the same
1114 // CPU model as the one in the snapshot state.
1115 let (_, _, vcpu) = setup_vcpu(0x1000);
1116
1117 {
1118 // The frequency difference is within tolerance.
1119 let mut state = vcpu.save_state().unwrap();
1120 state.tsc_khz = Some(
1121 state.tsc_khz.unwrap()
1122 + state.tsc_khz.unwrap() * u32::try_from(TSC_KHZ_TOL_NUMERATOR).unwrap()
1123 / u32::try_from(TSC_KHZ_TOL_DENOMINATOR).unwrap()
1124 / 2,
1125 );
1126 assert!(
1127 !vcpu
1128 .is_tsc_scaling_required(state.tsc_khz.unwrap())
1129 .unwrap()
1130 );
1131 }
1132
1133 {
1134 // The frequency difference is over the tolerance.
1135 let mut state = vcpu.save_state().unwrap();
1136 state.tsc_khz = Some(
1137 state.tsc_khz.unwrap()
1138 + state.tsc_khz.unwrap() * u32::try_from(TSC_KHZ_TOL_NUMERATOR).unwrap()
1139 / u32::try_from(TSC_KHZ_TOL_DENOMINATOR).unwrap()
1140 * 2,
1141 );
1142 assert!(
1143 vcpu.is_tsc_scaling_required(state.tsc_khz.unwrap())
1144 .unwrap()
1145 );
1146 }
1147
1148 {
1149 // Try a large frequency (30GHz) in the state and check it doesn't
1150 // overflow
1151 assert!(vcpu.is_tsc_scaling_required(30_000_000).unwrap());
1152 }
1153 }
1154
1155 #[test]
1156 fn test_set_tsc() {
1157 let (kvm, _, vcpu) = setup_vcpu(0x1000);
1158 let mut state = vcpu.save_state().unwrap();
1159 state.tsc_khz = Some(
1160 state.tsc_khz.unwrap()
1161 + state.tsc_khz.unwrap() * u32::try_from(TSC_KHZ_TOL_NUMERATOR).unwrap()
1162 / u32::try_from(TSC_KHZ_TOL_DENOMINATOR).unwrap()
1163 * 2,
1164 );
1165
1166 if kvm.fd.check_extension(Cap::TscControl) {
1167 vcpu.set_tsc_khz(state.tsc_khz.unwrap()).unwrap();
1168 if kvm.fd.check_extension(Cap::GetTscKhz) {
1169 assert_eq!(vcpu.get_tsc_khz().ok(), state.tsc_khz);
1170 } else {
1171 vcpu.get_tsc_khz().unwrap_err();
1172 }
1173 } else {
1174 vcpu.set_tsc_khz(state.tsc_khz.unwrap()).unwrap_err();
1175 }
1176 }
1177
1178 #[test]
1179 fn test_get_msrs_with_msrs_to_save() {
1180 // Test `get_msrs()` with the MSR indices that should be serialized into snapshots.
1181 // The MSR indices should be valid and this test should succeed.
1182 let (_, _, vcpu) = setup_vcpu(0x1000);
1183 vcpu.get_msrs(vcpu.msrs_to_save.iter().copied()).unwrap();
1184 }
1185
1186 #[test]
1187 fn test_get_msrs_with_msrs_to_dump() {
1188 // Test `get_msrs()` with the MSR indices that should be dumped.
1189 // All the MSR indices should be valid and the call should succeed.
1190 let (_, _, vcpu) = setup_vcpu(0x1000);
1191
1192 let kvm = kvm_ioctls::Kvm::new().unwrap();
1193 let msrs_to_dump = crate::arch::x86_64::msr::get_msrs_to_dump(&kvm).unwrap();
1194 vcpu.get_msrs(msrs_to_dump.as_slice().iter().copied())
1195 .unwrap();
1196 }
1197
1198 #[test]
1199 fn test_get_msrs_with_invalid_msr_index() {
1200 // Test `get_msrs()` with unsupported MSR indices. This should return `VcpuGetMsr` error
1201 // that happens when `KVM_GET_MSRS` fails to populate MSR values in the middle and exits.
1202 // Currently, MSR indices 2..=4 are not listed as supported MSRs.
1203 let (_, _, vcpu) = setup_vcpu(0x1000);
1204 let msr_index_list: Vec<u32> = vec![2, 3, 4];
1205 match vcpu.get_msrs(msr_index_list.iter().copied()) {
1206 Err(KvmVcpuError::VcpuGetMsr(_)) => (),
1207 Err(err) => panic!("Unexpected error: {err}"),
1208 Ok(_) => {
1209 panic!("KvmVcpu::get_msrs() for unsupported MSRs should fail with VcpuGetMsr.")
1210 }
1211 }
1212 }
1213
1214 fn msrs_from_entries(msr_entries: &[(u32, u64)]) -> Msrs {
1215 Msrs::from_entries(
1216 &msr_entries
1217 .iter()
1218 .map(|&(index, data)| kvm_msr_entry {
1219 index,
1220 data,
1221 ..Default::default()
1222 })
1223 .collect::<Vec<_>>(),
1224 )
1225 .unwrap()
1226 }
1227
1228 fn assert_msrs(msr_chunks: &[Msrs], expected_msr_entries: &[(u32, u64)]) {
1229 let flattened_msrs = msr_chunks.iter().flat_map(|msrs| msrs.as_slice());
1230 for (a, b) in flattened_msrs.zip(expected_msr_entries.iter()) {
1231 assert_eq!(a.index, b.0);
1232 assert_eq!(a.data, b.1);
1233 }
1234 }
1235
1236 #[test]
1237 fn test_defer_msrs() {
1238 let to_defer = DEFERRED_MSRS[0];
1239
1240 let mut msr_chunks = [msrs_from_entries(&[(to_defer, 0), (MSR_IA32_TSC, 1)])];
1241
1242 let deferred = KvmVcpu::extract_deferred_msrs(&mut msr_chunks).unwrap();
1243
1244 assert_eq!(deferred.as_slice().len(), 1, "did not correctly defer MSR");
1245 assert_eq!(
1246 msr_chunks[0].as_slice().len(),
1247 1,
1248 "deferred MSR not removed from chunk"
1249 );
1250
1251 assert_eq!(deferred.as_slice()[0].index, to_defer);
1252 assert_eq!(msr_chunks[0].as_slice()[0].index, MSR_IA32_TSC);
1253 }
1254
1255 #[test]
1256 fn test_fix_zero_tsc_deadline_msr_zero_same_chunk() {
1257 // Place both TSC and TSC_DEADLINE MSRs in the same chunk.
1258 let mut msr_chunks = [msrs_from_entries(&[
1259 (MSR_IA32_TSC_DEADLINE, 0),
1260 (MSR_IA32_TSC, 42),
1261 ])];
1262
1263 KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
1264
1265 // We expect for the MSR_IA32_TSC_DEADLINE to get updated with the MSR_IA32_TSC value.
1266 assert_msrs(
1267 &msr_chunks,
1268 &[(MSR_IA32_TSC_DEADLINE, 42), (MSR_IA32_TSC, 42)],
1269 );
1270 }
1271
1272 #[test]
1273 fn test_fix_zero_tsc_deadline_msr_zero_separate_chunks() {
1274 // Place both TSC and TSC_DEADLINE MSRs in separate chunks.
1275 let mut msr_chunks = [
1276 msrs_from_entries(&[(MSR_IA32_TSC_DEADLINE, 0)]),
1277 msrs_from_entries(&[(MSR_IA32_TSC, 42)]),
1278 ];
1279
1280 KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
1281
1282 // We expect for the MSR_IA32_TSC_DEADLINE to get updated with the MSR_IA32_TSC value.
1283 assert_msrs(
1284 &msr_chunks,
1285 &[(MSR_IA32_TSC_DEADLINE, 42), (MSR_IA32_TSC, 42)],
1286 );
1287 }
1288
1289 #[test]
1290 fn test_fix_zero_tsc_deadline_msr_non_zero() {
1291 let mut msr_chunks = [msrs_from_entries(&[
1292 (MSR_IA32_TSC_DEADLINE, 1),
1293 (MSR_IA32_TSC, 2),
1294 ])];
1295
1296 KvmVcpu::fix_zero_tsc_deadline_msr(&mut msr_chunks);
1297
1298 // We expect that MSR_IA32_TSC_DEADLINE should remain unchanged, because it is non-zero
1299 // already.
1300 assert_msrs(
1301 &msr_chunks,
1302 &[(MSR_IA32_TSC_DEADLINE, 1), (MSR_IA32_TSC, 2)],
1303 );
1304 }
1305
1306 #[test]
1307 fn test_get_msr_chunks_preserved_order() {
1308 // Regression test for #4666
1309 let (_, vm) = setup_vm();
1310 let vcpu = KvmVcpu::new(0, &vm).unwrap();
1311
1312 // The list of supported MSR indices, in the order they were returned by KVM
1313 let msrs_to_save = vm.msrs_to_save();
1314 // The MSRs after processing. The order should be identical to the one returned by KVM, with
1315 // the exception of deferred MSRs, which should be moved to the end (but show up in the same
1316 // order as they are listed in [`DEFERRED_MSRS`].
1317 let msr_chunks = vcpu
1318 .get_msr_chunks(vcpu.msrs_to_save.iter().copied())
1319 .unwrap();
1320
1321 msr_chunks
1322 .iter()
1323 .flat_map(|chunk| chunk.as_slice().iter())
1324 .zip(
1325 msrs_to_save
1326 .iter()
1327 .filter(|&idx| !DEFERRED_MSRS.contains(idx))
1328 .chain(DEFERRED_MSRS.iter()),
1329 )
1330 .for_each(|(left, &right)| assert_eq!(left.index, right));
1331 }
1332}