vmm/arch/x86_64/
vm.rs

1// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::fmt;
5use std::sync::{Arc, Mutex};
6
7use kvm_bindings::{
8    KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE,
9    KVM_PIT_SPEAKER_DUMMY, MsrList, kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2,
10};
11use kvm_ioctls::Cap;
12use serde::{Deserialize, Serialize};
13
14use crate::arch::x86_64::msr::MsrError;
15use crate::snapshot::Persist;
16use crate::utils::u64_to_usize;
17use crate::vstate::bus::Bus;
18use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState};
19use crate::vstate::resources::ResourceAllocator;
20use crate::vstate::vm::{VmCommon, VmError};
21
22/// Error type for [`Vm::restore_state`]
23#[allow(missing_docs)]
24#[cfg(target_arch = "x86_64")]
25#[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)]
26pub enum ArchVmError {
27    /// Failed to check KVM capability (0): {1}
28    CheckCapability(Cap, kvm_ioctls::Error),
29    /// Set PIT2 error: {0}
30    SetPit2(kvm_ioctls::Error),
31    /// Set clock error: {0}
32    SetClock(kvm_ioctls::Error),
33    /// Set IrqChipPicMaster error: {0}
34    SetIrqChipPicMaster(kvm_ioctls::Error),
35    /// Set IrqChipPicSlave error: {0}
36    SetIrqChipPicSlave(kvm_ioctls::Error),
37    /// Set IrqChipIoAPIC error: {0}
38    SetIrqChipIoAPIC(kvm_ioctls::Error),
39    /// Failed to get KVM vm pit state: {0}
40    VmGetPit2(kvm_ioctls::Error),
41    /// Failed to get KVM vm clock: {0}
42    VmGetClock(kvm_ioctls::Error),
43    /// Failed to get KVM vm irqchip: {0}
44    VmGetIrqChip(kvm_ioctls::Error),
45    /// Failed to set KVM vm irqchip: {0}
46    VmSetIrqChip(kvm_ioctls::Error),
47    /// Failed to get MSR index list to save into snapshots: {0}
48    GetMsrsToSave(MsrError),
49    /// Failed during KVM_SET_TSS_ADDRESS: {0}
50    SetTssAddress(kvm_ioctls::Error),
51}
52
53/// Structure representing the current architecture's understand of what a "virtual machine" is.
54#[derive(Debug)]
55pub struct ArchVm {
56    /// Architecture independent parts of a vm
57    pub common: VmCommon,
58    msrs_to_save: MsrList,
59    /// Size in bytes requiring to hold the dynamically-sized `kvm_xsave` struct.
60    ///
61    /// `None` if `KVM_CAP_XSAVE2` not supported.
62    xsave2_size: Option<usize>,
63    /// Port IO bus
64    pub pio_bus: Arc<Bus>,
65}
66
67impl ArchVm {
68    /// Create a new `Vm` struct.
69    pub fn new(kvm: &crate::vstate::kvm::Kvm) -> Result<ArchVm, VmError> {
70        let common = Self::create_common(kvm)?;
71
72        let msrs_to_save = kvm.msrs_to_save().map_err(ArchVmError::GetMsrsToSave)?;
73
74        // `KVM_CAP_XSAVE2` was introduced to support dynamically-sized XSTATE buffer in kernel
75        // v5.17. `KVM_GET_EXTENSION(KVM_CAP_XSAVE2)` returns the required size in byte if
76        // supported; otherwise returns 0.
77        // https://github.com/torvalds/linux/commit/be50b2065dfa3d88428fdfdc340d154d96bf6848
78        //
79        // Cache the value in order not to call it at each vCPU creation.
80        let xsave2_size = match common.fd.check_extension_int(Cap::Xsave2) {
81            // Catch all negative values just in case although the possible negative return value
82            // of ioctl() is only -1.
83            ..=-1 => {
84                return Err(VmError::Arch(ArchVmError::CheckCapability(
85                    Cap::Xsave2,
86                    vmm_sys_util::errno::Error::last(),
87                )));
88            }
89            0 => None,
90            // SAFETY: Safe because negative values are handled above.
91            ret => Some(usize::try_from(ret).unwrap()),
92        };
93
94        common
95            .fd
96            .set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS))
97            .map_err(ArchVmError::SetTssAddress)?;
98
99        let pio_bus = Arc::new(Bus::new());
100
101        Ok(ArchVm {
102            common,
103            msrs_to_save,
104            xsave2_size,
105            pio_bus,
106        })
107    }
108
109    /// Pre-vCPU creation setup.
110    pub fn arch_pre_create_vcpus(&mut self, _: u8) -> Result<(), ArchVmError> {
111        // For x86_64 we need to create the interrupt controller before calling `KVM_CREATE_VCPUS`
112        self.setup_irqchip()
113    }
114
115    /// Post-vCPU creation setup.
116    pub fn arch_post_create_vcpus(&mut self, _: u8) -> Result<(), ArchVmError> {
117        Ok(())
118    }
119
120    /// Restores the KVM VM state.
121    ///
122    /// # Errors
123    ///
124    /// When:
125    /// - [`kvm_ioctls::VmFd::set_pit`] errors.
126    /// - [`kvm_ioctls::VmFd::set_clock`] errors.
127    /// - [`kvm_ioctls::VmFd::set_irqchip`] errors.
128    /// - [`kvm_ioctls::VmFd::set_irqchip`] errors.
129    /// - [`kvm_ioctls::VmFd::set_irqchip`] errors.
130    pub fn restore_state(&mut self, state: &VmState) -> Result<(), ArchVmError> {
131        self.fd()
132            .set_pit2(&state.pitstate)
133            .map_err(ArchVmError::SetPit2)?;
134        self.fd()
135            .set_clock(&state.clock)
136            .map_err(ArchVmError::SetClock)?;
137        self.fd()
138            .set_irqchip(&state.pic_master)
139            .map_err(ArchVmError::SetIrqChipPicMaster)?;
140        self.fd()
141            .set_irqchip(&state.pic_slave)
142            .map_err(ArchVmError::SetIrqChipPicSlave)?;
143        self.fd()
144            .set_irqchip(&state.ioapic)
145            .map_err(ArchVmError::SetIrqChipIoAPIC)?;
146        self.common.resource_allocator = Mutex::new(state.resource_allocator.clone());
147        Ok(())
148    }
149
150    /// Creates the irq chip and an in-kernel device model for the PIT.
151    pub fn setup_irqchip(&self) -> Result<(), ArchVmError> {
152        self.fd()
153            .create_irq_chip()
154            .map_err(ArchVmError::VmSetIrqChip)?;
155        // We need to enable the emulation of a dummy speaker port stub so that writing to port 0x61
156        // (i.e. KVM_SPEAKER_BASE_ADDRESS) does not trigger an exit to user space.
157        let pit_config = kvm_pit_config {
158            flags: KVM_PIT_SPEAKER_DUMMY,
159            ..Default::default()
160        };
161        self.fd()
162            .create_pit2(pit_config)
163            .map_err(ArchVmError::VmSetIrqChip)
164    }
165
166    /// Saves and returns the Kvm Vm state.
167    pub fn save_state(&self) -> Result<VmState, ArchVmError> {
168        let pitstate = self.fd().get_pit2().map_err(ArchVmError::VmGetPit2)?;
169
170        let mut clock = self.fd().get_clock().map_err(ArchVmError::VmGetClock)?;
171        // This bit is not accepted in SET_CLOCK, clear it.
172        clock.flags &= !KVM_CLOCK_TSC_STABLE;
173
174        let mut pic_master = kvm_irqchip {
175            chip_id: KVM_IRQCHIP_PIC_MASTER,
176            ..Default::default()
177        };
178        self.fd()
179            .get_irqchip(&mut pic_master)
180            .map_err(ArchVmError::VmGetIrqChip)?;
181
182        let mut pic_slave = kvm_irqchip {
183            chip_id: KVM_IRQCHIP_PIC_SLAVE,
184            ..Default::default()
185        };
186        self.fd()
187            .get_irqchip(&mut pic_slave)
188            .map_err(ArchVmError::VmGetIrqChip)?;
189
190        let mut ioapic = kvm_irqchip {
191            chip_id: KVM_IRQCHIP_IOAPIC,
192            ..Default::default()
193        };
194        self.fd()
195            .get_irqchip(&mut ioapic)
196            .map_err(ArchVmError::VmGetIrqChip)?;
197
198        Ok(VmState {
199            memory: self.common.guest_memory.describe(),
200            resource_allocator: self.resource_allocator().save(),
201            pitstate,
202            clock,
203            pic_master,
204            pic_slave,
205            ioapic,
206        })
207    }
208
209    /// Gets the list of MSRs to save when creating snapshots
210    pub fn msrs_to_save(&self) -> &[u32] {
211        self.msrs_to_save.as_slice()
212    }
213
214    /// Gets the size (in bytes) of the `kvm_xsave` struct.
215    pub fn xsave2_size(&self) -> Option<usize> {
216        self.xsave2_size
217    }
218}
219
220#[derive(Default, Deserialize, Serialize)]
221/// Structure holding VM kvm state.
222pub struct VmState {
223    /// guest memory state
224    pub memory: GuestMemoryState,
225    /// resource allocator
226    pub resource_allocator: ResourceAllocator,
227    pitstate: kvm_pit_state2,
228    clock: kvm_clock_data,
229    // TODO: rename this field to adopt inclusive language once Linux updates it, too.
230    pic_master: kvm_irqchip,
231    // TODO: rename this field to adopt inclusive language once Linux updates it, too.
232    pic_slave: kvm_irqchip,
233    ioapic: kvm_irqchip,
234}
235
236impl fmt::Debug for VmState {
237    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
238        f.debug_struct("VmState")
239            .field("pitstate", &self.pitstate)
240            .field("clock", &self.clock)
241            .field("pic_master", &"?")
242            .field("pic_slave", &"?")
243            .field("ioapic", &"?")
244            .finish()
245    }
246}
247
248#[cfg(test)]
249mod tests {
250    use kvm_bindings::{
251        KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE,
252        KVM_PIT_SPEAKER_DUMMY,
253    };
254
255    use crate::snapshot::Snapshot;
256    use crate::vstate::vm::VmState;
257    use crate::vstate::vm::tests::{setup_vm, setup_vm_with_memory};
258
259    #[cfg(target_arch = "x86_64")]
260    #[test]
261    fn test_vm_save_restore_state() {
262        let (_, vm) = setup_vm();
263        // Irqchips, clock and pitstate are not configured so trying to save state should fail.
264        vm.save_state().unwrap_err();
265
266        let (_, vm) = setup_vm_with_memory(0x1000);
267        vm.setup_irqchip().unwrap();
268
269        let vm_state = vm.save_state().unwrap();
270        assert_eq!(
271            vm_state.pitstate.flags | KVM_PIT_SPEAKER_DUMMY,
272            KVM_PIT_SPEAKER_DUMMY
273        );
274        assert_eq!(vm_state.clock.flags & KVM_CLOCK_TSC_STABLE, 0);
275        assert_eq!(vm_state.pic_master.chip_id, KVM_IRQCHIP_PIC_MASTER);
276        assert_eq!(vm_state.pic_slave.chip_id, KVM_IRQCHIP_PIC_SLAVE);
277        assert_eq!(vm_state.ioapic.chip_id, KVM_IRQCHIP_IOAPIC);
278
279        let (_, mut vm) = setup_vm_with_memory(0x1000);
280        vm.setup_irqchip().unwrap();
281
282        vm.restore_state(&vm_state).unwrap();
283    }
284
285    #[cfg(target_arch = "x86_64")]
286    #[test]
287    fn test_vm_save_restore_state_bad_irqchip() {
288        use kvm_bindings::KVM_NR_IRQCHIPS;
289
290        let (_, vm) = setup_vm_with_memory(0x1000);
291        vm.setup_irqchip().unwrap();
292        let mut vm_state = vm.save_state().unwrap();
293
294        let (_, mut vm) = setup_vm_with_memory(0x1000);
295        vm.setup_irqchip().unwrap();
296
297        // Try to restore an invalid PIC Master chip ID
298        let orig_master_chip_id = vm_state.pic_master.chip_id;
299        vm_state.pic_master.chip_id = KVM_NR_IRQCHIPS;
300        vm.restore_state(&vm_state).unwrap_err();
301        vm_state.pic_master.chip_id = orig_master_chip_id;
302
303        // Try to restore an invalid PIC Slave chip ID
304        let orig_slave_chip_id = vm_state.pic_slave.chip_id;
305        vm_state.pic_slave.chip_id = KVM_NR_IRQCHIPS;
306        vm.restore_state(&vm_state).unwrap_err();
307        vm_state.pic_slave.chip_id = orig_slave_chip_id;
308
309        // Try to restore an invalid IOPIC chip ID
310        vm_state.ioapic.chip_id = KVM_NR_IRQCHIPS;
311        vm.restore_state(&vm_state).unwrap_err();
312    }
313
314    #[cfg(target_arch = "x86_64")]
315    #[test]
316    fn test_vmstate_serde() {
317        let mut snapshot_data = vec![0u8; 10000];
318
319        let (_, mut vm) = setup_vm_with_memory(0x1000);
320        vm.setup_irqchip().unwrap();
321        let state = vm.save_state().unwrap();
322        Snapshot::new(state)
323            .save(&mut snapshot_data.as_mut_slice())
324            .unwrap();
325        let restored_state: VmState = Snapshot::load_without_crc_check(snapshot_data.as_slice())
326            .unwrap()
327            .data;
328
329        vm.restore_state(&restored_state).unwrap();
330    }
331}