nyx_lite/
firecracker_wrappers.rs

1use std::sync::{Arc, Mutex};
2use std::{io, thread};
3
4use anyhow::Result;
5
6use event_manager::SubscriberOps;
7use vmm::Vcpu;
8use vmm::Vmm;
9use vmm::builder::StartMicrovmError;
10use vmm::cpu_config::templates::GetCpuTemplate;
11use vmm::initrd::InitrdConfig;
12use vmm::resources::VmResources;
13use vmm::vmm_config::instance_info::InstanceInfo;
14use vmm::vstate::memory;
15use vmm::{EventManager, VcpuHandle};
16
17use kvm_bindings::KVM_CAP_NESTED_STATE;
18use kvm_bindings::{KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_USE_SW_BP, kvm_guest_debug};
19
20#[cfg(target_arch = "x86_64")]
21use vmm::cpu_config::templates::KvmCapability;
22#[cfg(target_arch = "x86_64")]
23use vmm::cpu_config::x86_64::cpuid::common::get_vendor_id_from_host;
24#[cfg(target_arch = "x86_64")]
25use vmm::cpu_config::x86_64::cpuid::{
26    CpuidKey, CpuidTrait, KvmCpuidFlags, VENDOR_ID_AMD, VENDOR_ID_INTEL,
27};
28#[cfg(target_arch = "x86_64")]
29use vmm::cpu_config::x86_64::custom_cpu_template::{
30    CpuidLeafModifier, CpuidRegister, CpuidRegisterModifier, CustomCpuTemplate,
31};
32
33#[derive(Debug, thiserror::Error, displaydoc::Display)]
34pub enum ResizeFdTableError {
35    /// Failed to get RLIMIT_NOFILE
36    GetRlimit,
37    /// Failed to call dup2 to resize fdtable
38    Dup2(io::Error),
39    /// Failed to close dup2'd file descriptor
40    Close(io::Error),
41}
42
43#[cfg(target_arch = "x86_64")]
44fn ensure_nested_kvm_caps(template: &mut CustomCpuTemplate) {
45    if !template
46        .kvm_capabilities
47        .iter()
48        .any(|cap| matches!(cap, KvmCapability::Add(value) if *value == KVM_CAP_NESTED_STATE))
49    {
50        template
51            .kvm_capabilities
52            .push(KvmCapability::Add(KVM_CAP_NESTED_STATE));
53    }
54}
55
56#[cfg(target_arch = "x86_64")]
57fn set_cpuid_bit(
58    template: &mut CustomCpuTemplate,
59    leaf: u32,
60    subleaf: u32,
61    register: CpuidRegister,
62    bit: u8,
63) {
64    let mask = 1u32 << bit;
65    if let Some(leaf_mod) = template
66        .cpuid_modifiers
67        .iter_mut()
68        .find(|entry| entry.leaf == leaf && entry.subleaf == subleaf)
69    {
70        if let Some(reg_mod) = leaf_mod
71            .modifiers
72            .iter_mut()
73            .find(|entry| entry.register == register)
74        {
75            reg_mod.bitmap.filter |= mask;
76            reg_mod.bitmap.value |= mask;
77        } else {
78            leaf_mod.modifiers.push(CpuidRegisterModifier {
79                register,
80                bitmap: vmm::cpu_config::templates::RegisterValueFilter {
81                    filter: mask,
82                    value: mask,
83                },
84            });
85        }
86    } else {
87        template.cpuid_modifiers.push(CpuidLeafModifier {
88            leaf,
89            subleaf,
90            flags: KvmCpuidFlags::EMPTY,
91            modifiers: vec![CpuidRegisterModifier {
92                register,
93                bitmap: vmm::cpu_config::templates::RegisterValueFilter {
94                    filter: mask,
95                    value: mask,
96                },
97            }],
98        });
99    }
100}
101
102#[cfg(target_arch = "x86_64")]
103fn ensure_nested_virt_supported(
104    kvm: &vmm::arch::x86_64::kvm::Kvm,
105    template: &mut CustomCpuTemplate,
106) -> Result<(), StartMicrovmError> {
107    if kvm.fd.check_extension_raw(u64::from(KVM_CAP_NESTED_STATE)) == 0 {
108        return Err(StartMicrovmError::NestedVirtUnsupported(
109            "KVM_CAP_NESTED_STATE not supported by host".to_string(),
110        ));
111    }
112
113    let vendor = get_vendor_id_from_host().map_err(|err| {
114        StartMicrovmError::NestedVirtUnsupported(format!("unable to read CPUID vendor: {err}"))
115    })?;
116
117    if &vendor == VENDOR_ID_INTEL {
118        let key = CpuidKey {
119            leaf: 0x1,
120            subleaf: 0,
121        };
122        let entry = kvm.supported_cpuid.get(&key).ok_or_else(|| {
123            StartMicrovmError::NestedVirtUnsupported("missing CPUID leaf 0x1".to_string())
124        })?;
125        if entry.result.ecx & (1 << 5) == 0 {
126            return Err(StartMicrovmError::NestedVirtUnsupported(
127                "host CPUID does not advertise VMX support".to_string(),
128            ));
129        }
130        set_cpuid_bit(template, 0x1, 0x0, CpuidRegister::Ecx, 5);
131        Ok(())
132    } else if &vendor == VENDOR_ID_AMD {
133        let key = CpuidKey {
134            leaf: 0x8000_0001,
135            subleaf: 0,
136        };
137        let entry = kvm.supported_cpuid.get(&key).ok_or_else(|| {
138            StartMicrovmError::NestedVirtUnsupported("missing CPUID leaf 0x80000001".to_string())
139        })?;
140        if entry.result.ecx & (1 << 2) == 0 {
141            return Err(StartMicrovmError::NestedVirtUnsupported(
142                "host CPUID does not advertise SVM support".to_string(),
143            ));
144        }
145        set_cpuid_bit(template, 0x8000_0001, 0x0, CpuidRegister::Ecx, 2);
146        Ok(())
147    } else {
148        Err(StartMicrovmError::NestedVirtUnsupported(
149            "unsupported CPU vendor for nested virtualization".to_string(),
150        ))
151    }
152}
153
154/// Attempts to resize the processes file descriptor table to match RLIMIT_NOFILE or 2048 if no
155/// RLIMIT_NOFILE is set (this can only happen if firecracker is run outside the jailer. 2048 is
156/// the default the jailer would set).
157///
158/// We do this resizing because the kernel default is 64, with a reallocation happening whenever
159/// the tabel fills up. This was happening for some larger microVMs, and reallocating the
160/// fdtable while a lot of file descriptors are active (due to being eventfds/timerfds registered
161/// to epoll) incurs a penalty of 30ms-70ms on the snapshot restore path.
162pub fn resize_fdtable() -> Result<(), ResizeFdTableError> {
163    let mut rlimit = libc::rlimit {
164        rlim_cur: 0,
165        rlim_max: 0,
166    };
167
168    // SAFETY: We pass a pointer to a valid area of memory to which we have exclusive mutable access
169    if unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlimit as *mut libc::rlimit) } < 0 {
170        return Err(ResizeFdTableError::GetRlimit);
171    }
172
173    // If no jailer is used, there might not be an NOFILE limit set. In this case, resize
174    // the table to the default that the jailer would usually impose (2048)
175    let limit: libc::c_int = if rlimit.rlim_cur == libc::RLIM_INFINITY {
176        2048
177    } else {
178        rlimit.rlim_cur.try_into().unwrap_or(2048)
179    };
180
181    // Resize the file descriptor table to its maximal possible size, to ensure that
182    // firecracker will not need to reallocate it later. If the file descriptor table
183    // needs to be reallocated (which by default happens once more than 64 fds exist,
184    // something that happens for reasonably complex microvms due to each device using
185    // a multitude of eventfds), this can incur a significant performance impact (it
186    // was responsible for a 30ms-70ms impact on snapshot restore times).
187    if limit > 3 {
188        // SAFETY: Duplicating stdin is safe
189        if unsafe { libc::dup2(0, limit - 1) } < 0 {
190            return Err(ResizeFdTableError::Dup2(io::Error::last_os_error()));
191        }
192
193        // SAFETY: Closing the just created duplicate is safe
194        if unsafe { libc::close(limit - 1) } < 0 {
195            return Err(ResizeFdTableError::Close(io::Error::last_os_error()));
196        }
197    }
198
199    Ok(())
200}
201
202/// Builds and starts a microVM based on the current Firecracker VmResources configuration.
203///
204/// The built microVM and all the created vCPUs start off in the paused state.
205/// To boot the microVM and run those vCPUs, `Vmm::resume_vm()` needs to be
206/// called.
207pub fn build_microvm_for_boot(
208    instance_info: &InstanceInfo,
209    vm_resources: &VmResources,
210    event_manager: &mut EventManager,
211) -> Result<(Arc<Mutex<Vmm>>, Vcpu), StartMicrovmError> {
212    use self::StartMicrovmError::*;
213
214    let boot_config = vm_resources
215        .boot_source
216        .builder
217        .as_ref()
218        .ok_or(MissingKernelConfig)?;
219
220    let track_dirty_pages = vm_resources.machine_config.track_dirty_pages;
221
222    let vhost_user_device_used = vm_resources
223        .block
224        .devices
225        .iter()
226        .any(|b| b.lock().expect("Poisoned lock").is_vhost_user());
227
228    // Page faults are more expensive for shared memory mapping, including  memfd.
229    // For this reason, we only back guest memory with a memfd
230    // if a vhost-user-blk device is configured in the VM, otherwise we fall back to
231    // an anonymous private memory.
232    //
233    // The vhost-user-blk branch is not currently covered by integration tests in Rust,
234    // because that would require running a backend process. If in the future we converge to
235    // a single way of backing guest memory for vhost-user and non-vhost-user cases,
236    // that would not be worth the effort.
237    let regions = vmm::arch::arch_memory_regions(vm_resources.machine_config.mem_size_mib << 20);
238    let guest_regions = if vhost_user_device_used {
239        memory::memfd_backed(
240            &regions,
241            track_dirty_pages,
242            vm_resources.machine_config.huge_pages,
243        )
244        .map_err(StartMicrovmError::GuestMemory)?
245    } else {
246        memory::anonymous(
247            regions.iter().copied(),
248            track_dirty_pages,
249            vm_resources.machine_config.huge_pages,
250        )
251        .map_err(StartMicrovmError::GuestMemory)?
252    };
253    // Clone the command-line so that a failed boot doesn't pollute the original.
254    #[allow(unused_mut)]
255    let mut boot_cmdline = boot_config.cmdline.clone();
256
257    let mut cpu_template = vm_resources
258        .machine_config
259        .cpu_template
260        .get_cpu_template()?
261        .into_owned();
262    if vm_resources.machine_config.enable_nested_virt {
263        #[cfg(target_arch = "x86_64")]
264        {
265            ensure_nested_kvm_caps(&mut cpu_template);
266        }
267        #[cfg(not(target_arch = "x86_64"))]
268        {
269            return Err(StartMicrovmError::NestedVirtUnsupported(
270                "nested virtualization is only supported on x86_64".to_string(),
271            ));
272        }
273    }
274
275    let (mut vmm, mut vcpus) = vmm::builder::create_vmm_and_vcpus(
276        instance_info,
277        event_manager,
278        guest_regions,
279        None,
280        track_dirty_pages,
281        vm_resources.machine_config.vcpu_count,
282        cpu_template.kvm_capabilities.clone(),
283    )?;
284
285    if vm_resources.machine_config.enable_nested_virt {
286        #[cfg(target_arch = "x86_64")]
287        {
288            ensure_nested_virt_supported(vmm.kvm(), &mut cpu_template)?;
289        }
290    }
291
292    let entry_addr = vmm::arch::load_kernel(&boot_config.kernel_file, vmm.vm.guest_memory())?;
293    let initrd = InitrdConfig::from_config(boot_config, vmm.vm.guest_memory())?;
294
295    if vm_resources.pci_enabled {
296        vmm.device_manager.enable_pci(&vmm.vm)?;
297    } else {
298        boot_cmdline.insert("pci", "off")?;
299    }
300
301    // BEGIN NYX-LITE PATCH
302    assert_eq!(vcpus.len(), 1);
303    let debug_struct = kvm_guest_debug {
304        // Configure the vcpu so that a KVM_DEBUG_EXIT would be generated
305        // when encountering a software breakpoint during execution
306        control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP,
307        pad: 0,
308        // Reset all arch-specific debug registers
309        arch: Default::default(),
310    };
311
312    vcpus[0].kvm_vcpu.fd.set_guest_debug(&debug_struct).unwrap();
313    // END NYX-LITE PATCH
314    // The boot timer device needs to be the first device attached in order
315    // to maintain the same MMIO address referenced in the documentation
316    // and tests.
317    // if vm_resources.boot_timer {
318    //     vmm::builder::attach_boot_timer_device(&mut vmm, request_ts)?;
319    // }
320    vmm::builder::attach_block_devices(
321        &mut vmm.device_manager,
322        &vmm.vm,
323        &mut boot_cmdline,
324        vm_resources.block.devices.iter(),
325        event_manager,
326    )?;
327    vmm::builder::attach_net_devices(
328        &mut vmm.device_manager,
329        &vmm.vm,
330        &mut boot_cmdline,
331        vm_resources.net_builder.iter(),
332        event_manager,
333    )?;
334
335    // no need for nondeterminism - we don't like that anyway
336    //#[cfg(target_arch = "x86_64")]
337    //vmm::builder::attach_vmgenid_device(&mut vmm)?;
338
339    let vm_arc = vmm.vm.clone();
340    let kvm_ptr = vmm.kvm() as *const _;
341    // SAFETY: kvm_ptr points to vmm.kvm which outlives this call, and vm_arc
342    // keeps the VM alive while we mutably borrow the device manager.
343    unsafe {
344        vmm::arch::configure_system_for_boot(
345            &*kvm_ptr,
346            vm_arc.as_ref(),
347            &mut vmm.device_manager,
348            vcpus.as_mut(),
349            &vm_resources.machine_config,
350            &cpu_template,
351            entry_addr,
352            &initrd,
353            boot_cmdline,
354        )?;
355    }
356
357    let mut vcpu = vcpus.into_iter().next().unwrap();
358    let event_sender = vcpu.event_sender.take().expect("vCPU already started");
359    let response_receiver = vcpu.response_receiver.take().unwrap();
360    let vcpu_fd = vcpu
361        .copy_kvm_vcpu_fd(vmm.vm.as_ref())
362        .map_err(StartMicrovmError::VcpuFdCloneError)?;
363    let vcpu_join_handle = thread::Builder::new()
364        .name(format!("fake vcpu thread"))
365        .spawn(|| {})
366        .unwrap();
367    let handle = VcpuHandle::new(event_sender, response_receiver, vcpu_fd, vcpu_join_handle);
368
369    //END NYX-LITE PATCH
370    vmm.vcpus_handles.push(handle);
371    let vmm = Arc::new(Mutex::new(vmm));
372    event_manager.add_subscriber(vmm.clone());
373
374    vcpu.set_mmio_bus(vmm.lock().unwrap().vm.common.mmio_bus.clone());
375    vcpu.kvm_vcpu
376        .set_pio_bus(vmm.lock().unwrap().vm.pio_bus.clone());
377    Ok((vmm, vcpu))
378}