nyx_lite/
nyx_vm.rs

1use std::cell::RefCell;
2use std::collections::{HashMap, HashSet};
3use std::ffi::CStr;
4use std::fs::File;
5use std::os::unix::io::FromRawFd;
6use std::path::PathBuf;
7use std::ptr::NonNull;
8use std::sync::atomic::Ordering;
9use std::sync::{Arc, Mutex};
10use std::thread;
11use std::thread::JoinHandle;
12use std::time::{self, Duration};
13
14use anyhow::Result;
15
16use event_manager::SubscriberOps;
17use iced_x86::OpAccess;
18use vm_memory::Address;
19use vmm::Vcpu;
20use vmm::Vmm;
21use vmm::arch::GUEST_PAGE_SIZE;
22use vmm::arch::x86_64::generated::msr_index::{
23    MSR_IA32_DEBUGCTLMSR, MSR_IA32_DS_AREA, MSR_IA32_TSC, MSR_IA32_TSC_ADJUST,
24    MSR_IA32_TSC_DEADLINE,
25};
26use vmm::cpu_config::templates::StaticCpuTemplate;
27use vmm::device_manager::mmio::MMIODeviceManager;
28use vmm::devices::virtio::block::device::Block;
29use vmm::devices::virtio::block::persist::BlockState;
30use vmm::devices::virtio::generated::virtio_ids::VIRTIO_ID_BLOCK;
31use vmm::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX;
32use vmm::devices::virtio::persist::QueueConstructorArgs;
33use vmm::devices::virtio::queue::Queue;
34use vmm::logger::debug;
35use vmm::persist::MicrovmState;
36use vmm::resources::VmResources;
37use vmm::snapshot::Persist;
38use vmm::utils::get_page_size;
39use vmm::vmm_config::instance_info::{InstanceInfo, VmState};
40use vmm::vstate::memory::{
41    Bitmap, Bytes, GuestAddress, GuestMemory, GuestMemoryRegion, MemoryRegionAddress,
42};
43use vmm::vstate::memory::{GuestMemoryExtension, GuestRegionMmapExt};
44use vmm::vstate::vcpu::{VCPU_RTSIG_OFFSET, VcpuEmulation, VcpuError};
45use vmm::{EventManager, VcpuEvent};
46
47use kvm_bindings::{
48    KVM_CAP_DIRTY_LOG_RING, KVM_DIRTY_LOG_PAGE_OFFSET, KVM_GUESTDBG_BLOCKIRQ, KVM_GUESTDBG_ENABLE,
49    KVM_GUESTDBG_INJECT_BP, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP,
50    KVM_GUESTDBG_USE_SW_BP, Msrs, kvm_dirty_gfn, kvm_enable_cap, kvm_guest_debug,
51    kvm_guest_debug_arch, kvm_msr_entry, kvm_regs, kvm_sregs,
52};
53
54use crate::breakpoints::{BreakpointManager, BreakpointManagerTrait};
55use crate::disassembly::{disassemble_memory_accesses, is_control_flow};
56use crate::error::MemoryError;
57use crate::firecracker_wrappers::build_microvm_for_boot;
58use crate::hw_breakpoints::HwBreakpoints;
59use crate::mem::{
60    self, GetMem, HostDirtyTracker, LockedVmm, NyxMemExtension, PageAllocator, PageMapping,
61    ProcessMemory, SharedMemoryRegion,
62};
63use crate::snapshot::{BaseRegionSnapshot, MemorySnapshot, NyxSnapshot, SnapshotType};
64use crate::timer_event::TimerEvent;
65use crate::vm_continuation_statemachine::{RunMode, VMContinuationState, VMExitUserEvent};
66
67#[derive(Debug, Hash, Eq, PartialEq, Clone)]
68pub enum DebugState {
69    Breakpoint,
70    SingleStep,
71    Continue,
72}
73const EXECDONE: u64 = 0x656e6f6463657865;
74const SNAPSHOT: u64 = 0x746f687370616e73;
75const NYX_LITE: u64 = 0x6574696c2d78796e;
76const SHAREMEM: u64 = 0x6d656d6572616873;
77const DBGPRINT: u64 = 0x746e697270676264;
78const DBG_EXCEPTION_BREAKPOINT: u32 = 3;
79const DBG_EXCEPTION_SINGLESTEP: u32 = 1;
80const DR6_BS: u64 = 1 << 14; // Single-Step execution
81const DR6_HWBP_0: u64 = 1 << 0;
82const DR6_HWBP_1: u64 = 1 << 1;
83const DR6_HWBP_2: u64 = 1 << 2;
84const DR6_HWBP_3: u64 = 1 << 3;
85const KVM_DIRTY_GFN_F_DIRTY: u32 = 1;
86const KVM_DIRTY_GFN_F_RESET: u32 = 2;
87const KVM_DIRTY_RING_MAX_ENTRIES: usize = 65536;
88const CANONICAL_USER_LIMIT: u64 = 0x0000_8000_0000_0000;
89const DEBUGCTL_BTF: u64 = 1 << 1;
90const DEBUGCTL_BTS: u64 = 1 << 7;
91const DEBUGCTL_BTINT: u64 = 1 << 8;
92const DEBUGCTL_BTS_OFF_OS: u64 = 1 << 9;
93const DEBUGCTL_BTS_OFF_USR: u64 = 1 << 10;
94
95fn is_canonical_user_addr(addr: u64) -> bool {
96    addr < CANONICAL_USER_LIMIT
97}
98
99#[derive(Debug, Copy, Clone)]
100struct DirtyRingEntry {
101    slot: u32,
102    offset: u64,
103}
104
105#[derive(Debug)]
106struct DirtyRingState {
107    entries: NonNull<kvm_dirty_gfn>,
108    entry_count: usize,
109    head: u32,
110    page_size: u64,
111    slot_bases: HashMap<u32, GuestAddress>,
112    slot_sizes: HashMap<u32, usize>,
113}
114
115impl DirtyRingState {
116    fn drain(&mut self) -> Vec<DirtyRingEntry> {
117        let mut entries = Vec::new();
118        let count = self.entry_count as u32;
119        for _ in 0..self.entry_count {
120            let idx = (self.head % count) as usize;
121            let entry_ptr = unsafe { self.entries.as_ptr().add(idx) };
122            let entry = unsafe { std::ptr::read_volatile(entry_ptr) };
123            if (entry.flags & KVM_DIRTY_GFN_F_DIRTY) == 0 {
124                break;
125            }
126            entries.push(DirtyRingEntry {
127                slot: entry.slot,
128                offset: entry.offset,
129            });
130            let new_flags = entry.flags | KVM_DIRTY_GFN_F_RESET;
131            unsafe {
132                std::ptr::write_volatile(&mut (*entry_ptr).flags, new_flags);
133            }
134            self.head = self.head.wrapping_add(1);
135        }
136        entries
137    }
138}
139pub struct NyxVM {
140    pub vmm: Arc<Mutex<Vmm>>,
141    pub vcpu: Vcpu,
142    pub event_thread_handle: JoinHandle<Result<(), anyhow::Error>>,
143    event_manager: RefCell<EventManager>,
144    pub vm_resources: VmResources,
145    pub block_devices: Vec<Arc<Mutex<Block>>>,
146    pub timeout_timer: Arc<Mutex<TimerEvent>>,
147    pub continuation_state: VMContinuationState,
148    pub breakpoint_manager: Box<dyn BreakpointManagerTrait>,
149    pub hw_breakpoints: HwBreakpoints,
150    pub active_snapshot: Option<Arc<NyxSnapshot>>,
151    pub serial_pty: Option<SerialPty>,
152    regs_cache: RefCell<Option<kvm_regs>>,
153    sregs_cache: RefCell<Option<kvm_sregs>>,
154    last_nyx_breakpoint: RefCell<Option<(u64, u64)>>,
155    dirty_ring: Option<DirtyRingState>,
156    dirty_ring_backlog: Vec<DirtyRingEntry>,
157    host_dirty: Arc<HostDirtyTracker>,
158    shared_pages: HashSet<u64>,
159}
160
161#[derive(Debug)]
162pub struct SerialPty {
163    pub master: File,
164    pub slave_path: PathBuf,
165}
166
167fn create_serial_pty() -> std::io::Result<SerialPty> {
168    let mut master: libc::c_int = 0;
169    let mut slave: libc::c_int = 0;
170    let mut name = [0 as libc::c_char; 128];
171    let rc = unsafe {
172        libc::openpty(
173            &mut master,
174            &mut slave,
175            name.as_mut_ptr(),
176            std::ptr::null_mut(),
177            std::ptr::null_mut(),
178        )
179    };
180    if rc != 0 {
181        return Err(std::io::Error::last_os_error());
182    }
183    let slave_path = unsafe { CStr::from_ptr(name.as_ptr()) }
184        .to_string_lossy()
185        .into_owned();
186    unsafe { libc::close(slave) };
187    let master_file = unsafe { File::from_raw_fd(master) };
188    Ok(SerialPty {
189        master: master_file,
190        slave_path: PathBuf::from(slave_path),
191    })
192}
193
194fn register_kick_signal_handler() {
195    extern "C" fn handle_signal(_: libc::c_int, _: *mut libc::siginfo_t, _: *mut libc::c_void) {
196        std::sync::atomic::fence(Ordering::Acquire);
197    }
198    vmm::utils::signal::register_signal_handler(
199        vmm::utils::signal::sigrtmin() + VCPU_RTSIG_OFFSET,
200        handle_signal,
201    )
202    .expect("Failed to register vcpu signal handler");
203}
204
205#[derive(Debug)]
206pub enum UnparsedExitReason {
207    Shutdown,
208    Hypercall,
209    Timeout,
210    NyxBreakpoint,
211    GuestBreakpoint,
212    SingleStep,
213    Interrupted,
214    HWBreakpoint(u8),
215    BadMemoryAccess,
216}
217
218#[derive(Debug)]
219pub enum ExitReason {
220    Shutdown,
221    Hypercall(u64, u64, u64, u64, u64),
222    BadMemoryAccess(Vec<(u64, OpAccess)>),
223    RequestSnapshot,
224    ExecDone(u64),
225    SharedMem(String, u64, usize),
226    DebugPrint(String),
227    Timeout,
228    Breakpoint,
229    HWBreakpoint(u8),
230    SingleStep,
231    Interrupted,
232}
233
234/// Configuration for enabling Branch Trace Store (BTS) via DEBUGCTL.
235#[derive(Debug, Clone, Copy, Eq, PartialEq)]
236pub struct BtsConfig {
237    pub enable: bool,
238    pub interrupt: bool,
239    pub off_user: bool,
240    pub off_kernel: bool,
241}
242
243/// Control whether shared memory pages are snapshotted or preserved across resets.
244#[derive(Debug, Clone, Copy, Eq, PartialEq)]
245pub enum SharedMemoryPolicy {
246    Snapshot,
247    Preserve,
248}
249
250impl NyxVM {
251    // NOTE: due to the fact that timeout timers are tied to the thread that
252    // makes the NyxVM (see TimerEvent for more details), it's probably unsafe
253    // to use a NyxVM in a different thread than the one that made it.
254    pub fn new(instance_id: String, config_json: &str) -> Self {
255        let mmds_size_limit = 0;
256
257        let instance_info = InstanceInfo {
258            id: instance_id.clone(),
259            state: VmState::NotStarted,
260            vmm_version: "0.1".to_string(),
261            app_name: "Firecracker-Lite".to_string(),
262        };
263
264        let mut event_manager = EventManager::new().expect("Unable to create EventManager");
265
266        // Build the microVm.
267        let mut vm_resources =
268            VmResources::from_json(&config_json, &instance_info, mmds_size_limit, None)
269                .expect("couldn't parse config json");
270        let mut serial_pty = None;
271        if vm_resources.serial_out_path.is_none() {
272            if let Ok(pty) = create_serial_pty() {
273                vm_resources.serial_out_path = Some(pty.slave_path.clone());
274                serial_pty = Some(pty);
275            }
276        }
277
278        let block_devices = vm_resources
279            .block
280            .devices
281            .iter()
282            .cloned()
283            .collect::<Vec<_>>();
284
285        vm_resources.machine_config.track_dirty_pages = true;
286
287        vm_resources.boot_timer = false;
288
289        debug!("event_start: build microvm for boot");
290
291        let (vmm, mut vcpu) =
292            build_microvm_for_boot(&instance_info, &vm_resources, &mut event_manager)
293                .expect("couldn't prepare vm");
294        debug!("event_end: build microvm for boot");
295
296        let dirty_ring = {
297            let vmm_guard = vmm.lock().unwrap();
298            Self::try_enable_dirty_ring(&vmm_guard, &mut vcpu)
299        };
300
301        let timeout_timer = Arc::new(Mutex::new(TimerEvent::new()));
302        event_manager.add_subscriber(timeout_timer.clone());
303        // This will allow the timeout timer to send the signal that makes KVM exit immediatly
304        register_kick_signal_handler();
305        // Run the event manager in the same thread to avoid non-Send subscribers.
306        let event_thread_handle = thread::Builder::new()
307            .name("event_thread".to_string())
308            .spawn(|| Ok(()))
309            .unwrap();
310        let total_pages = {
311            let vmm_guard = vmm.lock().unwrap();
312            vmm_guard
313                .vm
314                .guest_memory()
315                .iter()
316                .map(|region| {
317                    let len = region.len() as usize;
318                    (len + mem::PAGE_SIZE as usize - 1) / mem::PAGE_SIZE as usize
319                })
320                .sum()
321        };
322
323        return Self {
324            vcpu,
325            vmm,
326            event_manager: RefCell::new(event_manager),
327            vm_resources,
328            event_thread_handle,
329            block_devices,
330            timeout_timer,
331            continuation_state: VMContinuationState::Main,
332            breakpoint_manager: Box::new(BreakpointManager::new()),
333            hw_breakpoints: HwBreakpoints::new(),
334            active_snapshot: None,
335            serial_pty,
336            regs_cache: RefCell::new(None),
337            sregs_cache: RefCell::new(None),
338            last_nyx_breakpoint: RefCell::new(None),
339            dirty_ring,
340            dirty_ring_backlog: Vec::new(),
341            host_dirty: Arc::new(HostDirtyTracker::new(total_pages)),
342            shared_pages: HashSet::new(),
343        };
344    }
345
346    fn try_enable_dirty_ring(vmm: &Vmm, vcpu: &mut Vcpu) -> Option<DirtyRingState> {
347        if vmm
348            .kvm()
349            .fd
350            .check_extension_raw(u64::from(KVM_CAP_DIRTY_LOG_RING))
351            == 0
352        {
353            return None;
354        }
355
356        let page_size = get_page_size().ok()? as usize;
357        let run_size = vmm.vm.fd().run_size();
358        let offset_bytes = (KVM_DIRTY_LOG_PAGE_OFFSET as usize).checked_mul(page_size)?;
359        if run_size <= offset_bytes {
360            debug!("dirty ring unsupported: vcpu mmap too small");
361            return None;
362        }
363
364        let max_entries = (run_size - offset_bytes) / std::mem::size_of::<kvm_dirty_gfn>();
365        if max_entries == 0 {
366            debug!("dirty ring unsupported: no space for entries");
367            return None;
368        }
369        let entry_count = std::cmp::min(max_entries, KVM_DIRTY_RING_MAX_ENTRIES);
370
371        let mut cap = kvm_enable_cap::default();
372        cap.cap = KVM_CAP_DIRTY_LOG_RING;
373        cap.args[0] = entry_count as u64;
374        if let Err(err) = vmm.vm.fd().enable_cap(&cap) {
375            debug!("dirty ring enable failed: {}", err);
376            return None;
377        }
378
379        let run_ptr = vcpu.kvm_vcpu.fd.get_kvm_run() as *mut _ as *mut u8;
380        let ring_ptr = unsafe { run_ptr.add(offset_bytes) as *mut kvm_dirty_gfn };
381        let entries = NonNull::new(ring_ptr)?;
382
383        let mut slot_bases = HashMap::new();
384        let mut slot_sizes = HashMap::new();
385        for region in vmm.vm.guest_memory().iter() {
386            let slot_size = region.slot_size();
387            for slot in region.slot_range() {
388                if let Some(base) = region.slot_base(slot) {
389                    slot_bases.insert(slot, base);
390                    slot_sizes.insert(slot, slot_size);
391                }
392            }
393        }
394
395        Some(DirtyRingState {
396            entries,
397            entry_count,
398            head: 0,
399            page_size: page_size as u64,
400            slot_bases,
401            slot_sizes,
402        })
403    }
404
405    pub fn process_memory(&self, cr3: u64) -> ProcessMemory<LockedVmm> {
406        let backend = LockedVmm::new(self.vmm.clone());
407        ProcessMemory::new(backend, cr3).with_host_dirty(self.host_dirty.clone())
408    }
409
410    pub fn current_process_memory(&self) -> ProcessMemory<LockedVmm> {
411        let cr3 = self.sregs().cr3;
412        self.process_memory(cr3)
413    }
414
415    /// Registers a guest memory range as shared and optionally excludes it from snapshot resets.
416    pub fn register_shared_region(
417        &mut self,
418        cr3: u64,
419        vaddr: u64,
420        len: usize,
421        policy: SharedMemoryPolicy,
422    ) -> Result<SharedMemoryRegion<LockedVmm>, MemoryError> {
423        if len == 0 {
424            return Ok(SharedMemoryRegion::new(self.process_memory(cr3), vaddr, 0));
425        }
426        if policy == SharedMemoryPolicy::Preserve {
427            let start = vaddr & mem::M_PAGE_ALIGN;
428            let end = vaddr.checked_add(len as u64 - 1).unwrap_or(u64::MAX) & mem::M_PAGE_ALIGN;
429            let mut cur = start;
430            let process = self.process_memory(cr3);
431            while cur <= end {
432                let phys = process.resolve_vaddr(cur)?;
433                self.shared_pages.insert(phys.raw_value());
434                if let Some(next) = cur.checked_add(mem::PAGE_SIZE) {
435                    cur = next;
436                } else {
437                    break;
438                }
439            }
440        }
441        Ok(SharedMemoryRegion::new(
442            self.process_memory(cr3),
443            vaddr,
444            len,
445        ))
446    }
447
448    pub fn register_shared_region_current(
449        &mut self,
450        vaddr: u64,
451        len: usize,
452        policy: SharedMemoryPolicy,
453    ) -> Result<SharedMemoryRegion<LockedVmm>, MemoryError> {
454        let cr3 = self.sregs().cr3;
455        self.register_shared_region(cr3, vaddr, len, policy)
456    }
457
458    pub fn inject_mapping(
459        &self,
460        cr3: u64,
461        vaddr: u64,
462        paddr: u64,
463        mapping: PageMapping,
464        allocator: Option<&mut dyn PageAllocator>,
465    ) -> Result<(), MemoryError> {
466        self.process_memory(cr3)
467            .map_page(vaddr, paddr, mapping, allocator)
468    }
469
470    pub fn inject_code(
471        &self,
472        cr3: u64,
473        vaddr: u64,
474        paddr: u64,
475        code: &[u8],
476        mapping: PageMapping,
477        allocator: Option<&mut dyn PageAllocator>,
478    ) -> Result<(), MemoryError> {
479        self.process_memory(cr3)
480            .inject_code(vaddr, paddr, code, mapping, allocator)
481    }
482
483    fn take_memory_snapshot_with_state(
484        vmm: &Vmm,
485        snap_type: SnapshotType,
486        dirty_ring: &mut Option<DirtyRingState>,
487        dirty_ring_backlog: &mut Vec<DirtyRingEntry>,
488        host_dirty: &HostDirtyTracker,
489        shared_pages: &HashSet<u64>,
490    ) -> MemorySnapshot {
491        let memory = match snap_type {
492            SnapshotType::Base => {
493                let mut regions = Vec::new();
494                for region in vmm.vm.guest_memory().iter() {
495                    let region_len: usize = region.len().try_into().unwrap_or(0);
496                    let mut memory = vec![0; region_len];
497                    region
498                        .read_slice(&mut memory, MemoryRegionAddress(0))
499                        .unwrap();
500                    regions.push(BaseRegionSnapshot {
501                        start: region.start_addr().raw_value(),
502                        data: Arc::from(memory),
503                    });
504                }
505                MemorySnapshot::Base(regions)
506            }
507            SnapshotType::Incremental => {
508                let mut map = HashMap::new();
509                Self::iter_dirty_pages_with_state(
510                    vmm,
511                    dirty_ring,
512                    dirty_ring_backlog,
513                    host_dirty,
514                    |region, region_offset, guest_addr| {
515                        if shared_pages.contains(&guest_addr) {
516                            return;
517                        }
518                        let mut data = vec![0; GUEST_PAGE_SIZE as usize];
519
520                        region
521                            .read_slice(&mut data, MemoryRegionAddress(region_offset as u64))
522                            .unwrap();
523                        map.insert(guest_addr, data);
524                    },
525                );
526                MemorySnapshot::Incremental(map)
527            }
528        };
529
530        Self::reset_dirty_tracking_with_state(vmm, dirty_ring, dirty_ring_backlog, host_dirty);
531        return memory;
532    }
533
534    pub fn take_snapshot(&mut self) -> Arc<NyxSnapshot> {
535        let snap_type = if self.active_snapshot.is_some() {
536            SnapshotType::Incremental
537        } else {
538            SnapshotType::Base
539        };
540        return self.take_snapshot_with_type(snap_type);
541    }
542
543    pub fn take_base_snapshot(&mut self) -> Arc<NyxSnapshot> {
544        return self.take_snapshot_with_type(SnapshotType::Base);
545    }
546
547    pub fn take_snapshot_with_type(&mut self, snap_type: SnapshotType) -> Arc<NyxSnapshot> {
548        if snap_type == SnapshotType::Incremental {
549            assert!(
550                self.active_snapshot.is_some(),
551                "can't take an incremental snapshot without a basis snapshot!"
552            );
553        }
554        let memory = {
555            let vmm = self.vmm.lock().unwrap();
556            Self::take_memory_snapshot_with_state(
557                &vmm,
558                snap_type,
559                &mut self.dirty_ring,
560                &mut self.dirty_ring_backlog,
561                &self.host_dirty,
562                &self.shared_pages,
563            )
564        };
565
566        //let block_device_snapshots = self.block_devices.iter().map(|dev| {
567        //    // This flushes all changes to the backing file
568        //    // - however this should not be needed, as we aren't shutting downt
569        //    // the process - For now, it's fine if the OS caches changes to the backing
570        //    // file for us. Eventually we will store all updates in memory and
571        //    // never change the backing file, so it won't be needed either
572        //    // dev.prepare_save();
573        //    BlockDeviceSnapshot::from(dev)
574        //}).collect();
575
576        let msrs = self
577            .vcpu
578            .kvm_vcpu
579            .get_msrs([MSR_IA32_TSC, MSR_IA32_TSC_DEADLINE, MSR_IA32_TSC_ADJUST].into_iter())
580            .unwrap();
581        let tsc = msrs[&MSR_IA32_TSC];
582        let parent = self.active_snapshot.take();
583        let depth = parent.as_ref().map(|p| p.depth + 1).unwrap_or(0);
584        let vmm = self.vmm.lock().unwrap();
585        let new_snap = Arc::new(NyxSnapshot {
586            parent,
587            depth,
588            memory,
589            state: self.save_vm_state(&vmm),
590            tsc,
591            continuation_state: self.continuation_state.clone(),
592        });
593        self.active_snapshot = Some(new_snap.clone());
594        return new_snap;
595    }
596
597    fn save_vm_state(&self, vmm: &Vmm) -> MicrovmState {
598        let vm_state = vmm.vm.save_state().unwrap();
599        let device_states = vmm.device_manager.save();
600        let vcpu_state = self.vcpu.kvm_vcpu.save_state().unwrap();
601        let vm_info = vmm::persist::VmInfo::from(&self.vm_resources);
602        let kvm_state = vmm.kvm().save_state();
603        // this is missing pio device state - notably shutdown and serial devices
604        return MicrovmState {
605            vm_info: vm_info,
606            kvm_state,
607            vm_state,
608            vcpu_states: vec![vcpu_state],
609            device_states,
610        };
611    }
612
613    fn apply_snapshot_mmio(
614        mmio: &MMIODeviceManager,
615        mem: &vmm::vstate::memory::GuestMemoryMmap,
616        snap: &NyxSnapshot,
617    ) {
618        let ds = &snap.state.device_states.mmio_state;
619        let blocks = &ds.block_devices;
620        for block_snap in blocks.iter() {
621            if let BlockState::Virtio(vio_block_snap_state) = &block_snap.device_state {
622                let vstate = &vio_block_snap_state.virtio_state;
623                let device_id = &block_snap.device_id;
624                let bus_dev = mmio.get_virtio_device(VIRTIO_ID_BLOCK, device_id).unwrap();
625                let mut mmio_transport = bus_dev.inner().lock().unwrap();
626                block_snap.transport_state.apply_to(&mut mmio_transport);
627                let mut locked_dev = mmio_transport.locked_device();
628                let cow_file_engine = locked_dev
629                    .as_cow_file_engine()
630                    .expect("Trying to apply a snapshot to a non-cow block device");
631                cow_file_engine.reset_to(vio_block_snap_state.cow_state.id);
632                locked_dev.set_acked_features(vstate.acked_features);
633                locked_dev
634                    .interrupt_status()
635                    .store(vstate.interrupt_status, Ordering::Relaxed);
636
637                let queue_args = QueueConstructorArgs {
638                    mem: mem.clone(),
639                    is_activated: locked_dev.is_activated(),
640                };
641                let uses_notif_suppression =
642                    (vstate.acked_features & (1u64 << VIRTIO_RING_F_EVENT_IDX)) != 0;
643                for (queue, queue_snap) in
644                    locked_dev.queues_mut().iter_mut().zip(vstate.queues.iter())
645                {
646                    let mut new_queue = Queue::restore(queue_args.clone(), queue_snap).unwrap();
647                    if uses_notif_suppression {
648                        new_queue.enable_notif_suppression();
649                    }
650                    let _ = std::mem::replace(queue, new_queue);
651                }
652            } else {
653                panic!("trying to apply snapshot for a non-virtio block device. Not supported");
654            }
655        }
656    }
657
658    fn apply_tsc(&mut self, tsc: u64) {
659        //let msrs = self.vcpu.kvm_vcpu.get_msrs([MSR_IA32_TSC, MSR_IA32_TSC_DEADLINE, MSR_IA32_TSC_ADJUST].into_iter()).unwrap();
660        //println!("MSRS: TSC {:x} (snapshot: {:x}) TSCDEADLINE {:x} TSC_ADJUST {:x}", msrs[&MSR_IA32_TSC], snap.tsc, msrs[&MSR_IA32_TSC_DEADLINE], msrs[&MSR_IA32_TSC_ADJUST]);
661        let msrs_to_set = [
662            // KVM "helpfully" tries to prevent us from updating TSC in small increments and ignores small delta updates.
663            // update to an insane value first
664            kvm_msr_entry {
665                index: MSR_IA32_TSC,
666                data: tsc.wrapping_add(0xdeadc0debeef),
667                ..Default::default()
668            },
669            // then update to what we actually want it to be.
670            kvm_msr_entry {
671                index: MSR_IA32_TSC,
672                data: tsc,
673                ..Default::default()
674            },
675        ];
676        let msrs_wrapper = Msrs::from_entries(&msrs_to_set).unwrap();
677        let num_set = self.vcpu.kvm_vcpu.fd.set_msrs(&msrs_wrapper).unwrap();
678        assert_eq!(num_set, msrs_to_set.len());
679        //let msrs = self.vcpu.kvm_vcpu.get_msrs([MSR_IA32_TSC, MSR_IA32_TSC_DEADLINE, MSR_IA32_TSC_ADJUST].into_iter()).unwrap();
680        //println!("MSRS: TSC {:x} (snapshot: {:x}) TSCDEADLINE {:x} TSC_ADJUST {:x}", msrs[&MSR_IA32_TSC], snap.tsc, msrs[&MSR_IA32_TSC_DEADLINE], msrs[&MSR_IA32_TSC_ADJUST]);
681    }
682
683    fn reset_dirty_tracking_with_state(
684        vmm: &Vmm,
685        dirty_ring: &mut Option<DirtyRingState>,
686        dirty_ring_backlog: &mut Vec<DirtyRingEntry>,
687        host_dirty: &HostDirtyTracker,
688    ) {
689        vmm.vm.guest_memory().reset_dirty();
690        if dirty_ring.is_some() {
691            dirty_ring_backlog.clear();
692            if let Err(err) = vmm.vm.reset_dirty_rings() {
693                debug!("failed to reset dirty ring: {}", err);
694            }
695        } else {
696            vmm.vm.reset_dirty_bitmap();
697        }
698        host_dirty.clear();
699    }
700
701    fn drain_dirty_ring_backlog(
702        vmm: &Vmm,
703        dirty_ring: &mut Option<DirtyRingState>,
704        dirty_ring_backlog: &mut Vec<DirtyRingEntry>,
705    ) {
706        let Some(ring) = dirty_ring.as_mut() else {
707            return;
708        };
709        let entries = ring.drain();
710        if !entries.is_empty() {
711            dirty_ring_backlog.extend(entries);
712        }
713        if let Err(err) = vmm.vm.reset_dirty_rings() {
714            debug!("failed to reset dirty ring: {}", err);
715        }
716    }
717
718    fn dirty_ring_entry_to_region_offset<'a>(
719        ring: &DirtyRingState,
720        mem: &'a vmm::vstate::memory::GuestMemoryMmap,
721        entry: DirtyRingEntry,
722    ) -> Option<(&'a GuestRegionMmapExt, usize, u64)> {
723        let base = *ring.slot_bases.get(&entry.slot)?;
724        let slot_size = *ring.slot_sizes.get(&entry.slot)?;
725        let offset_bytes = entry.offset.checked_mul(ring.page_size)?;
726        if offset_bytes >= slot_size as u64 {
727            return None;
728        }
729        let guest_addr = base.raw_value().checked_add(offset_bytes)?;
730        let region = mem.find_region(GuestAddress(guest_addr))?;
731        let region_offset = guest_addr
732            .checked_sub(region.start_addr().raw_value())?
733            .try_into()
734            .ok()?;
735        Some((region, region_offset, guest_addr))
736    }
737
738    /// callback will be called with the guest memory GuestRegionMmap object,
739    /// the region offset of the dirty page, and the guest physical address.
740    fn iter_dirty_pages_with_state<Callback>(
741        vmm: &Vmm,
742        dirty_ring: &mut Option<DirtyRingState>,
743        dirty_ring_backlog: &mut Vec<DirtyRingEntry>,
744        host_dirty: &HostDirtyTracker,
745        mut callback: Callback,
746    ) where
747        Callback: FnMut(&GuestRegionMmapExt, usize, u64),
748    {
749        let mut seen = HashSet::new();
750        let host_pages = host_dirty.snapshot_pages();
751        for guest_addr in host_pages {
752            if let Some(region) = vmm.vm.guest_memory().find_region(GuestAddress(guest_addr)) {
753                let region_offset = guest_addr
754                    .checked_sub(region.start_addr().raw_value())
755                    .and_then(|val| usize::try_from(val).ok());
756                if let Some(region_offset) = region_offset {
757                    if seen.insert(guest_addr) {
758                        callback(region, region_offset, guest_addr);
759                    }
760                }
761            }
762        }
763
764        if dirty_ring.is_some() {
765            Self::drain_dirty_ring_backlog(vmm, dirty_ring, dirty_ring_backlog);
766            let ring = dirty_ring.as_mut().unwrap();
767            let mut pending = std::mem::take(dirty_ring_backlog);
768            for entry in pending.drain(..) {
769                if let Some((region, region_offset, guest_addr)) =
770                    Self::dirty_ring_entry_to_region_offset(ring, vmm.vm.guest_memory(), entry)
771                {
772                    if seen.insert(guest_addr) {
773                        callback(region, region_offset, guest_addr);
774                    }
775                }
776            }
777
778            let page_size = ring.page_size as usize;
779            for region in vmm.vm.guest_memory().iter() {
780                let firecracker_bitmap = region.bitmap();
781                let region_len: usize = region.len().try_into().unwrap_or(0);
782                for region_offset in (0..region_len).step_by(page_size) {
783                    if firecracker_bitmap.dirty_at(region_offset) {
784                        let guest_addr = region.start_addr().raw_value() + region_offset as u64;
785                        if seen.insert(guest_addr) {
786                            callback(region, region_offset, guest_addr);
787                        }
788                    }
789                }
790            }
791            return;
792        }
793
794        let kvm_dirty_bitmap = vmm.vm.get_dirty_bitmap().unwrap();
795        let page_size: usize = mem::PAGE_SIZE as usize;
796
797        for (slot, region) in vmm.vm.guest_memory().iter().enumerate() {
798            let slot = u32::try_from(slot).unwrap();
799            let kvm_bitmap = kvm_dirty_bitmap.get(&slot).unwrap();
800            let firecracker_bitmap = region.bitmap();
801
802            for (i, v) in kvm_bitmap.iter().enumerate() {
803                for j in 0..64 {
804                    let is_kvm_page_dirty = ((v >> j) & 1u64) != 0u64;
805                    let index: usize = (i * 64) + j;
806                    let page_addr = index * page_size;
807                    let is_firecracker_page_dirty = firecracker_bitmap.dirty_at(page_addr);
808
809                    if is_kvm_page_dirty || is_firecracker_page_dirty {
810                        let guest_addr = region.start_addr().raw_value() + page_addr as u64;
811                        callback(region, page_addr, guest_addr);
812                    }
813                }
814            }
815        }
816    }
817
818    fn apply_deltas_to_least_common_ancestor(
819        vmm: &mut Vmm,
820        mut pages_reset: &mut HashSet<u64>,
821        active: Arc<NyxSnapshot>,
822        snapshot: Arc<NyxSnapshot>,
823    ) {
824        let mut active_ancestor = active.clone();
825        let mut snap_ancestor = snapshot.clone();
826        let mem = vmm.get_mem();
827        // for every delta in a parent of snapshot, we can apply it directly
828        let reset_snap_pages = |snap_ancestor: Arc<NyxSnapshot>, reset_pages: &mut HashSet<u64>| {
829            for (page, slice) in snap_ancestor.iter_delta() {
830                if !reset_pages.contains(&page) {
831                    reset_pages.insert(page);
832                    mem.write_slice(slice, GuestAddress(page)).unwrap();
833                }
834            }
835            snap_ancestor
836                .parent
837                .as_ref()
838                .expect("Only snapshots with depth 0 can be root snapshots")
839                .clone()
840        };
841        // however, for deltas in the parent of the currently active snapshot, we need to reset them to the same page from snapshot instead.
842        let reset_active_pages = |active_ancestor: Arc<NyxSnapshot>,
843                                  reset_pages: &mut HashSet<u64>| {
844            for (page, _) in active_ancestor.iter_delta() {
845                if !reset_pages.contains(&page) {
846                    reset_pages.insert(page);
847                    snapshot.get_page(page as usize, |slice| {
848                        mem.write_slice(slice, GuestAddress(page)).unwrap();
849                    });
850                }
851            }
852            active_ancestor
853                .parent
854                .as_ref()
855                .expect("Only snapshots with depth 0 can be root snapshots")
856                .clone()
857        };
858        // first we make sure cur and active are on the same depth
859        while snap_ancestor.depth > active_ancestor.depth {
860            snap_ancestor = reset_snap_pages(snap_ancestor, &mut pages_reset);
861        }
862        while active_ancestor.depth > snap_ancestor.depth {
863            active_ancestor = reset_active_pages(active_ancestor, &mut pages_reset);
864        }
865        // once they are on the same depth we can walk both of them upwards until they meet and we reset all pages to the LCA
866        while !Arc::ptr_eq(&active_ancestor, &snap_ancestor) {
867            assert_eq!(active.depth, snap_ancestor.depth);
868            active_ancestor = reset_active_pages(active_ancestor, &mut pages_reset);
869            snap_ancestor = reset_snap_pages(snap_ancestor, &mut pages_reset);
870        }
871    }
872
873    fn ensure_snapshot_compat(&self, snapshot: &NyxSnapshot) {
874        let info = &snapshot.state.vm_info;
875        let machine = &self.vm_resources.machine_config;
876        let cpu_template = StaticCpuTemplate::from(&machine.cpu_template);
877        assert_eq!(
878            info.mem_size_mib, machine.mem_size_mib as u64,
879            "snapshot memory size mismatch"
880        );
881        assert_eq!(info.smt, machine.smt, "snapshot smt mismatch");
882        assert_eq!(
883            info.cpu_template, cpu_template,
884            "snapshot cpu template mismatch"
885        );
886        assert_eq!(
887            info.huge_pages, machine.huge_pages,
888            "snapshot huge page config mismatch"
889        );
890        assert_eq!(
891            info.enable_nested_virt, machine.enable_nested_virt,
892            "snapshot nested virt mismatch"
893        );
894        assert_eq!(
895            snapshot.state.vcpu_states.len(),
896            machine.vcpu_count as usize,
897            "snapshot vcpu count mismatch"
898        );
899    }
900
901    /// Applies a snapshot. If there is no active snapshot, only a root snapshot is accepted.
902    pub fn apply_snapshot(&mut self, snapshot: &Arc<NyxSnapshot>) {
903        let mut vmm = self.vmm.lock().unwrap();
904        self.ensure_snapshot_compat(snapshot);
905
906        if self.active_snapshot.is_none() {
907            if snapshot.depth != 0 || snapshot.memory.is_incremental() {
908                panic!("can only apply root snapshots to VMs without an active snapshot");
909            }
910            let shared_pages = self.shared_pages.clone();
911            for region in vmm.vm.guest_memory().iter() {
912                let region_len: usize = region.len().try_into().unwrap_or(0);
913                for offset in (0..region_len).step_by(GUEST_PAGE_SIZE as usize) {
914                    let guest_addr = region.start_addr().raw_value() + offset as u64;
915                    if shared_pages.contains(&guest_addr) {
916                        continue;
917                    }
918                    snapshot.get_page(guest_addr as usize, |slice| {
919                        region
920                            .write_slice(slice, MemoryRegionAddress(offset as u64))
921                            .unwrap();
922                    });
923                }
924            }
925            Self::reset_dirty_tracking_with_state(
926                &vmm,
927                &mut self.dirty_ring,
928                &mut self.dirty_ring_backlog,
929                &self.host_dirty,
930            );
931            self.active_snapshot = Some(snapshot.clone());
932            self.vcpu
933                .kvm_vcpu
934                .restore_state(&snapshot.state.vcpu_states[0])
935                .unwrap();
936            let guest_mem = vmm.vm.guest_memory().clone();
937            Self::apply_snapshot_mmio(&vmm.device_manager.mmio_devices, &guest_mem, snapshot);
938            let vm =
939                Arc::get_mut(&mut vmm.vm).expect("exclusive VM access required to restore state");
940            vm.restore_state(&snapshot.state.vm_state).unwrap();
941            vmm.clear_shutdown_exit_code();
942            drop(vmm);
943            self.apply_tsc(snapshot.tsc);
944            self.continuation_state = snapshot.continuation_state.clone();
945            self.regs_cache.replace(None);
946            self.sregs_cache.replace(None);
947            return;
948        }
949
950        let mut pages_reset = HashSet::new();
951        let active_snapshot = self
952            .active_snapshot
953            .as_ref()
954            .expect("can only apply snapshots on VMs with an active snapshot");
955
956        let fast_path = Arc::ptr_eq(snapshot, active_snapshot);
957        let shared_pages = self.shared_pages.clone();
958
959        Self::iter_dirty_pages_with_state(
960            &vmm,
961            &mut self.dirty_ring,
962            &mut self.dirty_ring_backlog,
963            &self.host_dirty,
964            |region, region_offset, guest_addr| {
965                if shared_pages.contains(&guest_addr) {
966                    return;
967                }
968                let target_addr = MemoryRegionAddress(region_offset.try_into().unwrap());
969                snapshot.get_page(guest_addr as usize, |slice| {
970                    region.write_slice(slice, target_addr).unwrap();
971                    if !fast_path {
972                        pages_reset.insert(guest_addr);
973                    }
974                });
975            },
976        );
977
978        if !fast_path {
979            Self::apply_deltas_to_least_common_ancestor(
980                &mut vmm,
981                &mut pages_reset,
982                active_snapshot.clone(),
983                snapshot.clone(),
984            );
985        }
986
987        self.active_snapshot = Some(snapshot.clone());
988        Self::reset_dirty_tracking_with_state(
989            &vmm,
990            &mut self.dirty_ring,
991            &mut self.dirty_ring_backlog,
992            &self.host_dirty,
993        );
994
995        // The only ACPIDevice is the vmgenid device which we disable - no need to restore
996        //println!("acpi state: {:#?}", &state.acpi_dev_state);
997        //println!("vmm acpi_device_manager {:#?}", vmm.acpi_device_manager);
998
999        self.vcpu
1000            .kvm_vcpu
1001            .restore_state(&snapshot.state.vcpu_states[0])
1002            .unwrap();
1003
1004        // we currently can't restore the net mmio device, only the block one
1005        let guest_mem = vmm.vm.guest_memory().clone();
1006        Self::apply_snapshot_mmio(&vmm.device_manager.mmio_devices, &guest_mem, snapshot);
1007        // cpu might need to restore piodevices, investigate
1008        //Self::apply_snapshot_pio(&mut vmm.pio_device_manager, snap);
1009
1010        let vm = Arc::get_mut(&mut vmm.vm).expect("exclusive VM access required to restore state");
1011        vm.restore_state(&snapshot.state.vm_state).unwrap();
1012        vmm.clear_shutdown_exit_code();
1013
1014        // this should be done last, because KVM keeps tsc running - even when
1015        // the VM isn't. Doing this early will introduce additional
1016        // noise/nondeterminism
1017        drop(vmm);
1018        self.apply_tsc(snapshot.tsc);
1019        self.continuation_state = snapshot.continuation_state.clone();
1020        self.regs_cache.replace(None);
1021        self.sregs_cache.replace(None);
1022    }
1023
1024    pub fn sregs(&self) -> kvm_sregs {
1025        if let Some(sregs) = self.sregs_cache.borrow().clone() {
1026            return sregs;
1027        }
1028        let sregs = self.vcpu.kvm_vcpu.fd.get_sregs().unwrap();
1029        self.sregs_cache.borrow_mut().replace(sregs);
1030        sregs
1031    }
1032    pub fn regs(&self) -> kvm_regs {
1033        if let Some(regs) = self.regs_cache.borrow().clone() {
1034            return regs;
1035        }
1036        let regs = self.vcpu.kvm_vcpu.fd.get_regs().unwrap();
1037        self.regs_cache.borrow_mut().replace(regs);
1038        regs
1039    }
1040
1041    pub fn set_regs(&mut self, regs: &kvm_regs) {
1042        self.vcpu.kvm_vcpu.fd.set_regs(regs).unwrap();
1043        self.regs_cache.borrow_mut().replace(regs.clone());
1044        self.continuation_state = VMContinuationState::Main;
1045    }
1046
1047    fn read_debugctl(&self) -> u64 {
1048        self.vcpu
1049            .kvm_vcpu
1050            .get_msrs([MSR_IA32_DEBUGCTLMSR].into_iter())
1051            .ok()
1052            .and_then(|msrs| msrs.get(&MSR_IA32_DEBUGCTLMSR).copied())
1053            .unwrap_or(0)
1054    }
1055
1056    fn write_debugctl(&mut self, value: u64) {
1057        let msrs_to_set = [kvm_msr_entry {
1058            index: MSR_IA32_DEBUGCTLMSR,
1059            data: value,
1060            ..Default::default()
1061        }];
1062        let msrs_wrapper = Msrs::from_entries(&msrs_to_set).unwrap();
1063        let num_set = self.vcpu.kvm_vcpu.fd.set_msrs(&msrs_wrapper).unwrap();
1064        assert_eq!(num_set, 1);
1065    }
1066
1067    /// Configures BTS tracing and DS area pointer for the current vCPU.
1068    pub fn configure_bts(
1069        &mut self,
1070        ds_area_paddr: u64,
1071        config: BtsConfig,
1072    ) -> Result<(), MemoryError> {
1073        if (ds_area_paddr & mem::M_PAGE_OFFSET) != 0 {
1074            return Err(MemoryError::UnalignedAddress(ds_area_paddr));
1075        }
1076        let mut debugctl = self.read_debugctl();
1077        if config.enable {
1078            debugctl |= DEBUGCTL_BTS;
1079            if config.interrupt {
1080                debugctl |= DEBUGCTL_BTINT;
1081            } else {
1082                debugctl &= !DEBUGCTL_BTINT;
1083            }
1084            if config.off_kernel {
1085                debugctl |= DEBUGCTL_BTS_OFF_OS;
1086            } else {
1087                debugctl &= !DEBUGCTL_BTS_OFF_OS;
1088            }
1089            if config.off_user {
1090                debugctl |= DEBUGCTL_BTS_OFF_USR;
1091            } else {
1092                debugctl &= !DEBUGCTL_BTS_OFF_USR;
1093            }
1094        } else {
1095            debugctl &=
1096                !(DEBUGCTL_BTS | DEBUGCTL_BTINT | DEBUGCTL_BTS_OFF_OS | DEBUGCTL_BTS_OFF_USR);
1097        }
1098        let msrs_to_set = [
1099            kvm_msr_entry {
1100                index: MSR_IA32_DS_AREA,
1101                data: ds_area_paddr,
1102                ..Default::default()
1103            },
1104            kvm_msr_entry {
1105                index: MSR_IA32_DEBUGCTLMSR,
1106                data: debugctl,
1107                ..Default::default()
1108            },
1109        ];
1110        let msrs_wrapper = Msrs::from_entries(&msrs_to_set).unwrap();
1111        let num_set = self.vcpu.kvm_vcpu.fd.set_msrs(&msrs_wrapper).unwrap();
1112        assert_eq!(num_set, 2);
1113        Ok(())
1114    }
1115
1116    pub fn set_debug_state(&mut self, run_mode: RunMode, vmexit_on_swbp: bool) {
1117        let mut control = KVM_GUESTDBG_ENABLE;
1118        if run_mode.is_step() {
1119            control |= KVM_GUESTDBG_SINGLESTEP;
1120            control |= KVM_GUESTDBG_BLOCKIRQ;
1121        };
1122        // Set or clear BTF (branch trace flag) when requested.
1123        let mut debugctl = self.read_debugctl();
1124        if let RunMode::BranchStep = run_mode {
1125            debugctl |= DEBUGCTL_BTF;
1126        } else {
1127            debugctl &= !DEBUGCTL_BTF;
1128        }
1129        self.write_debugctl(debugctl);
1130        control |= if vmexit_on_swbp {
1131            KVM_GUESTDBG_USE_SW_BP
1132        } else {
1133            KVM_GUESTDBG_INJECT_BP
1134        };
1135        let mut arch = kvm_guest_debug_arch::default();
1136        if self.hw_breakpoints.any_active() {
1137            control |= KVM_GUESTDBG_USE_HW_BP;
1138            arch.debugreg[0] = self.hw_breakpoints.addr(0);
1139            arch.debugreg[1] = self.hw_breakpoints.addr(1);
1140            arch.debugreg[2] = self.hw_breakpoints.addr(2);
1141            arch.debugreg[3] = self.hw_breakpoints.addr(3);
1142            arch.debugreg[7] = self.hw_breakpoints.compute_dr7();
1143        }
1144        let dbg_info = kvm_guest_debug {
1145            control,
1146            pad: 0,
1147            arch,
1148        };
1149        self.vcpu.kvm_vcpu.fd.set_guest_debug(&dbg_info).unwrap();
1150    }
1151
1152    pub fn is_nyx_hypercall(&self) -> bool {
1153        let regs = self.regs();
1154        return regs.rax == NYX_LITE;
1155    }
1156
1157    pub fn parse_hypercall(&self) -> ExitReason {
1158        let regs = self.regs();
1159        if self.is_nyx_hypercall() {
1160            let hypercall = match regs.r8 {
1161                SHAREMEM => ExitReason::SharedMem(
1162                    String::from_utf8_lossy(&self.read_cstr_current(regs.r9)).to_string(),
1163                    regs.r10,
1164                    regs.r11.try_into().unwrap(),
1165                ),
1166                DBGPRINT => ExitReason::DebugPrint(
1167                    String::from_utf8_lossy(&self.read_cstr_current(regs.r9)).to_string(),
1168                ),
1169                SNAPSHOT => ExitReason::RequestSnapshot,
1170                EXECDONE => ExitReason::ExecDone(regs.r9),
1171                _ => ExitReason::Hypercall(regs.r8, regs.r9, regs.r10, regs.r11, regs.r12),
1172            };
1173            return hypercall;
1174        }
1175        panic!("Don't call parse_hypercall on a non-hypercall vmexit!");
1176    }
1177
1178    pub fn parse_bad_memory_access(&self) -> ExitReason {
1179        let regs = self.regs();
1180        let sregs = self.sregs();
1181        let data = self.read_current_bytes(regs.rip, 16);
1182        let accesses = disassemble_memory_accesses(&data, &regs, &sregs);
1183        return ExitReason::BadMemoryAccess(accesses);
1184    }
1185
1186    pub fn run_inner(&mut self, timeout: Duration) -> UnparsedExitReason {
1187        let start_time = time::Instant::now();
1188        self.timeout_timer.lock().unwrap().set_timeout(timeout);
1189        loop {
1190            self.regs_cache.replace(None);
1191            self.sregs_cache.replace(None);
1192            let _ = self.event_manager.borrow_mut().run_with_timeout(0);
1193            let mut exit = None;
1194            match self.vcpu.run_emulation() {
1195                // Emulation ran successfully, continue.
1196                Ok(VcpuEmulation::Handled) => {}
1197                Ok(VcpuEmulation::DirtyRingFull) => {
1198                    let vmm = self.vmm.lock().unwrap();
1199                    Self::drain_dirty_ring_backlog(
1200                        &vmm,
1201                        &mut self.dirty_ring,
1202                        &mut self.dirty_ring_backlog,
1203                    );
1204                }
1205                // Emulation was interrupted, check external events.
1206                Ok(VcpuEmulation::Interrupted) => {
1207                    if time::Instant::now().duration_since(start_time) >= timeout {
1208                        exit = Some(UnparsedExitReason::Timeout);
1209                    } else {
1210                        println!("[STOP] interrupt");
1211                        exit = Some(UnparsedExitReason::Interrupted);
1212                    }
1213                }
1214                Ok(VcpuEmulation::Stopped) => {
1215                    exit = Some(UnparsedExitReason::Shutdown);
1216                }
1217                Ok(VcpuEmulation::DebugEvent(dbg)) => {
1218                    let regs = self.regs();
1219                    let exc_reason = match dbg.exception {
1220                        DBG_EXCEPTION_BREAKPOINT if regs.rax == NYX_LITE => {
1221                            self.last_nyx_breakpoint.replace(None);
1222                            UnparsedExitReason::Hypercall
1223                        }
1224                        DBG_EXCEPTION_BREAKPOINT if regs.rax != NYX_LITE => {
1225                            let sregs = self.sregs();
1226                            if self
1227                                .breakpoint_manager
1228                                .forward_guest_bp(sregs.cr3, regs.rip)
1229                            {
1230                                self.last_nyx_breakpoint.replace(None);
1231                                UnparsedExitReason::GuestBreakpoint
1232                            } else {
1233                                self.last_nyx_breakpoint
1234                                    .replace(Some((sregs.cr3, regs.rip)));
1235                                UnparsedExitReason::NyxBreakpoint
1236                            }
1237                        }
1238                        DBG_EXCEPTION_SINGLESTEP if (dbg.dr6 & DR6_BS) != 0 => {
1239                            UnparsedExitReason::SingleStep
1240                        }
1241                        DBG_EXCEPTION_SINGLESTEP if (dbg.dr6 & DR6_HWBP_0) != 0 => {
1242                            UnparsedExitReason::HWBreakpoint(0)
1243                        }
1244                        DBG_EXCEPTION_SINGLESTEP if (dbg.dr6 & DR6_HWBP_1) != 0 => {
1245                            UnparsedExitReason::HWBreakpoint(1)
1246                        }
1247                        DBG_EXCEPTION_SINGLESTEP if (dbg.dr6 & DR6_HWBP_2) != 0 => {
1248                            UnparsedExitReason::HWBreakpoint(2)
1249                        }
1250                        DBG_EXCEPTION_SINGLESTEP if (dbg.dr6 & DR6_HWBP_3) != 0 => {
1251                            UnparsedExitReason::HWBreakpoint(3)
1252                        }
1253
1254                        excp => {
1255                            panic!("Unexpected Debug Exception From KVM: {excp} {:x?} ", dbg);
1256                        }
1257                    };
1258                    exit = Some(exc_reason)
1259                }
1260                Err(VcpuError::FaultyKvmExit(err)) if err == "Bad address (os error 14)" => {
1261                    exit = Some(UnparsedExitReason::BadMemoryAccess)
1262                }
1263                Err(err) => {
1264                    panic!("KVM returned unexpected error {:?}", err)
1265                }
1266            }
1267            while let Ok(ev) = self.vcpu.event_receiver.try_recv() {
1268                match ev {
1269                    VcpuEvent::Finish => {
1270                        exit = Some(UnparsedExitReason::Shutdown);
1271                    }
1272                    event => {
1273                        println!(">== recieved event: {:?}", event);
1274                    }
1275                }
1276            }
1277            if let Some(exitreason) = exit {
1278                self.timeout_timer.lock().unwrap().disable();
1279                return exitreason;
1280            }
1281        }
1282    }
1283
1284    pub fn parse_exit_reason(&self, unparsed: VMExitUserEvent) -> ExitReason {
1285        match unparsed {
1286            VMExitUserEvent::Hypercall => self.parse_hypercall(),
1287            VMExitUserEvent::BadMemoryAccess => self.parse_bad_memory_access(),
1288            VMExitUserEvent::Interrupted => ExitReason::Interrupted,
1289            VMExitUserEvent::Breakpoint => ExitReason::Breakpoint,
1290            VMExitUserEvent::SingleStep => ExitReason::SingleStep,
1291            VMExitUserEvent::Shutdown => ExitReason::Shutdown,
1292            VMExitUserEvent::Timeout => ExitReason::Timeout,
1293            VMExitUserEvent::HWBreakpoint(x) => ExitReason::HWBreakpoint(x),
1294        }
1295    }
1296    pub fn continue_vm(&mut self, run_mode: RunMode, timeout: Duration) -> ExitReason {
1297        let unparsed = VMContinuationState::step(self, run_mode, timeout);
1298        return self.parse_exit_reason(unparsed);
1299    }
1300
1301    pub fn run(&mut self, timeout: Duration) -> ExitReason {
1302        return self.continue_vm(RunMode::Run, timeout);
1303    }
1304
1305    pub fn single_step(&mut self, timeout: Duration) -> ExitReason {
1306        return self.continue_vm(RunMode::SingleStep, timeout);
1307    }
1308
1309    pub fn add_breakpoint(&mut self, cr3: u64, vaddr: u64) {
1310        self.breakpoint_manager.add_breakpoint(cr3, vaddr);
1311    }
1312
1313    pub fn remove_all_breakpoints(&mut self) {
1314        self.breakpoint_manager.remove_all_breakpoints();
1315    }
1316
1317    pub(crate) fn disable_last_nyx_breakpoint(&mut self) {
1318        let mut vmm = self.vmm.lock().unwrap();
1319        if let Some((cr3, rip)) = *self.last_nyx_breakpoint.borrow() {
1320            self.breakpoint_manager
1321                .disable_breakpoint(&mut vmm, cr3, rip);
1322        } else {
1323            self.breakpoint_manager.disable_all_breakpoints(&mut vmm);
1324        }
1325    }
1326
1327    pub fn read_cstr_current(&self, guest_vaddr: u64) -> Vec<u8> {
1328        let cr3 = self.sregs().cr3;
1329        let vmm = self.vmm.lock().unwrap();
1330        vmm.read_virtual_cstr(cr3, guest_vaddr)
1331    }
1332    pub fn read_current_u64(&self, vaddr: u64) -> u64 {
1333        self.current_process_memory().read_u64(vaddr).unwrap()
1334    }
1335    pub fn write_current_u64(&self, vaddr: u64, val: u64) {
1336        self.current_process_memory().write_u64(vaddr, val).unwrap();
1337    }
1338
1339    pub fn write_current_bytes(&self, vaddr: u64, buffer: &[u8]) -> usize {
1340        self.current_process_memory()
1341            .write_bytes(vaddr, buffer)
1342            .unwrap()
1343    }
1344
1345    pub fn read_current_bytes(&self, vaddr: u64, num_bytes: usize) -> Vec<u8> {
1346        let mut res = Vec::with_capacity(num_bytes);
1347        res.resize(num_bytes, 0);
1348        let bytes_copied = self
1349            .current_process_memory()
1350            .read_bytes(vaddr, &mut res)
1351            .unwrap();
1352        res.truncate(bytes_copied);
1353        if is_canonical_user_addr(vaddr) {
1354            assert_eq!(bytes_copied, num_bytes);
1355        }
1356        return res;
1357    }
1358
1359    pub fn branch_step(&mut self, timeout: Duration) -> ExitReason {
1360        let start = time::Instant::now();
1361        let mut prev_rip = self.regs().rip;
1362        loop {
1363            let elapsed = start.elapsed();
1364            if elapsed >= timeout {
1365                return ExitReason::Timeout;
1366            }
1367            let remaining = timeout.saturating_sub(elapsed);
1368            let exit = self.continue_vm(RunMode::BranchStep, remaining);
1369            match exit {
1370                ExitReason::SingleStep => {
1371                    let bytes = self.read_current_bytes(prev_rip, 16);
1372                    if is_control_flow(prev_rip, &bytes) {
1373                        return ExitReason::SingleStep;
1374                    }
1375                    prev_rip = self.regs().rip;
1376                }
1377                other => return other,
1378            }
1379        }
1380    }
1381
1382    pub fn set_lbr(&mut self) {
1383        panic!("reading lbr doesn't seem to be supported by KVM");
1384        //let msrs_to_set = [
1385        //    kvm_msr_entry {
1386        //        index: MSR_IA32_DEBUGCTLMSR,
1387        //        data: 1,
1388        //        ..Default::default()
1389        //    },
1390        //];
1391        //let msrs_wrapper = Msrs::from_entries(&msrs_to_set).unwrap();
1392        //let num_set = self.vcpu.kvm_vcpu.fd.set_msrs(&msrs_wrapper).unwrap();
1393        //assert_eq!(num_set, msrs_to_set.len());
1394    }
1395
1396    pub fn get_lbr(&mut self) {
1397        panic!("None of this seems supported by KVM?");
1398        // it appears XSAVE nees AI32_XSS[15] to actually store LBR data, and we can't set AI32_XSS via kvm.
1399        // let xsave = self.vcpu.kvm_vcpu.fd.get_xsave().unwrap();
1400        // println!("got xsave: {:x?}",xsave);
1401        //
1402        // this is all even more broken. All of this only applies to AMD cpus:
1403        ////let lbr_tos = self.vcpu.kvm_vcpu.get_msrs(&[MSR_LBR_TOS]).unwrap()[&0];
1404        // let msrs_to_set = [
1405        //     kvm_msr_entry {
1406        //         index: 0x00000680,
1407        //         data: 1,
1408        //         ..Default::default()
1409        //     },
1410        // ];
1411        // let mut msrs_wrapper = Msrs::from_entries(&msrs_to_set).unwrap();
1412        // let lbr_tos = self.vcpu.kvm_vcpu.fd.get_msrs(&mut msrs_wrapper).unwrap();
1413        // assert_eq!(lbr_tos, 1);
1414        // msrs_wrapper.as_slice().iter().for_each(|msr| {
1415        //     println!("got MSR_IA32_DEBUGCTLMSR: {}, {:?} {:?}", lbr_tos, msr.index, msr.data);
1416        // });
1417        ////let mut lbr_stack = Vec::with_capacity(32);
1418        //
1419        //for i in 0..32 {
1420        //    let msrs_to_set = [
1421        //        kvm_msr_entry {
1422        //            index: MSR_IA32_LASTBRANCHFROMIP+i,
1423        //            data: 1338,
1424        //            ..Default::default()
1425        //        },
1426        //        kvm_msr_entry {
1427        //            index: MSR_IA32_LASTBRANCHTOIP+i,
1428        //            data: 1339,
1429        //            ..Default::default()
1430        //        },
1431        //        kvm_msr_entry {
1432        //            index: MSR_LBR_TOS,
1433        //            data: 1337,
1434        //            ..Default::default()
1435        //        },
1436        //    ];
1437        //    let mut msrs_wrapper = Msrs::from_entries(&msrs_to_set).unwrap();
1438        //    let lbr_tos = self.vcpu.kvm_vcpu.fd.get_msrs(&mut msrs_wrapper).unwrap();
1439        //    assert_eq!(lbr_tos, 3);
1440        //    msrs_wrapper.as_slice().iter().for_each(|msr| {
1441        //        println!("got MSR_IA#@_LASTBRANCHTOIP: {}, {:?} {:?}", lbr_tos, msr.index, msr.data);
1442        //    });
1443        //    //lbr_stack.push((
1444        //    //    self.vcpu.kvm_vcpu.get_msrs(&[MSR_IA32_LASTINTFROMIP+i]).unwrap()[&0],
1445        //    //    self.vcpu.kvm_vcpu.get_msrs(&[MSR_IA32_LASTBRANCHTOIP+i]).unwrap()[&0],
1446        //    //));
1447        //}
1448        ////println!("LBR: {:x?} top: {:?}", lbr_stack, lbr_tos);
1449    }
1450}