diff --git a/src/firecracker/src/generated/prctl.rs b/src/firecracker/src/generated/prctl.rs index 909bf00f822..659c87c92a2 100644 --- a/src/firecracker/src/generated/prctl.rs +++ b/src/firecracker/src/generated/prctl.rs @@ -141,6 +141,8 @@ pub const PR_MTE_TCF_MASK: u32 = 6; pub const PR_MTE_TAG_SHIFT: u32 = 3; pub const PR_MTE_TAG_MASK: u32 = 524280; pub const PR_MTE_TCF_SHIFT: u32 = 1; +pub const PR_PMLEN_SHIFT: u32 = 24; +pub const PR_PMLEN_MASK: u32 = 2130706432; pub const PR_SET_IO_FLUSHER: u32 = 57; pub const PR_GET_IO_FLUSHER: u32 = 58; pub const PR_SET_SYSCALL_USER_DISPATCH: u32 = 59; @@ -197,3 +199,9 @@ pub const PR_PPC_DEXCR_CTRL_CLEAR: u32 = 4; pub const PR_PPC_DEXCR_CTRL_SET_ONEXEC: u32 = 8; pub const PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC: u32 = 16; pub const PR_PPC_DEXCR_CTRL_MASK: u32 = 31; +pub const PR_GET_SHADOW_STACK_STATUS: u32 = 74; +pub const PR_SET_SHADOW_STACK_STATUS: u32 = 75; +pub const PR_SHADOW_STACK_ENABLE: u32 = 1; +pub const PR_SHADOW_STACK_WRITE: u32 = 2; +pub const PR_SHADOW_STACK_PUSH: u32 = 4; +pub const PR_LOCK_SHADOW_STACK_STATUS: u32 = 76; diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index e09c0887a3a..6435eeba637 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -275,17 +275,15 @@ fn create_chosen_node( Ok(()) } -fn create_vmgenid_node(fdt: &mut FdtWriter, vmgenid: &Option) -> Result<(), FdtError> { - if let Some(vmgenid_info) = vmgenid { - let vmgenid = fdt.begin_node("vmgenid")?; - fdt.property_string("compatible", "microsoft,vmgenid")?; - fdt.property_array_u64("reg", &[vmgenid_info.guest_address.0, VMGENID_MEM_SIZE])?; - fdt.property_array_u32( - "interrupts", - &[GIC_FDT_IRQ_TYPE_SPI, vmgenid_info.gsi, IRQ_TYPE_EDGE_RISING], - )?; - fdt.end_node(vmgenid)?; - } +fn create_vmgenid_node(fdt: &mut FdtWriter, vmgenid: &VmGenId) -> Result<(), FdtError> { + let vmgenid_node = fdt.begin_node("vmgenid")?; + fdt.property_string("compatible", "microsoft,vmgenid")?; + fdt.property_array_u64("reg", &[vmgenid.guest_address.0, VMGENID_MEM_SIZE])?; + fdt.property_array_u32( + "interrupts", + &[GIC_FDT_IRQ_TYPE_SPI, vmgenid.gsi, IRQ_TYPE_EDGE_RISING], + )?; + fdt.end_node(vmgenid_node)?; Ok(()) } @@ -586,29 +584,6 @@ mod tests { .unwrap(); } - #[test] - fn test_create_fdt_with_vmgenid() { - let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - let mut device_manager = default_device_manager(); - let kvm = Kvm::new(vec![]).unwrap(); - let vm = Vm::new(&kvm).unwrap(); - let gic = create_gic(vm.fd(), 1, None).unwrap(); - let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); - cmdline.insert("console", "/dev/tty0").unwrap(); - - device_manager.attach_vmgenid_device(&mem, &vm).unwrap(); - - create_fdt( - &mem, - vec![0], - CString::new("console=tty0").unwrap(), - &device_manager, - &gic, - &None, - ) - .unwrap(); - } - #[test] fn test_create_fdt() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); diff --git a/src/vmm/src/arch/aarch64/output_GICv3.dtb b/src/vmm/src/arch/aarch64/output_GICv3.dtb index 35f4e9b63a3..979cd68a285 100644 Binary files a/src/vmm/src/arch/aarch64/output_GICv3.dtb and b/src/vmm/src/arch/aarch64/output_GICv3.dtb differ diff --git a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb index fb6147ade9c..63ab6765036 100644 Binary files a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb and b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb differ diff --git a/src/vmm/src/arch/x86_64/generated/msr_index.rs b/src/vmm/src/arch/x86_64/generated/msr_index.rs index ccdceeac8d4..095e9fe5960 100644 --- a/src/vmm/src/arch/x86_64/generated/msr_index.rs +++ b/src/vmm/src/arch/x86_64/generated/msr_index.rs @@ -237,7 +237,6 @@ pub const MSR_AMD64_OSVW_ID_LENGTH: u32 = 0xc0010140; pub const MSR_AMD64_OSVW_STATUS: u32 = 0xc0010141; pub const MSR_AMD_PPIN_CTL: u32 = 0xc00102f0; pub const MSR_AMD_PPIN: u32 = 0xc00102f1; -pub const MSR_AMD64_CPUID_FN_7: u32 = 0xc0011002; pub const MSR_AMD64_CPUID_FN_1: u32 = 0xc0011004; pub const MSR_AMD64_LS_CFG: u32 = 0xc0011020; pub const MSR_AMD64_DC_CFG: u32 = 0xc0011022; diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 0f9ef70813e..0a602b68871 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -29,7 +29,6 @@ use crate::device_manager::{ AttachDeviceError, DeviceManager, DeviceManagerCreateError, DevicePersistError, DeviceRestoreArgs, }; -use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::net::Net; @@ -76,8 +75,6 @@ pub enum StartMicrovmError { /// Error creating legacy device: {0} #[cfg(target_arch = "x86_64")] CreateLegacyDevice(device_manager::legacy::LegacyDeviceError), - /// Error creating VMGenID device: {0} - CreateVMGenID(VmGenIdError), /// Error enabling PCIe support: {0} EnablePciDevices(#[from] PciManagerError), /// Error enabling pvtime on vcpu: {0} @@ -258,7 +255,9 @@ pub fn build_microvm_for_boot( vm_resources.serial_out_path.as_ref(), )?; - device_manager.attach_vmgenid_device(vm.guest_memory(), &vm)?; + device_manager.attach_vmgenid_device(&vm)?; + #[cfg(target_arch = "x86_64")] + device_manager.attach_vmclock_device(&vm)?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { @@ -943,10 +942,12 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] pub(crate) fn insert_vmgenid_device(vmm: &mut Vmm) { - vmm.device_manager - .attach_vmgenid_device(vmm.vm.guest_memory(), &vmm.vm) - .unwrap(); - assert!(vmm.device_manager.acpi_devices.vmgenid.is_some()); + vmm.device_manager.attach_vmgenid_device(&vmm.vm).unwrap(); + } + + #[cfg(target_arch = "x86_64")] + pub(crate) fn insert_vmclock_device(vmm: &mut Vmm) { + vmm.device_manager.attach_vmclock_device(&vmm.vm).unwrap(); } pub(crate) fn insert_balloon_device( diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 874443fcc5c..9764143b5a9 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -2,84 +2,96 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::{Aml, aml}; +use vm_memory::GuestMemoryError; use crate::Vm; +#[cfg(target_arch = "x86_64")] +use crate::devices::acpi::vmclock::VmClock; use crate::devices::acpi::vmgenid::VmGenId; +use crate::vstate::resources::ResourceAllocator; -#[derive(Debug, Default)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum ACPIDeviceError { + /// Could not register GSI with KVM: {0} + RegisterIrq(#[from] kvm_ioctls::Error), + /// Could not write to guest memory: {0} + WriteGuestMemory(#[from] GuestMemoryError), +} + +#[derive(Debug)] pub struct ACPIDeviceManager { /// VMGenID device - pub vmgenid: Option, + pub vmgenid: VmGenId, + /// VMclock device + #[cfg(target_arch = "x86_64")] + pub vmclock: VmClock, } impl ACPIDeviceManager { /// Create a new ACPIDeviceManager object - pub fn new() -> Self { - Default::default() + pub fn new(resource_allocator: &mut ResourceAllocator) -> Self { + ACPIDeviceManager { + vmgenid: VmGenId::new(resource_allocator), + #[cfg(target_arch = "x86_64")] + vmclock: VmClock::new(resource_allocator), + } } - /// Attach a new VMGenID device to the microVM - /// - /// This will register the device's interrupt with KVM - pub fn attach_vmgenid(&mut self, vmgenid: VmGenId, vm: &Vm) -> Result<(), kvm_ioctls::Error> { - vm.register_irq(&vmgenid.interrupt_evt, vmgenid.gsi)?; - self.vmgenid = Some(vmgenid); + pub fn attach_vmgenid(&self, vm: &Vm) -> Result<(), ACPIDeviceError> { + vm.register_irq(&self.vmgenid.interrupt_evt, self.vmgenid.gsi)?; + self.vmgenid.activate(vm.guest_memory())?; Ok(()) } - /// If it exists, notify guest VMGenID device that we have resumed from a snapshot. - pub fn notify_vmgenid(&mut self) -> Result<(), std::io::Error> { - if let Some(vmgenid) = &mut self.vmgenid { - vmgenid.notify_guest()?; - } + #[cfg(target_arch = "x86_64")] + pub fn attach_vmclock(&self, vm: &Vm) -> Result<(), ACPIDeviceError> { + self.vmclock.activate(vm.guest_memory())?; Ok(()) } } impl Aml for ACPIDeviceManager { fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { - // If we have a VMGenID device, create the AML for the device and GED interrupt handler - match self.vmgenid.as_ref() { - Some(vmgenid) => { - // AML for GED - aml::Device::new( - "_SB_.GED_".try_into()?, - vec![ - &aml::Name::new("_HID".try_into()?, &"ACPI0013")?, - &aml::Name::new( - "_CRS".try_into()?, - &aml::ResourceTemplate::new(vec![&aml::Interrupt::new( - true, - true, - false, - false, - vmgenid.gsi, - )]), - )?, - &aml::Method::new( - "_EVT".try_into()?, - 1, - true, - vec![&aml::If::new( - // We know that the maximum IRQ number fits in a u8. We have up to - // 32 IRQs in x86 and up to 128 in - // ARM (look into - // `vmm::crate::arch::layout::GSI_LEGACY_END`) - #[allow(clippy::cast_possible_truncation)] - &aml::Equal::new(&aml::Arg(0), &(vmgenid.gsi as u8)), - vec![&aml::Notify::new( - &aml::Path::new("\\_SB_.VGEN")?, - &0x80usize, - )], - )], - ), - ], - ) - .append_aml_bytes(v)?; - // AML for VMGenID itself. - vmgenid.append_aml_bytes(v) - } - None => Ok(()), - } + // AML for [`VmGenId`] device. + self.vmgenid.append_aml_bytes(v)?; + // AML for [`VmClock`] device. + #[cfg(target_arch = "x86_64")] + self.vmclock.append_aml_bytes(v)?; + + // Create the AML for the GED interrupt handler + aml::Device::new( + "_SB_.GED_".try_into()?, + vec![ + &aml::Name::new("_HID".try_into()?, &"ACPI0013")?, + &aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![&aml::Interrupt::new( + true, + true, + false, + false, + self.vmgenid.gsi, + )]), + )?, + &aml::Method::new( + "_EVT".try_into()?, + 1, + true, + vec![&aml::If::new( + // We know that the maximum IRQ number fits in a u8. We have up to + // 32 IRQs in x86 and up to 128 in + // ARM (look into + // `vmm::crate::arch::layout::GSI_LEGACY_END`) + #[allow(clippy::cast_possible_truncation)] + &aml::Equal::new(&aml::Arg(0), &(self.vmgenid.gsi as u8)), + vec![&aml::Notify::new( + &aml::Path::new("\\_SB_.VGEN")?, + &0x80usize, + )], + )], + ), + ], + ) + .append_aml_bytes(v) } } diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index ad30da5b5db..fc245e05539 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -18,12 +18,12 @@ use linux_loader::loader::Cmdline; use log::{error, info}; use mmio::{MMIODeviceManager, MmioError}; use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; -use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; +use persist::MMIODevManagerConstructorArgs; use serde::{Deserialize, Serialize}; use utils::time::TimestampUs; use vmm_sys_util::eventfd::EventFd; -use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; +use crate::device_manager::acpi::ACPIDeviceError; #[cfg(target_arch = "x86_64")] use crate::devices::legacy::I8042Device; #[cfg(target_arch = "aarch64")] @@ -70,10 +70,8 @@ pub enum AttachDeviceError { MmioTransport(#[from] MmioError), /// Error inserting device in bus: {0} Bus(#[from] BusError), - /// Error creating VMGenID device: {0} - CreateVmGenID(#[from] VmGenIdError), - /// Error while registering VMGenID with KVM: {0} - AttachVmGenID(#[from] kvm_ioctls::Error), + /// Error while registering ACPI with KVM: {0} + AttachAcpiDevice(#[from] ACPIDeviceError), #[cfg(target_arch = "aarch64")] /// Cmdline error Cmdline, @@ -176,7 +174,7 @@ impl DeviceManager { mmio_devices: MMIODeviceManager::new(), #[cfg(target_arch = "x86_64")] legacy_devices, - acpi_devices: ACPIDeviceManager::new(), + acpi_devices: ACPIDeviceManager::new(&mut vm.resource_allocator()), pci_devices: PciDevices::new(), }) } @@ -234,13 +232,14 @@ impl DeviceManager { Ok(()) } - pub(crate) fn attach_vmgenid_device( - &mut self, - mem: &GuestMemoryMmap, - vm: &Vm, - ) -> Result<(), AttachDeviceError> { - let vmgenid = VmGenId::new(mem, &mut vm.resource_allocator())?; - self.acpi_devices.attach_vmgenid(vmgenid, vm)?; + pub(crate) fn attach_vmgenid_device(&mut self, vm: &Vm) -> Result<(), AttachDeviceError> { + self.acpi_devices.attach_vmgenid(vm)?; + Ok(()) + } + + #[cfg(target_arch = "x86_64")] + pub(crate) fn attach_vmclock_device(&mut self, vm: &Vm) -> Result<(), AttachDeviceError> { + self.acpi_devices.attach_vmclock(vm)?; Ok(()) } @@ -394,7 +393,7 @@ pub enum DevicePersistError { /// Error restoring MMIO devices: {0} MmioRestore(#[from] persist::DevicePersistError), /// Error restoring ACPI devices: {0} - AcpiRestore(#[from] persist::ACPIDeviceManagerRestoreError), + AcpiRestore(#[from] ACPIDeviceError), /// Error restoring PCI devices: {0} PciRestore(#[from] PciManagerError), /// Error notifying VMGenID device: {0} @@ -464,12 +463,8 @@ impl<'a> Persist<'a> for DeviceManager { let mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; // Restore ACPI devices - let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { - mem: constructor_args.mem, - vm: constructor_args.vm, - }; - let mut acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; - acpi_devices.notify_vmgenid()?; + let mut acpi_devices = ACPIDeviceManager::restore(constructor_args.vm, &state.acpi_state)?; + acpi_devices.vmgenid.notify_guest()?; // Restore PCI devices let pci_ctor_args = PciDevicesConstructorArgs { @@ -542,10 +537,12 @@ pub(crate) mod tests { use super::*; #[cfg(target_arch = "aarch64")] use crate::builder::tests::default_vmm; + use crate::vstate::resources::ResourceAllocator; pub(crate) fn default_device_manager() -> DeviceManager { + let mut resource_allocator = ResourceAllocator::new(); let mmio_devices = MMIODeviceManager::new(); - let acpi_devices = ACPIDeviceManager::new(); + let acpi_devices = ACPIDeviceManager::new(&mut resource_allocator); let pci_devices = PciDevices::new(); #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 49b4115f2cc..fa83aae9e37 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -14,7 +14,10 @@ use super::acpi::ACPIDeviceManager; use super::mmio::*; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; -use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; +use crate::device_manager::acpi::ACPIDeviceError; +#[cfg(target_arch = "x86_64")] +use crate::devices::acpi::vmclock::{VmClock, VmClockState}; +use crate::devices::acpi::vmgenid::{VMGenIDState, VmGenId}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; use crate::devices::virtio::ActivateError; @@ -156,50 +159,35 @@ impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct ACPIDeviceManagerState { - vmgenid: Option, -} - -#[derive(Debug)] -pub struct ACPIDeviceManagerConstructorArgs<'a> { - pub mem: &'a GuestMemoryMmap, - pub vm: &'a Vm, -} - -#[derive(Debug, thiserror::Error, displaydoc::Display)] -pub enum ACPIDeviceManagerRestoreError { - /// Could not register device: {0} - Interrupt(#[from] kvm_ioctls::Error), - /// Could not create VMGenID device: {0} - VMGenID(#[from] VmGenIdError), + vmgenid: VMGenIDState, + #[cfg(target_arch = "x86_64")] + vmclock: VmClockState, } impl<'a> Persist<'a> for ACPIDeviceManager { type State = ACPIDeviceManagerState; - type ConstructorArgs = ACPIDeviceManagerConstructorArgs<'a>; - type Error = ACPIDeviceManagerRestoreError; + type ConstructorArgs = &'a Vm; + type Error = ACPIDeviceError; fn save(&self) -> Self::State { ACPIDeviceManagerState { - vmgenid: self.vmgenid.as_ref().map(|dev| dev.save()), + vmgenid: self.vmgenid.save(), + #[cfg(target_arch = "x86_64")] + vmclock: self.vmclock.save(), } } - fn restore( - constructor_args: Self::ConstructorArgs, - state: &Self::State, - ) -> Result { - let mut dev_manager = ACPIDeviceManager::new(); - if let Some(vmgenid_args) = &state.vmgenid { - let vmgenid = VmGenId::restore( - VMGenIdConstructorArgs { - mem: constructor_args.mem, - resource_allocator: &mut constructor_args.vm.resource_allocator(), - }, - vmgenid_args, - )?; - dev_manager.attach_vmgenid(vmgenid, constructor_args.vm)?; - } - Ok(dev_manager) + fn restore(vm: Self::ConstructorArgs, state: &Self::State) -> Result { + let acpi_devices = ACPIDeviceManager { + // Safe to unwrap() here, this will never return an error. + vmgenid: VmGenId::restore((), &state.vmgenid).unwrap(), + // Safe to unwrap() here, this will never return an error. + #[cfg(target_arch = "x86_64")] + vmclock: VmClock::restore(vm.guest_memory(), &state.vmclock).unwrap(), + }; + + acpi_devices.attach_vmgenid(vm)?; + Ok(acpi_devices) } } diff --git a/src/vmm/src/devices/acpi/generated/mod.rs b/src/vmm/src/devices/acpi/generated/mod.rs new file mode 100644 index 00000000000..b7b60c9f800 --- /dev/null +++ b/src/vmm/src/devices/acpi/generated/mod.rs @@ -0,0 +1,9 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![allow(clippy::all)] +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] + +pub mod vmclock_abi; diff --git a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs new file mode 100644 index 00000000000..134c8393f0c --- /dev/null +++ b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs @@ -0,0 +1,201 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// automatically generated by tools/bindgen.sh + +#![allow( + non_camel_case_types, + non_upper_case_globals, + dead_code, + non_snake_case, + clippy::ptr_as_ptr, + clippy::undocumented_unsafe_blocks, + missing_debug_implementations, + clippy::tests_outside_test_module, + unsafe_op_in_unsafe_fn, + clippy::redundant_static_lifetimes +)] + +use serde::{Deserialize, Serialize}; + +pub const __BITS_PER_LONG: u32 = 64; +pub const __BITS_PER_LONG_LONG: u32 = 64; +pub const __FD_SETSIZE: u32 = 1024; +pub const VMCLOCK_MAGIC: u32 = 1263289174; +pub const VMCLOCK_COUNTER_ARM_VCNT: u8 = 0; +pub const VMCLOCK_COUNTER_X86_TSC: u8 = 1; +pub const VMCLOCK_COUNTER_INVALID: u8 = 255; +pub const VMCLOCK_TIME_UTC: u8 = 0; +pub const VMCLOCK_TIME_TAI: u8 = 1; +pub const VMCLOCK_TIME_MONOTONIC: u8 = 2; +pub const VMCLOCK_TIME_INVALID_SMEARED: u8 = 3; +pub const VMCLOCK_TIME_INVALID_MAYBE_SMEARED: u8 = 4; +pub const VMCLOCK_FLAG_TAI_OFFSET_VALID: u64 = 1; +pub const VMCLOCK_FLAG_DISRUPTION_SOON: u64 = 2; +pub const VMCLOCK_FLAG_DISRUPTION_IMMINENT: u64 = 4; +pub const VMCLOCK_FLAG_PERIOD_ESTERROR_VALID: u64 = 8; +pub const VMCLOCK_FLAG_PERIOD_MAXERROR_VALID: u64 = 16; +pub const VMCLOCK_FLAG_TIME_ESTERROR_VALID: u64 = 32; +pub const VMCLOCK_FLAG_TIME_MAXERROR_VALID: u64 = 64; +pub const VMCLOCK_FLAG_TIME_MONOTONIC: u64 = 128; +pub const VMCLOCK_STATUS_UNKNOWN: u8 = 0; +pub const VMCLOCK_STATUS_INITIALIZING: u8 = 1; +pub const VMCLOCK_STATUS_SYNCHRONIZED: u8 = 2; +pub const VMCLOCK_STATUS_FREERUNNING: u8 = 3; +pub const VMCLOCK_STATUS_UNRELIABLE: u8 = 4; +pub const VMCLOCK_SMEARING_STRICT: u8 = 0; +pub const VMCLOCK_SMEARING_NOON_LINEAR: u8 = 1; +pub const VMCLOCK_SMEARING_UTC_SLS: u8 = 2; +pub const VMCLOCK_LEAP_NONE: u8 = 0; +pub const VMCLOCK_LEAP_PRE_POS: u8 = 1; +pub const VMCLOCK_LEAP_PRE_NEG: u8 = 2; +pub const VMCLOCK_LEAP_POS: u8 = 3; +pub const VMCLOCK_LEAP_POST_POS: u8 = 4; +pub const VMCLOCK_LEAP_POST_NEG: u8 = 5; +pub type __s8 = ::std::os::raw::c_schar; +pub type __u8 = ::std::os::raw::c_uchar; +pub type __s16 = ::std::os::raw::c_short; +pub type __u16 = ::std::os::raw::c_ushort; +pub type __s32 = ::std::os::raw::c_int; +pub type __u32 = ::std::os::raw::c_uint; +pub type __s64 = ::std::os::raw::c_longlong; +pub type __u64 = ::std::os::raw::c_ulonglong; +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct __kernel_fd_set { + pub fds_bits: [::std::os::raw::c_ulong; 16usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of __kernel_fd_set"][::std::mem::size_of::<__kernel_fd_set>() - 128usize]; + ["Alignment of __kernel_fd_set"][::std::mem::align_of::<__kernel_fd_set>() - 8usize]; + ["Offset of field: __kernel_fd_set::fds_bits"] + [::std::mem::offset_of!(__kernel_fd_set, fds_bits) - 0usize]; +}; +pub type __kernel_sighandler_t = + ::std::option::Option; +pub type __kernel_key_t = ::std::os::raw::c_int; +pub type __kernel_mqd_t = ::std::os::raw::c_int; +pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; +pub type __kernel_long_t = ::std::os::raw::c_long; +pub type __kernel_ulong_t = ::std::os::raw::c_ulong; +pub type __kernel_ino_t = __kernel_ulong_t; +pub type __kernel_mode_t = ::std::os::raw::c_uint; +pub type __kernel_pid_t = ::std::os::raw::c_int; +pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; +pub type __kernel_uid_t = ::std::os::raw::c_uint; +pub type __kernel_gid_t = ::std::os::raw::c_uint; +pub type __kernel_suseconds_t = __kernel_long_t; +pub type __kernel_daddr_t = ::std::os::raw::c_int; +pub type __kernel_uid32_t = ::std::os::raw::c_uint; +pub type __kernel_gid32_t = ::std::os::raw::c_uint; +pub type __kernel_size_t = __kernel_ulong_t; +pub type __kernel_ssize_t = __kernel_long_t; +pub type __kernel_ptrdiff_t = __kernel_long_t; +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct __kernel_fsid_t { + pub val: [::std::os::raw::c_int; 2usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of __kernel_fsid_t"][::std::mem::size_of::<__kernel_fsid_t>() - 8usize]; + ["Alignment of __kernel_fsid_t"][::std::mem::align_of::<__kernel_fsid_t>() - 4usize]; + ["Offset of field: __kernel_fsid_t::val"] + [::std::mem::offset_of!(__kernel_fsid_t, val) - 0usize]; +}; +pub type __kernel_off_t = __kernel_long_t; +pub type __kernel_loff_t = ::std::os::raw::c_longlong; +pub type __kernel_old_time_t = __kernel_long_t; +pub type __kernel_time_t = __kernel_long_t; +pub type __kernel_time64_t = ::std::os::raw::c_longlong; +pub type __kernel_clock_t = __kernel_long_t; +pub type __kernel_timer_t = ::std::os::raw::c_int; +pub type __kernel_clockid_t = ::std::os::raw::c_int; +pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; +pub type __kernel_uid16_t = ::std::os::raw::c_ushort; +pub type __kernel_gid16_t = ::std::os::raw::c_ushort; +pub type __s128 = i128; +pub type __u128 = u128; +pub type __le16 = __u16; +pub type __be16 = __u16; +pub type __le32 = __u32; +pub type __be32 = __u32; +pub type __le64 = __u64; +pub type __be64 = __u64; +pub type __sum16 = __u16; +pub type __wsum = __u32; +pub type __poll_t = ::std::os::raw::c_uint; +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq, Serialize, Deserialize)] +pub struct vmclock_abi { + pub magic: __le32, + pub size: __le32, + pub version: __le16, + pub counter_id: __u8, + pub time_type: __u8, + pub seq_count: __le32, + pub disruption_marker: __le64, + pub flags: __le64, + pub pad: [__u8; 2usize], + pub clock_status: __u8, + pub leap_second_smearing_hint: __u8, + pub tai_offset_sec: __le16, + pub leap_indicator: __u8, + pub counter_period_shift: __u8, + pub counter_value: __le64, + pub counter_period_frac_sec: __le64, + pub counter_period_esterror_rate_frac_sec: __le64, + pub counter_period_maxerror_rate_frac_sec: __le64, + pub time_sec: __le64, + pub time_frac_sec: __le64, + pub time_esterror_nanosec: __le64, + pub time_maxerror_nanosec: __le64, +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of vmclock_abi"][::std::mem::size_of::() - 104usize]; + ["Alignment of vmclock_abi"][::std::mem::align_of::() - 8usize]; + ["Offset of field: vmclock_abi::magic"][::std::mem::offset_of!(vmclock_abi, magic) - 0usize]; + ["Offset of field: vmclock_abi::size"][::std::mem::offset_of!(vmclock_abi, size) - 4usize]; + ["Offset of field: vmclock_abi::version"] + [::std::mem::offset_of!(vmclock_abi, version) - 8usize]; + ["Offset of field: vmclock_abi::counter_id"] + [::std::mem::offset_of!(vmclock_abi, counter_id) - 10usize]; + ["Offset of field: vmclock_abi::time_type"] + [::std::mem::offset_of!(vmclock_abi, time_type) - 11usize]; + ["Offset of field: vmclock_abi::seq_count"] + [::std::mem::offset_of!(vmclock_abi, seq_count) - 12usize]; + ["Offset of field: vmclock_abi::disruption_marker"] + [::std::mem::offset_of!(vmclock_abi, disruption_marker) - 16usize]; + ["Offset of field: vmclock_abi::flags"][::std::mem::offset_of!(vmclock_abi, flags) - 24usize]; + ["Offset of field: vmclock_abi::pad"][::std::mem::offset_of!(vmclock_abi, pad) - 32usize]; + ["Offset of field: vmclock_abi::clock_status"] + [::std::mem::offset_of!(vmclock_abi, clock_status) - 34usize]; + ["Offset of field: vmclock_abi::leap_second_smearing_hint"] + [::std::mem::offset_of!(vmclock_abi, leap_second_smearing_hint) - 35usize]; + ["Offset of field: vmclock_abi::tai_offset_sec"] + [::std::mem::offset_of!(vmclock_abi, tai_offset_sec) - 36usize]; + ["Offset of field: vmclock_abi::leap_indicator"] + [::std::mem::offset_of!(vmclock_abi, leap_indicator) - 38usize]; + ["Offset of field: vmclock_abi::counter_period_shift"] + [::std::mem::offset_of!(vmclock_abi, counter_period_shift) - 39usize]; + ["Offset of field: vmclock_abi::counter_value"] + [::std::mem::offset_of!(vmclock_abi, counter_value) - 40usize]; + ["Offset of field: vmclock_abi::counter_period_frac_sec"] + [::std::mem::offset_of!(vmclock_abi, counter_period_frac_sec) - 48usize]; + ["Offset of field: vmclock_abi::counter_period_esterror_rate_frac_sec"] + [::std::mem::offset_of!(vmclock_abi, counter_period_esterror_rate_frac_sec) - 56usize]; + ["Offset of field: vmclock_abi::counter_period_maxerror_rate_frac_sec"] + [::std::mem::offset_of!(vmclock_abi, counter_period_maxerror_rate_frac_sec) - 64usize]; + ["Offset of field: vmclock_abi::time_sec"] + [::std::mem::offset_of!(vmclock_abi, time_sec) - 72usize]; + ["Offset of field: vmclock_abi::time_frac_sec"] + [::std::mem::offset_of!(vmclock_abi, time_frac_sec) - 80usize]; + ["Offset of field: vmclock_abi::time_esterror_nanosec"] + [::std::mem::offset_of!(vmclock_abi, time_esterror_nanosec) - 88usize]; + ["Offset of field: vmclock_abi::time_maxerror_nanosec"] + [::std::mem::offset_of!(vmclock_abi, time_maxerror_nanosec) - 96usize]; +}; diff --git a/src/vmm/src/devices/acpi/mod.rs b/src/vmm/src/devices/acpi/mod.rs index 5151bddd231..8eba26ac41d 100644 --- a/src/vmm/src/devices/acpi/mod.rs +++ b/src/vmm/src/devices/acpi/mod.rs @@ -1,4 +1,6 @@ // Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +mod generated; +pub mod vmclock; pub mod vmgenid; diff --git a/src/vmm/src/devices/acpi/vmclock.rs b/src/vmm/src/devices/acpi/vmclock.rs new file mode 100644 index 00000000000..9e12a81e287 --- /dev/null +++ b/src/vmm/src/devices/acpi/vmclock.rs @@ -0,0 +1,233 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::convert::Infallible; +use std::mem::offset_of; +use std::sync::atomic::{Ordering, fence}; + +use acpi_tables::{Aml, aml}; +use log::error; +use serde::{Deserialize, Serialize}; +use vm_allocator::AllocPolicy; +use vm_memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryError}; + +use crate::devices::acpi::generated::vmclock_abi::{ + VMCLOCK_COUNTER_INVALID, VMCLOCK_MAGIC, VMCLOCK_STATUS_UNKNOWN, vmclock_abi, +}; +use crate::snapshot::Persist; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::resources::ResourceAllocator; + +// SAFETY: `vmclock_abi` is a POD +unsafe impl ByteValued for vmclock_abi {} + +// We are reserving a physical page to expose the [`VmClock`] data +const VMCLOCK_SIZE: u32 = 0x1000; + +// Write a value in `vmclock_abi` both in the Firecracker-managed state +// and inside guest memory address that corresponds to it. +macro_rules! write_vmclock_field { + ($vmclock:expr, $mem:expr, $field:ident, $value:expr) => { + $vmclock.inner.$field = $value; + $mem.write_obj( + $vmclock.inner.$field, + $vmclock + .guest_address + .unchecked_add(offset_of!(vmclock_abi, $field) as u64), + ); + }; +} + +/// VMclock device +/// +/// This device emulates the VMclock device which allows passing information to the guest related +/// to the relation of the host CPU to real-time clock as well as information about disruptive +/// events, such as live-migration. +#[derive(Debug)] +pub struct VmClock { + /// Guest address in which we will write the VMclock struct + pub guest_address: GuestAddress, + /// The [`VmClock`] state we are exposing to the guest + inner: vmclock_abi, +} + +impl VmClock { + /// Create a new [`VmClock`] device for a newly booted VM + pub fn new(resource_allocator: &mut ResourceAllocator) -> VmClock { + let addr = resource_allocator + .allocate_system_memory( + VMCLOCK_SIZE as u64, + VMCLOCK_SIZE as u64, + AllocPolicy::LastMatch, + ) + .inspect_err(|err| error!("vmclock: could not allocate guest memory for device: {err}")) + .unwrap(); + + let mut inner = vmclock_abi { + magic: VMCLOCK_MAGIC, + size: VMCLOCK_SIZE, + version: u16::to_le(1), + clock_status: VMCLOCK_STATUS_UNKNOWN, + counter_id: VMCLOCK_COUNTER_INVALID, + ..Default::default() + }; + + VmClock { + guest_address: GuestAddress(addr), + inner, + } + } + + /// Activate [`VmClock`] device + pub fn activate(&self, mem: &GuestMemoryMmap) -> Result<(), GuestMemoryError> { + mem.write_slice(self.inner.as_slice(), self.guest_address)?; + Ok(()) + } + + /// Bump the VM generation counter + pub fn post_load_update(&mut self, mem: &GuestMemoryMmap) { + write_vmclock_field!(self, mem, seq_count, self.inner.seq_count | 1); + + // This fence ensures guest sees all previous writes. It is matched to a + // read barrier in the guest. + fence(Ordering::Release); + + write_vmclock_field!( + self, + mem, + disruption_marker, + self.inner.disruption_marker.wrapping_add(1) + ); + + // This fence ensures guest sees the `disruption_marker` update. It is matched to a + // read barrier in the guest. + fence(Ordering::Release); + + write_vmclock_field!(self, mem, seq_count, self.inner.seq_count.wrapping_add(1)); + } +} + +/// (De)serialize-able state of the [`VmClock`] +/// +/// We could avoid this and reuse [`VmClock`] itself if `GuestAddress` was `Serialize`/`Deserialize` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct VmClockState { + /// Guest address in which we write the [`VmClock`] info + pub guest_address: u64, + /// Data we expose to the guest + pub inner: vmclock_abi, +} + +impl<'a> Persist<'a> for VmClock { + type State = VmClockState; + type ConstructorArgs = &'a GuestMemoryMmap; + type Error = Infallible; + + fn save(&self) -> Self::State { + VmClockState { + guest_address: self.guest_address.0, + inner: self.inner, + } + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> Result { + let mut vmclock = VmClock { + guest_address: GuestAddress(state.guest_address), + inner: state.inner, + }; + vmclock.post_load_update(constructor_args); + Ok(vmclock) + } +} + +impl Aml for VmClock { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + aml::Device::new( + "_SB_.VCLK".try_into()?, + vec![ + &aml::Name::new("_HID".try_into()?, &"AMZNC10C")?, + &aml::Name::new("_CID".try_into()?, &"VMCLOCK")?, + &aml::Name::new("_DDN".try_into()?, &"VMCLOCK")?, + &aml::Method::new( + "_STA".try_into()?, + 0, + false, + vec![&aml::Return::new(&0x0fu8)], + ), + &aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::Cacheable, + false, + self.guest_address.0, + self.guest_address.0 + VMCLOCK_SIZE as u64 - 1, + )?]), + )?, + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(test)] +mod tests { + use vm_memory::{Bytes, GuestAddress}; + + use crate::arch; + use crate::devices::acpi::generated::vmclock_abi::vmclock_abi; + use crate::devices::acpi::vmclock::{VMCLOCK_SIZE, VmClock}; + use crate::snapshot::Persist; + use crate::test_utils::single_region_mem; + use crate::utils::u64_to_usize; + use crate::vstate::resources::ResourceAllocator; + + // We are allocating memory from the end of the system memory portion + const VMCLOCK_TEST_GUEST_ADDR: GuestAddress = + GuestAddress(arch::SYSTEM_MEM_START + arch::SYSTEM_MEM_SIZE - VMCLOCK_SIZE as u64); + + fn default_vmclock() -> VmClock { + let mut resource_allocator = ResourceAllocator::new(); + VmClock::new(&mut resource_allocator) + } + + #[test] + fn test_new_device() { + let vmclock = default_vmclock(); + let mem = single_region_mem( + u64_to_usize(arch::SYSTEM_MEM_START) + u64_to_usize(arch::SYSTEM_MEM_SIZE), + ); + + let guest_data: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); + assert_ne!(guest_data, vmclock.inner); + + vmclock.activate(&mem); + + let guest_data: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); + assert_eq!(guest_data, vmclock.inner); + } + + #[test] + fn test_device_save_restore() { + let vmclock = default_vmclock(); + let mem = single_region_mem( + u64_to_usize(arch::SYSTEM_MEM_START) + u64_to_usize(arch::SYSTEM_MEM_SIZE), + ); + + vmclock.activate(&mem).unwrap(); + let guest_data: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); + + let state = vmclock.save(); + let vmclock_new = VmClock::restore(&mem, &state).unwrap(); + + let guest_data_new: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); + assert_ne!(guest_data_new, vmclock.inner); + assert_eq!(guest_data_new, vmclock_new.inner); + assert_eq!( + vmclock.inner.disruption_marker + 1, + vmclock_new.inner.disruption_marker + ); + } +} diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 18881cd39c9..15b08116b97 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -1,6 +1,8 @@ // Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::convert::Infallible; + use acpi_tables::{Aml, aml}; use aws_lc_rs::error::Unspecified as RandError; use aws_lc_rs::rand; @@ -38,73 +40,55 @@ pub struct VmGenId { pub gsi: u32, } -#[derive(Debug, thiserror::Error, displaydoc::Display)] -pub enum VmGenIdError { - /// Error with VMGenID interrupt: {0} - Interrupt(#[from] std::io::Error), - /// Error accessing VMGenID memory: {0} - GuestMemory(#[from] GuestMemoryError), - /// Create generation ID error: {0} - GenerationId(#[from] RandError), - /// Failed to allocate requested resource: {0} - Allocator(#[from] vm_allocator::Error), -} - impl VmGenId { /// Create a new Vm Generation Id device using an address in the guest for writing the /// generation ID and a GSI for sending device notifications. - pub fn from_parts( - guest_address: GuestAddress, - gsi: u32, - mem: &GuestMemoryMmap, - ) -> Result { + pub fn from_parts(guest_address: GuestAddress, gsi: u32) -> Self { debug!( "vmgenid: building VMGenID device. Address: {:#010x}. IRQ: {}", guest_address.0, gsi ); - let interrupt_evt = EventFdTrigger::new(EventFd::new(libc::EFD_NONBLOCK)?); - let gen_id = Self::make_genid()?; - - // Write generation ID in guest memory - debug!( - "vmgenid: writing new generation ID to guest: {:#034x}", - gen_id + let interrupt_evt = EventFdTrigger::new( + EventFd::new(libc::EFD_NONBLOCK) + .inspect_err(|err| { + error!("vmgenid: Could not create EventFd for VMGenID device: {err}") + }) + .unwrap(), ); - mem.write_slice(&gen_id.to_le_bytes(), guest_address) - .inspect_err(|err| error!("vmgenid: could not write generation ID to guest: {err}"))?; + let gen_id = Self::make_genid(); - Ok(Self { + Self { gen_id, interrupt_evt, guest_address, gsi, - }) + } } /// Create a new VMGenID device /// /// Allocate memory and a GSI for sending notifications and build the device - pub fn new( - mem: &GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, - ) -> Result { - let gsi = resource_allocator.allocate_gsi_legacy(1)?; + pub fn new(resource_allocator: &mut ResourceAllocator) -> Self { + let gsi = resource_allocator + .allocate_gsi_legacy(1) + .inspect_err(|err| error!("vmgenid: Could not allocate GSI for VMGenID: {err}")) + .unwrap(); // The generation ID needs to live in an 8-byte aligned buffer - let addr = resource_allocator.allocate_system_memory( - VMGENID_MEM_SIZE, - 8, - vm_allocator::AllocPolicy::LastMatch, - )?; + let addr = resource_allocator + .allocate_system_memory(VMGENID_MEM_SIZE, 8, vm_allocator::AllocPolicy::LastMatch) + .inspect_err(|err| error!("vmgenid: Could not allocate guest RAM for VMGenID: {err}")) + .unwrap(); - Self::from_parts(GuestAddress(addr), gsi[0], mem) + Self::from_parts(GuestAddress(addr), gsi[0]) } // Create a 16-bytes random number - fn make_genid() -> Result { + fn make_genid() -> u128 { let mut gen_id_bytes = [0u8; 16]; rand::fill(&mut gen_id_bytes) - .inspect_err(|err| error!("vmgenid: could not create new generation ID: {err}"))?; - Ok(u128::from_le_bytes(gen_id_bytes)) + .inspect_err(|err| error!("vmgenid: could not create new generation ID: {err}")) + .unwrap(); + u128::from_le_bytes(gen_id_bytes) } /// Send an ACPI notification to guest device. @@ -118,6 +102,18 @@ impl VmGenId { debug!("vmgenid: notifying guest about new generation ID"); Ok(()) } + + /// Attach the [`VmGenId`] device + pub fn activate(&self, mem: &GuestMemoryMmap) -> Result<(), GuestMemoryError> { + debug!( + "vmgenid: writing new generation ID to guest: {:#034x}", + self.gen_id + ); + mem.write_slice(&self.gen_id.to_le_bytes(), self.guest_address) + .inspect_err(|err| error!("vmgenid: could not write generation ID to guest: {err}"))?; + + Ok(()) + } } /// Logic to save/restore the state of a VMGenID device @@ -130,16 +126,10 @@ pub struct VMGenIDState { pub addr: u64, } -#[derive(Debug)] -pub struct VMGenIdConstructorArgs<'a> { - pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a mut ResourceAllocator, -} - impl<'a> Persist<'a> for VmGenId { type State = VMGenIDState; - type ConstructorArgs = VMGenIdConstructorArgs<'a>; - type Error = VmGenIdError; + type ConstructorArgs = (); + type Error = Infallible; fn save(&self) -> Self::State { VMGenIDState { @@ -148,11 +138,8 @@ impl<'a> Persist<'a> for VmGenId { } } - fn restore( - constructor_args: Self::ConstructorArgs, - state: &Self::State, - ) -> Result { - Self::from_parts(GuestAddress(state.addr), state.gsi, constructor_args.mem) + fn restore(_: Self::ConstructorArgs, state: &Self::State) -> Result { + Ok(Self::from_parts(GuestAddress(state.addr), state.gsi)) } } diff --git a/src/vmm/src/io_uring/generated.rs b/src/vmm/src/io_uring/generated.rs index 3948a02fb21..f3f306e8e5a 100644 --- a/src/vmm/src/io_uring/generated.rs +++ b/src/vmm/src/io_uring/generated.rs @@ -107,6 +107,7 @@ pub const IORING_SETUP_DEFER_TASKRUN: u32 = 8192; pub const IORING_SETUP_NO_MMAP: u32 = 16384; pub const IORING_SETUP_REGISTERED_FD_ONLY: u32 = 32768; pub const IORING_SETUP_NO_SQARRAY: u32 = 65536; +pub const IORING_SETUP_HYBRID_IOPOLL: u32 = 131072; pub const IORING_URING_CMD_FIXED: u32 = 1; pub const IORING_URING_CMD_MASK: u32 = 1; pub const IORING_FSYNC_DATASYNC: u32 = 1; @@ -142,6 +143,9 @@ pub const IORING_MSG_RING_CQE_SKIP: u32 = 1; pub const IORING_MSG_RING_FLAGS_PASS: u32 = 2; pub const IORING_FIXED_FD_NO_CLOEXEC: u32 = 1; pub const IORING_NOP_INJECT_RESULT: u32 = 1; +pub const IORING_NOP_FILE: u32 = 2; +pub const IORING_NOP_FIXED_FILE: u32 = 4; +pub const IORING_NOP_FIXED_BUFFER: u32 = 8; pub const IORING_CQE_F_BUFFER: u32 = 1; pub const IORING_CQE_F_MORE: u32 = 2; pub const IORING_CQE_F_SOCK_NONEMPTY: u32 = 4; @@ -164,6 +168,7 @@ pub const IORING_ENTER_SQ_WAIT: u32 = 4; pub const IORING_ENTER_EXT_ARG: u32 = 8; pub const IORING_ENTER_REGISTERED_RING: u32 = 16; pub const IORING_ENTER_ABS_TIMER: u32 = 32; +pub const IORING_ENTER_EXT_ARG_REG: u32 = 64; pub const IORING_FEAT_SINGLE_MMAP: u32 = 1; pub const IORING_FEAT_NODROP: u32 = 2; pub const IORING_FEAT_SUBMIT_STABLE: u32 = 4; @@ -779,7 +784,10 @@ pub mod io_uring_register_op { pub const IORING_UNREGISTER_NAPI: Type = 28; pub const IORING_REGISTER_CLOCK: Type = 29; pub const IORING_REGISTER_CLONE_BUFFERS: Type = 30; - pub const IORING_REGISTER_LAST: Type = 31; + pub const IORING_REGISTER_SEND_MSG_RING: Type = 31; + pub const IORING_REGISTER_RESIZE_RINGS: Type = 33; + pub const IORING_REGISTER_MEM_REGION: Type = 34; + pub const IORING_REGISTER_LAST: Type = 35; pub const IORING_REGISTER_USE_REGISTERED_RING: Type = 2147483648; } #[repr(C)] @@ -801,6 +809,60 @@ const _: () = { ["Offset of field: io_uring_files_update::fds"] [::std::mem::offset_of!(io_uring_files_update, fds) - 8usize]; }; +pub mod _bindgen_ty_1 { + pub type Type = ::std::os::raw::c_uint; + pub const IORING_MEM_REGION_TYPE_USER: Type = 1; +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct io_uring_region_desc { + pub user_addr: __u64, + pub size: __u64, + pub flags: __u32, + pub id: __u32, + pub mmap_offset: __u64, + pub __resv: [__u64; 4usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of io_uring_region_desc"][::std::mem::size_of::() - 64usize]; + ["Alignment of io_uring_region_desc"][::std::mem::align_of::() - 8usize]; + ["Offset of field: io_uring_region_desc::user_addr"] + [::std::mem::offset_of!(io_uring_region_desc, user_addr) - 0usize]; + ["Offset of field: io_uring_region_desc::size"] + [::std::mem::offset_of!(io_uring_region_desc, size) - 8usize]; + ["Offset of field: io_uring_region_desc::flags"] + [::std::mem::offset_of!(io_uring_region_desc, flags) - 16usize]; + ["Offset of field: io_uring_region_desc::id"] + [::std::mem::offset_of!(io_uring_region_desc, id) - 20usize]; + ["Offset of field: io_uring_region_desc::mmap_offset"] + [::std::mem::offset_of!(io_uring_region_desc, mmap_offset) - 24usize]; + ["Offset of field: io_uring_region_desc::__resv"] + [::std::mem::offset_of!(io_uring_region_desc, __resv) - 32usize]; +}; +pub mod _bindgen_ty_2 { + pub type Type = ::std::os::raw::c_uint; + pub const IORING_MEM_REGION_REG_WAIT_ARG: Type = 1; +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct io_uring_mem_region_reg { + pub region_uptr: __u64, + pub flags: __u64, + pub __resv: [__u64; 2usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of io_uring_mem_region_reg"][::std::mem::size_of::() - 32usize]; + ["Alignment of io_uring_mem_region_reg"] + [::std::mem::align_of::() - 8usize]; + ["Offset of field: io_uring_mem_region_reg::region_uptr"] + [::std::mem::offset_of!(io_uring_mem_region_reg, region_uptr) - 0usize]; + ["Offset of field: io_uring_mem_region_reg::flags"] + [::std::mem::offset_of!(io_uring_mem_region_reg, flags) - 8usize]; + ["Offset of field: io_uring_mem_region_reg::__resv"] + [::std::mem::offset_of!(io_uring_mem_region_reg, __resv) - 16usize]; +}; #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct io_uring_rsrc_register { @@ -989,16 +1051,20 @@ const _: () = { ["Offset of field: io_uring_clock_register::__resv"] [::std::mem::offset_of!(io_uring_clock_register, __resv) - 4usize]; }; -pub mod _bindgen_ty_1 { +pub mod _bindgen_ty_3 { pub type Type = ::std::os::raw::c_uint; pub const IORING_REGISTER_SRC_REGISTERED: Type = 1; + pub const IORING_REGISTER_DST_REPLACE: Type = 2; } #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct io_uring_clone_buffers { pub src_fd: __u32, pub flags: __u32, - pub pad: [__u32; 6usize], + pub src_off: __u32, + pub dst_off: __u32, + pub nr: __u32, + pub pad: [__u32; 3usize], } #[allow(clippy::unnecessary_operation, clippy::identity_op)] const _: () = { @@ -1009,8 +1075,14 @@ const _: () = { [::std::mem::offset_of!(io_uring_clone_buffers, src_fd) - 0usize]; ["Offset of field: io_uring_clone_buffers::flags"] [::std::mem::offset_of!(io_uring_clone_buffers, flags) - 4usize]; + ["Offset of field: io_uring_clone_buffers::src_off"] + [::std::mem::offset_of!(io_uring_clone_buffers, src_off) - 8usize]; + ["Offset of field: io_uring_clone_buffers::dst_off"] + [::std::mem::offset_of!(io_uring_clone_buffers, dst_off) - 12usize]; + ["Offset of field: io_uring_clone_buffers::nr"] + [::std::mem::offset_of!(io_uring_clone_buffers, nr) - 16usize]; ["Offset of field: io_uring_clone_buffers::pad"] - [::std::mem::offset_of!(io_uring_clone_buffers, pad) - 8usize]; + [::std::mem::offset_of!(io_uring_clone_buffers, pad) - 20usize]; }; #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq)] @@ -1170,24 +1242,42 @@ const _: () = { ["Offset of field: io_uring_buf_status::resv"] [::std::mem::offset_of!(io_uring_buf_status, resv) - 8usize]; }; +pub mod io_uring_napi_op { + pub type Type = ::std::os::raw::c_uint; + pub const IO_URING_NAPI_REGISTER_OP: Type = 0; + pub const IO_URING_NAPI_STATIC_ADD_ID: Type = 1; + pub const IO_URING_NAPI_STATIC_DEL_ID: Type = 2; +} +pub mod io_uring_napi_tracking_strategy { + pub type Type = ::std::os::raw::c_uint; + pub const IO_URING_NAPI_TRACKING_DYNAMIC: Type = 0; + pub const IO_URING_NAPI_TRACKING_STATIC: Type = 1; + pub const IO_URING_NAPI_TRACKING_INACTIVE: Type = 255; +} #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct io_uring_napi { pub busy_poll_to: __u32, pub prefer_busy_poll: __u8, - pub pad: [__u8; 3usize], - pub resv: __u64, + pub opcode: __u8, + pub pad: [__u8; 2usize], + pub op_param: __u32, + pub resv: __u32, } #[allow(clippy::unnecessary_operation, clippy::identity_op)] const _: () = { ["Size of io_uring_napi"][::std::mem::size_of::() - 16usize]; - ["Alignment of io_uring_napi"][::std::mem::align_of::() - 8usize]; + ["Alignment of io_uring_napi"][::std::mem::align_of::() - 4usize]; ["Offset of field: io_uring_napi::busy_poll_to"] [::std::mem::offset_of!(io_uring_napi, busy_poll_to) - 0usize]; ["Offset of field: io_uring_napi::prefer_busy_poll"] [::std::mem::offset_of!(io_uring_napi, prefer_busy_poll) - 4usize]; - ["Offset of field: io_uring_napi::pad"][::std::mem::offset_of!(io_uring_napi, pad) - 5usize]; - ["Offset of field: io_uring_napi::resv"][::std::mem::offset_of!(io_uring_napi, resv) - 8usize]; + ["Offset of field: io_uring_napi::opcode"] + [::std::mem::offset_of!(io_uring_napi, opcode) - 5usize]; + ["Offset of field: io_uring_napi::pad"][::std::mem::offset_of!(io_uring_napi, pad) - 6usize]; + ["Offset of field: io_uring_napi::op_param"] + [::std::mem::offset_of!(io_uring_napi, op_param) - 8usize]; + ["Offset of field: io_uring_napi::resv"][::std::mem::offset_of!(io_uring_napi, resv) - 12usize]; }; pub mod io_uring_register_restriction_op { pub type Type = ::std::os::raw::c_uint; @@ -1197,6 +1287,40 @@ pub mod io_uring_register_restriction_op { pub const IORING_RESTRICTION_SQE_FLAGS_REQUIRED: Type = 3; pub const IORING_RESTRICTION_LAST: Type = 4; } +pub mod _bindgen_ty_4 { + pub type Type = ::std::os::raw::c_uint; + pub const IORING_REG_WAIT_TS: Type = 1; +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct io_uring_reg_wait { + pub ts: __kernel_timespec, + pub min_wait_usec: __u32, + pub flags: __u32, + pub sigmask: __u64, + pub sigmask_sz: __u32, + pub pad: [__u32; 3usize], + pub pad2: [__u64; 2usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of io_uring_reg_wait"][::std::mem::size_of::() - 64usize]; + ["Alignment of io_uring_reg_wait"][::std::mem::align_of::() - 8usize]; + ["Offset of field: io_uring_reg_wait::ts"] + [::std::mem::offset_of!(io_uring_reg_wait, ts) - 0usize]; + ["Offset of field: io_uring_reg_wait::min_wait_usec"] + [::std::mem::offset_of!(io_uring_reg_wait, min_wait_usec) - 16usize]; + ["Offset of field: io_uring_reg_wait::flags"] + [::std::mem::offset_of!(io_uring_reg_wait, flags) - 20usize]; + ["Offset of field: io_uring_reg_wait::sigmask"] + [::std::mem::offset_of!(io_uring_reg_wait, sigmask) - 24usize]; + ["Offset of field: io_uring_reg_wait::sigmask_sz"] + [::std::mem::offset_of!(io_uring_reg_wait, sigmask_sz) - 32usize]; + ["Offset of field: io_uring_reg_wait::pad"] + [::std::mem::offset_of!(io_uring_reg_wait, pad) - 36usize]; + ["Offset of field: io_uring_reg_wait::pad2"] + [::std::mem::offset_of!(io_uring_reg_wait, pad2) - 48usize]; +}; #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct io_uring_getevents_arg { diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 7186b904b8a..6b88a317605 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -124,7 +124,6 @@ use std::sync::{Arc, Barrier, Mutex}; use std::time::Duration; use device_manager::DeviceManager; -use devices::acpi::vmgenid::VmGenIdError; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccomp::BpfProgram; use snapshot::Persist; @@ -247,8 +246,6 @@ pub enum VmmError { Vm(#[from] vstate::vm::VmError), /// Kvm error: {0} Kvm(#[from] vstate::kvm::KvmError), - /// VMGenID error: {0} - VMGenID(#[from] VmGenIdError), /// Failed perform action on device: {0} FindDeviceError(#[from] device_manager::FindDeviceError), /// Block: {0} diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index ee76bf6800b..cbc4beac95a 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -576,6 +576,8 @@ mod tests { use super::*; use crate::Vmm; #[cfg(target_arch = "x86_64")] + use crate::builder::tests::insert_vmclock_device; + #[cfg(target_arch = "x86_64")] use crate::builder::tests::insert_vmgenid_device; use crate::builder::tests::{ CustomBlockConfig, default_kernel_cmdline, default_vmm, insert_balloon_device, @@ -638,6 +640,8 @@ mod tests { #[cfg(target_arch = "x86_64")] insert_vmgenid_device(&mut vmm); + #[cfg(target_arch = "x86_64")] + insert_vmclock_device(&mut vmm); vmm } diff --git a/tests/conftest.py b/tests/conftest.py index fabff84a0d8..fce511596fd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -237,6 +237,14 @@ def bin_vsock_path(test_fc_session_root_path): yield vsock_helper_bin_path +@pytest.fixture(scope="session") +def bin_vmclock_path(test_fc_session_root_path): + """Build a simple util for test VMclock device""" + vmclock_helper_bin_path = os.path.join(test_fc_session_root_path, "vmclock") + build_tools.gcc_compile("host_tools/vmclock.c", vmclock_helper_bin_path) + yield vmclock_helper_bin_path + + @pytest.fixture(scope="session") def change_net_config_space_bin(test_fc_session_root_path): """Build a binary that changes the MMIO config space.""" diff --git a/tests/host_tools/vmclock-abi.h b/tests/host_tools/vmclock-abi.h new file mode 100644 index 00000000000..2d99b29ac44 --- /dev/null +++ b/tests/host_tools/vmclock-abi.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ + +/* + * This structure provides a vDSO-style clock to VM guests, exposing the + * relationship (or lack thereof) between the CPU clock (TSC, timebase, arch + * counter, etc.) and real time. It is designed to address the problem of + * live migration, which other clock enlightenments do not. + * + * When a guest is live migrated, this affects the clock in two ways. + * + * First, even between identical hosts the actual frequency of the underlying + * counter will change within the tolerances of its specification (typically + * ±50PPM, or 4 seconds a day). This frequency also varies over time on the + * same host, but can be tracked by NTP as it generally varies slowly. With + * live migration there is a step change in the frequency, with no warning. + * + * Second, there may be a step change in the value of the counter itself, as + * its accuracy is limited by the precision of the NTP synchronization on the + * source and destination hosts. + * + * So any calibration (NTP, PTP, etc.) which the guest has done on the source + * host before migration is invalid, and needs to be redone on the new host. + * + * In its most basic mode, this structure provides only an indication to the + * guest that live migration has occurred. This allows the guest to know that + * its clock is invalid and take remedial action. For applications that need + * reliable accurate timestamps (e.g. distributed databases), the structure + * can be mapped all the way to userspace. This allows the application to see + * directly for itself that the clock is disrupted and take appropriate + * action, even when using a vDSO-style method to get the time instead of a + * system call. + * + * In its more advanced mode. this structure can also be used to expose the + * precise relationship of the CPU counter to real time, as calibrated by the + * host. This means that userspace applications can have accurate time + * immediately after live migration, rather than having to pause operations + * and wait for NTP to recover. This mode does, of course, rely on the + * counter being reliable and consistent across CPUs. + * + * Note that this must be true UTC, never with smeared leap seconds. If a + * guest wishes to construct a smeared clock, it can do so. Presenting a + * smeared clock through this interface would be problematic because it + * actually messes with the apparent counter *period*. A linear smearing + * of 1 ms per second would effectively tweak the counter period by 1000PPM + * at the start/end of the smearing period, while a sinusoidal smear would + * basically be impossible to represent. + * + * This structure is offered with the intent that it be adopted into the + * nascent virtio-rtc standard, as a virtio-rtc that does not address the live + * migration problem seems a little less than fit for purpose. For that + * reason, certain fields use precisely the same numeric definitions as in + * the virtio-rtc proposal. The structure can also be exposed through an ACPI + * device with the CID "VMCLOCK", modelled on the "VMGENID" device except for + * the fact that it uses a real _CRS to convey the address of the structure + * (which should be a full page, to allow for mapping directly to userspace). + */ + +#ifndef __VMCLOCK_ABI_H__ +#define __VMCLOCK_ABI_H__ + +#include + +struct vmclock_abi { + /* CONSTANT FIELDS */ + __le32 magic; +#define VMCLOCK_MAGIC 0x4b4c4356 /* "VCLK" */ + __le32 size; /* Size of region containing this structure */ + __le16 version; /* 1 */ + __u8 counter_id; /* Matches VIRTIO_RTC_COUNTER_xxx except INVALID */ +#define VMCLOCK_COUNTER_ARM_VCNT 0 +#define VMCLOCK_COUNTER_X86_TSC 1 +#define VMCLOCK_COUNTER_INVALID 0xff + __u8 time_type; /* Matches VIRTIO_RTC_TYPE_xxx */ +#define VMCLOCK_TIME_UTC 0 /* Since 1970-01-01 00:00:00z */ +#define VMCLOCK_TIME_TAI 1 /* Since 1970-01-01 00:00:00z */ +#define VMCLOCK_TIME_MONOTONIC 2 /* Since undefined epoch */ +#define VMCLOCK_TIME_INVALID_SMEARED 3 /* Not supported */ +#define VMCLOCK_TIME_INVALID_MAYBE_SMEARED 4 /* Not supported */ + + /* NON-CONSTANT FIELDS PROTECTED BY SEQCOUNT LOCK */ + __le32 seq_count; /* Low bit means an update is in progress */ + /* + * This field changes to another non-repeating value when the CPU + * counter is disrupted, for example on live migration. This lets + * the guest know that it should discard any calibration it has + * performed of the counter against external sources (NTP/PTP/etc.). + */ + __le64 disruption_marker; + __le64 flags; + /* Indicates that the tai_offset_sec field is valid */ +#define VMCLOCK_FLAG_TAI_OFFSET_VALID (1 << 0) + /* + * Optionally used to notify guests of pending maintenance events. + * A guest which provides latency-sensitive services may wish to + * remove itself from service if an event is coming up. Two flags + * indicate the approximate imminence of the event. + */ +#define VMCLOCK_FLAG_DISRUPTION_SOON (1 << 1) /* About a day */ +#define VMCLOCK_FLAG_DISRUPTION_IMMINENT (1 << 2) /* About an hour */ +#define VMCLOCK_FLAG_PERIOD_ESTERROR_VALID (1 << 3) +#define VMCLOCK_FLAG_PERIOD_MAXERROR_VALID (1 << 4) +#define VMCLOCK_FLAG_TIME_ESTERROR_VALID (1 << 5) +#define VMCLOCK_FLAG_TIME_MAXERROR_VALID (1 << 6) + /* + * If the MONOTONIC flag is set then (other than leap seconds) it is + * guaranteed that the time calculated according this structure at + * any given moment shall never appear to be later than the time + * calculated via the structure at any *later* moment. + * + * In particular, a timestamp based on a counter reading taken + * immediately after setting the low bit of seq_count (and the + * associated memory barrier), using the previously-valid time and + * period fields, shall never be later than a timestamp based on + * a counter reading taken immediately before *clearing* the low + * bit again after the update, using the about-to-be-valid fields. + */ +#define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) + + __u8 pad[2]; + __u8 clock_status; +#define VMCLOCK_STATUS_UNKNOWN 0 +#define VMCLOCK_STATUS_INITIALIZING 1 +#define VMCLOCK_STATUS_SYNCHRONIZED 2 +#define VMCLOCK_STATUS_FREERUNNING 3 +#define VMCLOCK_STATUS_UNRELIABLE 4 + + /* + * The time exposed through this device is never smeared. This field + * corresponds to the 'subtype' field in virtio-rtc, which indicates + * the smearing method. However in this case it provides a *hint* to + * the guest operating system, such that *if* the guest OS wants to + * provide its users with an alternative clock which does not follow + * UTC, it may do so in a fashion consistent with the other systems + * in the nearby environment. + */ + __u8 leap_second_smearing_hint; /* Matches VIRTIO_RTC_SUBTYPE_xxx */ +#define VMCLOCK_SMEARING_STRICT 0 +#define VMCLOCK_SMEARING_NOON_LINEAR 1 +#define VMCLOCK_SMEARING_UTC_SLS 2 + __le16 tai_offset_sec; /* Actually two's complement signed */ + __u8 leap_indicator; + /* + * This field is based on the VIRTIO_RTC_LEAP_xxx values as defined + * in the current draft of virtio-rtc, but since smearing cannot be + * used with the shared memory device, some values are not used. + * + * The _POST_POS and _POST_NEG values allow the guest to perform + * its own smearing during the day or so after a leap second when + * such smearing may need to continue being applied for a leap + * second which is now theoretically "historical". + */ +#define VMCLOCK_LEAP_NONE 0x00 /* No known nearby leap second */ +#define VMCLOCK_LEAP_PRE_POS 0x01 /* Positive leap second at EOM */ +#define VMCLOCK_LEAP_PRE_NEG 0x02 /* Negative leap second at EOM */ +#define VMCLOCK_LEAP_POS 0x03 /* Set during 23:59:60 second */ +#define VMCLOCK_LEAP_POST_POS 0x04 +#define VMCLOCK_LEAP_POST_NEG 0x05 + + /* Bit shift for counter_period_frac_sec and its error rate */ + __u8 counter_period_shift; + /* + * Paired values of counter and UTC at a given point in time. + */ + __le64 counter_value; + /* + * Counter period, and error margin of same. The unit of these + * fields is 1/2^(64 + counter_period_shift) of a second. + */ + __le64 counter_period_frac_sec; + __le64 counter_period_esterror_rate_frac_sec; + __le64 counter_period_maxerror_rate_frac_sec; + + /* + * Time according to time_type field above. + */ + __le64 time_sec; /* Seconds since time_type epoch */ + __le64 time_frac_sec; /* Units of 1/2^64 of a second */ + __le64 time_esterror_nanosec; + __le64 time_maxerror_nanosec; +}; + +#endif /* __VMCLOCK_ABI_H__ */ diff --git a/tests/host_tools/vmclock.c b/tests/host_tools/vmclock.c new file mode 100644 index 00000000000..d69304ac87c --- /dev/null +++ b/tests/host_tools/vmclock.c @@ -0,0 +1,78 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vmclock-abi.h" + +const char *VMCLOCK_DEV_PATH = "/dev/vmclock0"; + +int get_vmclock_handle(struct vmclock_abi **vmclock) +{ + int fd = open(VMCLOCK_DEV_PATH, 0); + if (fd == -1) + goto out_err; + + void *ptr = mmap(NULL, sizeof(struct vmclock_abi), PROT_READ, MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) + goto out_err_mmap; + + *vmclock = ptr; + return 0; + +out_err_mmap: + close(fd); +out_err: + return errno; +} + +#define READ_VMCLOCK_FIELD_FN(type, field) \ +type read##_##field (struct vmclock_abi *vmclock) { \ + type ret; \ + while (1) { \ + type seq = vmclock->seq_count & ~1ULL; \ + \ + /* This matches a write fence in the VMM */ \ + atomic_thread_fence(memory_order_acquire); \ + \ + ret = vmclock->field; \ + \ + /* This matches a write fence in the VMM */ \ + atomic_thread_fence(memory_order_acquire); \ + if (seq == vmclock->seq_count) \ + break; \ + } \ + \ + return ret; \ +} + +READ_VMCLOCK_FIELD_FN(uint64_t, disruption_marker); + +int main() +{ + struct vmclock_abi *vmclock; + + int err = get_vmclock_handle(&vmclock); + if (err) { + printf("Could not mmap vmclock struct: %s\n", strerror(err)); + exit(1); + } + + printf("VMCLOCK_MAGIC: 0x%x\n", vmclock->magic); + printf("VMCLOCK_SIZE: 0x%x\n", vmclock->size); + printf("VMCLOCK_VERSION: %u\n", vmclock->version); + printf("VMCLOCK_CLOCK_STATUS: %u\n", vmclock->clock_status); + printf("VMCLOCK_COUNTER_ID: %u\n", vmclock->counter_id); + printf("VMCLOCK_DISRUPTION_MARKER: %lu\n", read_disruption_marker(vmclock)); + + return 0; +} diff --git a/tests/integration_tests/functional/test_vmclock.py b/tests/integration_tests/functional/test_vmclock.py new file mode 100644 index 00000000000..b21acaaa1cc --- /dev/null +++ b/tests/integration_tests/functional/test_vmclock.py @@ -0,0 +1,66 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Test VMclock device emulation""" + +import platform + +import pytest + + +@pytest.fixture(scope="function") +def vm_with_vmclock(uvm_plain, bin_vmclock_path): + """Create a VM with VMclock support and the `vmclock` test binary under `/tmp/vmclock`""" + basevm = uvm_plain + basevm.spawn() + + basevm.basic_config() + basevm.add_net_iface() + basevm.start() + basevm.ssh.scp_put(bin_vmclock_path, "/tmp/vmclock") + + yield basevm + + +def parse_vmclock(vm): + """Parse the VMclock struct inside the guest and return a dictionary with its fields""" + _, stdout, _ = vm.ssh.check_output("/tmp/vmclock") + fields = stdout.strip().split("\n") + return dict(item.split(": ") for item in fields) + + +@pytest.mark.skipif( + platform.machine() != "x86_64", + reason="VMClock device is currently supported only on x86 systems", +) +def test_vmclock_fields(vm_with_vmclock): + """Make sure that we expose the expected values in the VMclock struct""" + vm = vm_with_vmclock + vmclock = parse_vmclock(vm) + + assert vmclock["VMCLOCK_MAGIC"] == "0x4b4c4356" + assert vmclock["VMCLOCK_SIZE"] == "0x1000" + assert vmclock["VMCLOCK_VERSION"] == "1" + assert vmclock["VMCLOCK_CLOCK_STATUS"] == "0" + assert vmclock["VMCLOCK_COUNTER_ID"] == "255" + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + + +@pytest.mark.skipif( + platform.machine() != "x86_64", + reason="VMClock device is currently supported only on x86 systems", +) +def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type): + """Test that `disruption_marker` is updated upon snapshot resume""" + basevm = vm_with_vmclock + + vmclock = parse_vmclock(basevm) + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + + snapshot = basevm.make_snapshot(snapshot_type) + basevm.kill() + + for i, vm in enumerate( + microvm_factory.build_n_from_snapshot(snapshot, 5, incremental=True) + ): + vmclock = parse_vmclock(vm) + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == f"{i+1}" diff --git a/tools/bindgen-patches/0003-vmclock.patch b/tools/bindgen-patches/0003-vmclock.patch new file mode 100644 index 00000000000..bb3ca006c87 --- /dev/null +++ b/tools/bindgen-patches/0003-vmclock.patch @@ -0,0 +1,86 @@ +diff --git a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs +index e841ca111..134c8393f 100644 +--- a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs ++++ b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs +@@ -16,40 +16,42 @@ + clippy::redundant_static_lifetimes + )] + ++use serde::{Deserialize, Serialize}; ++ + pub const __BITS_PER_LONG: u32 = 64; + pub const __BITS_PER_LONG_LONG: u32 = 64; + pub const __FD_SETSIZE: u32 = 1024; + pub const VMCLOCK_MAGIC: u32 = 1263289174; +-pub const VMCLOCK_COUNTER_ARM_VCNT: u32 = 0; +-pub const VMCLOCK_COUNTER_X86_TSC: u32 = 1; +-pub const VMCLOCK_COUNTER_INVALID: u32 = 255; +-pub const VMCLOCK_TIME_UTC: u32 = 0; +-pub const VMCLOCK_TIME_TAI: u32 = 1; +-pub const VMCLOCK_TIME_MONOTONIC: u32 = 2; +-pub const VMCLOCK_TIME_INVALID_SMEARED: u32 = 3; +-pub const VMCLOCK_TIME_INVALID_MAYBE_SMEARED: u32 = 4; +-pub const VMCLOCK_FLAG_TAI_OFFSET_VALID: u32 = 1; +-pub const VMCLOCK_FLAG_DISRUPTION_SOON: u32 = 2; +-pub const VMCLOCK_FLAG_DISRUPTION_IMMINENT: u32 = 4; +-pub const VMCLOCK_FLAG_PERIOD_ESTERROR_VALID: u32 = 8; +-pub const VMCLOCK_FLAG_PERIOD_MAXERROR_VALID: u32 = 16; +-pub const VMCLOCK_FLAG_TIME_ESTERROR_VALID: u32 = 32; +-pub const VMCLOCK_FLAG_TIME_MAXERROR_VALID: u32 = 64; +-pub const VMCLOCK_FLAG_TIME_MONOTONIC: u32 = 128; +-pub const VMCLOCK_STATUS_UNKNOWN: u32 = 0; +-pub const VMCLOCK_STATUS_INITIALIZING: u32 = 1; +-pub const VMCLOCK_STATUS_SYNCHRONIZED: u32 = 2; +-pub const VMCLOCK_STATUS_FREERUNNING: u32 = 3; +-pub const VMCLOCK_STATUS_UNRELIABLE: u32 = 4; +-pub const VMCLOCK_SMEARING_STRICT: u32 = 0; +-pub const VMCLOCK_SMEARING_NOON_LINEAR: u32 = 1; +-pub const VMCLOCK_SMEARING_UTC_SLS: u32 = 2; +-pub const VMCLOCK_LEAP_NONE: u32 = 0; +-pub const VMCLOCK_LEAP_PRE_POS: u32 = 1; +-pub const VMCLOCK_LEAP_PRE_NEG: u32 = 2; +-pub const VMCLOCK_LEAP_POS: u32 = 3; +-pub const VMCLOCK_LEAP_POST_POS: u32 = 4; +-pub const VMCLOCK_LEAP_POST_NEG: u32 = 5; ++pub const VMCLOCK_COUNTER_ARM_VCNT: u8 = 0; ++pub const VMCLOCK_COUNTER_X86_TSC: u8 = 1; ++pub const VMCLOCK_COUNTER_INVALID: u8 = 255; ++pub const VMCLOCK_TIME_UTC: u8 = 0; ++pub const VMCLOCK_TIME_TAI: u8 = 1; ++pub const VMCLOCK_TIME_MONOTONIC: u8 = 2; ++pub const VMCLOCK_TIME_INVALID_SMEARED: u8 = 3; ++pub const VMCLOCK_TIME_INVALID_MAYBE_SMEARED: u8 = 4; ++pub const VMCLOCK_FLAG_TAI_OFFSET_VALID: u64 = 1; ++pub const VMCLOCK_FLAG_DISRUPTION_SOON: u64 = 2; ++pub const VMCLOCK_FLAG_DISRUPTION_IMMINENT: u64 = 4; ++pub const VMCLOCK_FLAG_PERIOD_ESTERROR_VALID: u64 = 8; ++pub const VMCLOCK_FLAG_PERIOD_MAXERROR_VALID: u64 = 16; ++pub const VMCLOCK_FLAG_TIME_ESTERROR_VALID: u64 = 32; ++pub const VMCLOCK_FLAG_TIME_MAXERROR_VALID: u64 = 64; ++pub const VMCLOCK_FLAG_TIME_MONOTONIC: u64 = 128; ++pub const VMCLOCK_STATUS_UNKNOWN: u8 = 0; ++pub const VMCLOCK_STATUS_INITIALIZING: u8 = 1; ++pub const VMCLOCK_STATUS_SYNCHRONIZED: u8 = 2; ++pub const VMCLOCK_STATUS_FREERUNNING: u8 = 3; ++pub const VMCLOCK_STATUS_UNRELIABLE: u8 = 4; ++pub const VMCLOCK_SMEARING_STRICT: u8 = 0; ++pub const VMCLOCK_SMEARING_NOON_LINEAR: u8 = 1; ++pub const VMCLOCK_SMEARING_UTC_SLS: u8 = 2; ++pub const VMCLOCK_LEAP_NONE: u8 = 0; ++pub const VMCLOCK_LEAP_PRE_POS: u8 = 1; ++pub const VMCLOCK_LEAP_PRE_NEG: u8 = 2; ++pub const VMCLOCK_LEAP_POS: u8 = 3; ++pub const VMCLOCK_LEAP_POST_POS: u8 = 4; ++pub const VMCLOCK_LEAP_POST_NEG: u8 = 5; + pub type __s8 = ::std::os::raw::c_schar; + pub type __u8 = ::std::os::raw::c_uchar; + pub type __s16 = ::std::os::raw::c_short; +@@ -127,7 +129,7 @@ pub type __sum16 = __u16; + pub type __wsum = __u32; + pub type __poll_t = ::std::os::raw::c_uint; + #[repr(C)] +-#[derive(Debug, Default, Copy, Clone, PartialEq)] ++#[derive(Debug, Default, Copy, Clone, PartialEq, Serialize, Deserialize)] + pub struct vmclock_abi { + pub magic: __le32, + pub size: __le32, diff --git a/tools/bindgen.sh b/tools/bindgen.sh index e2529375285..cdab5ef824a 100755 --- a/tools/bindgen.sh +++ b/tools/bindgen.sh @@ -43,7 +43,7 @@ EOF bindgen --no-doc-comments --disable-header-comment --constified-enum-module '.*' --with-derive-default --with-derive-partialeq $@ } -KERNEL_BRANCH="linux-6.12.y" +KERNEL_BRANCH="linux-6.13.y" KERNEL_DIR="./$KERNEL_BRANCH" HEADERS_DIR=$(realpath "./linux-headers") # https://www.kernel.org/doc/Documentation/kbuild/headers_install.txt @@ -166,6 +166,10 @@ fc-bindgen \ --allowlist-var "ARCH_.*" \ "$ARCH_X86_INCLUDE/uapi/asm/prctl.h" >src/vmm/src/arch/x86_64/generated/arch_prctl.rs +info "BINDGEN include/uapi/linux/vmclock-abi.h" +fc-bindgen \ + "$KERNEL_DIR/include/uapi/linux/vmclock-abi.h" > src/vmm/src/devices/acpi/generated/vmclock_abi.rs + # Apply any patches info "Apply patches" for PATCH in $(dirname $0)/bindgen-patches/*.patch; do