From ce8217db6d39d465c8b5ff332b6eb7e5657e6ce6 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 21 Oct 2025 11:33:39 +0200 Subject: [PATCH 1/5] bindgen: upgrade to linux 6.13 headers vmclock is added in Linux 6.13. Upgrade the version we are using to create Rust bindings from to this version so we can get vmclock ABI definitions. Signed-off-by: Babis Chalios --- src/firecracker/src/generated/prctl.rs | 8 + .../src/arch/x86_64/generated/msr_index.rs | 1 - src/vmm/src/io_uring/generated.rs | 142 ++++++++++++++++-- tools/bindgen.sh | 2 +- 4 files changed, 142 insertions(+), 11 deletions(-) diff --git a/src/firecracker/src/generated/prctl.rs b/src/firecracker/src/generated/prctl.rs index 909bf00f822..659c87c92a2 100644 --- a/src/firecracker/src/generated/prctl.rs +++ b/src/firecracker/src/generated/prctl.rs @@ -141,6 +141,8 @@ pub const PR_MTE_TCF_MASK: u32 = 6; pub const PR_MTE_TAG_SHIFT: u32 = 3; pub const PR_MTE_TAG_MASK: u32 = 524280; pub const PR_MTE_TCF_SHIFT: u32 = 1; +pub const PR_PMLEN_SHIFT: u32 = 24; +pub const PR_PMLEN_MASK: u32 = 2130706432; pub const PR_SET_IO_FLUSHER: u32 = 57; pub const PR_GET_IO_FLUSHER: u32 = 58; pub const PR_SET_SYSCALL_USER_DISPATCH: u32 = 59; @@ -197,3 +199,9 @@ pub const PR_PPC_DEXCR_CTRL_CLEAR: u32 = 4; pub const PR_PPC_DEXCR_CTRL_SET_ONEXEC: u32 = 8; pub const PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC: u32 = 16; pub const PR_PPC_DEXCR_CTRL_MASK: u32 = 31; +pub const PR_GET_SHADOW_STACK_STATUS: u32 = 74; +pub const PR_SET_SHADOW_STACK_STATUS: u32 = 75; +pub const PR_SHADOW_STACK_ENABLE: u32 = 1; +pub const PR_SHADOW_STACK_WRITE: u32 = 2; +pub const PR_SHADOW_STACK_PUSH: u32 = 4; +pub const PR_LOCK_SHADOW_STACK_STATUS: u32 = 76; diff --git a/src/vmm/src/arch/x86_64/generated/msr_index.rs b/src/vmm/src/arch/x86_64/generated/msr_index.rs index ccdceeac8d4..095e9fe5960 100644 --- a/src/vmm/src/arch/x86_64/generated/msr_index.rs +++ b/src/vmm/src/arch/x86_64/generated/msr_index.rs @@ -237,7 +237,6 @@ pub const MSR_AMD64_OSVW_ID_LENGTH: u32 = 0xc0010140; pub const MSR_AMD64_OSVW_STATUS: u32 = 0xc0010141; pub const MSR_AMD_PPIN_CTL: u32 = 0xc00102f0; pub const MSR_AMD_PPIN: u32 = 0xc00102f1; -pub const MSR_AMD64_CPUID_FN_7: u32 = 0xc0011002; pub const MSR_AMD64_CPUID_FN_1: u32 = 0xc0011004; pub const MSR_AMD64_LS_CFG: u32 = 0xc0011020; pub const MSR_AMD64_DC_CFG: u32 = 0xc0011022; diff --git a/src/vmm/src/io_uring/generated.rs b/src/vmm/src/io_uring/generated.rs index 3948a02fb21..f3f306e8e5a 100644 --- a/src/vmm/src/io_uring/generated.rs +++ b/src/vmm/src/io_uring/generated.rs @@ -107,6 +107,7 @@ pub const IORING_SETUP_DEFER_TASKRUN: u32 = 8192; pub const IORING_SETUP_NO_MMAP: u32 = 16384; pub const IORING_SETUP_REGISTERED_FD_ONLY: u32 = 32768; pub const IORING_SETUP_NO_SQARRAY: u32 = 65536; +pub const IORING_SETUP_HYBRID_IOPOLL: u32 = 131072; pub const IORING_URING_CMD_FIXED: u32 = 1; pub const IORING_URING_CMD_MASK: u32 = 1; pub const IORING_FSYNC_DATASYNC: u32 = 1; @@ -142,6 +143,9 @@ pub const IORING_MSG_RING_CQE_SKIP: u32 = 1; pub const IORING_MSG_RING_FLAGS_PASS: u32 = 2; pub const IORING_FIXED_FD_NO_CLOEXEC: u32 = 1; pub const IORING_NOP_INJECT_RESULT: u32 = 1; +pub const IORING_NOP_FILE: u32 = 2; +pub const IORING_NOP_FIXED_FILE: u32 = 4; +pub const IORING_NOP_FIXED_BUFFER: u32 = 8; pub const IORING_CQE_F_BUFFER: u32 = 1; pub const IORING_CQE_F_MORE: u32 = 2; pub const IORING_CQE_F_SOCK_NONEMPTY: u32 = 4; @@ -164,6 +168,7 @@ pub const IORING_ENTER_SQ_WAIT: u32 = 4; pub const IORING_ENTER_EXT_ARG: u32 = 8; pub const IORING_ENTER_REGISTERED_RING: u32 = 16; pub const IORING_ENTER_ABS_TIMER: u32 = 32; +pub const IORING_ENTER_EXT_ARG_REG: u32 = 64; pub const IORING_FEAT_SINGLE_MMAP: u32 = 1; pub const IORING_FEAT_NODROP: u32 = 2; pub const IORING_FEAT_SUBMIT_STABLE: u32 = 4; @@ -779,7 +784,10 @@ pub mod io_uring_register_op { pub const IORING_UNREGISTER_NAPI: Type = 28; pub const IORING_REGISTER_CLOCK: Type = 29; pub const IORING_REGISTER_CLONE_BUFFERS: Type = 30; - pub const IORING_REGISTER_LAST: Type = 31; + pub const IORING_REGISTER_SEND_MSG_RING: Type = 31; + pub const IORING_REGISTER_RESIZE_RINGS: Type = 33; + pub const IORING_REGISTER_MEM_REGION: Type = 34; + pub const IORING_REGISTER_LAST: Type = 35; pub const IORING_REGISTER_USE_REGISTERED_RING: Type = 2147483648; } #[repr(C)] @@ -801,6 +809,60 @@ const _: () = { ["Offset of field: io_uring_files_update::fds"] [::std::mem::offset_of!(io_uring_files_update, fds) - 8usize]; }; +pub mod _bindgen_ty_1 { + pub type Type = ::std::os::raw::c_uint; + pub const IORING_MEM_REGION_TYPE_USER: Type = 1; +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct io_uring_region_desc { + pub user_addr: __u64, + pub size: __u64, + pub flags: __u32, + pub id: __u32, + pub mmap_offset: __u64, + pub __resv: [__u64; 4usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of io_uring_region_desc"][::std::mem::size_of::() - 64usize]; + ["Alignment of io_uring_region_desc"][::std::mem::align_of::() - 8usize]; + ["Offset of field: io_uring_region_desc::user_addr"] + [::std::mem::offset_of!(io_uring_region_desc, user_addr) - 0usize]; + ["Offset of field: io_uring_region_desc::size"] + [::std::mem::offset_of!(io_uring_region_desc, size) - 8usize]; + ["Offset of field: io_uring_region_desc::flags"] + [::std::mem::offset_of!(io_uring_region_desc, flags) - 16usize]; + ["Offset of field: io_uring_region_desc::id"] + [::std::mem::offset_of!(io_uring_region_desc, id) - 20usize]; + ["Offset of field: io_uring_region_desc::mmap_offset"] + [::std::mem::offset_of!(io_uring_region_desc, mmap_offset) - 24usize]; + ["Offset of field: io_uring_region_desc::__resv"] + [::std::mem::offset_of!(io_uring_region_desc, __resv) - 32usize]; +}; +pub mod _bindgen_ty_2 { + pub type Type = ::std::os::raw::c_uint; + pub const IORING_MEM_REGION_REG_WAIT_ARG: Type = 1; +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct io_uring_mem_region_reg { + pub region_uptr: __u64, + pub flags: __u64, + pub __resv: [__u64; 2usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of io_uring_mem_region_reg"][::std::mem::size_of::() - 32usize]; + ["Alignment of io_uring_mem_region_reg"] + [::std::mem::align_of::() - 8usize]; + ["Offset of field: io_uring_mem_region_reg::region_uptr"] + [::std::mem::offset_of!(io_uring_mem_region_reg, region_uptr) - 0usize]; + ["Offset of field: io_uring_mem_region_reg::flags"] + [::std::mem::offset_of!(io_uring_mem_region_reg, flags) - 8usize]; + ["Offset of field: io_uring_mem_region_reg::__resv"] + [::std::mem::offset_of!(io_uring_mem_region_reg, __resv) - 16usize]; +}; #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct io_uring_rsrc_register { @@ -989,16 +1051,20 @@ const _: () = { ["Offset of field: io_uring_clock_register::__resv"] [::std::mem::offset_of!(io_uring_clock_register, __resv) - 4usize]; }; -pub mod _bindgen_ty_1 { +pub mod _bindgen_ty_3 { pub type Type = ::std::os::raw::c_uint; pub const IORING_REGISTER_SRC_REGISTERED: Type = 1; + pub const IORING_REGISTER_DST_REPLACE: Type = 2; } #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct io_uring_clone_buffers { pub src_fd: __u32, pub flags: __u32, - pub pad: [__u32; 6usize], + pub src_off: __u32, + pub dst_off: __u32, + pub nr: __u32, + pub pad: [__u32; 3usize], } #[allow(clippy::unnecessary_operation, clippy::identity_op)] const _: () = { @@ -1009,8 +1075,14 @@ const _: () = { [::std::mem::offset_of!(io_uring_clone_buffers, src_fd) - 0usize]; ["Offset of field: io_uring_clone_buffers::flags"] [::std::mem::offset_of!(io_uring_clone_buffers, flags) - 4usize]; + ["Offset of field: io_uring_clone_buffers::src_off"] + [::std::mem::offset_of!(io_uring_clone_buffers, src_off) - 8usize]; + ["Offset of field: io_uring_clone_buffers::dst_off"] + [::std::mem::offset_of!(io_uring_clone_buffers, dst_off) - 12usize]; + ["Offset of field: io_uring_clone_buffers::nr"] + [::std::mem::offset_of!(io_uring_clone_buffers, nr) - 16usize]; ["Offset of field: io_uring_clone_buffers::pad"] - [::std::mem::offset_of!(io_uring_clone_buffers, pad) - 8usize]; + [::std::mem::offset_of!(io_uring_clone_buffers, pad) - 20usize]; }; #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq)] @@ -1170,24 +1242,42 @@ const _: () = { ["Offset of field: io_uring_buf_status::resv"] [::std::mem::offset_of!(io_uring_buf_status, resv) - 8usize]; }; +pub mod io_uring_napi_op { + pub type Type = ::std::os::raw::c_uint; + pub const IO_URING_NAPI_REGISTER_OP: Type = 0; + pub const IO_URING_NAPI_STATIC_ADD_ID: Type = 1; + pub const IO_URING_NAPI_STATIC_DEL_ID: Type = 2; +} +pub mod io_uring_napi_tracking_strategy { + pub type Type = ::std::os::raw::c_uint; + pub const IO_URING_NAPI_TRACKING_DYNAMIC: Type = 0; + pub const IO_URING_NAPI_TRACKING_STATIC: Type = 1; + pub const IO_URING_NAPI_TRACKING_INACTIVE: Type = 255; +} #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct io_uring_napi { pub busy_poll_to: __u32, pub prefer_busy_poll: __u8, - pub pad: [__u8; 3usize], - pub resv: __u64, + pub opcode: __u8, + pub pad: [__u8; 2usize], + pub op_param: __u32, + pub resv: __u32, } #[allow(clippy::unnecessary_operation, clippy::identity_op)] const _: () = { ["Size of io_uring_napi"][::std::mem::size_of::() - 16usize]; - ["Alignment of io_uring_napi"][::std::mem::align_of::() - 8usize]; + ["Alignment of io_uring_napi"][::std::mem::align_of::() - 4usize]; ["Offset of field: io_uring_napi::busy_poll_to"] [::std::mem::offset_of!(io_uring_napi, busy_poll_to) - 0usize]; ["Offset of field: io_uring_napi::prefer_busy_poll"] [::std::mem::offset_of!(io_uring_napi, prefer_busy_poll) - 4usize]; - ["Offset of field: io_uring_napi::pad"][::std::mem::offset_of!(io_uring_napi, pad) - 5usize]; - ["Offset of field: io_uring_napi::resv"][::std::mem::offset_of!(io_uring_napi, resv) - 8usize]; + ["Offset of field: io_uring_napi::opcode"] + [::std::mem::offset_of!(io_uring_napi, opcode) - 5usize]; + ["Offset of field: io_uring_napi::pad"][::std::mem::offset_of!(io_uring_napi, pad) - 6usize]; + ["Offset of field: io_uring_napi::op_param"] + [::std::mem::offset_of!(io_uring_napi, op_param) - 8usize]; + ["Offset of field: io_uring_napi::resv"][::std::mem::offset_of!(io_uring_napi, resv) - 12usize]; }; pub mod io_uring_register_restriction_op { pub type Type = ::std::os::raw::c_uint; @@ -1197,6 +1287,40 @@ pub mod io_uring_register_restriction_op { pub const IORING_RESTRICTION_SQE_FLAGS_REQUIRED: Type = 3; pub const IORING_RESTRICTION_LAST: Type = 4; } +pub mod _bindgen_ty_4 { + pub type Type = ::std::os::raw::c_uint; + pub const IORING_REG_WAIT_TS: Type = 1; +} +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct io_uring_reg_wait { + pub ts: __kernel_timespec, + pub min_wait_usec: __u32, + pub flags: __u32, + pub sigmask: __u64, + pub sigmask_sz: __u32, + pub pad: [__u32; 3usize], + pub pad2: [__u64; 2usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of io_uring_reg_wait"][::std::mem::size_of::() - 64usize]; + ["Alignment of io_uring_reg_wait"][::std::mem::align_of::() - 8usize]; + ["Offset of field: io_uring_reg_wait::ts"] + [::std::mem::offset_of!(io_uring_reg_wait, ts) - 0usize]; + ["Offset of field: io_uring_reg_wait::min_wait_usec"] + [::std::mem::offset_of!(io_uring_reg_wait, min_wait_usec) - 16usize]; + ["Offset of field: io_uring_reg_wait::flags"] + [::std::mem::offset_of!(io_uring_reg_wait, flags) - 20usize]; + ["Offset of field: io_uring_reg_wait::sigmask"] + [::std::mem::offset_of!(io_uring_reg_wait, sigmask) - 24usize]; + ["Offset of field: io_uring_reg_wait::sigmask_sz"] + [::std::mem::offset_of!(io_uring_reg_wait, sigmask_sz) - 32usize]; + ["Offset of field: io_uring_reg_wait::pad"] + [::std::mem::offset_of!(io_uring_reg_wait, pad) - 36usize]; + ["Offset of field: io_uring_reg_wait::pad2"] + [::std::mem::offset_of!(io_uring_reg_wait, pad2) - 48usize]; +}; #[repr(C)] #[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct io_uring_getevents_arg { diff --git a/tools/bindgen.sh b/tools/bindgen.sh index e2529375285..61e6f2dde21 100755 --- a/tools/bindgen.sh +++ b/tools/bindgen.sh @@ -43,7 +43,7 @@ EOF bindgen --no-doc-comments --disable-header-comment --constified-enum-module '.*' --with-derive-default --with-derive-partialeq $@ } -KERNEL_BRANCH="linux-6.12.y" +KERNEL_BRANCH="linux-6.13.y" KERNEL_DIR="./$KERNEL_BRANCH" HEADERS_DIR=$(realpath "./linux-headers") # https://www.kernel.org/doc/Documentation/kbuild/headers_install.txt From fb62e8764f2b46b2324d76de49ae3e8560e7773a Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 27 Oct 2025 13:15:18 +0100 Subject: [PATCH 2/5] acpi: simplify the construction of VMGenID device Decouple creation of VMGenID device with writing the generation ID in guest memory. This way we can avoid keeping an `Option` inside the ACPI device manager. We can always create it and only write the generation ID in guest memory once we are ready to activate the device (when we have a functioning guest memory object). Also, remove a few of the error types and fail in-place instead of propagating errors all the way up. On the Aarch64, we need to recreate the expected dtb files because default microVMs always create entries for the VMGenID device. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 43 ++----- src/vmm/src/arch/aarch64/output_GICv3.dtb | Bin 2097152 -> 2097152 bytes .../src/arch/aarch64/output_initrd_GICv3.dtb | Bin 2097152 -> 2097152 bytes src/vmm/src/builder.rs | 10 +- src/vmm/src/device_manager/acpi.rs | 117 +++++++++--------- src/vmm/src/device_manager/mod.rs | 35 ++---- src/vmm/src/device_manager/persist.rs | 49 +++----- src/vmm/src/devices/acpi/vmgenid.rs | 97 +++++++-------- src/vmm/src/lib.rs | 3 - 9 files changed, 136 insertions(+), 218 deletions(-) diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index e09c0887a3a..6435eeba637 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -275,17 +275,15 @@ fn create_chosen_node( Ok(()) } -fn create_vmgenid_node(fdt: &mut FdtWriter, vmgenid: &Option) -> Result<(), FdtError> { - if let Some(vmgenid_info) = vmgenid { - let vmgenid = fdt.begin_node("vmgenid")?; - fdt.property_string("compatible", "microsoft,vmgenid")?; - fdt.property_array_u64("reg", &[vmgenid_info.guest_address.0, VMGENID_MEM_SIZE])?; - fdt.property_array_u32( - "interrupts", - &[GIC_FDT_IRQ_TYPE_SPI, vmgenid_info.gsi, IRQ_TYPE_EDGE_RISING], - )?; - fdt.end_node(vmgenid)?; - } +fn create_vmgenid_node(fdt: &mut FdtWriter, vmgenid: &VmGenId) -> Result<(), FdtError> { + let vmgenid_node = fdt.begin_node("vmgenid")?; + fdt.property_string("compatible", "microsoft,vmgenid")?; + fdt.property_array_u64("reg", &[vmgenid.guest_address.0, VMGENID_MEM_SIZE])?; + fdt.property_array_u32( + "interrupts", + &[GIC_FDT_IRQ_TYPE_SPI, vmgenid.gsi, IRQ_TYPE_EDGE_RISING], + )?; + fdt.end_node(vmgenid_node)?; Ok(()) } @@ -586,29 +584,6 @@ mod tests { .unwrap(); } - #[test] - fn test_create_fdt_with_vmgenid() { - let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - let mut device_manager = default_device_manager(); - let kvm = Kvm::new(vec![]).unwrap(); - let vm = Vm::new(&kvm).unwrap(); - let gic = create_gic(vm.fd(), 1, None).unwrap(); - let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); - cmdline.insert("console", "/dev/tty0").unwrap(); - - device_manager.attach_vmgenid_device(&mem, &vm).unwrap(); - - create_fdt( - &mem, - vec![0], - CString::new("console=tty0").unwrap(), - &device_manager, - &gic, - &None, - ) - .unwrap(); - } - #[test] fn test_create_fdt() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); diff --git a/src/vmm/src/arch/aarch64/output_GICv3.dtb b/src/vmm/src/arch/aarch64/output_GICv3.dtb index 35f4e9b63a35caa91b793f37e857fe3ae1c3f3aa..979cd68a285710b054e1b7a8f26e1599d41116f5 100644 GIT binary patch delta 261 zcmYkzOAY}+5XSLtdOT+s@3*;t14zsz96&<6Vu6H-bte|q;u^cwUQ)F46+|69c9BSDG*h8Q8k7&#m!n4-W8b1YDz U!V)X2vB4I*N6`K0({%Ro112~%jsO4v diff --git a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb index fb6147ade9c26388062ccdf440ffa36addff517d..63ab6765036fccb4140b5c2ac84830bebeb976ef 100644 GIT binary patch delta 261 zcmYkzISK+n5QX7xr`@+%+_$-af#HCs@Bl`Rih&9$#tjC>V(KL(@)`!7z+5mF)V~qI zf{(7MSFN>{8ulgsx(ktTgb;xc>UyxN@^p;iUexcYq1?`=!iEDE9!e;q Tf+}jLqk$$`Xrprul^;Am6ahLv delta 171 zcmWm1%MF4+6hP58`WXBngNXRo5@1JSI~HIGlelngsKW&-%NQH60u$qPaTdpWZ#>*9 z`MyP>hln&HdbzvOG;L<{?o{bD=q;#04}JU}!6L>0LyR!S1PP`{;gDg59CH*{pu`d@ Stg*osJM5o95A~-#+Vc;Mz&We{ diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 0f9ef70813e..a1e7038181c 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -29,7 +29,6 @@ use crate::device_manager::{ AttachDeviceError, DeviceManager, DeviceManagerCreateError, DevicePersistError, DeviceRestoreArgs, }; -use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::net::Net; @@ -76,8 +75,6 @@ pub enum StartMicrovmError { /// Error creating legacy device: {0} #[cfg(target_arch = "x86_64")] CreateLegacyDevice(device_manager::legacy::LegacyDeviceError), - /// Error creating VMGenID device: {0} - CreateVMGenID(VmGenIdError), /// Error enabling PCIe support: {0} EnablePciDevices(#[from] PciManagerError), /// Error enabling pvtime on vcpu: {0} @@ -258,7 +255,7 @@ pub fn build_microvm_for_boot( vm_resources.serial_out_path.as_ref(), )?; - device_manager.attach_vmgenid_device(vm.guest_memory(), &vm)?; + device_manager.attach_vmgenid_device(&vm)?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { @@ -943,10 +940,7 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] pub(crate) fn insert_vmgenid_device(vmm: &mut Vmm) { - vmm.device_manager - .attach_vmgenid_device(vmm.vm.guest_memory(), &vmm.vm) - .unwrap(); - assert!(vmm.device_manager.acpi_devices.vmgenid.is_some()); + vmm.device_manager.attach_vmgenid_device(&vmm.vm).unwrap(); } pub(crate) fn insert_balloon_device( diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 874443fcc5c..0779cc14f4d 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -2,84 +2,79 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::{Aml, aml}; +use vm_memory::GuestMemoryError; use crate::Vm; use crate::devices::acpi::vmgenid::VmGenId; +use crate::vstate::resources::ResourceAllocator; -#[derive(Debug, Default)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum ACPIDeviceError { + /// Could not register GSI with KVM: {0} + RegisterIrq(#[from] kvm_ioctls::Error), + /// Could not write to guest memory: {0} + WriteGuestMemory(#[from] GuestMemoryError), +} + +#[derive(Debug)] pub struct ACPIDeviceManager { /// VMGenID device - pub vmgenid: Option, + pub vmgenid: VmGenId, } impl ACPIDeviceManager { /// Create a new ACPIDeviceManager object - pub fn new() -> Self { - Default::default() - } - - /// Attach a new VMGenID device to the microVM - /// - /// This will register the device's interrupt with KVM - pub fn attach_vmgenid(&mut self, vmgenid: VmGenId, vm: &Vm) -> Result<(), kvm_ioctls::Error> { - vm.register_irq(&vmgenid.interrupt_evt, vmgenid.gsi)?; - self.vmgenid = Some(vmgenid); - Ok(()) + pub fn new(resource_allocator: &mut ResourceAllocator) -> Self { + let vmgenid = VmGenId::new(resource_allocator); + ACPIDeviceManager { vmgenid } } - /// If it exists, notify guest VMGenID device that we have resumed from a snapshot. - pub fn notify_vmgenid(&mut self) -> Result<(), std::io::Error> { - if let Some(vmgenid) = &mut self.vmgenid { - vmgenid.notify_guest()?; - } + pub fn attach_vmgenid(&self, vm: &Vm) -> Result<(), ACPIDeviceError> { + vm.register_irq(&self.vmgenid.interrupt_evt, self.vmgenid.gsi)?; + self.vmgenid.activate(vm.guest_memory())?; Ok(()) } } impl Aml for ACPIDeviceManager { fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { - // If we have a VMGenID device, create the AML for the device and GED interrupt handler - match self.vmgenid.as_ref() { - Some(vmgenid) => { - // AML for GED - aml::Device::new( - "_SB_.GED_".try_into()?, - vec![ - &aml::Name::new("_HID".try_into()?, &"ACPI0013")?, - &aml::Name::new( - "_CRS".try_into()?, - &aml::ResourceTemplate::new(vec![&aml::Interrupt::new( - true, - true, - false, - false, - vmgenid.gsi, - )]), - )?, - &aml::Method::new( - "_EVT".try_into()?, - 1, - true, - vec![&aml::If::new( - // We know that the maximum IRQ number fits in a u8. We have up to - // 32 IRQs in x86 and up to 128 in - // ARM (look into - // `vmm::crate::arch::layout::GSI_LEGACY_END`) - #[allow(clippy::cast_possible_truncation)] - &aml::Equal::new(&aml::Arg(0), &(vmgenid.gsi as u8)), - vec![&aml::Notify::new( - &aml::Path::new("\\_SB_.VGEN")?, - &0x80usize, - )], - )], - ), - ], - ) - .append_aml_bytes(v)?; - // AML for VMGenID itself. - vmgenid.append_aml_bytes(v) - } - None => Ok(()), - } + // AML for [`VmGenId`] device. + self.vmgenid.append_aml_bytes(v)?; + + // Create the AML for the GED interrupt handler + aml::Device::new( + "_SB_.GED_".try_into()?, + vec![ + &aml::Name::new("_HID".try_into()?, &"ACPI0013")?, + &aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![&aml::Interrupt::new( + true, + true, + false, + false, + self.vmgenid.gsi, + )]), + )?, + &aml::Method::new( + "_EVT".try_into()?, + 1, + true, + vec![&aml::If::new( + // We know that the maximum IRQ number fits in a u8. We have up to + // 32 IRQs in x86 and up to 128 in + // ARM (look into + // `vmm::crate::arch::layout::GSI_LEGACY_END`) + #[allow(clippy::cast_possible_truncation)] + &aml::Equal::new(&aml::Arg(0), &(self.vmgenid.gsi as u8)), + vec![&aml::Notify::new( + &aml::Path::new("\\_SB_.VGEN")?, + &0x80usize, + )], + )], + ), + ], + ) + .append_aml_bytes(v) } } diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index ad30da5b5db..b9850c0a8e1 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -18,12 +18,12 @@ use linux_loader::loader::Cmdline; use log::{error, info}; use mmio::{MMIODeviceManager, MmioError}; use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; -use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; +use persist::MMIODevManagerConstructorArgs; use serde::{Deserialize, Serialize}; use utils::time::TimestampUs; use vmm_sys_util::eventfd::EventFd; -use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; +use crate::device_manager::acpi::ACPIDeviceError; #[cfg(target_arch = "x86_64")] use crate::devices::legacy::I8042Device; #[cfg(target_arch = "aarch64")] @@ -70,10 +70,8 @@ pub enum AttachDeviceError { MmioTransport(#[from] MmioError), /// Error inserting device in bus: {0} Bus(#[from] BusError), - /// Error creating VMGenID device: {0} - CreateVmGenID(#[from] VmGenIdError), - /// Error while registering VMGenID with KVM: {0} - AttachVmGenID(#[from] kvm_ioctls::Error), + /// Error while registering ACPI with KVM: {0} + AttachAcpiDevice(#[from] ACPIDeviceError), #[cfg(target_arch = "aarch64")] /// Cmdline error Cmdline, @@ -176,7 +174,7 @@ impl DeviceManager { mmio_devices: MMIODeviceManager::new(), #[cfg(target_arch = "x86_64")] legacy_devices, - acpi_devices: ACPIDeviceManager::new(), + acpi_devices: ACPIDeviceManager::new(&mut vm.resource_allocator()), pci_devices: PciDevices::new(), }) } @@ -234,13 +232,8 @@ impl DeviceManager { Ok(()) } - pub(crate) fn attach_vmgenid_device( - &mut self, - mem: &GuestMemoryMmap, - vm: &Vm, - ) -> Result<(), AttachDeviceError> { - let vmgenid = VmGenId::new(mem, &mut vm.resource_allocator())?; - self.acpi_devices.attach_vmgenid(vmgenid, vm)?; + pub(crate) fn attach_vmgenid_device(&mut self, vm: &Vm) -> Result<(), AttachDeviceError> { + self.acpi_devices.attach_vmgenid(vm)?; Ok(()) } @@ -394,7 +387,7 @@ pub enum DevicePersistError { /// Error restoring MMIO devices: {0} MmioRestore(#[from] persist::DevicePersistError), /// Error restoring ACPI devices: {0} - AcpiRestore(#[from] persist::ACPIDeviceManagerRestoreError), + AcpiRestore(#[from] ACPIDeviceError), /// Error restoring PCI devices: {0} PciRestore(#[from] PciManagerError), /// Error notifying VMGenID device: {0} @@ -464,12 +457,8 @@ impl<'a> Persist<'a> for DeviceManager { let mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; // Restore ACPI devices - let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { - mem: constructor_args.mem, - vm: constructor_args.vm, - }; - let mut acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; - acpi_devices.notify_vmgenid()?; + let mut acpi_devices = ACPIDeviceManager::restore(constructor_args.vm, &state.acpi_state)?; + acpi_devices.vmgenid.notify_guest()?; // Restore PCI devices let pci_ctor_args = PciDevicesConstructorArgs { @@ -542,10 +531,12 @@ pub(crate) mod tests { use super::*; #[cfg(target_arch = "aarch64")] use crate::builder::tests::default_vmm; + use crate::vstate::resources::ResourceAllocator; pub(crate) fn default_device_manager() -> DeviceManager { + let mut resource_allocator = ResourceAllocator::new(); let mmio_devices = MMIODeviceManager::new(); - let acpi_devices = ACPIDeviceManager::new(); + let acpi_devices = ACPIDeviceManager::new(&mut resource_allocator); let pci_devices = PciDevices::new(); #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 49b4115f2cc..b7f3627bb6a 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -14,7 +14,8 @@ use super::acpi::ACPIDeviceManager; use super::mmio::*; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; -use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; +use crate::device_manager::acpi::ACPIDeviceError; +use crate::devices::acpi::vmgenid::{VMGenIDState, VmGenId}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; use crate::devices::virtio::ActivateError; @@ -156,50 +157,28 @@ impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct ACPIDeviceManagerState { - vmgenid: Option, -} - -#[derive(Debug)] -pub struct ACPIDeviceManagerConstructorArgs<'a> { - pub mem: &'a GuestMemoryMmap, - pub vm: &'a Vm, -} - -#[derive(Debug, thiserror::Error, displaydoc::Display)] -pub enum ACPIDeviceManagerRestoreError { - /// Could not register device: {0} - Interrupt(#[from] kvm_ioctls::Error), - /// Could not create VMGenID device: {0} - VMGenID(#[from] VmGenIdError), + vmgenid: VMGenIDState, } impl<'a> Persist<'a> for ACPIDeviceManager { type State = ACPIDeviceManagerState; - type ConstructorArgs = ACPIDeviceManagerConstructorArgs<'a>; - type Error = ACPIDeviceManagerRestoreError; + type ConstructorArgs = &'a Vm; + type Error = ACPIDeviceError; fn save(&self) -> Self::State { ACPIDeviceManagerState { - vmgenid: self.vmgenid.as_ref().map(|dev| dev.save()), + vmgenid: self.vmgenid.save(), } } - fn restore( - constructor_args: Self::ConstructorArgs, - state: &Self::State, - ) -> Result { - let mut dev_manager = ACPIDeviceManager::new(); - if let Some(vmgenid_args) = &state.vmgenid { - let vmgenid = VmGenId::restore( - VMGenIdConstructorArgs { - mem: constructor_args.mem, - resource_allocator: &mut constructor_args.vm.resource_allocator(), - }, - vmgenid_args, - )?; - dev_manager.attach_vmgenid(vmgenid, constructor_args.vm)?; - } - Ok(dev_manager) + fn restore(vm: Self::ConstructorArgs, state: &Self::State) -> Result { + let acpi_devices = ACPIDeviceManager { + // Safe to unwrap() here, this will never return an error. + vmgenid: VmGenId::restore((), &state.vmgenid).unwrap(), + }; + + acpi_devices.attach_vmgenid(vm)?; + Ok(acpi_devices) } } diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 18881cd39c9..15b08116b97 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -1,6 +1,8 @@ // Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::convert::Infallible; + use acpi_tables::{Aml, aml}; use aws_lc_rs::error::Unspecified as RandError; use aws_lc_rs::rand; @@ -38,73 +40,55 @@ pub struct VmGenId { pub gsi: u32, } -#[derive(Debug, thiserror::Error, displaydoc::Display)] -pub enum VmGenIdError { - /// Error with VMGenID interrupt: {0} - Interrupt(#[from] std::io::Error), - /// Error accessing VMGenID memory: {0} - GuestMemory(#[from] GuestMemoryError), - /// Create generation ID error: {0} - GenerationId(#[from] RandError), - /// Failed to allocate requested resource: {0} - Allocator(#[from] vm_allocator::Error), -} - impl VmGenId { /// Create a new Vm Generation Id device using an address in the guest for writing the /// generation ID and a GSI for sending device notifications. - pub fn from_parts( - guest_address: GuestAddress, - gsi: u32, - mem: &GuestMemoryMmap, - ) -> Result { + pub fn from_parts(guest_address: GuestAddress, gsi: u32) -> Self { debug!( "vmgenid: building VMGenID device. Address: {:#010x}. IRQ: {}", guest_address.0, gsi ); - let interrupt_evt = EventFdTrigger::new(EventFd::new(libc::EFD_NONBLOCK)?); - let gen_id = Self::make_genid()?; - - // Write generation ID in guest memory - debug!( - "vmgenid: writing new generation ID to guest: {:#034x}", - gen_id + let interrupt_evt = EventFdTrigger::new( + EventFd::new(libc::EFD_NONBLOCK) + .inspect_err(|err| { + error!("vmgenid: Could not create EventFd for VMGenID device: {err}") + }) + .unwrap(), ); - mem.write_slice(&gen_id.to_le_bytes(), guest_address) - .inspect_err(|err| error!("vmgenid: could not write generation ID to guest: {err}"))?; + let gen_id = Self::make_genid(); - Ok(Self { + Self { gen_id, interrupt_evt, guest_address, gsi, - }) + } } /// Create a new VMGenID device /// /// Allocate memory and a GSI for sending notifications and build the device - pub fn new( - mem: &GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, - ) -> Result { - let gsi = resource_allocator.allocate_gsi_legacy(1)?; + pub fn new(resource_allocator: &mut ResourceAllocator) -> Self { + let gsi = resource_allocator + .allocate_gsi_legacy(1) + .inspect_err(|err| error!("vmgenid: Could not allocate GSI for VMGenID: {err}")) + .unwrap(); // The generation ID needs to live in an 8-byte aligned buffer - let addr = resource_allocator.allocate_system_memory( - VMGENID_MEM_SIZE, - 8, - vm_allocator::AllocPolicy::LastMatch, - )?; + let addr = resource_allocator + .allocate_system_memory(VMGENID_MEM_SIZE, 8, vm_allocator::AllocPolicy::LastMatch) + .inspect_err(|err| error!("vmgenid: Could not allocate guest RAM for VMGenID: {err}")) + .unwrap(); - Self::from_parts(GuestAddress(addr), gsi[0], mem) + Self::from_parts(GuestAddress(addr), gsi[0]) } // Create a 16-bytes random number - fn make_genid() -> Result { + fn make_genid() -> u128 { let mut gen_id_bytes = [0u8; 16]; rand::fill(&mut gen_id_bytes) - .inspect_err(|err| error!("vmgenid: could not create new generation ID: {err}"))?; - Ok(u128::from_le_bytes(gen_id_bytes)) + .inspect_err(|err| error!("vmgenid: could not create new generation ID: {err}")) + .unwrap(); + u128::from_le_bytes(gen_id_bytes) } /// Send an ACPI notification to guest device. @@ -118,6 +102,18 @@ impl VmGenId { debug!("vmgenid: notifying guest about new generation ID"); Ok(()) } + + /// Attach the [`VmGenId`] device + pub fn activate(&self, mem: &GuestMemoryMmap) -> Result<(), GuestMemoryError> { + debug!( + "vmgenid: writing new generation ID to guest: {:#034x}", + self.gen_id + ); + mem.write_slice(&self.gen_id.to_le_bytes(), self.guest_address) + .inspect_err(|err| error!("vmgenid: could not write generation ID to guest: {err}"))?; + + Ok(()) + } } /// Logic to save/restore the state of a VMGenID device @@ -130,16 +126,10 @@ pub struct VMGenIDState { pub addr: u64, } -#[derive(Debug)] -pub struct VMGenIdConstructorArgs<'a> { - pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a mut ResourceAllocator, -} - impl<'a> Persist<'a> for VmGenId { type State = VMGenIDState; - type ConstructorArgs = VMGenIdConstructorArgs<'a>; - type Error = VmGenIdError; + type ConstructorArgs = (); + type Error = Infallible; fn save(&self) -> Self::State { VMGenIDState { @@ -148,11 +138,8 @@ impl<'a> Persist<'a> for VmGenId { } } - fn restore( - constructor_args: Self::ConstructorArgs, - state: &Self::State, - ) -> Result { - Self::from_parts(GuestAddress(state.addr), state.gsi, constructor_args.mem) + fn restore(_: Self::ConstructorArgs, state: &Self::State) -> Result { + Ok(Self::from_parts(GuestAddress(state.addr), state.gsi)) } } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 7186b904b8a..6b88a317605 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -124,7 +124,6 @@ use std::sync::{Arc, Barrier, Mutex}; use std::time::Duration; use device_manager::DeviceManager; -use devices::acpi::vmgenid::VmGenIdError; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccomp::BpfProgram; use snapshot::Persist; @@ -247,8 +246,6 @@ pub enum VmmError { Vm(#[from] vstate::vm::VmError), /// Kvm error: {0} Kvm(#[from] vstate::kvm::KvmError), - /// VMGenID error: {0} - VMGenID(#[from] VmGenIdError), /// Failed perform action on device: {0} FindDeviceError(#[from] device_manager::FindDeviceError), /// Block: {0} From 1e0cbaafb1a215289c2c4f1611458836501a24a4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 21 Oct 2025 11:37:04 +0200 Subject: [PATCH 3/5] vmclock: add vmclock ABI Rust bindings Add bindings for vmclock ABI from Linux 6.13, also add the logic in tools/bindgen.sh to automate the process. Signed-off-by: Babis Chalios --- src/vmm/src/devices/acpi/generated/mod.rs | 9 + .../src/devices/acpi/generated/vmclock_abi.rs | 201 ++++++++++++++++++ src/vmm/src/devices/acpi/mod.rs | 1 + tools/bindgen-patches/0003-vmclock.patch | 86 ++++++++ tools/bindgen.sh | 4 + 5 files changed, 301 insertions(+) create mode 100644 src/vmm/src/devices/acpi/generated/mod.rs create mode 100644 src/vmm/src/devices/acpi/generated/vmclock_abi.rs create mode 100644 tools/bindgen-patches/0003-vmclock.patch diff --git a/src/vmm/src/devices/acpi/generated/mod.rs b/src/vmm/src/devices/acpi/generated/mod.rs new file mode 100644 index 00000000000..b7b60c9f800 --- /dev/null +++ b/src/vmm/src/devices/acpi/generated/mod.rs @@ -0,0 +1,9 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![allow(clippy::all)] +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] + +pub mod vmclock_abi; diff --git a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs new file mode 100644 index 00000000000..134c8393f0c --- /dev/null +++ b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs @@ -0,0 +1,201 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// automatically generated by tools/bindgen.sh + +#![allow( + non_camel_case_types, + non_upper_case_globals, + dead_code, + non_snake_case, + clippy::ptr_as_ptr, + clippy::undocumented_unsafe_blocks, + missing_debug_implementations, + clippy::tests_outside_test_module, + unsafe_op_in_unsafe_fn, + clippy::redundant_static_lifetimes +)] + +use serde::{Deserialize, Serialize}; + +pub const __BITS_PER_LONG: u32 = 64; +pub const __BITS_PER_LONG_LONG: u32 = 64; +pub const __FD_SETSIZE: u32 = 1024; +pub const VMCLOCK_MAGIC: u32 = 1263289174; +pub const VMCLOCK_COUNTER_ARM_VCNT: u8 = 0; +pub const VMCLOCK_COUNTER_X86_TSC: u8 = 1; +pub const VMCLOCK_COUNTER_INVALID: u8 = 255; +pub const VMCLOCK_TIME_UTC: u8 = 0; +pub const VMCLOCK_TIME_TAI: u8 = 1; +pub const VMCLOCK_TIME_MONOTONIC: u8 = 2; +pub const VMCLOCK_TIME_INVALID_SMEARED: u8 = 3; +pub const VMCLOCK_TIME_INVALID_MAYBE_SMEARED: u8 = 4; +pub const VMCLOCK_FLAG_TAI_OFFSET_VALID: u64 = 1; +pub const VMCLOCK_FLAG_DISRUPTION_SOON: u64 = 2; +pub const VMCLOCK_FLAG_DISRUPTION_IMMINENT: u64 = 4; +pub const VMCLOCK_FLAG_PERIOD_ESTERROR_VALID: u64 = 8; +pub const VMCLOCK_FLAG_PERIOD_MAXERROR_VALID: u64 = 16; +pub const VMCLOCK_FLAG_TIME_ESTERROR_VALID: u64 = 32; +pub const VMCLOCK_FLAG_TIME_MAXERROR_VALID: u64 = 64; +pub const VMCLOCK_FLAG_TIME_MONOTONIC: u64 = 128; +pub const VMCLOCK_STATUS_UNKNOWN: u8 = 0; +pub const VMCLOCK_STATUS_INITIALIZING: u8 = 1; +pub const VMCLOCK_STATUS_SYNCHRONIZED: u8 = 2; +pub const VMCLOCK_STATUS_FREERUNNING: u8 = 3; +pub const VMCLOCK_STATUS_UNRELIABLE: u8 = 4; +pub const VMCLOCK_SMEARING_STRICT: u8 = 0; +pub const VMCLOCK_SMEARING_NOON_LINEAR: u8 = 1; +pub const VMCLOCK_SMEARING_UTC_SLS: u8 = 2; +pub const VMCLOCK_LEAP_NONE: u8 = 0; +pub const VMCLOCK_LEAP_PRE_POS: u8 = 1; +pub const VMCLOCK_LEAP_PRE_NEG: u8 = 2; +pub const VMCLOCK_LEAP_POS: u8 = 3; +pub const VMCLOCK_LEAP_POST_POS: u8 = 4; +pub const VMCLOCK_LEAP_POST_NEG: u8 = 5; +pub type __s8 = ::std::os::raw::c_schar; +pub type __u8 = ::std::os::raw::c_uchar; +pub type __s16 = ::std::os::raw::c_short; +pub type __u16 = ::std::os::raw::c_ushort; +pub type __s32 = ::std::os::raw::c_int; +pub type __u32 = ::std::os::raw::c_uint; +pub type __s64 = ::std::os::raw::c_longlong; +pub type __u64 = ::std::os::raw::c_ulonglong; +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct __kernel_fd_set { + pub fds_bits: [::std::os::raw::c_ulong; 16usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of __kernel_fd_set"][::std::mem::size_of::<__kernel_fd_set>() - 128usize]; + ["Alignment of __kernel_fd_set"][::std::mem::align_of::<__kernel_fd_set>() - 8usize]; + ["Offset of field: __kernel_fd_set::fds_bits"] + [::std::mem::offset_of!(__kernel_fd_set, fds_bits) - 0usize]; +}; +pub type __kernel_sighandler_t = + ::std::option::Option; +pub type __kernel_key_t = ::std::os::raw::c_int; +pub type __kernel_mqd_t = ::std::os::raw::c_int; +pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; +pub type __kernel_long_t = ::std::os::raw::c_long; +pub type __kernel_ulong_t = ::std::os::raw::c_ulong; +pub type __kernel_ino_t = __kernel_ulong_t; +pub type __kernel_mode_t = ::std::os::raw::c_uint; +pub type __kernel_pid_t = ::std::os::raw::c_int; +pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; +pub type __kernel_uid_t = ::std::os::raw::c_uint; +pub type __kernel_gid_t = ::std::os::raw::c_uint; +pub type __kernel_suseconds_t = __kernel_long_t; +pub type __kernel_daddr_t = ::std::os::raw::c_int; +pub type __kernel_uid32_t = ::std::os::raw::c_uint; +pub type __kernel_gid32_t = ::std::os::raw::c_uint; +pub type __kernel_size_t = __kernel_ulong_t; +pub type __kernel_ssize_t = __kernel_long_t; +pub type __kernel_ptrdiff_t = __kernel_long_t; +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub struct __kernel_fsid_t { + pub val: [::std::os::raw::c_int; 2usize], +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of __kernel_fsid_t"][::std::mem::size_of::<__kernel_fsid_t>() - 8usize]; + ["Alignment of __kernel_fsid_t"][::std::mem::align_of::<__kernel_fsid_t>() - 4usize]; + ["Offset of field: __kernel_fsid_t::val"] + [::std::mem::offset_of!(__kernel_fsid_t, val) - 0usize]; +}; +pub type __kernel_off_t = __kernel_long_t; +pub type __kernel_loff_t = ::std::os::raw::c_longlong; +pub type __kernel_old_time_t = __kernel_long_t; +pub type __kernel_time_t = __kernel_long_t; +pub type __kernel_time64_t = ::std::os::raw::c_longlong; +pub type __kernel_clock_t = __kernel_long_t; +pub type __kernel_timer_t = ::std::os::raw::c_int; +pub type __kernel_clockid_t = ::std::os::raw::c_int; +pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; +pub type __kernel_uid16_t = ::std::os::raw::c_ushort; +pub type __kernel_gid16_t = ::std::os::raw::c_ushort; +pub type __s128 = i128; +pub type __u128 = u128; +pub type __le16 = __u16; +pub type __be16 = __u16; +pub type __le32 = __u32; +pub type __be32 = __u32; +pub type __le64 = __u64; +pub type __be64 = __u64; +pub type __sum16 = __u16; +pub type __wsum = __u32; +pub type __poll_t = ::std::os::raw::c_uint; +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq, Serialize, Deserialize)] +pub struct vmclock_abi { + pub magic: __le32, + pub size: __le32, + pub version: __le16, + pub counter_id: __u8, + pub time_type: __u8, + pub seq_count: __le32, + pub disruption_marker: __le64, + pub flags: __le64, + pub pad: [__u8; 2usize], + pub clock_status: __u8, + pub leap_second_smearing_hint: __u8, + pub tai_offset_sec: __le16, + pub leap_indicator: __u8, + pub counter_period_shift: __u8, + pub counter_value: __le64, + pub counter_period_frac_sec: __le64, + pub counter_period_esterror_rate_frac_sec: __le64, + pub counter_period_maxerror_rate_frac_sec: __le64, + pub time_sec: __le64, + pub time_frac_sec: __le64, + pub time_esterror_nanosec: __le64, + pub time_maxerror_nanosec: __le64, +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of vmclock_abi"][::std::mem::size_of::() - 104usize]; + ["Alignment of vmclock_abi"][::std::mem::align_of::() - 8usize]; + ["Offset of field: vmclock_abi::magic"][::std::mem::offset_of!(vmclock_abi, magic) - 0usize]; + ["Offset of field: vmclock_abi::size"][::std::mem::offset_of!(vmclock_abi, size) - 4usize]; + ["Offset of field: vmclock_abi::version"] + [::std::mem::offset_of!(vmclock_abi, version) - 8usize]; + ["Offset of field: vmclock_abi::counter_id"] + [::std::mem::offset_of!(vmclock_abi, counter_id) - 10usize]; + ["Offset of field: vmclock_abi::time_type"] + [::std::mem::offset_of!(vmclock_abi, time_type) - 11usize]; + ["Offset of field: vmclock_abi::seq_count"] + [::std::mem::offset_of!(vmclock_abi, seq_count) - 12usize]; + ["Offset of field: vmclock_abi::disruption_marker"] + [::std::mem::offset_of!(vmclock_abi, disruption_marker) - 16usize]; + ["Offset of field: vmclock_abi::flags"][::std::mem::offset_of!(vmclock_abi, flags) - 24usize]; + ["Offset of field: vmclock_abi::pad"][::std::mem::offset_of!(vmclock_abi, pad) - 32usize]; + ["Offset of field: vmclock_abi::clock_status"] + [::std::mem::offset_of!(vmclock_abi, clock_status) - 34usize]; + ["Offset of field: vmclock_abi::leap_second_smearing_hint"] + [::std::mem::offset_of!(vmclock_abi, leap_second_smearing_hint) - 35usize]; + ["Offset of field: vmclock_abi::tai_offset_sec"] + [::std::mem::offset_of!(vmclock_abi, tai_offset_sec) - 36usize]; + ["Offset of field: vmclock_abi::leap_indicator"] + [::std::mem::offset_of!(vmclock_abi, leap_indicator) - 38usize]; + ["Offset of field: vmclock_abi::counter_period_shift"] + [::std::mem::offset_of!(vmclock_abi, counter_period_shift) - 39usize]; + ["Offset of field: vmclock_abi::counter_value"] + [::std::mem::offset_of!(vmclock_abi, counter_value) - 40usize]; + ["Offset of field: vmclock_abi::counter_period_frac_sec"] + [::std::mem::offset_of!(vmclock_abi, counter_period_frac_sec) - 48usize]; + ["Offset of field: vmclock_abi::counter_period_esterror_rate_frac_sec"] + [::std::mem::offset_of!(vmclock_abi, counter_period_esterror_rate_frac_sec) - 56usize]; + ["Offset of field: vmclock_abi::counter_period_maxerror_rate_frac_sec"] + [::std::mem::offset_of!(vmclock_abi, counter_period_maxerror_rate_frac_sec) - 64usize]; + ["Offset of field: vmclock_abi::time_sec"] + [::std::mem::offset_of!(vmclock_abi, time_sec) - 72usize]; + ["Offset of field: vmclock_abi::time_frac_sec"] + [::std::mem::offset_of!(vmclock_abi, time_frac_sec) - 80usize]; + ["Offset of field: vmclock_abi::time_esterror_nanosec"] + [::std::mem::offset_of!(vmclock_abi, time_esterror_nanosec) - 88usize]; + ["Offset of field: vmclock_abi::time_maxerror_nanosec"] + [::std::mem::offset_of!(vmclock_abi, time_maxerror_nanosec) - 96usize]; +}; diff --git a/src/vmm/src/devices/acpi/mod.rs b/src/vmm/src/devices/acpi/mod.rs index 5151bddd231..7c70f96380d 100644 --- a/src/vmm/src/devices/acpi/mod.rs +++ b/src/vmm/src/devices/acpi/mod.rs @@ -1,4 +1,5 @@ // Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +mod generated; pub mod vmgenid; diff --git a/tools/bindgen-patches/0003-vmclock.patch b/tools/bindgen-patches/0003-vmclock.patch new file mode 100644 index 00000000000..bb3ca006c87 --- /dev/null +++ b/tools/bindgen-patches/0003-vmclock.patch @@ -0,0 +1,86 @@ +diff --git a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs +index e841ca111..134c8393f 100644 +--- a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs ++++ b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs +@@ -16,40 +16,42 @@ + clippy::redundant_static_lifetimes + )] + ++use serde::{Deserialize, Serialize}; ++ + pub const __BITS_PER_LONG: u32 = 64; + pub const __BITS_PER_LONG_LONG: u32 = 64; + pub const __FD_SETSIZE: u32 = 1024; + pub const VMCLOCK_MAGIC: u32 = 1263289174; +-pub const VMCLOCK_COUNTER_ARM_VCNT: u32 = 0; +-pub const VMCLOCK_COUNTER_X86_TSC: u32 = 1; +-pub const VMCLOCK_COUNTER_INVALID: u32 = 255; +-pub const VMCLOCK_TIME_UTC: u32 = 0; +-pub const VMCLOCK_TIME_TAI: u32 = 1; +-pub const VMCLOCK_TIME_MONOTONIC: u32 = 2; +-pub const VMCLOCK_TIME_INVALID_SMEARED: u32 = 3; +-pub const VMCLOCK_TIME_INVALID_MAYBE_SMEARED: u32 = 4; +-pub const VMCLOCK_FLAG_TAI_OFFSET_VALID: u32 = 1; +-pub const VMCLOCK_FLAG_DISRUPTION_SOON: u32 = 2; +-pub const VMCLOCK_FLAG_DISRUPTION_IMMINENT: u32 = 4; +-pub const VMCLOCK_FLAG_PERIOD_ESTERROR_VALID: u32 = 8; +-pub const VMCLOCK_FLAG_PERIOD_MAXERROR_VALID: u32 = 16; +-pub const VMCLOCK_FLAG_TIME_ESTERROR_VALID: u32 = 32; +-pub const VMCLOCK_FLAG_TIME_MAXERROR_VALID: u32 = 64; +-pub const VMCLOCK_FLAG_TIME_MONOTONIC: u32 = 128; +-pub const VMCLOCK_STATUS_UNKNOWN: u32 = 0; +-pub const VMCLOCK_STATUS_INITIALIZING: u32 = 1; +-pub const VMCLOCK_STATUS_SYNCHRONIZED: u32 = 2; +-pub const VMCLOCK_STATUS_FREERUNNING: u32 = 3; +-pub const VMCLOCK_STATUS_UNRELIABLE: u32 = 4; +-pub const VMCLOCK_SMEARING_STRICT: u32 = 0; +-pub const VMCLOCK_SMEARING_NOON_LINEAR: u32 = 1; +-pub const VMCLOCK_SMEARING_UTC_SLS: u32 = 2; +-pub const VMCLOCK_LEAP_NONE: u32 = 0; +-pub const VMCLOCK_LEAP_PRE_POS: u32 = 1; +-pub const VMCLOCK_LEAP_PRE_NEG: u32 = 2; +-pub const VMCLOCK_LEAP_POS: u32 = 3; +-pub const VMCLOCK_LEAP_POST_POS: u32 = 4; +-pub const VMCLOCK_LEAP_POST_NEG: u32 = 5; ++pub const VMCLOCK_COUNTER_ARM_VCNT: u8 = 0; ++pub const VMCLOCK_COUNTER_X86_TSC: u8 = 1; ++pub const VMCLOCK_COUNTER_INVALID: u8 = 255; ++pub const VMCLOCK_TIME_UTC: u8 = 0; ++pub const VMCLOCK_TIME_TAI: u8 = 1; ++pub const VMCLOCK_TIME_MONOTONIC: u8 = 2; ++pub const VMCLOCK_TIME_INVALID_SMEARED: u8 = 3; ++pub const VMCLOCK_TIME_INVALID_MAYBE_SMEARED: u8 = 4; ++pub const VMCLOCK_FLAG_TAI_OFFSET_VALID: u64 = 1; ++pub const VMCLOCK_FLAG_DISRUPTION_SOON: u64 = 2; ++pub const VMCLOCK_FLAG_DISRUPTION_IMMINENT: u64 = 4; ++pub const VMCLOCK_FLAG_PERIOD_ESTERROR_VALID: u64 = 8; ++pub const VMCLOCK_FLAG_PERIOD_MAXERROR_VALID: u64 = 16; ++pub const VMCLOCK_FLAG_TIME_ESTERROR_VALID: u64 = 32; ++pub const VMCLOCK_FLAG_TIME_MAXERROR_VALID: u64 = 64; ++pub const VMCLOCK_FLAG_TIME_MONOTONIC: u64 = 128; ++pub const VMCLOCK_STATUS_UNKNOWN: u8 = 0; ++pub const VMCLOCK_STATUS_INITIALIZING: u8 = 1; ++pub const VMCLOCK_STATUS_SYNCHRONIZED: u8 = 2; ++pub const VMCLOCK_STATUS_FREERUNNING: u8 = 3; ++pub const VMCLOCK_STATUS_UNRELIABLE: u8 = 4; ++pub const VMCLOCK_SMEARING_STRICT: u8 = 0; ++pub const VMCLOCK_SMEARING_NOON_LINEAR: u8 = 1; ++pub const VMCLOCK_SMEARING_UTC_SLS: u8 = 2; ++pub const VMCLOCK_LEAP_NONE: u8 = 0; ++pub const VMCLOCK_LEAP_PRE_POS: u8 = 1; ++pub const VMCLOCK_LEAP_PRE_NEG: u8 = 2; ++pub const VMCLOCK_LEAP_POS: u8 = 3; ++pub const VMCLOCK_LEAP_POST_POS: u8 = 4; ++pub const VMCLOCK_LEAP_POST_NEG: u8 = 5; + pub type __s8 = ::std::os::raw::c_schar; + pub type __u8 = ::std::os::raw::c_uchar; + pub type __s16 = ::std::os::raw::c_short; +@@ -127,7 +129,7 @@ pub type __sum16 = __u16; + pub type __wsum = __u32; + pub type __poll_t = ::std::os::raw::c_uint; + #[repr(C)] +-#[derive(Debug, Default, Copy, Clone, PartialEq)] ++#[derive(Debug, Default, Copy, Clone, PartialEq, Serialize, Deserialize)] + pub struct vmclock_abi { + pub magic: __le32, + pub size: __le32, diff --git a/tools/bindgen.sh b/tools/bindgen.sh index 61e6f2dde21..cdab5ef824a 100755 --- a/tools/bindgen.sh +++ b/tools/bindgen.sh @@ -166,6 +166,10 @@ fc-bindgen \ --allowlist-var "ARCH_.*" \ "$ARCH_X86_INCLUDE/uapi/asm/prctl.h" >src/vmm/src/arch/x86_64/generated/arch_prctl.rs +info "BINDGEN include/uapi/linux/vmclock-abi.h" +fc-bindgen \ + "$KERNEL_DIR/include/uapi/linux/vmclock-abi.h" > src/vmm/src/devices/acpi/generated/vmclock_abi.rs + # Apply any patches info "Apply patches" for PATCH in $(dirname $0)/bindgen-patches/*.patch; do From 96b13c6155ce893ad59e5b5709f5f9d846983421 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 21 Oct 2025 14:45:42 +0200 Subject: [PATCH 4/5] vmclock: add support for VMclock device for x86_64 Implement the VMClock device on x86_64 platforms. At the moment, we just allocate the memory region in the guest address space for exposing the device. We don't expose any clock from the host and since we don't support live migration, the device won't do anything at the moment, but we should still be able to see a `/dev/vmclock` inside the guest. We do support the `disruption_marker` field which notifies the guest to adjust clocks due to a time shifting event. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 7 + src/vmm/src/device_manager/acpi.rs | 21 ++- src/vmm/src/device_manager/mod.rs | 6 + src/vmm/src/device_manager/persist.rs | 9 + src/vmm/src/devices/acpi/mod.rs | 1 + src/vmm/src/devices/acpi/vmclock.rs | 233 ++++++++++++++++++++++++++ src/vmm/src/persist.rs | 4 + 7 files changed, 279 insertions(+), 2 deletions(-) create mode 100644 src/vmm/src/devices/acpi/vmclock.rs diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index a1e7038181c..0a602b68871 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -256,6 +256,8 @@ pub fn build_microvm_for_boot( )?; device_manager.attach_vmgenid_device(&vm)?; + #[cfg(target_arch = "x86_64")] + device_manager.attach_vmclock_device(&vm)?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { @@ -943,6 +945,11 @@ pub(crate) mod tests { vmm.device_manager.attach_vmgenid_device(&vmm.vm).unwrap(); } + #[cfg(target_arch = "x86_64")] + pub(crate) fn insert_vmclock_device(vmm: &mut Vmm) { + vmm.device_manager.attach_vmclock_device(&vmm.vm).unwrap(); + } + pub(crate) fn insert_balloon_device( vmm: &mut Vmm, cmdline: &mut Cmdline, diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 0779cc14f4d..9764143b5a9 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -5,6 +5,8 @@ use acpi_tables::{Aml, aml}; use vm_memory::GuestMemoryError; use crate::Vm; +#[cfg(target_arch = "x86_64")] +use crate::devices::acpi::vmclock::VmClock; use crate::devices::acpi::vmgenid::VmGenId; use crate::vstate::resources::ResourceAllocator; @@ -20,13 +22,19 @@ pub enum ACPIDeviceError { pub struct ACPIDeviceManager { /// VMGenID device pub vmgenid: VmGenId, + /// VMclock device + #[cfg(target_arch = "x86_64")] + pub vmclock: VmClock, } impl ACPIDeviceManager { /// Create a new ACPIDeviceManager object pub fn new(resource_allocator: &mut ResourceAllocator) -> Self { - let vmgenid = VmGenId::new(resource_allocator); - ACPIDeviceManager { vmgenid } + ACPIDeviceManager { + vmgenid: VmGenId::new(resource_allocator), + #[cfg(target_arch = "x86_64")] + vmclock: VmClock::new(resource_allocator), + } } pub fn attach_vmgenid(&self, vm: &Vm) -> Result<(), ACPIDeviceError> { @@ -34,12 +42,21 @@ impl ACPIDeviceManager { self.vmgenid.activate(vm.guest_memory())?; Ok(()) } + + #[cfg(target_arch = "x86_64")] + pub fn attach_vmclock(&self, vm: &Vm) -> Result<(), ACPIDeviceError> { + self.vmclock.activate(vm.guest_memory())?; + Ok(()) + } } impl Aml for ACPIDeviceManager { fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { // AML for [`VmGenId`] device. self.vmgenid.append_aml_bytes(v)?; + // AML for [`VmClock`] device. + #[cfg(target_arch = "x86_64")] + self.vmclock.append_aml_bytes(v)?; // Create the AML for the GED interrupt handler aml::Device::new( diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index b9850c0a8e1..fc245e05539 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -237,6 +237,12 @@ impl DeviceManager { Ok(()) } + #[cfg(target_arch = "x86_64")] + pub(crate) fn attach_vmclock_device(&mut self, vm: &Vm) -> Result<(), AttachDeviceError> { + self.acpi_devices.attach_vmclock(vm)?; + Ok(()) + } + #[cfg(target_arch = "aarch64")] pub(crate) fn attach_legacy_devices_aarch64( &mut self, diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index b7f3627bb6a..fa83aae9e37 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -15,6 +15,8 @@ use super::mmio::*; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::device_manager::acpi::ACPIDeviceError; +#[cfg(target_arch = "x86_64")] +use crate::devices::acpi::vmclock::{VmClock, VmClockState}; use crate::devices::acpi::vmgenid::{VMGenIDState, VmGenId}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; @@ -158,6 +160,8 @@ impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct ACPIDeviceManagerState { vmgenid: VMGenIDState, + #[cfg(target_arch = "x86_64")] + vmclock: VmClockState, } impl<'a> Persist<'a> for ACPIDeviceManager { @@ -168,6 +172,8 @@ impl<'a> Persist<'a> for ACPIDeviceManager { fn save(&self) -> Self::State { ACPIDeviceManagerState { vmgenid: self.vmgenid.save(), + #[cfg(target_arch = "x86_64")] + vmclock: self.vmclock.save(), } } @@ -175,6 +181,9 @@ impl<'a> Persist<'a> for ACPIDeviceManager { let acpi_devices = ACPIDeviceManager { // Safe to unwrap() here, this will never return an error. vmgenid: VmGenId::restore((), &state.vmgenid).unwrap(), + // Safe to unwrap() here, this will never return an error. + #[cfg(target_arch = "x86_64")] + vmclock: VmClock::restore(vm.guest_memory(), &state.vmclock).unwrap(), }; acpi_devices.attach_vmgenid(vm)?; diff --git a/src/vmm/src/devices/acpi/mod.rs b/src/vmm/src/devices/acpi/mod.rs index 7c70f96380d..8eba26ac41d 100644 --- a/src/vmm/src/devices/acpi/mod.rs +++ b/src/vmm/src/devices/acpi/mod.rs @@ -2,4 +2,5 @@ // SPDX-License-Identifier: Apache-2.0 mod generated; +pub mod vmclock; pub mod vmgenid; diff --git a/src/vmm/src/devices/acpi/vmclock.rs b/src/vmm/src/devices/acpi/vmclock.rs new file mode 100644 index 00000000000..9e12a81e287 --- /dev/null +++ b/src/vmm/src/devices/acpi/vmclock.rs @@ -0,0 +1,233 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::convert::Infallible; +use std::mem::offset_of; +use std::sync::atomic::{Ordering, fence}; + +use acpi_tables::{Aml, aml}; +use log::error; +use serde::{Deserialize, Serialize}; +use vm_allocator::AllocPolicy; +use vm_memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryError}; + +use crate::devices::acpi::generated::vmclock_abi::{ + VMCLOCK_COUNTER_INVALID, VMCLOCK_MAGIC, VMCLOCK_STATUS_UNKNOWN, vmclock_abi, +}; +use crate::snapshot::Persist; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::resources::ResourceAllocator; + +// SAFETY: `vmclock_abi` is a POD +unsafe impl ByteValued for vmclock_abi {} + +// We are reserving a physical page to expose the [`VmClock`] data +const VMCLOCK_SIZE: u32 = 0x1000; + +// Write a value in `vmclock_abi` both in the Firecracker-managed state +// and inside guest memory address that corresponds to it. +macro_rules! write_vmclock_field { + ($vmclock:expr, $mem:expr, $field:ident, $value:expr) => { + $vmclock.inner.$field = $value; + $mem.write_obj( + $vmclock.inner.$field, + $vmclock + .guest_address + .unchecked_add(offset_of!(vmclock_abi, $field) as u64), + ); + }; +} + +/// VMclock device +/// +/// This device emulates the VMclock device which allows passing information to the guest related +/// to the relation of the host CPU to real-time clock as well as information about disruptive +/// events, such as live-migration. +#[derive(Debug)] +pub struct VmClock { + /// Guest address in which we will write the VMclock struct + pub guest_address: GuestAddress, + /// The [`VmClock`] state we are exposing to the guest + inner: vmclock_abi, +} + +impl VmClock { + /// Create a new [`VmClock`] device for a newly booted VM + pub fn new(resource_allocator: &mut ResourceAllocator) -> VmClock { + let addr = resource_allocator + .allocate_system_memory( + VMCLOCK_SIZE as u64, + VMCLOCK_SIZE as u64, + AllocPolicy::LastMatch, + ) + .inspect_err(|err| error!("vmclock: could not allocate guest memory for device: {err}")) + .unwrap(); + + let mut inner = vmclock_abi { + magic: VMCLOCK_MAGIC, + size: VMCLOCK_SIZE, + version: u16::to_le(1), + clock_status: VMCLOCK_STATUS_UNKNOWN, + counter_id: VMCLOCK_COUNTER_INVALID, + ..Default::default() + }; + + VmClock { + guest_address: GuestAddress(addr), + inner, + } + } + + /// Activate [`VmClock`] device + pub fn activate(&self, mem: &GuestMemoryMmap) -> Result<(), GuestMemoryError> { + mem.write_slice(self.inner.as_slice(), self.guest_address)?; + Ok(()) + } + + /// Bump the VM generation counter + pub fn post_load_update(&mut self, mem: &GuestMemoryMmap) { + write_vmclock_field!(self, mem, seq_count, self.inner.seq_count | 1); + + // This fence ensures guest sees all previous writes. It is matched to a + // read barrier in the guest. + fence(Ordering::Release); + + write_vmclock_field!( + self, + mem, + disruption_marker, + self.inner.disruption_marker.wrapping_add(1) + ); + + // This fence ensures guest sees the `disruption_marker` update. It is matched to a + // read barrier in the guest. + fence(Ordering::Release); + + write_vmclock_field!(self, mem, seq_count, self.inner.seq_count.wrapping_add(1)); + } +} + +/// (De)serialize-able state of the [`VmClock`] +/// +/// We could avoid this and reuse [`VmClock`] itself if `GuestAddress` was `Serialize`/`Deserialize` +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct VmClockState { + /// Guest address in which we write the [`VmClock`] info + pub guest_address: u64, + /// Data we expose to the guest + pub inner: vmclock_abi, +} + +impl<'a> Persist<'a> for VmClock { + type State = VmClockState; + type ConstructorArgs = &'a GuestMemoryMmap; + type Error = Infallible; + + fn save(&self) -> Self::State { + VmClockState { + guest_address: self.guest_address.0, + inner: self.inner, + } + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> Result { + let mut vmclock = VmClock { + guest_address: GuestAddress(state.guest_address), + inner: state.inner, + }; + vmclock.post_load_update(constructor_args); + Ok(vmclock) + } +} + +impl Aml for VmClock { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + aml::Device::new( + "_SB_.VCLK".try_into()?, + vec![ + &aml::Name::new("_HID".try_into()?, &"AMZNC10C")?, + &aml::Name::new("_CID".try_into()?, &"VMCLOCK")?, + &aml::Name::new("_DDN".try_into()?, &"VMCLOCK")?, + &aml::Method::new( + "_STA".try_into()?, + 0, + false, + vec![&aml::Return::new(&0x0fu8)], + ), + &aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::Cacheable, + false, + self.guest_address.0, + self.guest_address.0 + VMCLOCK_SIZE as u64 - 1, + )?]), + )?, + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(test)] +mod tests { + use vm_memory::{Bytes, GuestAddress}; + + use crate::arch; + use crate::devices::acpi::generated::vmclock_abi::vmclock_abi; + use crate::devices::acpi::vmclock::{VMCLOCK_SIZE, VmClock}; + use crate::snapshot::Persist; + use crate::test_utils::single_region_mem; + use crate::utils::u64_to_usize; + use crate::vstate::resources::ResourceAllocator; + + // We are allocating memory from the end of the system memory portion + const VMCLOCK_TEST_GUEST_ADDR: GuestAddress = + GuestAddress(arch::SYSTEM_MEM_START + arch::SYSTEM_MEM_SIZE - VMCLOCK_SIZE as u64); + + fn default_vmclock() -> VmClock { + let mut resource_allocator = ResourceAllocator::new(); + VmClock::new(&mut resource_allocator) + } + + #[test] + fn test_new_device() { + let vmclock = default_vmclock(); + let mem = single_region_mem( + u64_to_usize(arch::SYSTEM_MEM_START) + u64_to_usize(arch::SYSTEM_MEM_SIZE), + ); + + let guest_data: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); + assert_ne!(guest_data, vmclock.inner); + + vmclock.activate(&mem); + + let guest_data: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); + assert_eq!(guest_data, vmclock.inner); + } + + #[test] + fn test_device_save_restore() { + let vmclock = default_vmclock(); + let mem = single_region_mem( + u64_to_usize(arch::SYSTEM_MEM_START) + u64_to_usize(arch::SYSTEM_MEM_SIZE), + ); + + vmclock.activate(&mem).unwrap(); + let guest_data: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); + + let state = vmclock.save(); + let vmclock_new = VmClock::restore(&mem, &state).unwrap(); + + let guest_data_new: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); + assert_ne!(guest_data_new, vmclock.inner); + assert_eq!(guest_data_new, vmclock_new.inner); + assert_eq!( + vmclock.inner.disruption_marker + 1, + vmclock_new.inner.disruption_marker + ); + } +} diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index ee76bf6800b..cbc4beac95a 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -576,6 +576,8 @@ mod tests { use super::*; use crate::Vmm; #[cfg(target_arch = "x86_64")] + use crate::builder::tests::insert_vmclock_device; + #[cfg(target_arch = "x86_64")] use crate::builder::tests::insert_vmgenid_device; use crate::builder::tests::{ CustomBlockConfig, default_kernel_cmdline, default_vmm, insert_balloon_device, @@ -638,6 +640,8 @@ mod tests { #[cfg(target_arch = "x86_64")] insert_vmgenid_device(&mut vmm); + #[cfg(target_arch = "x86_64")] + insert_vmclock_device(&mut vmm); vmm } From b04cbd8e528d400c0c5e63ad1c25b91d628533e7 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 10 Nov 2025 15:54:10 +0100 Subject: [PATCH 5/5] vmclock: add integration tests Add tests that validate that VMClock device as exposed inside the guest under /dev/vmclock0 works as expected. This includes a small C program that knows how to open and read values from /dev/vmclock0. Signed-off-by: Babis Chalios --- tests/conftest.py | 8 + tests/host_tools/vmclock-abi.h | 182 ++++++++++++++++++ tests/host_tools/vmclock.c | 78 ++++++++ .../functional/test_vmclock.py | 66 +++++++ 4 files changed, 334 insertions(+) create mode 100644 tests/host_tools/vmclock-abi.h create mode 100644 tests/host_tools/vmclock.c create mode 100644 tests/integration_tests/functional/test_vmclock.py diff --git a/tests/conftest.py b/tests/conftest.py index fabff84a0d8..fce511596fd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -237,6 +237,14 @@ def bin_vsock_path(test_fc_session_root_path): yield vsock_helper_bin_path +@pytest.fixture(scope="session") +def bin_vmclock_path(test_fc_session_root_path): + """Build a simple util for test VMclock device""" + vmclock_helper_bin_path = os.path.join(test_fc_session_root_path, "vmclock") + build_tools.gcc_compile("host_tools/vmclock.c", vmclock_helper_bin_path) + yield vmclock_helper_bin_path + + @pytest.fixture(scope="session") def change_net_config_space_bin(test_fc_session_root_path): """Build a binary that changes the MMIO config space.""" diff --git a/tests/host_tools/vmclock-abi.h b/tests/host_tools/vmclock-abi.h new file mode 100644 index 00000000000..2d99b29ac44 --- /dev/null +++ b/tests/host_tools/vmclock-abi.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ + +/* + * This structure provides a vDSO-style clock to VM guests, exposing the + * relationship (or lack thereof) between the CPU clock (TSC, timebase, arch + * counter, etc.) and real time. It is designed to address the problem of + * live migration, which other clock enlightenments do not. + * + * When a guest is live migrated, this affects the clock in two ways. + * + * First, even between identical hosts the actual frequency of the underlying + * counter will change within the tolerances of its specification (typically + * ±50PPM, or 4 seconds a day). This frequency also varies over time on the + * same host, but can be tracked by NTP as it generally varies slowly. With + * live migration there is a step change in the frequency, with no warning. + * + * Second, there may be a step change in the value of the counter itself, as + * its accuracy is limited by the precision of the NTP synchronization on the + * source and destination hosts. + * + * So any calibration (NTP, PTP, etc.) which the guest has done on the source + * host before migration is invalid, and needs to be redone on the new host. + * + * In its most basic mode, this structure provides only an indication to the + * guest that live migration has occurred. This allows the guest to know that + * its clock is invalid and take remedial action. For applications that need + * reliable accurate timestamps (e.g. distributed databases), the structure + * can be mapped all the way to userspace. This allows the application to see + * directly for itself that the clock is disrupted and take appropriate + * action, even when using a vDSO-style method to get the time instead of a + * system call. + * + * In its more advanced mode. this structure can also be used to expose the + * precise relationship of the CPU counter to real time, as calibrated by the + * host. This means that userspace applications can have accurate time + * immediately after live migration, rather than having to pause operations + * and wait for NTP to recover. This mode does, of course, rely on the + * counter being reliable and consistent across CPUs. + * + * Note that this must be true UTC, never with smeared leap seconds. If a + * guest wishes to construct a smeared clock, it can do so. Presenting a + * smeared clock through this interface would be problematic because it + * actually messes with the apparent counter *period*. A linear smearing + * of 1 ms per second would effectively tweak the counter period by 1000PPM + * at the start/end of the smearing period, while a sinusoidal smear would + * basically be impossible to represent. + * + * This structure is offered with the intent that it be adopted into the + * nascent virtio-rtc standard, as a virtio-rtc that does not address the live + * migration problem seems a little less than fit for purpose. For that + * reason, certain fields use precisely the same numeric definitions as in + * the virtio-rtc proposal. The structure can also be exposed through an ACPI + * device with the CID "VMCLOCK", modelled on the "VMGENID" device except for + * the fact that it uses a real _CRS to convey the address of the structure + * (which should be a full page, to allow for mapping directly to userspace). + */ + +#ifndef __VMCLOCK_ABI_H__ +#define __VMCLOCK_ABI_H__ + +#include + +struct vmclock_abi { + /* CONSTANT FIELDS */ + __le32 magic; +#define VMCLOCK_MAGIC 0x4b4c4356 /* "VCLK" */ + __le32 size; /* Size of region containing this structure */ + __le16 version; /* 1 */ + __u8 counter_id; /* Matches VIRTIO_RTC_COUNTER_xxx except INVALID */ +#define VMCLOCK_COUNTER_ARM_VCNT 0 +#define VMCLOCK_COUNTER_X86_TSC 1 +#define VMCLOCK_COUNTER_INVALID 0xff + __u8 time_type; /* Matches VIRTIO_RTC_TYPE_xxx */ +#define VMCLOCK_TIME_UTC 0 /* Since 1970-01-01 00:00:00z */ +#define VMCLOCK_TIME_TAI 1 /* Since 1970-01-01 00:00:00z */ +#define VMCLOCK_TIME_MONOTONIC 2 /* Since undefined epoch */ +#define VMCLOCK_TIME_INVALID_SMEARED 3 /* Not supported */ +#define VMCLOCK_TIME_INVALID_MAYBE_SMEARED 4 /* Not supported */ + + /* NON-CONSTANT FIELDS PROTECTED BY SEQCOUNT LOCK */ + __le32 seq_count; /* Low bit means an update is in progress */ + /* + * This field changes to another non-repeating value when the CPU + * counter is disrupted, for example on live migration. This lets + * the guest know that it should discard any calibration it has + * performed of the counter against external sources (NTP/PTP/etc.). + */ + __le64 disruption_marker; + __le64 flags; + /* Indicates that the tai_offset_sec field is valid */ +#define VMCLOCK_FLAG_TAI_OFFSET_VALID (1 << 0) + /* + * Optionally used to notify guests of pending maintenance events. + * A guest which provides latency-sensitive services may wish to + * remove itself from service if an event is coming up. Two flags + * indicate the approximate imminence of the event. + */ +#define VMCLOCK_FLAG_DISRUPTION_SOON (1 << 1) /* About a day */ +#define VMCLOCK_FLAG_DISRUPTION_IMMINENT (1 << 2) /* About an hour */ +#define VMCLOCK_FLAG_PERIOD_ESTERROR_VALID (1 << 3) +#define VMCLOCK_FLAG_PERIOD_MAXERROR_VALID (1 << 4) +#define VMCLOCK_FLAG_TIME_ESTERROR_VALID (1 << 5) +#define VMCLOCK_FLAG_TIME_MAXERROR_VALID (1 << 6) + /* + * If the MONOTONIC flag is set then (other than leap seconds) it is + * guaranteed that the time calculated according this structure at + * any given moment shall never appear to be later than the time + * calculated via the structure at any *later* moment. + * + * In particular, a timestamp based on a counter reading taken + * immediately after setting the low bit of seq_count (and the + * associated memory barrier), using the previously-valid time and + * period fields, shall never be later than a timestamp based on + * a counter reading taken immediately before *clearing* the low + * bit again after the update, using the about-to-be-valid fields. + */ +#define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) + + __u8 pad[2]; + __u8 clock_status; +#define VMCLOCK_STATUS_UNKNOWN 0 +#define VMCLOCK_STATUS_INITIALIZING 1 +#define VMCLOCK_STATUS_SYNCHRONIZED 2 +#define VMCLOCK_STATUS_FREERUNNING 3 +#define VMCLOCK_STATUS_UNRELIABLE 4 + + /* + * The time exposed through this device is never smeared. This field + * corresponds to the 'subtype' field in virtio-rtc, which indicates + * the smearing method. However in this case it provides a *hint* to + * the guest operating system, such that *if* the guest OS wants to + * provide its users with an alternative clock which does not follow + * UTC, it may do so in a fashion consistent with the other systems + * in the nearby environment. + */ + __u8 leap_second_smearing_hint; /* Matches VIRTIO_RTC_SUBTYPE_xxx */ +#define VMCLOCK_SMEARING_STRICT 0 +#define VMCLOCK_SMEARING_NOON_LINEAR 1 +#define VMCLOCK_SMEARING_UTC_SLS 2 + __le16 tai_offset_sec; /* Actually two's complement signed */ + __u8 leap_indicator; + /* + * This field is based on the VIRTIO_RTC_LEAP_xxx values as defined + * in the current draft of virtio-rtc, but since smearing cannot be + * used with the shared memory device, some values are not used. + * + * The _POST_POS and _POST_NEG values allow the guest to perform + * its own smearing during the day or so after a leap second when + * such smearing may need to continue being applied for a leap + * second which is now theoretically "historical". + */ +#define VMCLOCK_LEAP_NONE 0x00 /* No known nearby leap second */ +#define VMCLOCK_LEAP_PRE_POS 0x01 /* Positive leap second at EOM */ +#define VMCLOCK_LEAP_PRE_NEG 0x02 /* Negative leap second at EOM */ +#define VMCLOCK_LEAP_POS 0x03 /* Set during 23:59:60 second */ +#define VMCLOCK_LEAP_POST_POS 0x04 +#define VMCLOCK_LEAP_POST_NEG 0x05 + + /* Bit shift for counter_period_frac_sec and its error rate */ + __u8 counter_period_shift; + /* + * Paired values of counter and UTC at a given point in time. + */ + __le64 counter_value; + /* + * Counter period, and error margin of same. The unit of these + * fields is 1/2^(64 + counter_period_shift) of a second. + */ + __le64 counter_period_frac_sec; + __le64 counter_period_esterror_rate_frac_sec; + __le64 counter_period_maxerror_rate_frac_sec; + + /* + * Time according to time_type field above. + */ + __le64 time_sec; /* Seconds since time_type epoch */ + __le64 time_frac_sec; /* Units of 1/2^64 of a second */ + __le64 time_esterror_nanosec; + __le64 time_maxerror_nanosec; +}; + +#endif /* __VMCLOCK_ABI_H__ */ diff --git a/tests/host_tools/vmclock.c b/tests/host_tools/vmclock.c new file mode 100644 index 00000000000..d69304ac87c --- /dev/null +++ b/tests/host_tools/vmclock.c @@ -0,0 +1,78 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vmclock-abi.h" + +const char *VMCLOCK_DEV_PATH = "/dev/vmclock0"; + +int get_vmclock_handle(struct vmclock_abi **vmclock) +{ + int fd = open(VMCLOCK_DEV_PATH, 0); + if (fd == -1) + goto out_err; + + void *ptr = mmap(NULL, sizeof(struct vmclock_abi), PROT_READ, MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) + goto out_err_mmap; + + *vmclock = ptr; + return 0; + +out_err_mmap: + close(fd); +out_err: + return errno; +} + +#define READ_VMCLOCK_FIELD_FN(type, field) \ +type read##_##field (struct vmclock_abi *vmclock) { \ + type ret; \ + while (1) { \ + type seq = vmclock->seq_count & ~1ULL; \ + \ + /* This matches a write fence in the VMM */ \ + atomic_thread_fence(memory_order_acquire); \ + \ + ret = vmclock->field; \ + \ + /* This matches a write fence in the VMM */ \ + atomic_thread_fence(memory_order_acquire); \ + if (seq == vmclock->seq_count) \ + break; \ + } \ + \ + return ret; \ +} + +READ_VMCLOCK_FIELD_FN(uint64_t, disruption_marker); + +int main() +{ + struct vmclock_abi *vmclock; + + int err = get_vmclock_handle(&vmclock); + if (err) { + printf("Could not mmap vmclock struct: %s\n", strerror(err)); + exit(1); + } + + printf("VMCLOCK_MAGIC: 0x%x\n", vmclock->magic); + printf("VMCLOCK_SIZE: 0x%x\n", vmclock->size); + printf("VMCLOCK_VERSION: %u\n", vmclock->version); + printf("VMCLOCK_CLOCK_STATUS: %u\n", vmclock->clock_status); + printf("VMCLOCK_COUNTER_ID: %u\n", vmclock->counter_id); + printf("VMCLOCK_DISRUPTION_MARKER: %lu\n", read_disruption_marker(vmclock)); + + return 0; +} diff --git a/tests/integration_tests/functional/test_vmclock.py b/tests/integration_tests/functional/test_vmclock.py new file mode 100644 index 00000000000..b21acaaa1cc --- /dev/null +++ b/tests/integration_tests/functional/test_vmclock.py @@ -0,0 +1,66 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Test VMclock device emulation""" + +import platform + +import pytest + + +@pytest.fixture(scope="function") +def vm_with_vmclock(uvm_plain, bin_vmclock_path): + """Create a VM with VMclock support and the `vmclock` test binary under `/tmp/vmclock`""" + basevm = uvm_plain + basevm.spawn() + + basevm.basic_config() + basevm.add_net_iface() + basevm.start() + basevm.ssh.scp_put(bin_vmclock_path, "/tmp/vmclock") + + yield basevm + + +def parse_vmclock(vm): + """Parse the VMclock struct inside the guest and return a dictionary with its fields""" + _, stdout, _ = vm.ssh.check_output("/tmp/vmclock") + fields = stdout.strip().split("\n") + return dict(item.split(": ") for item in fields) + + +@pytest.mark.skipif( + platform.machine() != "x86_64", + reason="VMClock device is currently supported only on x86 systems", +) +def test_vmclock_fields(vm_with_vmclock): + """Make sure that we expose the expected values in the VMclock struct""" + vm = vm_with_vmclock + vmclock = parse_vmclock(vm) + + assert vmclock["VMCLOCK_MAGIC"] == "0x4b4c4356" + assert vmclock["VMCLOCK_SIZE"] == "0x1000" + assert vmclock["VMCLOCK_VERSION"] == "1" + assert vmclock["VMCLOCK_CLOCK_STATUS"] == "0" + assert vmclock["VMCLOCK_COUNTER_ID"] == "255" + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + + +@pytest.mark.skipif( + platform.machine() != "x86_64", + reason="VMClock device is currently supported only on x86 systems", +) +def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type): + """Test that `disruption_marker` is updated upon snapshot resume""" + basevm = vm_with_vmclock + + vmclock = parse_vmclock(basevm) + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + + snapshot = basevm.make_snapshot(snapshot_type) + basevm.kill() + + for i, vm in enumerate( + microvm_factory.build_n_from_snapshot(snapshot, 5, incremental=True) + ): + vmclock = parse_vmclock(vm) + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == f"{i+1}"