From c50dd35d2c160ad2cdaeaab327450f6c010a7b2d Mon Sep 17 00:00:00 2001 From: Bruce Mitchener Date: Wed, 26 Nov 2025 08:58:44 +0700 Subject: [PATCH] metal: prevent command buffer exhaustion deadlocks - Track outstanding Metal command buffers per queue and gate begin_encoding against the hard MAX_COMMAND_BUFFERS, returning device-lost with an actionable warning instead of letting new_command_buffer hang when encoders are leaked. - Share the counter across queue/encoders and decrement on submit or discard after clearing the raw command buffer so drop happens before the bookkeeping update. Fixes #3084, #8047. --- wgpu-hal/src/metal/adapter.rs | 4 +++- wgpu-hal/src/metal/command.rs | 30 ++++++++++++++++++++++++++++-- wgpu-hal/src/metal/device.rs | 3 +++ wgpu-hal/src/metal/mod.rs | 15 +++++++++++++++ 4 files changed, 49 insertions(+), 3 deletions(-) diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs index f4e0f43e2b..a832589aa6 100644 --- a/wgpu-hal/src/metal/adapter.rs +++ b/wgpu-hal/src/metal/adapter.rs @@ -7,6 +7,7 @@ use parking_lot::Mutex; use wgt::{AstcBlock, AstcChannel}; use alloc::sync::Arc; +use core::sync::atomic; use super::TimestampQuerySupport; @@ -27,7 +28,7 @@ use super::TimestampQuerySupport; /// . /// /// [new command buffer]: https://developer.apple.com/documentation/metal/mtlcommandqueue/makecommandbuffer()?language=objc -const MAX_COMMAND_BUFFERS: u64 = 4096; +pub(super) const MAX_COMMAND_BUFFERS: u64 = 4096; unsafe impl Send for super::Adapter {} unsafe impl Sync for super::Adapter {} @@ -87,6 +88,7 @@ impl crate::Adapter for super::Adapter { }, queue: super::Queue { raw: Arc::new(Mutex::new(queue)), + command_buffer_created_not_submitted: Arc::new(atomic::AtomicU64::new(0)), timestamp_period, }, }) diff --git a/wgpu-hal/src/metal/command.rs b/wgpu-hal/src/metal/command.rs index 86be90427d..c5223fd8e3 100644 --- a/wgpu-hal/src/metal/command.rs +++ b/wgpu-hal/src/metal/command.rs @@ -1,10 +1,10 @@ -use super::{conv, AsNative, TimestampQuerySupport}; +use super::{adapter, conv, AsNative, TimestampQuerySupport}; use crate::CommandEncoder as _; use alloc::{ borrow::{Cow, ToOwned as _}, vec::Vec, }; -use core::ops::Range; +use core::{ops::Range, sync::atomic}; use metal::{ MTLIndexType, MTLLoadAction, MTLPrimitiveType, MTLScissorRect, MTLSize, MTLStoreAction, MTLViewport, MTLVisibilityResultMode, NSRange, @@ -318,6 +318,25 @@ impl crate::CommandEncoder for super::CommandEncoder { unsafe fn begin_encoding(&mut self, label: crate::Label) -> Result<(), crate::DeviceError> { let queue = &self.raw_queue.lock(); let retain_references = self.shared.settings.retain_command_buffer_references; + + // Guard against exhausting Metal's command buffer budget. Use the hard + // limit (`MAX_COMMAND_BUFFERS`) so we fail before Metal can hang inside + // `new_command_buffer`. + let previous = self + .command_buffer_created_not_submitted + .fetch_add(1, atomic::Ordering::AcqRel); + if previous >= adapter::MAX_COMMAND_BUFFERS { + let current = previous + 1; + log::warn!( + "metal: refusing to create new command buffer; {current} outstanding command \ + buffers exceeds the limit of {}. Treating this as device lost. \ + Ensure command encoders are submitted or dropped rather than kept alive \ + to avoid exhausting Metal's command buffer budget.", + adapter::MAX_COMMAND_BUFFERS + ); + return Err(crate::DeviceError::Lost); + } + let raw = objc::rc::autoreleasepool(move || { let cmd_buf_ref = if retain_references { queue.new_command_buffer() @@ -345,7 +364,14 @@ impl crate::CommandEncoder for super::CommandEncoder { if let Some(encoder) = self.state.compute.take() { encoder.end_encoding(); } + let had_command_buffer = self.raw_cmd_buf.is_some(); + // Clear the Option first so the underlying `metal::CommandBuffer` is + // dropped before we update the counter. self.raw_cmd_buf = None; + if had_command_buffer { + self.command_buffer_created_not_submitted + .fetch_sub(1, atomic::Ordering::AcqRel); + } } unsafe fn end_encoding(&mut self) -> Result { diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs index f7bcca7251..e33cf9fc6f 100644 --- a/wgpu-hal/src/metal/device.rs +++ b/wgpu-hal/src/metal/device.rs @@ -639,6 +639,9 @@ impl crate::Device for super::Device { Ok(super::CommandEncoder { shared: Arc::clone(&self.shared), raw_queue: Arc::clone(&desc.queue.raw), + command_buffer_created_not_submitted: Arc::clone( + &desc.queue.command_buffer_created_not_submitted, + ), raw_cmd_buf: None, state: super::CommandState::default(), temp: super::Temp::default(), diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs index 7258a885f2..7335d1f008 100644 --- a/wgpu-hal/src/metal/mod.rs +++ b/wgpu-hal/src/metal/mod.rs @@ -362,6 +362,10 @@ pub struct Adapter { pub struct Queue { raw: Arc>, + // Tracks command buffers created via `CommandEncoder::begin_encoding` that + // have not yet been submitted or discarded. Used to proactively fail + // before hitting Metal's `maxCommandBufferCount`. + command_buffer_created_not_submitted: Arc, timestamp_period: f32, } @@ -372,6 +376,7 @@ impl Queue { pub unsafe fn queue_from_raw(raw: metal::CommandQueue, timestamp_period: f32) -> Self { Self { raw: Arc::new(Mutex::new(raw)), + command_buffer_created_not_submitted: Arc::new(atomic::AtomicU64::new(0)), timestamp_period, } } @@ -469,6 +474,13 @@ impl crate::Queue for Queue { for cmd_buffer in command_buffers { cmd_buffer.raw.commit(); + // One command buffer per `end_encoding` call moves from the + // "created but not yet submitted" bucket into the submitted + // set, so update the counter. + let previous = self + .command_buffer_created_not_submitted + .fetch_sub(1, atomic::Ordering::AcqRel); + debug_assert!(previous > 0); } if let Some(raw) = extra_command_buffer { @@ -1012,6 +1024,9 @@ struct CommandState { pub struct CommandEncoder { shared: Arc, raw_queue: Arc>, + // Tracks how many command buffers have been created via this queue but not + // yet submitted or discarded. Shared across all encoders for the same queue. + command_buffer_created_not_submitted: Arc, raw_cmd_buf: Option, state: CommandState, temp: Temp,