From 021b8f9bc5af99d2a36af8278c10e8bcc3000b88 Mon Sep 17 00:00:00 2001 From: Bryce Young Date: Tue, 4 Jun 2024 10:17:49 +0100 Subject: [PATCH 1/7] Fixes for timeline semaphore --- .../timeline_semaphore/CMakeLists.txt | 71 +- .../extensions/timeline_semaphore/README.adoc | 747 +++++++-------- .../timeline_semaphore/timeline_semaphore.cpp | 898 ++++++++---------- .../timeline_semaphore/timeline_semaphore.h | 157 +-- 4 files changed, 913 insertions(+), 960 deletions(-) diff --git a/samples/extensions/timeline_semaphore/CMakeLists.txt b/samples/extensions/timeline_semaphore/CMakeLists.txt index c1fd093acf..91d975a1dd 100644 --- a/samples/extensions/timeline_semaphore/CMakeLists.txt +++ b/samples/extensions/timeline_semaphore/CMakeLists.txt @@ -1,38 +1,33 @@ -# Copyright (c) 2021, Arm Limited and Contributors -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 the "License"; -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -if (NOT WIN32) - # Not enabled on Windows at this time due to bugs. - # Out-of-order submission in presentation causes kernel level issues, - # and need to be figured out before this sample can be enabled on Windows. - get_filename_component(FOLDER_NAME ${CMAKE_CURRENT_LIST_DIR} NAME) - get_filename_component(PARENT_DIR ${CMAKE_CURRENT_LIST_DIR} PATH) - get_filename_component(CATEGORY_NAME ${PARENT_DIR} NAME) - - add_sample_with_tags( - ID ${FOLDER_NAME} - CATEGORY ${CATEGORY_NAME} - AUTHOR "Hans-Kristian Arntzen" - NAME "Timeline semaphore" - DESCRIPTION "Demonstrates use of timeline semaphores to express complex queue dependency graphs" - SHADER_FILES_GLSL - "timeline_semaphore/game_of_life_update.comp" - "timeline_semaphore/game_of_life_mutate.comp" - "timeline_semaphore/game_of_life_init.comp" - "timeline_semaphore/render.vert" - "timeline_semaphore/render.frag") -endif() +# Copyright (c) 2021-2024, Arm Limited and Contributors +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 the "License"; +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +get_filename_component(FOLDER_NAME ${CMAKE_CURRENT_LIST_DIR} NAME) +get_filename_component(PARENT_DIR ${CMAKE_CURRENT_LIST_DIR} PATH) +get_filename_component(CATEGORY_NAME ${PARENT_DIR} NAME) + +add_sample_with_tags( + ID ${FOLDER_NAME} + CATEGORY ${CATEGORY_NAME} + AUTHOR "Hans-Kristian Arntzen" + NAME "Timeline semaphore" + DESCRIPTION "Demonstrates use of timeline semaphores to express complex queue dependency graphs" + SHADER_FILES_GLSL + "timeline_semaphore/game_of_life_update.comp" + "timeline_semaphore/game_of_life_mutate.comp" + "timeline_semaphore/game_of_life_init.comp" + "timeline_semaphore/render.vert" + "timeline_semaphore/render.frag") diff --git a/samples/extensions/timeline_semaphore/README.adoc b/samples/extensions/timeline_semaphore/README.adoc index a8ab58d924..b39ad86168 100644 --- a/samples/extensions/timeline_semaphore/README.adoc +++ b/samples/extensions/timeline_semaphore/README.adoc @@ -1,373 +1,374 @@ -//// -- Copyright (c) 2021-2023, Arm Limited and Contributors -- -- SPDX-License-Identifier: Apache-2.0 -- -- Licensed under the Apache License, Version 2.0 the "License"; -- you may not use this file except in compliance with the License. -- You may obtain a copy of the License at -- -- http://www.apache.org/licenses/LICENSE-2.0 -- -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -- See the License for the specific language governing permissions and -- limitations under the License. -- -//// -= Timeline semaphore - -ifdef::site-gen-antora[] -TIP: The source for this sample can be found in the https://github.com/KhronosGroup/Vulkan-Samples/tree/main/samples/extensions/timeline_semaphore[Khronos Vulkan samples github repository]. -endif::[] - - -== Overview - -In Vulkan 1.0, we were introduced to `VkSemaphore` which is able to synchronize work between Vulkan queues. -It has some peculiar behavior which makes it somewhat difficult to use in practice. -The timeline semaphore is designed to solve these problems and it also makes the queue synchronization model closer to what we see in D3D12. - -The existing semaphore as-is works fine in normal situations, but as applications learn to take advantage of async compute, async transfer, and other advanced synchronization use cases, there are problems which are hard to ignore. - -=== The binary semaphore problems - -The existing semaphore type is now called a `BINARY` semaphore, as signals and waits must always happen in 1:1 pairs. -Completing a wait for a semaphore on the `VkQueue` also *unsignals it*. -This is problematic for more advanced use cases where we wish to create a single producer, multiple consumers scenario. -To make binary semaphores work, we would have to signal multiple semaphores in a single `vkQueueSubmit`, and then assign one semaphore to each waiting queue. -This is rather awkward, since it might not be obvious at signal time how this scenario will play out, and juggling N semaphores just for this case is not fun. - -When juggling N semaphores, it might also happen that a semaphore was not required after all, and we are now sitting with a signalled semaphore which cannot be recycled and signalled again unless we wait for it first. -The solution here is to just destroy such "hung" semaphores, which is unfortunate. -Ideally we would be able to reset semaphores on the host as well, but no such API exists and submitting a wait to GPU just for the purpose of unsignalling a semaphore is silly. - -There is also an object bloat problem. -Usually, there are many submissions in flight on a GPU, and to be able to synchronize with each submission, we must keep track of a certain number of semaphores which are in-flight at any one time. -This is doable, but inelegant. -There is a similar problem for `VkFence` as well. - -The final problem is a lack of out-of-order signal and wait. -This is a somewhat of a niche problem, but in a world with free threaded task graphs, it could make sense to be able to submit work out of order and let synchronization objects take care of synchronization on the GPU. -With binary semaphores, a signal must be submitted before the wait, which guarantees forward progress, but guarantees jank in the engine. -There are certainly good reasons for this restriction, but it removes some flexibility. - -=== Viewing a `VkQueue` as a sequence - thinking in terms of counters - -In order to signal on a `VkQueue`, we wait for everything that happened before we signal anything. -This also means that future signal operations will wait for a superset of the operations in the signal that came before. -In this sense, instead of thinking of synchronizing against individual submissions, we can think about things like "Wait for submission #134 on compute queue to complete", i.e. -we just associate a single monotonically increasing number to a queue. -Submitting to a `VkQueue` can now be considered a simple increment of the monotonically increasing number. - -This is the foundation of timeline semaphores. -A `VkSemaphore` can have a 64-bit counter associated with it and there are two new operations we can do: - -* As a signal semaphore, wait for everything to complete in queue, then *monotonically* bump counter value to `$old_value + $increment`, where `$increment` is usually 1. -* As a wait semaphore, wait for the counter of the semaphore to reach *at least* the wait count value. - -From an application point of view, there is no longer a need to own synchronization objects and applications can instead agree on 64-bit counters. - -=== Out-of-order signal and wait - -Timeline semaphores also adds support for submitting waits before the corresponding signal operation. -This hands over the burden to the driver, where it will need to either hold back submissions on its own, or defer this work to the kernel driver. -Either way, the application no longer needs to hold back submissions. - -This can be quite useful when applications have multiple threads which perform queue submission, since ensuring ordering otherwise would require a lot of careful thread synchronization. - -=== Single producer, multiple consumers - -There is no unsignal operation with timeline semaphores, so it's perfectly fine to do something like: - -* Signal graphics queue, value 40 -* Wait async compute queue 0, value 40 -* Wait async compute queue 1, value 39 -* Wait async compute queue 2, value 36 - -Once the counter reaches 40, it will always be at least 40, and we can keep waiting for this counter as long as we wish. - -=== Integration of host signal and wait, good night sweet `VkFence` - -VkFence is somewhat redundant when we have timeline semaphores, since we can now wait for counter values on CPU as well. -There is not even a requirement to externally synchronize `VkSemaphore` objects when doing so, which is very nice! -To synchronize GPU work with CPU, we just need to know the timeline value we signalled with. - -== Using timeline semaphores - -First, we need to create a `VkSemaphore` with `TIMELINE` type. - -[,cpp] ----- -// A timeline semaphore is still a semaphore, but it is of TIMELINE type rather than BINARY. -VkSemaphoreCreateInfo create_info = vkb::initializers::semaphore_create_info(); -VkSemaphoreTypeCreateInfoKHR type_create_info{VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR}; - -type_create_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; -type_create_info.initialValue = 0; -create_info.pNext = &type_create_info; - -VK_CHECK(vkCreateSemaphore(get_device().get_handle(), &create_info, nullptr, &timeline.semaphore)); ----- - -We can signal the timeline in `vkQueueSubmit`. - -[,cpp] ----- -VkSubmitInfo submit = vkb::initializers::submit_info(); -submit.pSignalSemaphores = &timeline.semaphore; -submit.signalSemaphoreCount = 1; -submit.pCommandBuffers = &cmd; -submit.commandBufferCount = 1; - -// For every timeline semaphore we signal, we give an auxillary timeline value. -VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; -timeline_info.signalSemaphoreValueCount = 1; -timeline_info.pSignalSemaphoreValues = &timeline.timeline; - -submit.pNext = &timeline_info; - -VK_CHECK(vkQueueSubmit(signal_queue, 1, &submit, VK_NULL_HANDLE)); ----- - -Similarly, we can wait in `vkQueueSubmit`. - -[,cpp] ----- -const VkPipelineStageFlags wait_stages = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - -VkSubmitInfo submit = vkb::initializers::submit_info(); -submit.pWaitSemaphores = &timeline.semaphore; -submit.pWaitDstStageMask = &wait_stages; -submit.waitSemaphoreCount = 1; -submit.pCommandBuffers = &cmd; -submit.commandBufferCount = 1; - -VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; -timeline_info.waitSemaphoreValueCount = 1; -timeline_info.pWaitSemaphoreValues = &timeline.timeline; - -submit.pNext = &timeline_info; - -VK_CHECK(vkQueueSubmit(wait_queue, 1, &submit, VK_NULL_HANDLE)); ----- - -We can wait for one or more semaphores on host as well! - -[,cpp] ----- -VkSemaphoreWaitInfoKHR wait_info{VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR}; -wait_info.pSemaphores = &semaphore; -wait_info.semaphoreCount = 1; -wait_info.pValues = &value; -VK_CHECK(vkWaitSemaphoresKHR(device->get_handle(), &wait_info, UINT64_MAX)); ----- - -A somewhat esoteric feature is to signal a timeline on host, this can be used to "kick" the GPU. - -[,cpp] ----- -VkSemaphoreSignalInfoKHR signal_info{VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO_KHR}; -signal_info.semaphore = semaphore; -signal_info.value = value; -VK_CHECK(vkSignalSemaphoreKHR(device->get_handle(), &signal_info)); ----- - -== The sample - -image::./images/sample.png[Sample] - -This sample demonstrates an esoteric way of implementing the well-known "Game of Life". -Through this sample we end up using all the core features of timeline semaphores. - -=== The queues - -In this sample, we make use of two `VkQueues`, an async compute queue which performs simulation, and the main graphics queue which blits to swapchain and presents the results. -The two queues need to carefully synchronize with each other. -This sample could trivially be done with binary semaphores of course, so in this sample we implement it in a difficult way to demonstrate the full API capabilities. - -=== Async worker thread - out-of-order submission - -The key aspect we use to demonstrate out of order submission is a dedicated worker thread which does all work related to simulation on the async compute queue. -It never synchronizes with the main thread except at teardown, so the only way it synchronizes is through timeline semaphores. -Submission order is completely out-of-order in this case and forward progress in the async queue is generally blocked by the main thread submitting more work. - -=== Data flow - -To simulate "Game of Life", we allocate two images of 64x64 RGBA8. -First, one image is initialized with initial state, and from here there is a ping-pong where image N is updated, while reading from image 1 - N. - -After updating image N, the main thread will sample from image N. -Before async compute updates the same image index N again, it must wait for graphics queue to complete. -With the double buffer in play, the async queue can run ahead for a little while and it will be mostly stalled by graphics queue. - -The sequential flow of the rendering is something like, assuming two timeline semaphores A and G: - -* Async compute write image 1. -* Async compute signal A = 1. -* Graphics wait A = 1. -* Graphics read image 1. -* Graphics signal G = 1. -* Async compute wait A = 1. -(Could use pipeline barrier of course, but hey!) -* Async compute write image 0. -* Async compute signal A = 2. -* Graphics wait A = 2. -* Graphics read image 0. -* Graphics signal G = 2. -* Async compute wait G = 1. -(Resolve write-after-read hazard) -* Async compute wait A = 2. -(Could use pipeline barrier of course, but hey!) -* Async compute wait host A = 1. -(Wait for command buffer to retire so we can re-record it!) -* Async compute write image 1. -* Async compute signal A = 3. -* Graphics wait A = 3. -* Graphics read image 1. -* Graphics signal G = 3. - -And so on ... -With out of order signal, we can end up observing this order of submissions instead. - -* Async compute write image 1. -* Async compute signal A = 1. -* Async compute wait A = 1. -* Async compute write image 0. -* Async compute signal A = 2. -* Async compute wait G = 1. -(Out of order submission, queue progress is stalled, but we can keep recording) -* Async compute wait A = 2. -* Async compute wait host A = 1. -* Async compute write image 1. -* Async compute signal A = 3. -* Graphics wait A = 1. -* Graphics read image 1. -* Graphics signal G = 1. -(Unblocks queue forward progress) -* Graphics wait A = 2. -* Graphics read image 0. -* Graphics signal G = 2. -* Graphics wait A = 3. -* Graphics read image 1. -* Graphics signal G = 3. - -When submitting out of order, it is important that you don't just submit work way ahead of where the GPU actually is, since the latency becomes extremely large. -The natural place to keep submission explosion under control here is the place where we wait for the timeline on host, since we need to re-record command buffers anyways. - -=== Avoiding deadlocks in `vkDeviceWaitIdle` - -When submitting out-of-order we end up in a situation where a queue cannot see any forward progress until another queue submits. -Calling `vkDeviceWaitIdle` at this point triggers a deadlock of the application since `vkDeviceWaitIdle` will never finish, as there is one queue which cannot make forward progress. -While calling `vkDeviceWaitIdle`, you cannot call `vkQueueSubmit` due to external synchronization rules. - -Instead, just wait for timeline semaphores on host to "drain" the GPU, or if you must use API calls, use `vkQueueWaitIdle` and only wait on queues which you need. - -=== Avoiding deadlocks when tearing down worker thread - -Similar to `vkDeviceWaitIdle`, when tearing down the application, an out-of-order submission might be waiting on work which never comes, and that queue becomes deadlocked. -To alleviate this, we can make use of host signalling of timeline semaphores to unblock everything in one fell swoop. - -From `TimelineSemaphore::finish()`: - -[,cpp] ----- -// Draining queues which submit out-of-order can be quite tricky, since QueueWaitIdle can deadlock for threads which want to run ahead. -// If we call Submit waiting for a semaphore which is yet to be signalled, -// QueueWaitIdle will not finish until a signal in another thread happens. -// Here's an approach we can use to safely tear down the queue. - -// Drain the main thread timeline. -// The async queue might be stalled waiting on the main queue to finish rendering a future frame which it never completes, -// but we might never hit that count, since we're tearing down the application now. -wait_timeline_cpu(main_thread_timeline); - -// Now we're guaranteed that the graphics timeline is at N and the async compute queue is blocked at N + num_frames + 1, waiting for N + 1 to finish. -// Since we're not reading any more in graphics queue, we can jump bump the timeline on CPU towards infinity. -// On the next loop iteration, we will exit the rendering loop and QueueWaitIdle will not be blocked on async thread anymore. -// Just bump the timeline by INT32_MAX which is min-spec for maxTimelineSemaphoreValueDifference. -// This is a useful way to mark a timeline semaphore as "permanently" signalled. -main_thread_timeline.timeline += std::numeric_limits::max(); - -// Order matters here, this works kinda like a condition variable. -// If the timeline update is observed, we should see that the worker is not alive anymore. -async_compute_worker.alive = false; -signal_timeline_cpu(main_thread_timeline, main_thread_timeline_lock); - -// This will now complete in finite time. -if (async_compute_worker.thread.joinable()) -{ - async_compute_worker.thread.join(); -} ----- - -=== Out-of-order submission fallbacks for single queue implementations - -Since this sample needs to run on all implementations which support timeline semaphores, the sample also demonstrates the limitations of out-of-order queue submissions. -It's easy to land in a situation where you deadlock the GPU or driver which only happens on single queue Vulkan implementations. -There are two fixes we need to make this work. - -==== Holding back submissions - -This workaround ensures that submissions happen in-order, where forward progress can always be made. -Since we are using multiple submission threads this sample uses a condition variable to only allow a wait to be submitted if it ensures forward progress. -This is handled by `TimelineSemaphore::update_pending()`: - -[,cpp] ----- -std::lock_guard holder{lock.lock}; -lock.pending_timeline = timeline; -lock.cond.notify_one(); ----- - -and `TimelineSemaphore::wait_pending()`: - -[,cpp] ----- -std::unique_lock holder{lock.lock}; -lock.cond.wait(holder, [&lock, timeline]() -> bool { - return lock.pending_timeline >= timeline; -}); ----- - -Blocking like this only works when multiple threads can submit, but that's what this sample is doing, so it is a simple fix. - -The most robust workaround is probably to not lean too heavily on out-of-order submission unless you know you have all the `VkQueues` you need to resolve the dependencies properly. - -==== Locking `vkQueueSubmit` - -If two threads end up submitting to the same queue at the same time, we need to add locks due to external synchronization requirement of the `VkQueue`. -In this sample, we only add the locks if we're applying workarounds. - -== API limitations - -Currently, the Vulkan WSI swapchain does not support timeline semaphores. -In practice, this isn't too big of a deal as swapchain integration tends to be a "special case" either way in most rendering backends. -The acquire and release semaphores have no analog in other modern APIs. - -Another related issue with WSI swapchains is that when using binary semaphores, it is not possible to use wait-before-signal. -The specification states that in order to submit a wait on a binary semaphore all dependencies for that semaphore wait must have been submitted already. -This means that we need to potentially block a bit on host before we can call vkQueuePresentKHR. -The sample does this right before calling `ApiVulkanSample::submit_frame()`. - -[,cpp] ----- -// Before we call present, which uses a binary semaphore, we must ensure that all dependent submissions -// have been submitted, so that the presenting queue is unblocked at the time of calling. -wait_pending(async_compute_timeline_lock, main_thread_timeline.timeline); - -ApiVulkanSample::submit_frame(); ----- - -== Conclusion - -Timeline semaphores grants a lot of flexibility to applications. -With modern approaches of task graphs, many threads and free flowing synchronization, timeline semaphores simplify a lot of things and removes the need for emulating a similar concept with binary semaphores and fences. - -Be careful with out-of-order submissions. -There are various pitfalls with this approach which have been outlined in this sample. -s. - -Be careful with out-of-order submissions. -There are various pitfalls with this approach which have been outlined in this sample. +//// +- Copyright (c) 2021-2024, Arm Limited and Contributors +- +- SPDX-License-Identifier: Apache-2.0 +- +- Licensed under the Apache License, Version 2.0 the "License"; +- you may not use this file except in compliance with the License. +- You may obtain a copy of the License at +- +- http://www.apache.org/licenses/LICENSE-2.0 +- +- Unless required by applicable law or agreed to in writing, software +- distributed under the License is distributed on an "AS IS" BASIS, +- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- See the License for the specific language governing permissions and +- limitations under the License. +- +//// += Timeline semaphore + +ifdef::site-gen-antora[] +TIP: The source for this sample can be found in the https://github.com/KhronosGroup/Vulkan-Samples/tree/main/samples/extensions/timeline_semaphore[Khronos Vulkan samples github repository]. +endif::[] + + +== Overview + +In Vulkan 1.0, we were introduced to `VkSemaphore` which is able to synchronize work between Vulkan queues. +It has some peculiar behavior which makes it somewhat difficult to use in practice. +The timeline semaphore is designed to solve these problems and it also makes the queue synchronization model closer to what we see in D3D12. + +The existing semaphore as-is works fine in normal situations, but as applications learn to take advantage of async compute, async transfer, and other advanced synchronization use cases, there are problems which are hard to ignore. + +=== The binary semaphore problems + +The existing semaphore type is now called a `BINARY` semaphore, as signals and waits must always happen in 1:1 pairs. +Completing a wait for a semaphore on the `VkQueue` also *unsignals it*. +This is problematic for more advanced use cases where we wish to create a single producer, multiple consumers scenario. +To make binary semaphores work, we would have to signal multiple semaphores in a single `vkQueueSubmit`, and then assign one semaphore to each waiting queue. +This is rather awkward, since it might not be obvious at signal time how this scenario will play out, and juggling N semaphores just for this case is not fun. + +When juggling N semaphores, it might also happen that a semaphore was not required after all, and we are now sitting with a signalled semaphore which cannot be recycled and signalled again unless we wait for it first. +The solution here is to just destroy such "hung" semaphores, which is unfortunate. +Ideally we would be able to reset semaphores on the host as well, but no such API exists and submitting a wait to GPU just for the purpose of unsignalling a semaphore is silly. + +There is also an object bloat problem. +Usually, there are many submissions in flight on a GPU, and to be able to synchronize with each submission, we must keep track of a certain number of semaphores which are in-flight at any one time. +This is doable, but inelegant. +There is a similar problem for `VkFence` as well. + +The final problem is a lack of out-of-order signal and wait. +This is a somewhat of a niche problem, but in a world with free threaded task graphs, it could make sense to be able to submit work out of order and let synchronization objects take care of synchronization on the GPU. +With binary semaphores, a signal must be submitted before the wait, which guarantees forward progress, but guarantees jank in the engine. +There are certainly good reasons for this restriction, but it removes some flexibility. + +=== Viewing a `VkQueue` as a sequence - thinking in terms of counters + +In order to signal on a `VkQueue`, we wait for everything that happened before we signal anything. +This also means that future signal operations will wait for a superset of the operations in the signal that came before. +In this sense, instead of thinking of synchronizing against individual submissions, we can think about things like "Wait for submission #134 on compute queue to complete", i.e. +we just associate a single monotonically increasing number to a queue. +Submitting to a `VkQueue` can now be considered a simple increment of the monotonically increasing number. + +This is the foundation of timeline semaphores. +A `VkSemaphore` can have a 64-bit counter associated with it and there are two new operations we can do: + +* As a signal semaphore, wait for everything to complete in queue, then *monotonically* bump counter value to `$old_value + $increment`, where `$increment` is usually 1. +* As a wait semaphore, wait for the counter of the semaphore to reach *at least* the wait count value. + +From an application point of view, there is no longer a need to own synchronization objects and applications can instead agree on 64-bit counters. + +=== Out-of-order signal and wait + +Timeline semaphores also adds support for submitting waits before the corresponding signal operation. +This hands over the burden to the driver, where it will need to either hold back submissions on its own, or defer this work to the kernel driver. +Either way, the application no longer needs to hold back submissions. + +This can be quite useful when applications have multiple threads which perform queue submission, since ensuring ordering otherwise would require a lot of careful thread synchronization. + +=== Single producer, multiple consumers + +There is no unsignal operation with timeline semaphores, so it's perfectly fine to do something like: + +* Signal graphics queue, value 40 +* Wait async compute queue 0, value 40 +* Wait async compute queue 1, value 39 +* Wait async compute queue 2, value 36 + +Once the counter reaches 40, it will always be at least 40, and we can keep waiting for this counter as long as we wish. + +=== Integration of host signal and wait, good night sweet `VkFence` + +VkFence is somewhat redundant when we have timeline semaphores, since we can now wait for counter values on CPU as well. +There is not even a requirement to externally synchronize `VkSemaphore` objects when doing so, which is very nice! +To synchronize GPU work with CPU, we just need to know the timeline value we signalled with. + +== Using timeline semaphores + +First, we need to create a `VkSemaphore` with `TIMELINE` type. + +[,cpp] +---- +// A timeline semaphore is still a semaphore, but it is of TIMELINE type rather than BINARY. +VkSemaphoreCreateInfo create_info = vkb::initializers::semaphore_create_info(); +VkSemaphoreTypeCreateInfoKHR type_create_info{VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR}; + +type_create_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; +type_create_info.initialValue = 0; +create_info.pNext = &type_create_info; + +VK_CHECK(vkCreateSemaphore(get_device().get_handle(), &create_info, nullptr, &timeline.semaphore)); +---- + +We can signal the timeline in `vkQueueSubmit`. + +[,cpp] +---- +VkSubmitInfo submit = vkb::initializers::submit_info(); +submit.pSignalSemaphores = &timeline.semaphore; +submit.signalSemaphoreCount = 1; +submit.pCommandBuffers = &cmd; +submit.commandBufferCount = 1; + +// For every timeline semaphore we signal, we give an auxillary timeline value. +VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; +timeline_info.signalSemaphoreValueCount = 1; +timeline_info.pSignalSemaphoreValues = &timeline.timeline; + +submit.pNext = &timeline_info; + +VK_CHECK(vkQueueSubmit(signal_queue, 1, &submit, VK_NULL_HANDLE)); +---- + +Similarly, we can wait in `vkQueueSubmit`. + +[,cpp] +---- +const VkPipelineStageFlags wait_stages = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + +VkSubmitInfo submit = vkb::initializers::submit_info(); +submit.pWaitSemaphores = &timeline.semaphore; +submit.pWaitDstStageMask = &wait_stages; +submit.waitSemaphoreCount = 1; +submit.pCommandBuffers = &cmd; +submit.commandBufferCount = 1; + +VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; +timeline_info.waitSemaphoreValueCount = 1; +timeline_info.pWaitSemaphoreValues = &timeline.timeline; + +submit.pNext = &timeline_info; + +VK_CHECK(vkQueueSubmit(wait_queue, 1, &submit, VK_NULL_HANDLE)); +---- + +We can wait for one or more semaphores on host as well! + +[,cpp] +---- +VkSemaphoreWaitInfoKHR wait_info{VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR}; +wait_info.pSemaphores = &semaphore; +wait_info.semaphoreCount = 1; +wait_info.pValues = &value; +VK_CHECK(vkWaitSemaphoresKHR(device->get_handle(), &wait_info, UINT64_MAX)); +---- + +A somewhat esoteric feature is to signal a timeline on host, this can be used to "kick" the GPU. + +[,cpp] +---- +VkSemaphoreSignalInfoKHR signal_info{VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO_KHR}; +signal_info.semaphore = semaphore; +signal_info.value = value; +VK_CHECK(vkSignalSemaphoreKHR(device->get_handle(), &signal_info)); +---- + +== The sample + +image::./images/sample.png[Sample] + +This sample demonstrates an esoteric way of implementing the well-known "Game of Life". +Through this sample we end up using all the core features of timeline semaphores. + +=== The queues + +In this sample, we make use of two `VkQueues`, an async compute queue which performs simulation, and the main graphics queue which blits to swapchain and presents the results. +The two queues need to carefully synchronize with each other. +This sample could trivially be done with binary semaphores of course, so in this sample we implement it in a difficult way to demonstrate the full API capabilities. + +=== Async worker thread - out-of-order submission + +The key aspects we use to demonstrate out of order submission are dedicated workers thread which perform all work related to simulation on the async compute queue, and drawing on the graphics queue. +They never synchronize with the main thread except at teardown, so the only way to synchronize them is through timeline semaphores. +To avoid issues when running the sample on Windows platforms (particularly when resizing the window), forward progress in the queues is throttled by the main thread (i.e. only allowing the timeline to advance +when a render call is active). + + +=== Data flow + +To simulate "Game of Life", we allocate two images of 64x64 RGBA8. +First, one image is initialized with initial state, and from here there is a ping-pong where image N is updated, while reading from image 1 - N. +After updating image N, the main thread will sample from image N. + +The sequential flow of the rendering is something like: + +* Compute thread waits for "prepare" +* Graphics thread waits for prepare +* Main thread signals "prepare" +* Main thread acquires the next swapchain image +* Compute thread prepares command buffer +* Compute thread waits on "submit" +* Graphics thread prepares command buffer +* Graphics thread waits on "submit" +* Main thread signals "submit" +* Main thread waits for "present" +* Compute thread writes image +* Compute thread signals "draw" +* Compute thread waits for next frame +* Graphics thread reads image +* Graphics thread signals "present" +* Graphics thread waits for next frame +* Main thread presents swapchain +* Main thread signals "end of frame" +* Compute thread waits for "prepare" +* Graphics thread waits for prepare + +And so on ... +With out of order signal, we can end up observing this order of submissions instead. + +* Main thread signals "prepare" +* Main thread acquires the next swapchain image +* Main signals "submit" +* Compute thread waits for "prepare" +* Compute thread prepares command buffer +* Compute thread writes image +* Compute thread signals "draw" +* Compute thread waits for next frame +* Graphics thread waits for "prepare" +* Graphics thread prepares command buffer +* Graphics thread waits on "submit" +* Graphics thread reads image +* Graphics thread signals present +* Graphics thread waits for next frame +* Main thread presents swapchain +* Main thread signals end of frame +* Main thread signals "prepare" +* Compute thread waits for "prepare" +* Graphics thread waits for prepare + +When submitting out of order, it is important that you don't just submit work way ahead of where the GPU actually is, since the latency becomes extremely large. +The natural place to keep submission explosion under control here is the place where we wait for the timeline on host, since we need to re-record command buffers anyways. + +=== Avoiding deadlocks in `vkDeviceWaitIdle` + +When submitting out-of-order we end up in a situation where a queue cannot see any forward progress until another queue submits. +Calling `vkDeviceWaitIdle` at this point triggers a deadlock of the application since `vkDeviceWaitIdle` will never finish, as there is one queue which cannot make forward progress. +While calling `vkDeviceWaitIdle`, you cannot call `vkQueueSubmit` due to external synchronization rules. + +Instead, just wait for timeline semaphores on host to "drain" the GPU, or if you must use API calls, use `vkQueueWaitIdle` and only wait on queues which you need. + +=== Avoiding deadlocks when tearing down worker thread + +Similar to `vkDeviceWaitIdle`, when tearing down the application, an out-of-order submission might be waiting on work which never comes, and that queue becomes deadlocked. +To alleviate this, we can make use of host signalling of timeline semaphores to unblock everything in one fell swoop. + +From `TimelineSemaphore::finish_timeline_workers()`: + +[,cpp] +---- + graphics_worker.alive = false; + compute_worker.alive = false; + + signal_timeline(Timeline::MAX_STAGES); + + if (graphics_worker.thread.joinable()) + { + graphics_worker.thread.join(); + } + + if (compute_worker.thread.joinable()) + { + compute_worker.thread.join(); + } +---- + +From `TimelineSemaphore::finish_timeline_workers()`: + +[,cpp] +---- + graphics_worker.alive = false; + compute_worker.alive = false; + + signal_timeline(Timeline::MAX_STAGES); + + if (graphics_worker.thread.joinable()) + { + graphics_worker.thread.join(); + } + + if (compute_worker.thread.joinable()) + { + compute_worker.thread.join(); + } +---- + +=== Out-of-order submission fallbacks for single queue implementations + +Since this sample needs to run on all implementations which support timeline semaphores, the sample also demonstrates the limitations of out-of-order queue submissions. +It's easy to land in a situation where you deadlock the GPU or driver which only happens on single queue Vulkan implementations. +There are two fixes we need to make this work. + +==== Holding back submissions + +This workaround ensures that submissions happen in-order, where forward progress can always be made. +Since we are using multiple submission threads this sample uses a condition variable to only allow a wait to be submitted if it ensures forward progress. +This is handled by `TimelineSemaphore::update_pending()`: + +[,cpp] +---- +std::lock_guard holder{lock.lock}; +lock.pending_timeline = timeline; +lock.cond.notify_one(); +---- + +and `TimelineSemaphore::wait_pending()`: + +[,cpp] +---- +std::unique_lock holder{lock.lock}; +lock.cond.wait(holder, [&lock, timeline]() -> bool { + return lock.pending_timeline >= timeline; +}); +---- + +Blocking like this only works when multiple threads can submit, but that's what this sample is doing, so it is a simple fix. + +The most robust workaround is probably to not lean too heavily on out-of-order submission unless you know you have all the `VkQueues` you need to resolve the dependencies properly. + +==== Locking `vkQueueSubmit` + +If two threads end up submitting to the same queue at the same time, we need to add locks due to external synchronization requirement of the `VkQueue`. +In this sample, we only add the locks if we're applying workarounds. + +== API limitations + +Currently, the Vulkan WSI swapchain does not support timeline semaphores. +In practice, this isn't too big of a deal as swapchain integration tends to be a "special case" either way in most rendering backends. +The acquire and release semaphores have no analog in other modern APIs. + +Another related issue with WSI swapchains is that when using binary semaphores, it is not possible to use wait-before-signal. +The specification states that in order to submit a wait on a binary semaphore all dependencies for that semaphore wait must have been submitted already. +This means that we need to potentially block a bit on host before we can call vkQueuePresentKHR. +The sample does this right before calling `ApiVulkanSample::submit_frame()`. + +[,cpp] +---- +// Before we call present, which uses a binary semaphore, we must ensure that all dependent submissions +// have been submitted, so that the presenting queue is unblocked at the time of calling. +wait_pending(async_compute_timeline_lock, main_thread_timeline.timeline); + +ApiVulkanSample::submit_frame(); +---- + +== Conclusion + +Timeline semaphores grants a lot of flexibility to applications. +With modern approaches of task graphs, many threads and free flowing synchronization, timeline semaphores simplify a lot of things and removes the need for emulating a similar concept with binary semaphores and fences. + +Be careful with out-of-order submissions. +There are various pitfalls with this approach which have been outlined in this sample. +s. + +Be careful with out-of-order submissions. +There are various pitfalls with this approach which have been outlined in this sample. diff --git a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp index ef861642d9..84b9428b02 100644 --- a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp +++ b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp @@ -17,111 +17,165 @@ #include "timeline_semaphore.h" +// What we're trying to demonstrate here is: +// - Out-of-order submission using threads which synchronize GPU work with each other using timeline semaphores. +// In this sample we have a dedicated worker threads for submitting work to the compute and graphics pipelines respectively, +// and the only synchronization with main thread happens via timeline semaphores. +// - Waiting for timeline semaphore on CPU to replace redundant fence objects. +// - Multiple waits on the same timeline. We don't need to worry about allocating and managing binary semaphores in complex scenarios. +// We can wait on the same timeline values as many times as we want, and we avoid all resource management problems that binary semaphores have. + +namespace +{ static constexpr unsigned grid_width = 64; static constexpr unsigned grid_height = 64; -// A simple variant of std::lock_guard which takes a condition for when to lock. -// We need this since we will only lock vkQueueSubmit when the worker thread needs to submit to the main thread as well. -// Otherwise, we will submit lock-free since we only need to externally synchronize the same VkQueue. -class ConditionalLockGuard +VkTimelineSemaphoreSubmitInfo create_timeline_submit_info(uint32_t waitValueCount, uint64_t *waitValue, uint32_t signalValueCount, uint64_t *signalValue) { - public: - ConditionalLockGuard(std::mutex &lock_, bool cond_) : - lock(lock_), cond(cond_) - { - if (cond) - { - lock.lock(); - } - } - - ~ConditionalLockGuard() - { - if (cond) - { - lock.unlock(); - } - } + return VkTimelineSemaphoreSubmitInfo{ + VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO, + NULL, + waitValueCount, + waitValue, + signalValueCount, + signalValue}; +} - private: - std::mutex &lock; - bool cond; -}; +} // namespace TimelineSemaphore::TimelineSemaphore() { - title = "Timeline semaphore"; + title = "Timeline Semaphore"; - // Need to enable timeline semaphore extension. - add_instance_extension(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); add_device_extension(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME); + add_instance_extension(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); } TimelineSemaphore::~TimelineSemaphore() { + if (prepared) + { + finish_timeline_workers(); + } + if (has_device()) { VkDevice vk_device = get_device().get_handle(); - vkDestroyPipelineLayout(vk_device, pipelines.compute_pipeline_layout, nullptr); - vkDestroyPipelineLayout(vk_device, pipelines.graphics_pipeline_layout, nullptr); - vkDestroyPipeline(vk_device, pipelines.visualize_pipeline, nullptr); - vkDestroyPipeline(vk_device, pipelines.compute_update_pipeline, nullptr); - vkDestroyPipeline(vk_device, pipelines.compute_mutate_pipeline, nullptr); - vkDestroyPipeline(vk_device, pipelines.compute_init_pipeline, nullptr); - vkDestroyDescriptorSetLayout(vk_device, descriptors.sampled_layout, nullptr); - vkDestroyDescriptorSetLayout(vk_device, descriptors.storage_layout, nullptr); - vkDestroyDescriptorPool(vk_device, descriptors.descriptor_pool, nullptr); - - vkDestroySemaphore(vk_device, main_thread_timeline.semaphore, nullptr); - vkDestroySemaphore(vk_device, async_compute_timeline.semaphore, nullptr); + vkDestroyCommandPool(vk_device, graphics.command_pool, nullptr); + vkDestroyPipelineLayout(vk_device, graphics.pipeline_layout, nullptr); + vkDestroyPipeline(vk_device, graphics.pipeline, nullptr); + + vkDestroyCommandPool(vk_device, compute.command_pool, nullptr); + vkDestroyPipelineLayout(vk_device, compute.pipeline_layout, nullptr); + vkDestroyPipeline(vk_device, compute.update_pipeline, nullptr); + vkDestroyPipeline(vk_device, compute.mutate_pipeline, nullptr); + vkDestroyPipeline(vk_device, compute.init_pipeline, nullptr); + + vkDestroyDescriptorSetLayout(vk_device, shared.storage_layout, nullptr); + vkDestroyDescriptorSetLayout(vk_device, shared.sampled_layout, nullptr); + + vkDestroySemaphore(vk_device, timeline.semaphore, nullptr); } } -void TimelineSemaphore::build_command_buffers() +void TimelineSemaphore::setup_shared_resources() { -} + // Descriptor pool + { + VkDescriptorPoolSize pool_sizes[2] = { + {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, NumAsyncFrames}, + {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, NumAsyncFrames}, + }; -void TimelineSemaphore::on_update_ui_overlay(vkb::Drawer &) -{ -} + VkDescriptorPoolCreateInfo pool_info = vkb::initializers::descriptor_pool_create_info(2, pool_sizes, 2 * NumAsyncFrames); + VK_CHECK(vkCreateDescriptorPool(get_device().get_handle(), &pool_info, nullptr, &shared.descriptor_pool)); + } -void TimelineSemaphore::finish() -{ - if (!has_device()) + // Sampler { - return; + auto sampler_create_info = vkb::initializers::sampler_create_info(); + sampler_create_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; + sampler_create_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT; + sampler_create_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT; + sampler_create_info.minFilter = VK_FILTER_NEAREST; + sampler_create_info.magFilter = VK_FILTER_NEAREST; + sampler_create_info.maxLod = VK_LOD_CLAMP_NONE; + sampler_create_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; + shared.immutable_sampler = std::make_unique(get_device(), sampler_create_info); + } + + // Images and image views + { + uint32_t queue_families[2]{}; + uint32_t num_queue_families{}; + + // Need CONCURRENT usage here since we will sample from the image + // in both graphics and compute queues. + if (get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT) != + get_device().get_queue_by_present(0).get_family_index()) + { + queue_families[0] = get_device().get_queue_by_present(0).get_family_index(); + queue_families[1] = get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT); + num_queue_families = 2; + } + + for (int i = 0; i < NumAsyncFrames; ++i) + { + shared.images[i] = std::make_unique(get_device(), VkExtent3D{grid_width, grid_height, 1}, + VK_FORMAT_R8G8B8A8_UNORM, + VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, + VMA_MEMORY_USAGE_GPU_ONLY, + VK_SAMPLE_COUNT_1_BIT, + 1, 1, VK_IMAGE_TILING_OPTIMAL, + 0, num_queue_families, queue_families); + + shared.image_views[i] = std::make_unique(*shared.images[i], VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R8G8B8A8_UNORM); + } } - // Draining queues which submit out-of-order can be quite tricky, since QueueWaitIdle can deadlock for threads which want to run ahead. - // If we call Submit waiting for a semaphore which is yet to be signalled, - // QueueWaitIdle will not finish until a signal in another thread happens. - // Here's an approach we can use to safely tear down the queue. - - // Drain the main thread timeline. - // The async queue might be stalled waiting on the main queue to finish rendering a future frame which it never completes, - // but we might never hit that count, since we're tearing down the application now. - wait_timeline_cpu(main_thread_timeline); - - // Now we're guaranteed that the graphics timeline is at N and the async compute queue is blocked at N + num_frames + 1, waiting for N + 1 to finish. - // Since we're not reading any more in graphics queue, we can bump the timeline on CPU towards infinity. - // On the next loop iteration, we will exit the rendering loop and QueueWaitIdle will not be blocked on async thread anymore. - // Just bump the timeline by INT32_MAX which is min-spec for maxTimelineSemaphoreValueDifference. - // This is a useful way to mark a timeline semaphore as "permanently" signalled. - main_thread_timeline.timeline += std::numeric_limits::max(); - - // Order matters here, this works kinda like a condition variable. - // If the timeline update is observed, we should see that the worker is not alive anymore. - async_compute_worker.alive = false; - signal_timeline_cpu(main_thread_timeline, main_thread_timeline_lock); - - // This will now complete in finite time. - if (async_compute_worker.thread.joinable()) + // Descriptor layouts { - async_compute_worker.thread.join(); + VkDescriptorSetLayoutBinding storage_binding = vkb::initializers::descriptor_set_layout_binding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT, 0); + VkDescriptorSetLayoutBinding sampled_binding = vkb::initializers::descriptor_set_layout_binding(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_SHADER_STAGE_ALL, 0); + + VkSampler vk_immutable_sampler = shared.immutable_sampler->get_handle(); + sampled_binding.pImmutableSamplers = &vk_immutable_sampler; + + VkDescriptorSetLayoutCreateInfo storage_set_layout_info = vkb::initializers::descriptor_set_layout_create_info(&storage_binding, 1); + VkDescriptorSetLayoutCreateInfo sampled_set_layout_info = vkb::initializers::descriptor_set_layout_create_info(&sampled_binding, 1); + + VK_CHECK(vkCreateDescriptorSetLayout(get_device().get_handle(), &storage_set_layout_info, nullptr, &shared.storage_layout)); + VK_CHECK(vkCreateDescriptorSetLayout(get_device().get_handle(), &sampled_set_layout_info, nullptr, &shared.sampled_layout)); + } + + // Descriptor sets + { + VkDescriptorSetAllocateInfo storage_alloc_info = vkb::initializers::descriptor_set_allocate_info(shared.descriptor_pool, &shared.storage_layout, 1); + VkDescriptorSetAllocateInfo sampled_alloc_info = vkb::initializers::descriptor_set_allocate_info(shared.descriptor_pool, &shared.sampled_layout, 1); + + for (int i = 0; i < NumAsyncFrames; ++i) + { + VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &storage_alloc_info, &shared.storage_images[i])); + VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &sampled_alloc_info, &shared.sampled_images[i])); + + auto general_info = vkb::initializers::descriptor_image_info(VK_NULL_HANDLE, shared.image_views[i]->get_handle(), VK_IMAGE_LAYOUT_GENERAL); + auto readonly_info = vkb::initializers::descriptor_image_info(VK_NULL_HANDLE, shared.image_views[i]->get_handle(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + + const VkWriteDescriptorSet writes[2] = { + vkb::initializers::write_descriptor_set(shared.storage_images[i], VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 0, &general_info), + vkb::initializers::write_descriptor_set(shared.sampled_images[i], VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 0, &readonly_info), + }; + + vkUpdateDescriptorSets(get_device().get_handle(), 2, writes, 0, nullptr); + } } } -void TimelineSemaphore::create_timeline_semaphore(Timeline &timeline) +void TimelineSemaphore::build_command_buffers() +{ +} + +void TimelineSemaphore::create_timeline_semaphore() { // A timeline semaphore is still a semaphore, but it is of TIMELINE type rather than BINARY. VkSemaphoreCreateInfo create_info = vkb::initializers::semaphore_create_info(); @@ -133,431 +187,319 @@ void TimelineSemaphore::create_timeline_semaphore(Timeline &timeline) VK_CHECK(vkCreateSemaphore(get_device().get_handle(), &create_info, nullptr, &timeline.semaphore)); - timeline.timeline = 0; + timeline.frame = 0; } -void TimelineSemaphore::create_timeline_semaphores() +void TimelineSemaphore::start_timeline_workers() { - create_timeline_semaphore(main_thread_timeline); - create_timeline_semaphore(async_compute_timeline); -} + graphics_worker.alive = true; + graphics_worker.thread = std::thread([this]() { do_graphics_work(); }); -void TimelineSemaphore::create_timeline_worker(TimelineWorker &worker, std::function thread_func) -{ - worker.alive = true; - worker.thread = std::thread(std::move(thread_func)); + compute_worker.alive = true; + compute_worker.thread = std::thread([this]() { do_compute_work(); }); } -// Normally, signal and wait would be merged into a single submit info, -// but this would have made the sample a bit harder to read and reason about. -// For this reason, we split up signals, waits and executions. -void TimelineSemaphore::signal_timeline_gpu(VkQueue signal_queue, const Timeline &timeline, TimelineLock &lock) +void TimelineSemaphore::finish_timeline_workers() { - VkSubmitInfo submit = vkb::initializers::submit_info(); - submit.pSignalSemaphores = &timeline.semaphore; - submit.signalSemaphoreCount = 1; + graphics_worker.alive = false; + compute_worker.alive = false; - // When N semaphores are provided and at least one of them is a timeline semaphore, - // we must pass an auxillary pNext struct which provides which timeline values to use. - VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; - timeline_info.signalSemaphoreValueCount = 1; - timeline_info.pSignalSemaphoreValues = &timeline.timeline; + // The MAX_STAGES value is used to unblock all threads that are waiting on a timeline stage + signal_timeline(Timeline::MAX_STAGES); - submit.pNext = &timeline_info; - - // VkQueue needs to be externally synchronized in vkQueueSubmit if async_queue == queue. + if (graphics_worker.thread.joinable()) { - ConditionalLockGuard holder{submission_lock, async_queue == queue}; - VK_CHECK(vkQueueSubmit(signal_queue, 1, &submit, VK_NULL_HANDLE)); + graphics_worker.thread.join(); } - // This is a special case to handle a scenario where async_queue == queue as well. - // Out-of-order submit is not possible with a single queue since the queue will deadlock itself. - // Very few implementations only support one queue, but the sample should run on all implementations. - // We also need this to handle the fact that we currently cannot use out-of-order submissions with swapchain. - update_pending(lock, timeline.timeline); -} - -void TimelineSemaphore::wait_timeline_gpu(VkQueue wait_queue, const Timeline &timeline, TimelineLock &lock) -{ - if (timeline.timeline == 0) + if (compute_worker.thread.joinable()) { - // No-op. - return; + compute_worker.thread.join(); } +} - // This is a special case to handle a scenario where async_queue == queue as well. - // Out-of-order submit is not possible with a single queue since the queue will deadlock itself. - // Very few implementations only support one queue, but the sample should run on all implementations. - wait_pending_in_order_queue(lock, timeline.timeline); - - const VkPipelineStageFlags wait_stages = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; +// Signal the timeline from the host. +void TimelineSemaphore::signal_timeline(Timeline::Stages stage) +{ + VkSemaphoreSignalInfo signalInfo; + signalInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO; + signalInfo.pNext = NULL; + signalInfo.semaphore = timeline.semaphore; + signalInfo.value = get_timeline_stage_value(stage); - VkSubmitInfo submit = vkb::initializers::submit_info(); - submit.pWaitSemaphores = &timeline.semaphore; - submit.pWaitDstStageMask = &wait_stages; - submit.waitSemaphoreCount = 1; + VK_CHECK(vkSignalSemaphoreKHR(get_device().get_handle(), &signalInfo)); +} - VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; - timeline_info.waitSemaphoreValueCount = 1; - timeline_info.pWaitSemaphoreValues = &timeline.timeline; +// Wait on the timeline from the host. +void TimelineSemaphore::wait_on_timeline(Timeline::Stages stage) +{ + const uint64_t waitValue = get_timeline_stage_value(stage); - submit.pNext = &timeline_info; + VkSemaphoreWaitInfo waitInfo; + waitInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO; + waitInfo.pNext = NULL; + waitInfo.flags = 0; + waitInfo.semaphoreCount = 1; + waitInfo.pSemaphores = &timeline.semaphore; + waitInfo.pValues = &waitValue; - // VkQueue needs to be externally synchronized in vkQueueSubmit if async_queue == queue. - { - ConditionalLockGuard holder{submission_lock, async_queue == queue}; - VK_CHECK(vkQueueSubmit(wait_queue, 1, &submit, VK_NULL_HANDLE)); - } + VK_CHECK(vkWaitSemaphoresKHR(get_device().get_handle(), &waitInfo, UINT64_MAX)); } -void TimelineSemaphore::wait_timeline_cpu(const Timeline &timeline) +// Sends the MAX_STAGES signal for the current frame, then increments the frame counter +void TimelineSemaphore::signal_next_frame() { - // There is no distinction between fences and semaphores anymore. - // We can freely wait for a timeline semaphore on host. - // There is also no external synchronization requirement like with VkFence! - // This allows for a free flowing synchronization implementation which makes multithreading even nicer. - - VkSemaphoreWaitInfoKHR wait_info{VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR}; - wait_info.pSemaphores = &timeline.semaphore; - wait_info.semaphoreCount = 1; - wait_info.pValues = &timeline.timeline; - VK_CHECK(vkWaitSemaphoresKHR(get_device().get_handle(), &wait_info, UINT64_MAX)); -} + VkSemaphoreSignalInfo signalInfo; + signalInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO; + signalInfo.pNext = NULL; + signalInfo.semaphore = timeline.semaphore; + signalInfo.value = get_timeline_stage_value(Timeline::MAX_STAGES); -void TimelineSemaphore::signal_timeline_cpu(const Timeline &timeline, TimelineLock &lock) -{ - VkSemaphoreSignalInfoKHR signal_info{VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO_KHR}; - signal_info.semaphore = timeline.semaphore; - signal_info.value = timeline.timeline; - VK_CHECK(vkSignalSemaphoreKHR(get_device().get_handle(), &signal_info)); - - // This is a special case to handle a scenario where async_queue == queue as well. - // Out-of-order submit is not possible with a single queue since the queue will deadlock itself. - // Very few implementations only support one queue, but the sample should run on all implementations. - update_pending(lock, timeline.timeline); -} + timeline.frame++; -void TimelineSemaphore::update_pending(TimelineLock &lock, uint64_t timeline) -{ - // To support out-of-order signal and wait with a single queue we must do some workarounds. - // Normally, an application should not bother with multiple async queues - // if they have to be hammered onto one VkQueue in the end, - // but it can be useful to know about these problem scenarios up front. - // - // The other case where we need to ensure some kind of ordering is when waiting on a binary semaphore. - // Binary semaphores still have the requirement that all dependencies must already have been submitted, - // and we must still use binary semaphores for swapchain. - // - // To make the single queue scenario work, we must be able to guarantee that a wait is submitted after a signal, - // since we cannot signal on a queue once it is blocked by a wait. - // The only way to do this is to hold back submissions and ensure submissions happen in a forward-progress order. - // - // In this sample, we can achieve this with a condition variable where we wait until - // a pending signal has been submitted, but this approach does not work in all cases. - // It works here since we have a dedicated submission thread. - // It is always possible to add submission threads which may or may not be practical. - // - // This is called after signalling the timeline, which lets other submission threads know that it is safe to wait on - // any timeline value that is <= pending_timeline. - std::lock_guard holder{lock.lock}; - lock.pending_timeline = timeline; - lock.cond.notify_one(); + VK_CHECK(vkSignalSemaphoreKHR(get_device().get_handle(), &signalInfo)); } -void TimelineSemaphore::wait_pending(TimelineLock &lock, uint64_t timeline) +// Waits for the timeline to reach MAX_STAGES for the current frame +void TimelineSemaphore::wait_for_next_frame() { - // See update_pending(). This is called before submitting a wait to the single VkQueue. - std::unique_lock holder{lock.lock}; - lock.cond.wait(holder, [&lock, timeline]() -> bool { - return lock.pending_timeline >= timeline; - }); + // MAX_STAGES is used as it provides a boundary value between the stages of this frame and the next + const uint64_t waitValue = (timeline.frame + 1) * Timeline::MAX_STAGES; + + VkSemaphoreWaitInfo waitInfo; + waitInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO; + waitInfo.pNext = NULL; + waitInfo.flags = 0; + waitInfo.semaphoreCount = 1; + waitInfo.pSemaphores = &timeline.semaphore; + waitInfo.pValues = &waitValue; + + VK_CHECK(vkWaitSemaphoresKHR(get_device().get_handle(), &waitInfo, UINT64_MAX)); } -void TimelineSemaphore::wait_pending_in_order_queue(TimelineLock &lock, uint64_t timeline) +// Calculates the timeline value for the specified stage in the current frame +uint64_t TimelineSemaphore::get_timeline_stage_value(Timeline::Stages stage) { - if (async_queue == queue) - { - wait_pending(lock, timeline); - } + return (timeline.frame * Timeline::MAX_STAGES) + stage; } -// We want to achieve a pipeline where we're doing these in a double-buffered fashion: -// - Do async compute work, write buffer frame % 2, read buffer (frame - 1) % 2. -// - Blit results in main thread, read buffer frame % 2, write swapchain. - -// What we're trying to demonstrate here is: -// - Out-of-order submission using threads which synchronize GPU work with each other using timeline semaphores. -// In this sample we have a dedicated worker thread which submits work to async compute, -// and the only synchronization with main thread happens via timeline semaphores. -// - Waiting for timeline semaphore on CPU to replace redundant fence objects. -// - Multiple waits on the same timeline. We don't need to worry about allocating and managing binary semaphores in complex scenarios. -// We can wait on the same timeline values as many times as we want, and we avoid all resource management problems that binary semaphores have. - -void TimelineSemaphore::async_compute_loop() +void TimelineSemaphore::do_compute_work() { - uint64_t iteration = 0; - - vkb::Timer timer; - timer.start(); - - // We're going to be recording commands on a thread, so make sure we have our own command pool. - VkCommandPool pool = get_device().create_command_pool(get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT), - VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT); + setup_compute_resources(); + setup_compute_pipeline(); - // Pre-allocate N command buffers. We will however re-record them every iteration. - VkCommandBufferAllocateInfo alloc_info = - vkb::initializers::command_buffer_allocate_info(pool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, NumAsyncFrames); - VkCommandBuffer cmds[NumAsyncFrames]; - VK_CHECK(vkAllocateCommandBuffers(get_device().get_handle(), &alloc_info, cmds)); - - while (async_compute_worker.alive) + compute.timer.start(); + while (compute_worker.alive) { - iteration++; - unsigned frame_index = iteration % NumAsyncFrames; - VkCommandBuffer cmd = cmds[frame_index]; + wait_on_timeline(Timeline::prepare); - if (iteration >= NumAsyncFrames) + if (timeline.frame == 0) + { + // Initialise the game of life on the first frame + build_compute_command_buffers(ComputeResources::init); + } + else { - // Wait for main thread to be done reading from the buffer, before we clobber it. - wait_timeline_gpu(async_queue, {main_thread_timeline.semaphore, iteration - NumAsyncFrames}, main_thread_timeline_lock); + auto elapsed = static_cast(compute.timer.elapsed()); + auto command_type = (elapsed > 1.0f) ? ComputeResources::update : ComputeResources::mutate; - // We're going to re-record command buffers, wait on host here. This also ensures we don't endlessly submit commands to the async queues. - // The signalling of async compute timeline is gated somewhat on the main thread submitting work to the swapchain. - wait_timeline_cpu({async_compute_timeline.semaphore, iteration - NumAsyncFrames}); + build_compute_command_buffers(command_type, elapsed); } - // Wait for last iteration to complete since we're going to read from the results. - // Could use pipeline barrier here certainly, but this is a sample - // where we can show how free-flowing queue synchronization can be. - wait_timeline_gpu(async_queue, async_compute_timeline, async_compute_timeline_lock); + uint64_t signal_value = get_timeline_stage_value(Timeline::draw); + VkTimelineSemaphoreSubmitInfo timeline_info = create_timeline_submit_info(0, nullptr, 1, &signal_value); - auto begin_info = vkb::initializers::command_buffer_begin_info(); - begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - VK_CHECK(vkResetCommandBuffer(cmd, 0)); - VK_CHECK(vkBeginCommandBuffer(cmd, &begin_info)); + VkSubmitInfo submit_info = vkb::initializers::submit_info(); + submit_info.pNext = &timeline_info; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &compute.command_buffer; + submit_info.signalSemaphoreCount = 1; + submit_info.pSignalSemaphores = &timeline.semaphore; - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines.compute_pipeline_layout, 0, 1, - &descriptors.storage_images[frame_index], - 0, nullptr); + // Wait for the main thread to signal that the workers can submit to their queues + wait_on_timeline(Timeline::submit); - if (iteration == 1) + // If the threads are being killed, we need to skip the queue submission to allow the program to exit gracefully + if (compute_worker.alive) { - // On the first iteration, we initialize the game of life. - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines.compute_init_pipeline); - } - else - { - auto elapsed = static_cast(timer.elapsed()); - - // Either we iterate the game every second, or we mutate it by changing colors gradually - // to make something more aesthetically interesting. - if (elapsed > 1.0f) - { - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines.compute_update_pipeline); - timer.lap(); - } - else - { - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines.compute_mutate_pipeline); - vkCmdPushConstants(cmd, pipelines.compute_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, - 0, sizeof(elapsed), &elapsed); - } - - // Bind previous iteration's texture. - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines.compute_pipeline_layout, 1, 1, - &descriptors.sampled_images[(frame_index + (NumAsyncFrames - 1)) % NumAsyncFrames], - 0, nullptr); + VK_CHECK(vkQueueSubmit(compute.queue, 1, &submit_info, VK_NULL_HANDLE)); } - VkImageMemoryBarrier image_barrier = vkb::initializers::image_memory_barrier(); - image_barrier.srcAccessMask = 0; - image_barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - image_barrier.image = images[frame_index]->get_handle(); - image_barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; - image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; - image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; + wait_for_next_frame(); + } +} - // The semaphore takes care of srcStageMask. - vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - 0, 0, nullptr, 0, nullptr, 1, &image_barrier); +void TimelineSemaphore::setup_compute_pipeline() +{ + VkDescriptorSetLayout layouts[2] = {shared.storage_layout, shared.sampled_layout}; + auto layout_info = vkb::initializers::pipeline_layout_create_info(layouts, 2); - vkCmdDispatch(cmd, grid_width / 8, grid_height / 8, 1); + VkPushConstantRange range = vkb::initializers::push_constant_range(VK_SHADER_STAGE_COMPUTE_BIT, sizeof(float), 0); + layout_info.pushConstantRangeCount = 1; + layout_info.pPushConstantRanges = ⦥ - image_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - image_barrier.dstAccessMask = 0; - image_barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; - image_barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + VK_CHECK(vkCreatePipelineLayout(get_device().get_handle(), &layout_info, nullptr, &compute.pipeline_layout)); + VkComputePipelineCreateInfo info = vkb::initializers::compute_pipeline_create_info(compute.pipeline_layout); - // The semaphore takes care of dstStageMask. - vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, - 0, 0, nullptr, 0, nullptr, 1, &image_barrier); + info.stage = load_shader("timeline_semaphore/game_of_life_update.comp", VK_SHADER_STAGE_COMPUTE_BIT); + VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &compute.update_pipeline)); - VK_CHECK(vkEndCommandBuffer(cmd)); + info.stage = load_shader("timeline_semaphore/game_of_life_mutate.comp", VK_SHADER_STAGE_COMPUTE_BIT); + VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &compute.mutate_pipeline)); - auto submit_info = vkb::initializers::submit_info(); - submit_info.commandBufferCount = 1; - submit_info.pCommandBuffers = &cmd; + info.stage = load_shader("timeline_semaphore/game_of_life_init.comp", VK_SHADER_STAGE_COMPUTE_BIT); + VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &compute.init_pipeline)); +} - // VkQueue needs to be externally synchronized in vkQueueSubmit if async_queue == queue. - { - ConditionalLockGuard holder{submission_lock, async_queue == queue}; - VK_CHECK(vkQueueSubmit(async_queue, 1, &submit_info, VK_NULL_HANDLE)); - } +void TimelineSemaphore::setup_compute_resources() +{ + // Get compute queue + compute.queue_family_index = get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT); + vkGetDeviceQueue(get_device().get_handle(), compute.queue_family_index, 0, &compute.queue); - // Kicks shading work in main queue. - async_compute_timeline.timeline = iteration; - signal_timeline_gpu(async_queue, async_compute_timeline, async_compute_timeline_lock); - } + compute.command_pool = get_device().create_command_pool(compute.queue_family_index, VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT); - // This QueueWaitIdle can be precarious. - // See TimelineSemaphore::finish() comments for why this is the case. - { - ConditionalLockGuard holder{submission_lock, async_queue == queue}; - vkQueueWaitIdle(async_queue); - } + VkCommandBufferAllocateInfo alloc_info = + vkb::initializers::command_buffer_allocate_info(compute.command_pool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, 1); - // This also frees command buffers allocated from the pool. - vkDestroyCommandPool(get_device().get_handle(), pool, nullptr); + VK_CHECK(vkAllocateCommandBuffers(get_device().get_handle(), &alloc_info, &compute.command_buffer)); } -void TimelineSemaphore::create_timeline_workers() +void TimelineSemaphore::build_compute_command_buffers(const ComputeResources::CommandType type, float elapsed) { - create_timeline_worker(async_compute_worker, [this]() { async_compute_loop(); }); -} + auto begin_info = vkb::initializers::command_buffer_begin_info(); + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + VK_CHECK(vkResetCommandBuffer(compute.command_buffer, 0)); + VK_CHECK(vkBeginCommandBuffer(compute.command_buffer, &begin_info)); -void TimelineSemaphore::prepare_queue() -{ - // Attempt to find a queue which is async compute. - // If we cannot find that queue family, at least try to find a queue which is not the "main" queue. - // If we have different queues we can safely use out of order signal and wait which is a core part of this sample. + auto frame_index = timeline.frame % NumAsyncFrames; + auto prev_index = (timeline.frame - 1) % NumAsyncFrames; - auto &device = get_device(); - uint32_t family_index = device.get_queue_family_index(VK_QUEUE_COMPUTE_BIT); - uint32_t num_queues = device.get_num_queues_for_queue_family(family_index); + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_images[frame_index], 0, nullptr); - for (uint32_t i = 0; i < num_queues; i++) + switch (type) { - auto &candidate = device.get_queue(family_index, i); - if (candidate.get_handle() != queue) + case ComputeResources::init: { - async_queue = candidate.get_handle(); - break; + // On the first iteration, we initialize the game of life. + vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.init_pipeline); } - } + break; - if (!async_queue) - { - // Fallback path. Cannot use out-of-order signal and wait here since the queue will deadlock itself. - // If this happens we need to add some locks and condition variables to make things work. - // See comments in TimelineSemaphore::update_pending(). - async_queue = queue; - } -} + case ComputeResources::update: + { + vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.update_pipeline); + compute.timer.lap(); -void TimelineSemaphore::create_resources() -{ - uint32_t queue_families[2]{}; - uint32_t num_queue_families{}; + // Bind previous iteration's texture. + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 1, 1, &shared.sampled_images[prev_index], 0, nullptr); + } + break; - // Need CONCURRENT usage here since we will sample from the image - // in both graphics and compute queues. - if (get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT) != - get_device().get_queue_by_present(0).get_family_index()) - { - queue_families[0] = get_device().get_queue_by_present(0).get_family_index(); - queue_families[1] = get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT); - num_queue_families = 2; - } + case ComputeResources::mutate: + { + vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.mutate_pipeline); + vkCmdPushConstants(compute.command_buffer, compute.pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(elapsed), &elapsed); - for (int i = 0; i < NumAsyncFrames; i++) - { - images[i] = std::make_unique(get_device(), VkExtent3D{grid_width, grid_height, 1}, - VK_FORMAT_R8G8B8A8_UNORM, - VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, - VMA_MEMORY_USAGE_GPU_ONLY, - VK_SAMPLE_COUNT_1_BIT, - 1, 1, VK_IMAGE_TILING_OPTIMAL, - 0, num_queue_families, queue_families); - - image_views[i] = std::make_unique(*images[i], VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R8G8B8A8_UNORM); + // Bind previous iteration's texture. + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 1, 1, &shared.sampled_images[prev_index], 0, nullptr); + } + break; } - // Boilerplate where we create a STORAGE_IMAGE descriptor set and SAMPLED_IMAGE descriptor set. - - auto sampler_create_info = vkb::initializers::sampler_create_info(); - sampler_create_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; - sampler_create_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT; - sampler_create_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT; - sampler_create_info.minFilter = VK_FILTER_NEAREST; - sampler_create_info.magFilter = VK_FILTER_NEAREST; - sampler_create_info.maxLod = VK_LOD_CLAMP_NONE; - sampler_create_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; - immutable_sampler = std::make_unique(get_device(), sampler_create_info); - VkSampler vk_immutable_sampler = immutable_sampler->get_handle(); - - VkDescriptorSetLayoutBinding storage_binding = vkb::initializers::descriptor_set_layout_binding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT, 0); - VkDescriptorSetLayoutBinding sampled_binding = vkb::initializers::descriptor_set_layout_binding(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_SHADER_STAGE_ALL, 0); - sampled_binding.pImmutableSamplers = &vk_immutable_sampler; - VkDescriptorSetLayoutCreateInfo storage_set_layout_info = vkb::initializers::descriptor_set_layout_create_info(&storage_binding, 1); - VkDescriptorSetLayoutCreateInfo sampled_set_layout_info = vkb::initializers::descriptor_set_layout_create_info(&sampled_binding, 1); - - VK_CHECK(vkCreateDescriptorSetLayout(get_device().get_handle(), &storage_set_layout_info, nullptr, &descriptors.storage_layout)); - VK_CHECK(vkCreateDescriptorSetLayout(get_device().get_handle(), &sampled_set_layout_info, nullptr, &descriptors.sampled_layout)); - - VkDescriptorPoolSize pool_sizes[2] = { - {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, NumAsyncFrames}, - {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, NumAsyncFrames}, - }; - VkDescriptorPoolCreateInfo pool_info = vkb::initializers::descriptor_pool_create_info(2, pool_sizes, NumAsyncFrames * 2); - VK_CHECK(vkCreateDescriptorPool(get_device().get_handle(), &pool_info, nullptr, &descriptors.descriptor_pool)); - - VkDescriptorSetAllocateInfo storage_alloc_info = vkb::initializers::descriptor_set_allocate_info(descriptors.descriptor_pool, &descriptors.storage_layout, 1); - VkDescriptorSetAllocateInfo sampled_alloc_info = vkb::initializers::descriptor_set_allocate_info(descriptors.descriptor_pool, &descriptors.sampled_layout, 1); - for (int i = 0; i < NumAsyncFrames; i++) - { - VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &storage_alloc_info, &descriptors.storage_images[i])); - VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &sampled_alloc_info, &descriptors.sampled_images[i])); + VkImageMemoryBarrier image_barrier = vkb::initializers::image_memory_barrier(); + image_barrier.srcAccessMask = 0; + image_barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + image_barrier.image = shared.images[frame_index]->get_handle(); + image_barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; - auto general_info = vkb::initializers::descriptor_image_info(VK_NULL_HANDLE, image_views[i]->get_handle(), VK_IMAGE_LAYOUT_GENERAL); - auto readonly_info = vkb::initializers::descriptor_image_info(VK_NULL_HANDLE, image_views[i]->get_handle(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + // The semaphore takes care of srcStageMask. + vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); - const VkWriteDescriptorSet writes[2] = { - vkb::initializers::write_descriptor_set(descriptors.storage_images[i], VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 0, &general_info), - vkb::initializers::write_descriptor_set(descriptors.sampled_images[i], VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 0, &readonly_info), - }; - vkUpdateDescriptorSets(get_device().get_handle(), 2, writes, 0, nullptr); - } + vkCmdDispatch(compute.command_buffer, grid_width / 8, grid_height / 8, 1); + + image_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + image_barrier.dstAccessMask = 0; + image_barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; + image_barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + // The semaphore takes care of dstStageMask. + vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); + + VK_CHECK(vkEndCommandBuffer(compute.command_buffer)); } -void TimelineSemaphore::create_compute_pipeline() +void TimelineSemaphore::do_graphics_work() { - VkDescriptorSetLayout layouts[2] = {descriptors.storage_layout, descriptors.sampled_layout}; - auto layout_info = vkb::initializers::pipeline_layout_create_info(layouts, 2); + setup_graphics_resources(); + setup_graphics_pipeline(); - VkPushConstantRange range = vkb::initializers::push_constant_range(VK_SHADER_STAGE_COMPUTE_BIT, sizeof(float), 0); - layout_info.pushConstantRangeCount = 1; - layout_info.pPushConstantRanges = ⦥ + while (graphics_worker.alive) + { + wait_on_timeline(Timeline::prepare); + + build_graphics_command_buffer(); + + uint64_t wait_values[] = {0, get_timeline_stage_value(Timeline::draw)}; + VkPipelineStageFlags wait_stage_masks[] = {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT}; + VkSemaphore wait_semaphores[] = {semaphores.acquired_image_ready, timeline.semaphore}; + uint64_t signal_values[] = {get_timeline_stage_value(Timeline::present), 0}; + VkSemaphore signal_semaphores[] = {timeline.semaphore, semaphores.render_complete}; + VkTimelineSemaphoreSubmitInfo timeline_info = create_timeline_submit_info(2, wait_values, 2, signal_values); + + VkSubmitInfo submit_info = vkb::initializers::submit_info(); + submit_info.pNext = &timeline_info; + submit_info.waitSemaphoreCount = 2; + submit_info.pWaitSemaphores = wait_semaphores; + submit_info.pWaitDstStageMask = wait_stage_masks; + submit_info.signalSemaphoreCount = 2; + submit_info.pSignalSemaphores = signal_semaphores; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &graphics.command_buffer; + + // Wait for the main thread to signal that the workers can submit to their queues + wait_on_timeline(Timeline::submit); + + if (compute.queue == graphics.queue) + { + // If compute.queue == queue, we need synchronise access to the queue AND ensure that submissions are made in order + // (otherwise the queue will deadlock itself). So we wait for the "draw" stage to be signalled on the host, before + // submitting the work. + wait_on_timeline(Timeline::draw); + } - VK_CHECK(vkCreatePipelineLayout(get_device().get_handle(), &layout_info, nullptr, &pipelines.compute_pipeline_layout)); - VkComputePipelineCreateInfo info = vkb::initializers::compute_pipeline_create_info(pipelines.compute_pipeline_layout); + // If the threads are being killed, we need to skip the queue submission to allow the program to exit gracefully + if (graphics_worker.alive) + { + VK_CHECK(vkQueueSubmit(graphics.queue, 1, &submit_info, VK_NULL_HANDLE)); + } - info.stage = load_shader("timeline_semaphore/game_of_life_update.comp", VK_SHADER_STAGE_COMPUTE_BIT); - VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &pipelines.compute_update_pipeline)); + wait_for_next_frame(); + } +} - info.stage = load_shader("timeline_semaphore/game_of_life_mutate.comp", VK_SHADER_STAGE_COMPUTE_BIT); - VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &pipelines.compute_mutate_pipeline)); +void TimelineSemaphore::setup_graphics_resources() +{ + graphics.queue_family_index = get_device().get_queue_family_index(VK_QUEUE_GRAPHICS_BIT); + graphics.queue = queue; - info.stage = load_shader("timeline_semaphore/game_of_life_init.comp", VK_SHADER_STAGE_COMPUTE_BIT); - VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &pipelines.compute_init_pipeline)); + graphics.command_pool = get_device().create_command_pool(graphics.queue_family_index, VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT); + + VkCommandBufferAllocateInfo alloc_info = + vkb::initializers::command_buffer_allocate_info(graphics.command_pool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, 1); + + VK_CHECK(vkAllocateCommandBuffers(get_device().get_handle(), &alloc_info, &graphics.command_buffer)); } -void TimelineSemaphore::create_graphics_pipeline() +void TimelineSemaphore::setup_graphics_pipeline() { - auto layout_info = vkb::initializers::pipeline_layout_create_info(&descriptors.sampled_layout, 1); - VK_CHECK(vkCreatePipelineLayout(get_device().get_handle(), &layout_info, nullptr, &pipelines.graphics_pipeline_layout)); + auto layout_info = vkb::initializers::pipeline_layout_create_info(&shared.sampled_layout, 1); + VK_CHECK(vkCreatePipelineLayout(get_device().get_handle(), &layout_info, nullptr, &graphics.pipeline_layout)); - VkGraphicsPipelineCreateInfo info = vkb::initializers::pipeline_create_info(pipelines.graphics_pipeline_layout, render_pass); + VkGraphicsPipelineCreateInfo info = vkb::initializers::pipeline_create_info(graphics.pipeline_layout, render_pass); VkPipelineVertexInputStateCreateInfo vertex_input_state = vkb::initializers::pipeline_vertex_input_state_create_info(); @@ -595,41 +537,14 @@ void TimelineSemaphore::create_graphics_pipeline() stages[0] = load_shader("timeline_semaphore/render.vert", VK_SHADER_STAGE_VERTEX_BIT); stages[1] = load_shader("timeline_semaphore/render.frag", VK_SHADER_STAGE_FRAGMENT_BIT); - VK_CHECK(vkCreateGraphicsPipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &pipelines.visualize_pipeline)); + VK_CHECK(vkCreateGraphicsPipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &graphics.pipeline)); } -void TimelineSemaphore::create_pipelines() +void TimelineSemaphore::build_graphics_command_buffer() { - create_compute_pipeline(); - create_graphics_pipeline(); -} - -bool TimelineSemaphore::prepare(const vkb::ApplicationOptions &options) -{ - if (!ApiVulkanSample::prepare(options)) - { - return false; - } - - create_resources(); - create_pipelines(); - prepare_queue(); - create_timeline_semaphores(); - create_timeline_workers(); - - prepared = true; - return true; -} - -void TimelineSemaphore::render(float delta_time) -{ - ApiVulkanSample::prepare_frame(); - - VK_CHECK(vkWaitForFences(get_device().get_handle(), 1, &wait_fences[current_buffer], VK_TRUE, UINT64_MAX)); - VK_CHECK(vkResetFences(get_device().get_handle(), 1, &wait_fences[current_buffer])); - - VkViewport viewport = {0.0f, 0.0f, static_cast(width), static_cast(height), 0.0f, 1.0f}; - VkRect2D scissor = {{0, 0}, {width, height}}; + auto frame_index = timeline.frame % NumAsyncFrames; + VkViewport viewport = {0.0f, 0.0f, static_cast(width), static_cast(height), 0.0f, 1.0f}; + VkRect2D scissor = {{0, 0}, {width, height}}; // Simple fix for 1:1 pixel aspect ratio. if (viewport.width > viewport.height) @@ -643,11 +558,10 @@ void TimelineSemaphore::render(float delta_time) viewport.height = viewport.width; } - recreate_current_command_buffer(); - auto cmd = draw_cmd_buffers[current_buffer]; auto begin_info = vkb::initializers::command_buffer_begin_info(); begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - vkBeginCommandBuffer(cmd, &begin_info); + VK_CHECK(vkResetCommandBuffer(graphics.command_buffer, 0)); + VK_CHECK(vkBeginCommandBuffer(graphics.command_buffer, &begin_info)); VkRenderPassBeginInfo render_pass_begin = vkb::initializers::render_pass_begin_info(); render_pass_begin.renderPass = render_pass; @@ -661,54 +575,76 @@ void TimelineSemaphore::render(float delta_time) render_pass_begin.pClearValues = clears; render_pass_begin.framebuffer = framebuffers[current_buffer]; - vkCmdBeginRenderPass(cmd, &render_pass_begin, VK_SUBPASS_CONTENTS_INLINE); + vkCmdBeginRenderPass(graphics.command_buffer, &render_pass_begin, VK_SUBPASS_CONTENTS_INLINE); - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelines.visualize_pipeline); - vkCmdSetViewport(cmd, 0, 1, &viewport); - vkCmdSetScissor(cmd, 0, 1, &scissor); + vkCmdBindPipeline(graphics.command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphics.pipeline); + vkCmdSetViewport(graphics.command_buffer, 0, 1, &viewport); + vkCmdSetScissor(graphics.command_buffer, 0, 1, &scissor); - main_thread_timeline.timeline++; - uint32_t frame_index = main_thread_timeline.timeline % NumAsyncFrames; - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelines.graphics_pipeline_layout, - 0, 1, &descriptors.sampled_images[frame_index], 0, nullptr); - vkCmdDraw(cmd, 3, 1, 0, 0); + vkCmdBindDescriptorSets(graphics.command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphics.pipeline_layout, 0, 1, &shared.sampled_images[frame_index], 0, nullptr); + vkCmdDraw(graphics.command_buffer, 3, 1, 0, 0); - draw_ui(cmd); + draw_ui(graphics.command_buffer); - vkCmdEndRenderPass(cmd); + vkCmdEndRenderPass(graphics.command_buffer); - VK_CHECK(vkEndCommandBuffer(cmd)); - submit_info.commandBufferCount = 1; - submit_info.pCommandBuffers = &draw_cmd_buffers[current_buffer]; + VK_CHECK(vkEndCommandBuffer(graphics.command_buffer)); +} - // Wait for the async queue to have completed rendering. - wait_timeline_gpu(queue, {async_compute_timeline.semaphore, main_thread_timeline.timeline}, async_compute_timeline_lock); +void TimelineSemaphore::request_gpu_features(vkb::PhysicalDevice &gpu) +{ + // Need to enable the timelineSemaphore feature. + auto &features = gpu.request_extension_features( + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR); + features.timelineSemaphore = VK_TRUE; +} - // Need to hold the conditional lock during submit_frame as well since vkQueuePresentKHR uses the main queue as well. +bool TimelineSemaphore::prepare(const vkb::ApplicationOptions &options) +{ + if (!ApiVulkanSample::prepare(options)) { - ConditionalLockGuard holder{submission_lock, async_queue == queue}; - VK_CHECK(vkQueueSubmit(queue, 1, &submit_info, wait_fences[current_buffer])); + return false; + } + + setup_shared_resources(); + create_timeline_semaphore(); - // Before we call present, which uses a binary semaphore, we must ensure that all dependent submissions - // have been submitted, so that the presenting queue is unblocked at the time of calling. - wait_pending(async_compute_timeline_lock, main_thread_timeline.timeline); + start_timeline_workers(); - ApiVulkanSample::submit_frame(); + prepared = true; + return true; +} + +void TimelineSemaphore::render(float delta_time) +{ + if (!prepared) + { + return; } - // Let async queue know it is safe to clobber the image since main queue is done reading it. - signal_timeline_gpu(queue, main_thread_timeline, main_thread_timeline_lock); + // Signal to the worker threads that they can prepare their command buffers + signal_timeline(Timeline::prepare); + + ApiVulkanSample::prepare_frame(); + + // Signal to the worker threads that they can submit their work, then wait for the work to complete + signal_timeline(Timeline::submit); + + // Wait for the worker threads to signal that the frame is ready to present + wait_on_timeline(Timeline::present); + + ApiVulkanSample::submit_frame(); + + // Signal to the worker threads that they can proceed to the next frame's work + signal_next_frame(); } -void TimelineSemaphore::request_gpu_features(vkb::PhysicalDevice &gpu) +bool TimelineSemaphore::resize(const uint32_t width, const uint32_t height) { - // Need to enable the timelineSemaphore feature. - auto &features = gpu.request_extension_features( - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR); - features.timelineSemaphore = VK_TRUE; + return ApiVulkanSample::resize(width, height); } -std::unique_ptr> create_timeline_semaphore() +std::unique_ptr create_timeline_semaphore() { return std::make_unique(); } diff --git a/samples/extensions/timeline_semaphore/timeline_semaphore.h b/samples/extensions/timeline_semaphore/timeline_semaphore.h index 0dc6c9e0f5..c2012435c6 100644 --- a/samples/extensions/timeline_semaphore/timeline_semaphore.h +++ b/samples/extensions/timeline_semaphore/timeline_semaphore.h @@ -18,98 +18,119 @@ #pragma once #include "api_vulkan_sample.h" -#include -#include -#include -#include class TimelineSemaphore : public ApiVulkanSample { public: - TimelineSemaphore(); - ~TimelineSemaphore(); + static const uint32_t NumAsyncFrames = 2; - private: - virtual void request_gpu_features(vkb::PhysicalDevice &gpu) override; - virtual void render(float delta_time) override; - virtual void build_command_buffers() override; - virtual void on_update_ui_overlay(vkb::Drawer &drawer) override; - virtual bool prepare(const vkb::ApplicationOptions &options) override; - virtual void finish() override; + // Resources for the graphics worker + struct GraphicsResources + { + VkQueue queue; + VkCommandPool command_pool; + VkCommandBuffer command_buffer; - void create_resources(); - void create_pipelines(); - void create_compute_pipeline(); - void create_graphics_pipeline(); + VkPipelineLayout pipeline_layout; + VkPipeline pipeline; - struct Pipelines - { - VkPipelineLayout compute_pipeline_layout{}; - VkPipelineLayout graphics_pipeline_layout{}; - VkPipeline visualize_pipeline{}; - VkPipeline compute_update_pipeline{}; - VkPipeline compute_mutate_pipeline{}; - VkPipeline compute_init_pipeline{}; - } pipelines; - - enum - { - NumAsyncFrames = 2 - }; + uint32_t queue_family_index; + } graphics; - struct Descriptors + // Resources for the compute worker + struct ComputeResources + { + VkQueue queue; + VkCommandPool command_pool; + VkCommandBuffer command_buffer; + + VkPipelineLayout pipeline_layout; + VkPipeline init_pipeline; + VkPipeline update_pipeline; + VkPipeline mutate_pipeline; + + vkb::Timer timer; + uint32_t queue_family_index; + + enum CommandType + { + init, + update, + mutate + }; + } compute; + + // Resources used by both workers for storing/sampling images + struct SharedResources { VkDescriptorSetLayout storage_layout; VkDescriptorSetLayout sampled_layout; VkDescriptorSet storage_images[NumAsyncFrames]; VkDescriptorSet sampled_images[NumAsyncFrames]; VkDescriptorPool descriptor_pool; - } descriptors{}; - std::unique_ptr immutable_sampler; - std::unique_ptr images[NumAsyncFrames]; - std::unique_ptr image_views[NumAsyncFrames]; - - VkQueue async_queue{VK_NULL_HANDLE}; - void prepare_queue(); - std::mutex submission_lock; + std::unique_ptr immutable_sampler; + std::unique_ptr images[NumAsyncFrames]; + std::unique_ptr image_views[NumAsyncFrames]; + } shared; struct Timeline { - VkSemaphore semaphore; - uint64_t timeline; - }; - Timeline main_thread_timeline{}, async_compute_timeline{}; + // The stages of the timeline are enumerated, to make it easier to read which stage we are signalling/waiting on, and to allow + // the stages to be reused without needing to recreate the semaphore. + enum Stages + { + prepare = 1, // Worker threads can begin preparing command buffers for submission + submit, // Worker threads can submit their command buffers, + draw, // The graphics worker can draw the current frame + present, // The main thread can present the frame to the display + MAX_STAGES + }; - struct TimelineLock - { - std::condition_variable cond; - std::mutex lock; - uint64_t pending_timeline; - }; - TimelineLock main_thread_timeline_lock{}, async_compute_timeline_lock{}; + VkSemaphore semaphore; + uint64_t frame; // Number of iterations through the timeline stages + } timeline; struct TimelineWorker { std::thread thread; std::atomic_bool alive; }; - TimelineWorker async_compute_worker; - void create_timeline_semaphores(); - void create_timeline_semaphore(Timeline &timeline); - - void create_timeline_workers(); - static void create_timeline_worker(TimelineWorker &worker, std::function thread_func); - - void async_compute_loop(); - - void signal_timeline_gpu(VkQueue queue, const Timeline &timeline, TimelineLock &lock); - void wait_timeline_gpu(VkQueue queue, const Timeline &timeline, TimelineLock &lock); - void wait_timeline_cpu(const Timeline &timeline); - void signal_timeline_cpu(const Timeline &timeline, TimelineLock &lock); - void update_pending(TimelineLock &lock, uint64_t timeline); - void wait_pending_in_order_queue(TimelineLock &lock, uint64_t timeline); - void wait_pending(TimelineLock &lock, uint64_t timeline); + + TimelineWorker graphics_worker, compute_worker; + + TimelineSemaphore(); + ~TimelineSemaphore(); + + void setup_shared_resources(); + void build_command_buffers() override; + + // Timeline operations + void create_timeline_semaphore(); + void start_timeline_workers(); + void finish_timeline_workers(); + void signal_next_frame(); + void wait_for_next_frame(); + void signal_timeline(Timeline::Stages stage); + void wait_on_timeline(Timeline::Stages stage); + uint64_t get_timeline_stage_value(Timeline::Stages stage); + + // Compute Work + void do_compute_work(); + void setup_compute_pipeline(); + void setup_compute_resources(); + void build_compute_command_buffers(const ComputeResources::CommandType type, const float elapsed = 0.0f); + + // Graphics Work + void do_graphics_work(); + void setup_graphics_resources(); + void setup_graphics_pipeline(); + void build_graphics_command_buffer(); + + virtual void request_gpu_features(vkb::PhysicalDevice &gpu) override; + virtual bool prepare(const vkb::ApplicationOptions &options) override; + virtual void render(float delta_time) override; + virtual bool resize(const uint32_t width, const uint32_t height) override; }; -std::unique_ptr> create_timeline_semaphore(); +std::unique_ptr create_timeline_semaphore(); From b2df90e86b5e515a17f3c37e96f6814b7ebfb7d5 Mon Sep 17 00:00:00 2001 From: Bryce Young Date: Wed, 12 Jun 2024 12:42:20 +0100 Subject: [PATCH 2/7] Fixed verification error --- .../timeline_semaphore/CMakeLists.txt | 66 +- .../extensions/timeline_semaphore/README.adoc | 748 +++++++++--------- .../timeline_semaphore/timeline_semaphore.cpp | 148 ++-- .../timeline_semaphore/timeline_semaphore.h | 23 +- 4 files changed, 498 insertions(+), 487 deletions(-) diff --git a/samples/extensions/timeline_semaphore/CMakeLists.txt b/samples/extensions/timeline_semaphore/CMakeLists.txt index 91d975a1dd..4458805124 100644 --- a/samples/extensions/timeline_semaphore/CMakeLists.txt +++ b/samples/extensions/timeline_semaphore/CMakeLists.txt @@ -1,33 +1,33 @@ -# Copyright (c) 2021-2024, Arm Limited and Contributors -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 the "License"; -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -get_filename_component(FOLDER_NAME ${CMAKE_CURRENT_LIST_DIR} NAME) -get_filename_component(PARENT_DIR ${CMAKE_CURRENT_LIST_DIR} PATH) -get_filename_component(CATEGORY_NAME ${PARENT_DIR} NAME) - -add_sample_with_tags( - ID ${FOLDER_NAME} - CATEGORY ${CATEGORY_NAME} - AUTHOR "Hans-Kristian Arntzen" - NAME "Timeline semaphore" - DESCRIPTION "Demonstrates use of timeline semaphores to express complex queue dependency graphs" - SHADER_FILES_GLSL - "timeline_semaphore/game_of_life_update.comp" - "timeline_semaphore/game_of_life_mutate.comp" - "timeline_semaphore/game_of_life_init.comp" - "timeline_semaphore/render.vert" - "timeline_semaphore/render.frag") +# Copyright (c) 2021-2024, Arm Limited and Contributors +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 the "License"; +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +get_filename_component(FOLDER_NAME ${CMAKE_CURRENT_LIST_DIR} NAME) +get_filename_component(PARENT_DIR ${CMAKE_CURRENT_LIST_DIR} PATH) +get_filename_component(CATEGORY_NAME ${PARENT_DIR} NAME) + +add_sample_with_tags( + ID ${FOLDER_NAME} + CATEGORY ${CATEGORY_NAME} + AUTHOR "Hans-Kristian Arntzen" + NAME "Timeline semaphore" + DESCRIPTION "Demonstrates use of timeline semaphores to express complex queue dependency graphs" + SHADER_FILES_GLSL + "timeline_semaphore/game_of_life_update.comp" + "timeline_semaphore/game_of_life_mutate.comp" + "timeline_semaphore/game_of_life_init.comp" + "timeline_semaphore/render.vert" + "timeline_semaphore/render.frag") diff --git a/samples/extensions/timeline_semaphore/README.adoc b/samples/extensions/timeline_semaphore/README.adoc index b39ad86168..235096dc35 100644 --- a/samples/extensions/timeline_semaphore/README.adoc +++ b/samples/extensions/timeline_semaphore/README.adoc @@ -1,374 +1,374 @@ -//// -- Copyright (c) 2021-2024, Arm Limited and Contributors -- -- SPDX-License-Identifier: Apache-2.0 -- -- Licensed under the Apache License, Version 2.0 the "License"; -- you may not use this file except in compliance with the License. -- You may obtain a copy of the License at -- -- http://www.apache.org/licenses/LICENSE-2.0 -- -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -- See the License for the specific language governing permissions and -- limitations under the License. -- -//// -= Timeline semaphore - -ifdef::site-gen-antora[] -TIP: The source for this sample can be found in the https://github.com/KhronosGroup/Vulkan-Samples/tree/main/samples/extensions/timeline_semaphore[Khronos Vulkan samples github repository]. -endif::[] - - -== Overview - -In Vulkan 1.0, we were introduced to `VkSemaphore` which is able to synchronize work between Vulkan queues. -It has some peculiar behavior which makes it somewhat difficult to use in practice. -The timeline semaphore is designed to solve these problems and it also makes the queue synchronization model closer to what we see in D3D12. - -The existing semaphore as-is works fine in normal situations, but as applications learn to take advantage of async compute, async transfer, and other advanced synchronization use cases, there are problems which are hard to ignore. - -=== The binary semaphore problems - -The existing semaphore type is now called a `BINARY` semaphore, as signals and waits must always happen in 1:1 pairs. -Completing a wait for a semaphore on the `VkQueue` also *unsignals it*. -This is problematic for more advanced use cases where we wish to create a single producer, multiple consumers scenario. -To make binary semaphores work, we would have to signal multiple semaphores in a single `vkQueueSubmit`, and then assign one semaphore to each waiting queue. -This is rather awkward, since it might not be obvious at signal time how this scenario will play out, and juggling N semaphores just for this case is not fun. - -When juggling N semaphores, it might also happen that a semaphore was not required after all, and we are now sitting with a signalled semaphore which cannot be recycled and signalled again unless we wait for it first. -The solution here is to just destroy such "hung" semaphores, which is unfortunate. -Ideally we would be able to reset semaphores on the host as well, but no such API exists and submitting a wait to GPU just for the purpose of unsignalling a semaphore is silly. - -There is also an object bloat problem. -Usually, there are many submissions in flight on a GPU, and to be able to synchronize with each submission, we must keep track of a certain number of semaphores which are in-flight at any one time. -This is doable, but inelegant. -There is a similar problem for `VkFence` as well. - -The final problem is a lack of out-of-order signal and wait. -This is a somewhat of a niche problem, but in a world with free threaded task graphs, it could make sense to be able to submit work out of order and let synchronization objects take care of synchronization on the GPU. -With binary semaphores, a signal must be submitted before the wait, which guarantees forward progress, but guarantees jank in the engine. -There are certainly good reasons for this restriction, but it removes some flexibility. - -=== Viewing a `VkQueue` as a sequence - thinking in terms of counters - -In order to signal on a `VkQueue`, we wait for everything that happened before we signal anything. -This also means that future signal operations will wait for a superset of the operations in the signal that came before. -In this sense, instead of thinking of synchronizing against individual submissions, we can think about things like "Wait for submission #134 on compute queue to complete", i.e. -we just associate a single monotonically increasing number to a queue. -Submitting to a `VkQueue` can now be considered a simple increment of the monotonically increasing number. - -This is the foundation of timeline semaphores. -A `VkSemaphore` can have a 64-bit counter associated with it and there are two new operations we can do: - -* As a signal semaphore, wait for everything to complete in queue, then *monotonically* bump counter value to `$old_value + $increment`, where `$increment` is usually 1. -* As a wait semaphore, wait for the counter of the semaphore to reach *at least* the wait count value. - -From an application point of view, there is no longer a need to own synchronization objects and applications can instead agree on 64-bit counters. - -=== Out-of-order signal and wait - -Timeline semaphores also adds support for submitting waits before the corresponding signal operation. -This hands over the burden to the driver, where it will need to either hold back submissions on its own, or defer this work to the kernel driver. -Either way, the application no longer needs to hold back submissions. - -This can be quite useful when applications have multiple threads which perform queue submission, since ensuring ordering otherwise would require a lot of careful thread synchronization. - -=== Single producer, multiple consumers - -There is no unsignal operation with timeline semaphores, so it's perfectly fine to do something like: - -* Signal graphics queue, value 40 -* Wait async compute queue 0, value 40 -* Wait async compute queue 1, value 39 -* Wait async compute queue 2, value 36 - -Once the counter reaches 40, it will always be at least 40, and we can keep waiting for this counter as long as we wish. - -=== Integration of host signal and wait, good night sweet `VkFence` - -VkFence is somewhat redundant when we have timeline semaphores, since we can now wait for counter values on CPU as well. -There is not even a requirement to externally synchronize `VkSemaphore` objects when doing so, which is very nice! -To synchronize GPU work with CPU, we just need to know the timeline value we signalled with. - -== Using timeline semaphores - -First, we need to create a `VkSemaphore` with `TIMELINE` type. - -[,cpp] ----- -// A timeline semaphore is still a semaphore, but it is of TIMELINE type rather than BINARY. -VkSemaphoreCreateInfo create_info = vkb::initializers::semaphore_create_info(); -VkSemaphoreTypeCreateInfoKHR type_create_info{VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR}; - -type_create_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; -type_create_info.initialValue = 0; -create_info.pNext = &type_create_info; - -VK_CHECK(vkCreateSemaphore(get_device().get_handle(), &create_info, nullptr, &timeline.semaphore)); ----- - -We can signal the timeline in `vkQueueSubmit`. - -[,cpp] ----- -VkSubmitInfo submit = vkb::initializers::submit_info(); -submit.pSignalSemaphores = &timeline.semaphore; -submit.signalSemaphoreCount = 1; -submit.pCommandBuffers = &cmd; -submit.commandBufferCount = 1; - -// For every timeline semaphore we signal, we give an auxillary timeline value. -VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; -timeline_info.signalSemaphoreValueCount = 1; -timeline_info.pSignalSemaphoreValues = &timeline.timeline; - -submit.pNext = &timeline_info; - -VK_CHECK(vkQueueSubmit(signal_queue, 1, &submit, VK_NULL_HANDLE)); ----- - -Similarly, we can wait in `vkQueueSubmit`. - -[,cpp] ----- -const VkPipelineStageFlags wait_stages = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - -VkSubmitInfo submit = vkb::initializers::submit_info(); -submit.pWaitSemaphores = &timeline.semaphore; -submit.pWaitDstStageMask = &wait_stages; -submit.waitSemaphoreCount = 1; -submit.pCommandBuffers = &cmd; -submit.commandBufferCount = 1; - -VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; -timeline_info.waitSemaphoreValueCount = 1; -timeline_info.pWaitSemaphoreValues = &timeline.timeline; - -submit.pNext = &timeline_info; - -VK_CHECK(vkQueueSubmit(wait_queue, 1, &submit, VK_NULL_HANDLE)); ----- - -We can wait for one or more semaphores on host as well! - -[,cpp] ----- -VkSemaphoreWaitInfoKHR wait_info{VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR}; -wait_info.pSemaphores = &semaphore; -wait_info.semaphoreCount = 1; -wait_info.pValues = &value; -VK_CHECK(vkWaitSemaphoresKHR(device->get_handle(), &wait_info, UINT64_MAX)); ----- - -A somewhat esoteric feature is to signal a timeline on host, this can be used to "kick" the GPU. - -[,cpp] ----- -VkSemaphoreSignalInfoKHR signal_info{VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO_KHR}; -signal_info.semaphore = semaphore; -signal_info.value = value; -VK_CHECK(vkSignalSemaphoreKHR(device->get_handle(), &signal_info)); ----- - -== The sample - -image::./images/sample.png[Sample] - -This sample demonstrates an esoteric way of implementing the well-known "Game of Life". -Through this sample we end up using all the core features of timeline semaphores. - -=== The queues - -In this sample, we make use of two `VkQueues`, an async compute queue which performs simulation, and the main graphics queue which blits to swapchain and presents the results. -The two queues need to carefully synchronize with each other. -This sample could trivially be done with binary semaphores of course, so in this sample we implement it in a difficult way to demonstrate the full API capabilities. - -=== Async worker thread - out-of-order submission - -The key aspects we use to demonstrate out of order submission are dedicated workers thread which perform all work related to simulation on the async compute queue, and drawing on the graphics queue. -They never synchronize with the main thread except at teardown, so the only way to synchronize them is through timeline semaphores. -To avoid issues when running the sample on Windows platforms (particularly when resizing the window), forward progress in the queues is throttled by the main thread (i.e. only allowing the timeline to advance -when a render call is active). - - -=== Data flow - -To simulate "Game of Life", we allocate two images of 64x64 RGBA8. -First, one image is initialized with initial state, and from here there is a ping-pong where image N is updated, while reading from image 1 - N. -After updating image N, the main thread will sample from image N. - -The sequential flow of the rendering is something like: - -* Compute thread waits for "prepare" -* Graphics thread waits for prepare -* Main thread signals "prepare" -* Main thread acquires the next swapchain image -* Compute thread prepares command buffer -* Compute thread waits on "submit" -* Graphics thread prepares command buffer -* Graphics thread waits on "submit" -* Main thread signals "submit" -* Main thread waits for "present" -* Compute thread writes image -* Compute thread signals "draw" -* Compute thread waits for next frame -* Graphics thread reads image -* Graphics thread signals "present" -* Graphics thread waits for next frame -* Main thread presents swapchain -* Main thread signals "end of frame" -* Compute thread waits for "prepare" -* Graphics thread waits for prepare - -And so on ... -With out of order signal, we can end up observing this order of submissions instead. - -* Main thread signals "prepare" -* Main thread acquires the next swapchain image -* Main signals "submit" -* Compute thread waits for "prepare" -* Compute thread prepares command buffer -* Compute thread writes image -* Compute thread signals "draw" -* Compute thread waits for next frame -* Graphics thread waits for "prepare" -* Graphics thread prepares command buffer -* Graphics thread waits on "submit" -* Graphics thread reads image -* Graphics thread signals present -* Graphics thread waits for next frame -* Main thread presents swapchain -* Main thread signals end of frame -* Main thread signals "prepare" -* Compute thread waits for "prepare" -* Graphics thread waits for prepare - -When submitting out of order, it is important that you don't just submit work way ahead of where the GPU actually is, since the latency becomes extremely large. -The natural place to keep submission explosion under control here is the place where we wait for the timeline on host, since we need to re-record command buffers anyways. - -=== Avoiding deadlocks in `vkDeviceWaitIdle` - -When submitting out-of-order we end up in a situation where a queue cannot see any forward progress until another queue submits. -Calling `vkDeviceWaitIdle` at this point triggers a deadlock of the application since `vkDeviceWaitIdle` will never finish, as there is one queue which cannot make forward progress. -While calling `vkDeviceWaitIdle`, you cannot call `vkQueueSubmit` due to external synchronization rules. - -Instead, just wait for timeline semaphores on host to "drain" the GPU, or if you must use API calls, use `vkQueueWaitIdle` and only wait on queues which you need. - -=== Avoiding deadlocks when tearing down worker thread - -Similar to `vkDeviceWaitIdle`, when tearing down the application, an out-of-order submission might be waiting on work which never comes, and that queue becomes deadlocked. -To alleviate this, we can make use of host signalling of timeline semaphores to unblock everything in one fell swoop. - -From `TimelineSemaphore::finish_timeline_workers()`: - -[,cpp] ----- - graphics_worker.alive = false; - compute_worker.alive = false; - - signal_timeline(Timeline::MAX_STAGES); - - if (graphics_worker.thread.joinable()) - { - graphics_worker.thread.join(); - } - - if (compute_worker.thread.joinable()) - { - compute_worker.thread.join(); - } ----- - -From `TimelineSemaphore::finish_timeline_workers()`: - -[,cpp] ----- - graphics_worker.alive = false; - compute_worker.alive = false; - - signal_timeline(Timeline::MAX_STAGES); - - if (graphics_worker.thread.joinable()) - { - graphics_worker.thread.join(); - } - - if (compute_worker.thread.joinable()) - { - compute_worker.thread.join(); - } ----- - -=== Out-of-order submission fallbacks for single queue implementations - -Since this sample needs to run on all implementations which support timeline semaphores, the sample also demonstrates the limitations of out-of-order queue submissions. -It's easy to land in a situation where you deadlock the GPU or driver which only happens on single queue Vulkan implementations. -There are two fixes we need to make this work. - -==== Holding back submissions - -This workaround ensures that submissions happen in-order, where forward progress can always be made. -Since we are using multiple submission threads this sample uses a condition variable to only allow a wait to be submitted if it ensures forward progress. -This is handled by `TimelineSemaphore::update_pending()`: - -[,cpp] ----- -std::lock_guard holder{lock.lock}; -lock.pending_timeline = timeline; -lock.cond.notify_one(); ----- - -and `TimelineSemaphore::wait_pending()`: - -[,cpp] ----- -std::unique_lock holder{lock.lock}; -lock.cond.wait(holder, [&lock, timeline]() -> bool { - return lock.pending_timeline >= timeline; -}); ----- - -Blocking like this only works when multiple threads can submit, but that's what this sample is doing, so it is a simple fix. - -The most robust workaround is probably to not lean too heavily on out-of-order submission unless you know you have all the `VkQueues` you need to resolve the dependencies properly. - -==== Locking `vkQueueSubmit` - -If two threads end up submitting to the same queue at the same time, we need to add locks due to external synchronization requirement of the `VkQueue`. -In this sample, we only add the locks if we're applying workarounds. - -== API limitations - -Currently, the Vulkan WSI swapchain does not support timeline semaphores. -In practice, this isn't too big of a deal as swapchain integration tends to be a "special case" either way in most rendering backends. -The acquire and release semaphores have no analog in other modern APIs. - -Another related issue with WSI swapchains is that when using binary semaphores, it is not possible to use wait-before-signal. -The specification states that in order to submit a wait on a binary semaphore all dependencies for that semaphore wait must have been submitted already. -This means that we need to potentially block a bit on host before we can call vkQueuePresentKHR. -The sample does this right before calling `ApiVulkanSample::submit_frame()`. - -[,cpp] ----- -// Before we call present, which uses a binary semaphore, we must ensure that all dependent submissions -// have been submitted, so that the presenting queue is unblocked at the time of calling. -wait_pending(async_compute_timeline_lock, main_thread_timeline.timeline); - -ApiVulkanSample::submit_frame(); ----- - -== Conclusion - -Timeline semaphores grants a lot of flexibility to applications. -With modern approaches of task graphs, many threads and free flowing synchronization, timeline semaphores simplify a lot of things and removes the need for emulating a similar concept with binary semaphores and fences. - -Be careful with out-of-order submissions. -There are various pitfalls with this approach which have been outlined in this sample. -s. - -Be careful with out-of-order submissions. -There are various pitfalls with this approach which have been outlined in this sample. +//// +- Copyright (c) 2021-2024, Arm Limited and Contributors +- +- SPDX-License-Identifier: Apache-2.0 +- +- Licensed under the Apache License, Version 2.0 the "License"; +- you may not use this file except in compliance with the License. +- You may obtain a copy of the License at +- +- http://www.apache.org/licenses/LICENSE-2.0 +- +- Unless required by applicable law or agreed to in writing, software +- distributed under the License is distributed on an "AS IS" BASIS, +- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- See the License for the specific language governing permissions and +- limitations under the License. +- +//// += Timeline semaphore + +ifdef::site-gen-antora[] +TIP: The source for this sample can be found in the https://github.com/KhronosGroup/Vulkan-Samples/tree/main/samples/extensions/timeline_semaphore[Khronos Vulkan samples github repository]. +endif::[] + + +== Overview + +In Vulkan 1.0, we were introduced to `VkSemaphore` which is able to synchronize work between Vulkan queues. +It has some peculiar behavior which makes it somewhat difficult to use in practice. +The timeline semaphore is designed to solve these problems and it also makes the queue synchronization model closer to what we see in D3D12. + +The existing semaphore as-is works fine in normal situations, but as applications learn to take advantage of async compute, async transfer, and other advanced synchronization use cases, there are problems which are hard to ignore. + +=== The binary semaphore problems + +The existing semaphore type is now called a `BINARY` semaphore, as signals and waits must always happen in 1:1 pairs. +Completing a wait for a semaphore on the `VkQueue` also *unsignals it*. +This is problematic for more advanced use cases where we wish to create a single producer, multiple consumers scenario. +To make binary semaphores work, we would have to signal multiple semaphores in a single `vkQueueSubmit`, and then assign one semaphore to each waiting queue. +This is rather awkward, since it might not be obvious at signal time how this scenario will play out, and juggling N semaphores just for this case is not fun. + +When juggling N semaphores, it might also happen that a semaphore was not required after all, and we are now sitting with a signalled semaphore which cannot be recycled and signalled again unless we wait for it first. +The solution here is to just destroy such "hung" semaphores, which is unfortunate. +Ideally we would be able to reset semaphores on the host as well, but no such API exists and submitting a wait to GPU just for the purpose of unsignalling a semaphore is silly. + +There is also an object bloat problem. +Usually, there are many submissions in flight on a GPU, and to be able to synchronize with each submission, we must keep track of a certain number of semaphores which are in-flight at any one time. +This is doable, but inelegant. +There is a similar problem for `VkFence` as well. + +The final problem is a lack of out-of-order signal and wait. +This is a somewhat of a niche problem, but in a world with free threaded task graphs, it could make sense to be able to submit work out of order and let synchronization objects take care of synchronization on the GPU. +With binary semaphores, a signal must be submitted before the wait, which guarantees forward progress, but guarantees jank in the engine. +There are certainly good reasons for this restriction, but it removes some flexibility. + +=== Viewing a `VkQueue` as a sequence - thinking in terms of counters + +In order to signal on a `VkQueue`, we wait for everything that happened before we signal anything. +This also means that future signal operations will wait for a superset of the operations in the signal that came before. +In this sense, instead of thinking of synchronizing against individual submissions, we can think about things like "Wait for submission #134 on compute queue to complete", i.e. +we just associate a single monotonically increasing number to a queue. +Submitting to a `VkQueue` can now be considered a simple increment of the monotonically increasing number. + +This is the foundation of timeline semaphores. +A `VkSemaphore` can have a 64-bit counter associated with it and there are two new operations we can do: + +* As a signal semaphore, wait for everything to complete in queue, then *monotonically* bump counter value to `$old_value + $increment`, where `$increment` is usually 1. +* As a wait semaphore, wait for the counter of the semaphore to reach *at least* the wait count value. + +From an application point of view, there is no longer a need to own synchronization objects and applications can instead agree on 64-bit counters. + +=== Out-of-order signal and wait + +Timeline semaphores also adds support for submitting waits before the corresponding signal operation. +This hands over the burden to the driver, where it will need to either hold back submissions on its own, or defer this work to the kernel driver. +Either way, the application no longer needs to hold back submissions. + +This can be quite useful when applications have multiple threads which perform queue submission, since ensuring ordering otherwise would require a lot of careful thread synchronization. + +=== Single producer, multiple consumers + +There is no unsignal operation with timeline semaphores, so it's perfectly fine to do something like: + +* Signal graphics queue, value 40 +* Wait async compute queue 0, value 40 +* Wait async compute queue 1, value 39 +* Wait async compute queue 2, value 36 + +Once the counter reaches 40, it will always be at least 40, and we can keep waiting for this counter as long as we wish. + +=== Integration of host signal and wait, good night sweet `VkFence` + +VkFence is somewhat redundant when we have timeline semaphores, since we can now wait for counter values on CPU as well. +There is not even a requirement to externally synchronize `VkSemaphore` objects when doing so, which is very nice! +To synchronize GPU work with CPU, we just need to know the timeline value we signalled with. + +== Using timeline semaphores + +First, we need to create a `VkSemaphore` with `TIMELINE` type. + +[,cpp] +---- +// A timeline semaphore is still a semaphore, but it is of TIMELINE type rather than BINARY. +VkSemaphoreCreateInfo create_info = vkb::initializers::semaphore_create_info(); +VkSemaphoreTypeCreateInfoKHR type_create_info{VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR}; + +type_create_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; +type_create_info.initialValue = 0; +create_info.pNext = &type_create_info; + +VK_CHECK(vkCreateSemaphore(get_device().get_handle(), &create_info, nullptr, &timeline.semaphore)); +---- + +We can signal the timeline in `vkQueueSubmit`. + +[,cpp] +---- +VkSubmitInfo submit = vkb::initializers::submit_info(); +submit.pSignalSemaphores = &timeline.semaphore; +submit.signalSemaphoreCount = 1; +submit.pCommandBuffers = &cmd; +submit.commandBufferCount = 1; + +// For every timeline semaphore we signal, we give an auxillary timeline value. +VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; +timeline_info.signalSemaphoreValueCount = 1; +timeline_info.pSignalSemaphoreValues = &timeline.timeline; + +submit.pNext = &timeline_info; + +VK_CHECK(vkQueueSubmit(signal_queue, 1, &submit, VK_NULL_HANDLE)); +---- + +Similarly, we can wait in `vkQueueSubmit`. + +[,cpp] +---- +const VkPipelineStageFlags wait_stages = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + +VkSubmitInfo submit = vkb::initializers::submit_info(); +submit.pWaitSemaphores = &timeline.semaphore; +submit.pWaitDstStageMask = &wait_stages; +submit.waitSemaphoreCount = 1; +submit.pCommandBuffers = &cmd; +submit.commandBufferCount = 1; + +VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; +timeline_info.waitSemaphoreValueCount = 1; +timeline_info.pWaitSemaphoreValues = &timeline.timeline; + +submit.pNext = &timeline_info; + +VK_CHECK(vkQueueSubmit(wait_queue, 1, &submit, VK_NULL_HANDLE)); +---- + +We can wait for one or more semaphores on host as well! + +[,cpp] +---- +VkSemaphoreWaitInfoKHR wait_info{VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR}; +wait_info.pSemaphores = &semaphore; +wait_info.semaphoreCount = 1; +wait_info.pValues = &value; +VK_CHECK(vkWaitSemaphoresKHR(device->get_handle(), &wait_info, UINT64_MAX)); +---- + +A somewhat esoteric feature is to signal a timeline on host, this can be used to "kick" the GPU. + +[,cpp] +---- +VkSemaphoreSignalInfoKHR signal_info{VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO_KHR}; +signal_info.semaphore = semaphore; +signal_info.value = value; +VK_CHECK(vkSignalSemaphoreKHR(device->get_handle(), &signal_info)); +---- + +== The sample + +image::./images/sample.png[Sample] + +This sample demonstrates an esoteric way of implementing the well-known "Game of Life". +Through this sample we end up using all the core features of timeline semaphores. + +=== The queues + +In this sample, we make use of two `VkQueues`, an async compute queue which performs simulation, and the main graphics queue which blits to swapchain and presents the results. +The two queues need to carefully synchronize with each other. +This sample could trivially be done with binary semaphores of course, so in this sample we implement it in a difficult way to demonstrate the full API capabilities. + +=== Async worker thread - out-of-order submission + +The key aspects we use to demonstrate out of order submission are dedicated workers thread which perform all work related to simulation on the async compute queue, and drawing on the graphics queue. +They never synchronize with the main thread except at teardown, so the only way to synchronize them is through timeline semaphores. +To avoid issues when running the sample on Windows platforms (particularly when resizing the window), forward progress in the queues is throttled by the main thread (i.e. only allowing the timeline to advance +when a render call is active). + + +=== Data flow + +To simulate "Game of Life", we allocate two images of 64x64 RGBA8. +First, one image is initialized with initial state, and from here there is a ping-pong where image N is updated, while reading from image 1 - N. +After updating image N, the main thread will sample from image N. + +The sequential flow of the rendering is something like: + +* Compute thread waits for "prepare" +* Graphics thread waits for prepare +* Main thread signals "prepare" +* Main thread acquires the next swapchain image +* Compute thread prepares command buffer +* Compute thread waits on "submit" +* Graphics thread prepares command buffer +* Graphics thread waits on "submit" +* Main thread signals "submit" +* Main thread waits for "present" +* Compute thread writes image +* Compute thread signals "draw" +* Compute thread waits for next frame +* Graphics thread reads image +* Graphics thread signals "present" +* Graphics thread waits for next frame +* Main thread presents swapchain +* Main thread signals "end of frame" +* Compute thread waits for "prepare" +* Graphics thread waits for prepare + +And so on ... +With out of order signal, we can end up observing this order of submissions instead. + +* Main thread signals "prepare" +* Main thread acquires the next swapchain image +* Main signals "submit" +* Compute thread waits for "prepare" +* Compute thread prepares command buffer +* Compute thread writes image +* Compute thread signals "draw" +* Compute thread waits for next frame +* Graphics thread waits for "prepare" +* Graphics thread prepares command buffer +* Graphics thread waits on "submit" +* Graphics thread reads image +* Graphics thread signals present +* Graphics thread waits for next frame +* Main thread presents swapchain +* Main thread signals end of frame +* Main thread signals "prepare" +* Compute thread waits for "prepare" +* Graphics thread waits for prepare + +When submitting out of order, it is important that you don't just submit work way ahead of where the GPU actually is, since the latency becomes extremely large. +The natural place to keep submission explosion under control here is the place where we wait for the timeline on host, since we need to re-record command buffers anyways. + +=== Avoiding deadlocks in `vkDeviceWaitIdle` + +When submitting out-of-order we end up in a situation where a queue cannot see any forward progress until another queue submits. +Calling `vkDeviceWaitIdle` at this point triggers a deadlock of the application since `vkDeviceWaitIdle` will never finish, as there is one queue which cannot make forward progress. +While calling `vkDeviceWaitIdle`, you cannot call `vkQueueSubmit` due to external synchronization rules. + +Instead, just wait for timeline semaphores on host to "drain" the GPU, or if you must use API calls, use `vkQueueWaitIdle` and only wait on queues which you need. + +=== Avoiding deadlocks when tearing down worker thread + +Similar to `vkDeviceWaitIdle`, when tearing down the application, an out-of-order submission might be waiting on work which never comes, and that queue becomes deadlocked. +To alleviate this, we can make use of host signalling of timeline semaphores to unblock everything in one fell swoop. + +From `TimelineSemaphore::finish_timeline_workers()`: + +[,cpp] +---- + graphics_worker.alive = false; + compute_worker.alive = false; + + signal_timeline(Timeline::MAX_STAGES); + + if (graphics_worker.thread.joinable()) + { + graphics_worker.thread.join(); + } + + if (compute_worker.thread.joinable()) + { + compute_worker.thread.join(); + } +---- + +From `TimelineSemaphore::finish_timeline_workers()`: + +[,cpp] +---- + graphics_worker.alive = false; + compute_worker.alive = false; + + signal_timeline(Timeline::MAX_STAGES); + + if (graphics_worker.thread.joinable()) + { + graphics_worker.thread.join(); + } + + if (compute_worker.thread.joinable()) + { + compute_worker.thread.join(); + } +---- + +=== Out-of-order submission fallbacks for single queue implementations + +Since this sample needs to run on all implementations which support timeline semaphores, the sample also demonstrates the limitations of out-of-order queue submissions. +It's easy to land in a situation where you deadlock the GPU or driver which only happens on single queue Vulkan implementations. +There are two fixes we need to make this work. + +==== Holding back submissions + +This workaround ensures that submissions happen in-order, where forward progress can always be made. +Since we are using multiple submission threads this sample uses a condition variable to only allow a wait to be submitted if it ensures forward progress. +This is handled by `TimelineSemaphore::update_pending()`: + +[,cpp] +---- +std::lock_guard holder{lock.lock}; +lock.pending_timeline = timeline; +lock.cond.notify_one(); +---- + +and `TimelineSemaphore::wait_pending()`: + +[,cpp] +---- +std::unique_lock holder{lock.lock}; +lock.cond.wait(holder, [&lock, timeline]() -> bool { + return lock.pending_timeline >= timeline; +}); +---- + +Blocking like this only works when multiple threads can submit, but that's what this sample is doing, so it is a simple fix. + +The most robust workaround is probably to not lean too heavily on out-of-order submission unless you know you have all the `VkQueues` you need to resolve the dependencies properly. + +==== Locking `vkQueueSubmit` + +If two threads end up submitting to the same queue at the same time, we need to add locks due to external synchronization requirement of the `VkQueue`. +In this sample, we only add the locks if we're applying workarounds. + +== API limitations + +Currently, the Vulkan WSI swapchain does not support timeline semaphores. +In practice, this isn't too big of a deal as swapchain integration tends to be a "special case" either way in most rendering backends. +The acquire and release semaphores have no analog in other modern APIs. + +Another related issue with WSI swapchains is that when using binary semaphores, it is not possible to use wait-before-signal. +The specification states that in order to submit a wait on a binary semaphore all dependencies for that semaphore wait must have been submitted already. +This means that we need to potentially block a bit on host before we can call vkQueuePresentKHR. +The sample does this right before calling `ApiVulkanSample::submit_frame()`. + +[,cpp] +---- +// Before we call present, which uses a binary semaphore, we must ensure that all dependent submissions +// have been submitted, so that the presenting queue is unblocked at the time of calling. +wait_pending(async_compute_timeline_lock, main_thread_timeline.timeline); + +ApiVulkanSample::submit_frame(); +---- + +== Conclusion + +Timeline semaphores grants a lot of flexibility to applications. +With modern approaches of task graphs, many threads and free flowing synchronization, timeline semaphores simplify a lot of things and removes the need for emulating a similar concept with binary semaphores and fences. + +Be careful with out-of-order submissions. +There are various pitfalls with this approach which have been outlined in this sample. +s. + +Be careful with out-of-order submissions. +There are various pitfalls with this approach which have been outlined in this sample. diff --git a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp index 84b9428b02..2251144e52 100644 --- a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp +++ b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp @@ -219,7 +219,7 @@ void TimelineSemaphore::finish_timeline_workers() } // Signal the timeline from the host. -void TimelineSemaphore::signal_timeline(Timeline::Stages stage) +void TimelineSemaphore::signal_timeline(const Timeline::Stages stage) { VkSemaphoreSignalInfo signalInfo; signalInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO; @@ -231,7 +231,7 @@ void TimelineSemaphore::signal_timeline(Timeline::Stages stage) } // Wait on the timeline from the host. -void TimelineSemaphore::wait_on_timeline(Timeline::Stages stage) +void TimelineSemaphore::wait_on_timeline(const Timeline::Stages stage) { const uint64_t waitValue = get_timeline_stage_value(stage); @@ -278,33 +278,23 @@ void TimelineSemaphore::wait_for_next_frame() } // Calculates the timeline value for the specified stage in the current frame -uint64_t TimelineSemaphore::get_timeline_stage_value(Timeline::Stages stage) +uint64_t TimelineSemaphore::get_timeline_stage_value(const Timeline::Stages stage) { return (timeline.frame * Timeline::MAX_STAGES) + stage; } void TimelineSemaphore::do_compute_work() { - setup_compute_resources(); - setup_compute_pipeline(); - compute.timer.start(); + while (compute_worker.alive) { - wait_on_timeline(Timeline::prepare); + // Wait for the main thread to signal that the workers can prepare and submit their work + wait_on_timeline(Timeline::submit); - if (timeline.frame == 0) - { - // Initialise the game of life on the first frame - build_compute_command_buffers(ComputeResources::init); - } - else - { - auto elapsed = static_cast(compute.timer.elapsed()); - auto command_type = (elapsed > 1.0f) ? ComputeResources::update : ComputeResources::mutate; + auto elapsed = static_cast(compute.timer.elapsed()); - build_compute_command_buffers(command_type, elapsed); - } + build_compute_command_buffers(elapsed); uint64_t signal_value = get_timeline_stage_value(Timeline::draw); VkTimelineSemaphoreSubmitInfo timeline_info = create_timeline_submit_info(0, nullptr, 1, &signal_value); @@ -316,9 +306,6 @@ void TimelineSemaphore::do_compute_work() submit_info.signalSemaphoreCount = 1; submit_info.pSignalSemaphores = &timeline.semaphore; - // Wait for the main thread to signal that the workers can submit to their queues - wait_on_timeline(Timeline::submit); - // If the threads are being killed, we need to skip the queue submission to allow the program to exit gracefully if (compute_worker.alive) { @@ -365,49 +352,77 @@ void TimelineSemaphore::setup_compute_resources() VK_CHECK(vkAllocateCommandBuffers(get_device().get_handle(), &alloc_info, &compute.command_buffer)); } -void TimelineSemaphore::build_compute_command_buffers(const ComputeResources::CommandType type, float elapsed) +void TimelineSemaphore::setup_game_of_life() { auto begin_info = vkb::initializers::command_buffer_begin_info(); begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; VK_CHECK(vkResetCommandBuffer(compute.command_buffer, 0)); VK_CHECK(vkBeginCommandBuffer(compute.command_buffer, &begin_info)); - auto frame_index = timeline.frame % NumAsyncFrames; - auto prev_index = (timeline.frame - 1) % NumAsyncFrames; + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_images[1], 0, nullptr); - vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_images[frame_index], 0, nullptr); + // On the first iteration, we initialize the game of life. + vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.init_pipeline); - switch (type) - { - case ComputeResources::init: - { - // On the first iteration, we initialize the game of life. - vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.init_pipeline); - } - break; + VkImageMemoryBarrier image_barrier = vkb::initializers::image_memory_barrier(); + image_barrier.srcAccessMask = 0; + image_barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + image_barrier.image = shared.images[1]->get_handle(); + image_barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; - case ComputeResources::update: - { - vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.update_pipeline); - compute.timer.lap(); + // The semaphore takes care of srcStageMask. + vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); - // Bind previous iteration's texture. - vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 1, 1, &shared.sampled_images[prev_index], 0, nullptr); - } - break; + vkCmdDispatch(compute.command_buffer, grid_width / 8, grid_height / 8, 1); - case ComputeResources::mutate: - { - vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.mutate_pipeline); - vkCmdPushConstants(compute.command_buffer, compute.pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, - 0, sizeof(elapsed), &elapsed); + image_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + image_barrier.dstAccessMask = 0; + image_barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; + image_barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - // Bind previous iteration's texture. - vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 1, 1, &shared.sampled_images[prev_index], 0, nullptr); - } - break; + // The semaphore takes care of dstStageMask. + vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); + + VK_CHECK(vkEndCommandBuffer(compute.command_buffer)); + + VkSubmitInfo submit_info = vkb::initializers::submit_info(); + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &compute.command_buffer; + + VK_CHECK(vkQueueSubmit(compute.queue, 1, &submit_info, VK_NULL_HANDLE)); + + VK_CHECK(get_device().wait_idle()); +} + +void TimelineSemaphore::build_compute_command_buffers(const float elapsed) +{ + auto begin_info = vkb::initializers::command_buffer_begin_info(); + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + VK_CHECK(vkResetCommandBuffer(compute.command_buffer, 0)); + VK_CHECK(vkBeginCommandBuffer(compute.command_buffer, &begin_info)); + + auto frame_index = timeline.frame % NumAsyncFrames; + auto prev_index = (timeline.frame - 1) % NumAsyncFrames; + + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_images[frame_index], 0, nullptr); + + if (elapsed > 1.0f) + { + vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.update_pipeline); + compute.timer.lap(); + } + else + { + vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.mutate_pipeline); + vkCmdPushConstants(compute.command_buffer, compute.pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(elapsed), &elapsed); } + // Bind previous iteration's texture. + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 1, 1, &shared.sampled_images[prev_index], 0, nullptr); + VkImageMemoryBarrier image_barrier = vkb::initializers::image_memory_barrier(); image_barrier.srcAccessMask = 0; image_barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; @@ -434,18 +449,16 @@ void TimelineSemaphore::build_compute_command_buffers(const ComputeResources::Co void TimelineSemaphore::do_graphics_work() { - setup_graphics_resources(); - setup_graphics_pipeline(); - while (graphics_worker.alive) { - wait_on_timeline(Timeline::prepare); + // Wait for the main thread to signal that the workers can prepare and submit their work + wait_on_timeline(Timeline::submit); build_graphics_command_buffer(); - uint64_t wait_values[] = {0, get_timeline_stage_value(Timeline::draw)}; - VkPipelineStageFlags wait_stage_masks[] = {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT}; - VkSemaphore wait_semaphores[] = {semaphores.acquired_image_ready, timeline.semaphore}; + uint64_t wait_values[] = {get_timeline_stage_value(Timeline::draw), 0}; + VkPipelineStageFlags wait_stage_masks[] = {VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT}; + VkSemaphore wait_semaphores[] = {timeline.semaphore, semaphores.acquired_image_ready}; uint64_t signal_values[] = {get_timeline_stage_value(Timeline::present), 0}; VkSemaphore signal_semaphores[] = {timeline.semaphore, semaphores.render_complete}; VkTimelineSemaphoreSubmitInfo timeline_info = create_timeline_submit_info(2, wait_values, 2, signal_values); @@ -460,9 +473,6 @@ void TimelineSemaphore::do_graphics_work() submit_info.commandBufferCount = 1; submit_info.pCommandBuffers = &graphics.command_buffer; - // Wait for the main thread to signal that the workers can submit to their queues - wait_on_timeline(Timeline::submit); - if (compute.queue == graphics.queue) { // If compute.queue == queue, we need synchronise access to the queue AND ensure that submissions are made in order @@ -558,9 +568,10 @@ void TimelineSemaphore::build_graphics_command_buffer() viewport.height = viewport.width; } + VK_CHECK(vkResetCommandBuffer(graphics.command_buffer, 0)); + auto begin_info = vkb::initializers::command_buffer_begin_info(); begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - VK_CHECK(vkResetCommandBuffer(graphics.command_buffer, 0)); VK_CHECK(vkBeginCommandBuffer(graphics.command_buffer, &begin_info)); VkRenderPassBeginInfo render_pass_begin = vkb::initializers::render_pass_begin_info(); @@ -607,11 +618,21 @@ bool TimelineSemaphore::prepare(const vkb::ApplicationOptions &options) } setup_shared_resources(); + + setup_compute_resources(); + setup_compute_pipeline(); + + setup_graphics_resources(); + setup_graphics_pipeline(); + + setup_game_of_life(); + create_timeline_semaphore(); start_timeline_workers(); prepared = true; + return true; } @@ -622,12 +643,9 @@ void TimelineSemaphore::render(float delta_time) return; } - // Signal to the worker threads that they can prepare their command buffers - signal_timeline(Timeline::prepare); - ApiVulkanSample::prepare_frame(); - // Signal to the worker threads that they can submit their work, then wait for the work to complete + // Signal to the worker threads that they can submit their work signal_timeline(Timeline::submit); // Wait for the worker threads to signal that the frame is ready to present diff --git a/samples/extensions/timeline_semaphore/timeline_semaphore.h b/samples/extensions/timeline_semaphore/timeline_semaphore.h index c2012435c6..090b12b515 100644 --- a/samples/extensions/timeline_semaphore/timeline_semaphore.h +++ b/samples/extensions/timeline_semaphore/timeline_semaphore.h @@ -51,13 +51,6 @@ class TimelineSemaphore : public ApiVulkanSample vkb::Timer timer; uint32_t queue_family_index; - - enum CommandType - { - init, - update, - mutate - }; } compute; // Resources used by both workers for storing/sampling images @@ -80,10 +73,9 @@ class TimelineSemaphore : public ApiVulkanSample // the stages to be reused without needing to recreate the semaphore. enum Stages { - prepare = 1, // Worker threads can begin preparing command buffers for submission - submit, // Worker threads can submit their command buffers, - draw, // The graphics worker can draw the current frame - present, // The main thread can present the frame to the display + submit = 1, // Worker threads can create and submit their command buffers, + draw, // The graphics worker can draw the current frame + present, // The main thread can present the frame to the display MAX_STAGES }; @@ -109,17 +101,18 @@ class TimelineSemaphore : public ApiVulkanSample void create_timeline_semaphore(); void start_timeline_workers(); void finish_timeline_workers(); + void signal_timeline(const Timeline::Stages stage); + void wait_on_timeline(const Timeline::Stages stage); void signal_next_frame(); void wait_for_next_frame(); - void signal_timeline(Timeline::Stages stage); - void wait_on_timeline(Timeline::Stages stage); - uint64_t get_timeline_stage_value(Timeline::Stages stage); + uint64_t get_timeline_stage_value(const Timeline::Stages stage); // Compute Work void do_compute_work(); void setup_compute_pipeline(); void setup_compute_resources(); - void build_compute_command_buffers(const ComputeResources::CommandType type, const float elapsed = 0.0f); + void setup_game_of_life(); + void build_compute_command_buffers(const float elapsed = 0.0f); // Graphics Work void do_graphics_work(); From f4b6744ef1d2724d9f2a0a9e113dcc6904b092bb Mon Sep 17 00:00:00 2001 From: Bryce Young Date: Fri, 14 Jun 2024 09:59:03 +0100 Subject: [PATCH 3/7] Updated data flow example in readme --- .../extensions/timeline_semaphore/README.adoc | 73 +++++++++---------- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/samples/extensions/timeline_semaphore/README.adoc b/samples/extensions/timeline_semaphore/README.adoc index 235096dc35..88e02f1178 100644 --- a/samples/extensions/timeline_semaphore/README.adoc +++ b/samples/extensions/timeline_semaphore/README.adoc @@ -203,49 +203,44 @@ After updating image N, the main thread will sample from image N. The sequential flow of the rendering is something like: -* Compute thread waits for "prepare" -* Graphics thread waits for prepare -* Main thread signals "prepare" -* Main thread acquires the next swapchain image -* Compute thread prepares command buffer -* Compute thread waits on "submit" -* Graphics thread prepares command buffer -* Graphics thread waits on "submit" -* Main thread signals "submit" -* Main thread waits for "present" -* Compute thread writes image -* Compute thread signals "draw" -* Compute thread waits for next frame -* Graphics thread reads image -* Graphics thread signals "present" -* Graphics thread waits for next frame -* Main thread presents swapchain -* Main thread signals "end of frame" -* Compute thread waits for "prepare" -* Graphics thread waits for prepare +* Compute: wait for "submit" +* Graphics: wait for "submit" +* Main: acquires the swapchain image +* Main: signal "submit" +* Main: wait for "present" +* Compute: wait for "image_acquired" (binary semaphore) +* Graphics: wait for "draw" +* Compute: write image +* Compute: signal "draw" +* Compute: wait for "end of frame" +* Graphics: read image +* Graphics: signal "present" +* Graphics: wait for "end of frame" +* Main: present swapchain +* Main: signals "end of frame" +* Compute: wait for "submit" +* Graphics: wait for "submit" And so on ... With out of order signal, we can end up observing this order of submissions instead. -* Main thread signals "prepare" -* Main thread acquires the next swapchain image -* Main signals "submit" -* Compute thread waits for "prepare" -* Compute thread prepares command buffer -* Compute thread writes image -* Compute thread signals "draw" -* Compute thread waits for next frame -* Graphics thread waits for "prepare" -* Graphics thread prepares command buffer -* Graphics thread waits on "submit" -* Graphics thread reads image -* Graphics thread signals present -* Graphics thread waits for next frame -* Main thread presents swapchain -* Main thread signals end of frame -* Main thread signals "prepare" -* Compute thread waits for "prepare" -* Graphics thread waits for prepare +* Compute: wait for "submit" +* Graphics: wait for "submit" +* Main: acquires the swapchain image +* Main: signal "submit" +* Graphics: wait for "draw" +* Compute: wait for "image_acquired" (binary semaphore) +* Compute: write image +* Compute: signal "draw" +* Graphics: read image +* Graphics: signal "present" +* Main: wait for "present" +* Main: present swapchain +* Compute: wait for "end of frame" +* Main: signals "end of frame" +* Graphics: wait for "end of frame" +* Compute: wait for "submit" +* Graphics: wait for "submit" When submitting out of order, it is important that you don't just submit work way ahead of where the GPU actually is, since the latency becomes extremely large. The natural place to keep submission explosion under control here is the place where we wait for the timeline on host, since we need to re-record command buffers anyways. From 9b1de705df1eeca2792da3cfc0cebc7f67459dc3 Mon Sep 17 00:00:00 2001 From: Bryce Young Date: Fri, 16 Aug 2024 12:33:32 +0100 Subject: [PATCH 4/7] Added changes from review comments --- framework/common/vk_initializers.h | 15 ++++- .../timeline_semaphore/timeline_semaphore.cpp | 65 ++++++++----------- .../timeline_semaphore/timeline_semaphore.h | 5 +- 3 files changed, 41 insertions(+), 44 deletions(-) diff --git a/framework/common/vk_initializers.h b/framework/common/vk_initializers.h index a9362d9337..175dc72813 100644 --- a/framework/common/vk_initializers.h +++ b/framework/common/vk_initializers.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2019-2022, Sascha Willems +/* Copyright (c) 2019-2024, Sascha Willems * * SPDX-License-Identifier: Apache-2.0 * @@ -546,7 +546,7 @@ inline VkPipelineMultisampleStateCreateInfo pipeline_multisample_state_create_in } inline VkPipelineDynamicStateCreateInfo pipeline_dynamic_state_create_info( - const VkDynamicState * dynamic_states, + const VkDynamicState *dynamic_states, uint32_t dynamicStateCount, VkPipelineDynamicStateCreateFlags flags = 0) { @@ -652,5 +652,16 @@ inline VkSpecializationInfo specialization_info(uint32_t map_entry_count, const specialization_info.pData = data; return specialization_info; } + +inline VkTimelineSemaphoreSubmitInfo timeline_semaphore_submit_info(uint32_t wait_value_count, uint64_t *wait_values, uint32_t signal_value_count, uint64_t *signal_values) +{ + return VkTimelineSemaphoreSubmitInfo{ + VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO, + NULL, + wait_value_count, + wait_values, + signal_value_count, + signal_values}; +} } // namespace initializers } // namespace vkb diff --git a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp index 2251144e52..76f46cb8f2 100644 --- a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp +++ b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp @@ -16,6 +16,7 @@ */ #include "timeline_semaphore.h" +#include "common/vk_initializers.h" // What we're trying to demonstrate here is: // - Out-of-order submission using threads which synchronize GPU work with each other using timeline semaphores. @@ -30,17 +31,6 @@ namespace static constexpr unsigned grid_width = 64; static constexpr unsigned grid_height = 64; -VkTimelineSemaphoreSubmitInfo create_timeline_submit_info(uint32_t waitValueCount, uint64_t *waitValue, uint32_t signalValueCount, uint64_t *signalValue) -{ - return VkTimelineSemaphoreSubmitInfo{ - VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO, - NULL, - waitValueCount, - waitValue, - signalValueCount, - signalValue}; -} - } // namespace TimelineSemaphore::TimelineSemaphore() @@ -73,6 +63,7 @@ TimelineSemaphore::~TimelineSemaphore() vkDestroyDescriptorSetLayout(vk_device, shared.storage_layout, nullptr); vkDestroyDescriptorSetLayout(vk_device, shared.sampled_layout, nullptr); + vkDestroyDescriptorPool(vk_device, shared.descriptor_pool, nullptr); vkDestroySemaphore(vk_device, timeline.semaphore, nullptr); } @@ -106,28 +97,29 @@ void TimelineSemaphore::setup_shared_resources() // Images and image views { - uint32_t queue_families[2]{}; - uint32_t num_queue_families{}; + const auto present_index = get_device().get_queue_by_present(0).get_family_index(); + std::vector queue_families{compute.queue_family_index}; - // Need CONCURRENT usage here since we will sample from the image - // in both graphics and compute queues. - if (get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT) != - get_device().get_queue_by_present(0).get_family_index()) + if (graphics.queue_family_index != compute.queue_family_index) { - queue_families[0] = get_device().get_queue_by_present(0).get_family_index(); - queue_families[1] = get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT); - num_queue_families = 2; + queue_families.push_back(graphics.queue_family_index); + } + + if (compute.queue_family_index != present_index && graphics.queue_family_index != present_index) + { + queue_families.push_back(present_index); } for (int i = 0; i < NumAsyncFrames; ++i) { + // Need CONCURRENT usage here since we will sample from the image in both graphics and compute queues. shared.images[i] = std::make_unique(get_device(), VkExtent3D{grid_width, grid_height, 1}, VK_FORMAT_R8G8B8A8_UNORM, VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, VMA_MEMORY_USAGE_GPU_ONLY, VK_SAMPLE_COUNT_1_BIT, 1, 1, VK_IMAGE_TILING_OPTIMAL, - 0, num_queue_families, queue_families); + 0, static_cast(queue_families.size()), queue_families.data()); shared.image_views[i] = std::make_unique(*shared.images[i], VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R8G8B8A8_UNORM); } @@ -155,15 +147,15 @@ void TimelineSemaphore::setup_shared_resources() for (int i = 0; i < NumAsyncFrames; ++i) { - VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &storage_alloc_info, &shared.storage_images[i])); - VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &sampled_alloc_info, &shared.sampled_images[i])); + VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &storage_alloc_info, &shared.storage_descriptor_sets[i])); + VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &sampled_alloc_info, &shared.sampled_descriptor_sets[i])); auto general_info = vkb::initializers::descriptor_image_info(VK_NULL_HANDLE, shared.image_views[i]->get_handle(), VK_IMAGE_LAYOUT_GENERAL); auto readonly_info = vkb::initializers::descriptor_image_info(VK_NULL_HANDLE, shared.image_views[i]->get_handle(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); const VkWriteDescriptorSet writes[2] = { - vkb::initializers::write_descriptor_set(shared.storage_images[i], VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 0, &general_info), - vkb::initializers::write_descriptor_set(shared.sampled_images[i], VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 0, &readonly_info), + vkb::initializers::write_descriptor_set(shared.storage_descriptor_sets[i], VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 0, &general_info), + vkb::initializers::write_descriptor_set(shared.sampled_descriptor_sets[i], VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 0, &readonly_info), }; vkUpdateDescriptorSets(get_device().get_handle(), 2, writes, 0, nullptr); @@ -173,6 +165,7 @@ void TimelineSemaphore::setup_shared_resources() void TimelineSemaphore::build_command_buffers() { + // Unused, but required to resolve pure virtual function inherited from ApiVulkanSample } void TimelineSemaphore::create_timeline_semaphore() @@ -297,7 +290,7 @@ void TimelineSemaphore::do_compute_work() build_compute_command_buffers(elapsed); uint64_t signal_value = get_timeline_stage_value(Timeline::draw); - VkTimelineSemaphoreSubmitInfo timeline_info = create_timeline_submit_info(0, nullptr, 1, &signal_value); + VkTimelineSemaphoreSubmitInfo timeline_info = vkb::initializers::timeline_semaphore_submit_info(0, nullptr, 1, &signal_value); VkSubmitInfo submit_info = vkb::initializers::submit_info(); submit_info.pNext = &timeline_info; @@ -359,7 +352,7 @@ void TimelineSemaphore::setup_game_of_life() VK_CHECK(vkResetCommandBuffer(compute.command_buffer, 0)); VK_CHECK(vkBeginCommandBuffer(compute.command_buffer, &begin_info)); - vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_images[1], 0, nullptr); + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_descriptor_sets[1], 0, nullptr); // On the first iteration, we initialize the game of life. vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.init_pipeline); @@ -406,7 +399,7 @@ void TimelineSemaphore::build_compute_command_buffers(const float elapsed) auto frame_index = timeline.frame % NumAsyncFrames; auto prev_index = (timeline.frame - 1) % NumAsyncFrames; - vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_images[frame_index], 0, nullptr); + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_descriptor_sets[frame_index], 0, nullptr); if (elapsed > 1.0f) { @@ -421,7 +414,7 @@ void TimelineSemaphore::build_compute_command_buffers(const float elapsed) } // Bind previous iteration's texture. - vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 1, 1, &shared.sampled_images[prev_index], 0, nullptr); + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 1, 1, &shared.sampled_descriptor_sets[prev_index], 0, nullptr); VkImageMemoryBarrier image_barrier = vkb::initializers::image_memory_barrier(); image_barrier.srcAccessMask = 0; @@ -461,7 +454,7 @@ void TimelineSemaphore::do_graphics_work() VkSemaphore wait_semaphores[] = {timeline.semaphore, semaphores.acquired_image_ready}; uint64_t signal_values[] = {get_timeline_stage_value(Timeline::present), 0}; VkSemaphore signal_semaphores[] = {timeline.semaphore, semaphores.render_complete}; - VkTimelineSemaphoreSubmitInfo timeline_info = create_timeline_submit_info(2, wait_values, 2, signal_values); + VkTimelineSemaphoreSubmitInfo timeline_info = vkb::initializers::timeline_semaphore_submit_info(2, wait_values, 2, signal_values); VkSubmitInfo submit_info = vkb::initializers::submit_info(); submit_info.pNext = &timeline_info; @@ -592,7 +585,7 @@ void TimelineSemaphore::build_graphics_command_buffer() vkCmdSetViewport(graphics.command_buffer, 0, 1, &viewport); vkCmdSetScissor(graphics.command_buffer, 0, 1, &scissor); - vkCmdBindDescriptorSets(graphics.command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphics.pipeline_layout, 0, 1, &shared.sampled_images[frame_index], 0, nullptr); + vkCmdBindDescriptorSets(graphics.command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphics.pipeline_layout, 0, 1, &shared.sampled_descriptor_sets[frame_index], 0, nullptr); vkCmdDraw(graphics.command_buffer, 3, 1, 0, 0); draw_ui(graphics.command_buffer); @@ -617,12 +610,11 @@ bool TimelineSemaphore::prepare(const vkb::ApplicationOptions &options) return false; } + setup_compute_resources(); + setup_graphics_resources(); setup_shared_resources(); - setup_compute_resources(); setup_compute_pipeline(); - - setup_graphics_resources(); setup_graphics_pipeline(); setup_game_of_life(); @@ -657,11 +649,6 @@ void TimelineSemaphore::render(float delta_time) signal_next_frame(); } -bool TimelineSemaphore::resize(const uint32_t width, const uint32_t height) -{ - return ApiVulkanSample::resize(width, height); -} - std::unique_ptr create_timeline_semaphore() { return std::make_unique(); diff --git a/samples/extensions/timeline_semaphore/timeline_semaphore.h b/samples/extensions/timeline_semaphore/timeline_semaphore.h index 090b12b515..cfa0a17b29 100644 --- a/samples/extensions/timeline_semaphore/timeline_semaphore.h +++ b/samples/extensions/timeline_semaphore/timeline_semaphore.h @@ -58,8 +58,8 @@ class TimelineSemaphore : public ApiVulkanSample { VkDescriptorSetLayout storage_layout; VkDescriptorSetLayout sampled_layout; - VkDescriptorSet storage_images[NumAsyncFrames]; - VkDescriptorSet sampled_images[NumAsyncFrames]; + VkDescriptorSet storage_descriptor_sets[NumAsyncFrames]; + VkDescriptorSet sampled_descriptor_sets[NumAsyncFrames]; VkDescriptorPool descriptor_pool; std::unique_ptr immutable_sampler; @@ -123,7 +123,6 @@ class TimelineSemaphore : public ApiVulkanSample virtual void request_gpu_features(vkb::PhysicalDevice &gpu) override; virtual bool prepare(const vkb::ApplicationOptions &options) override; virtual void render(float delta_time) override; - virtual bool resize(const uint32_t width, const uint32_t height) override; }; std::unique_ptr create_timeline_semaphore(); From b676180cda0e6e43ad0fee68d5a69f75966b001c Mon Sep 17 00:00:00 2001 From: Bryce Young Date: Fri, 16 Aug 2024 13:19:22 +0100 Subject: [PATCH 5/7] Updated setup_shared_resources to use ImageBuilder --- .../timeline_semaphore/timeline_semaphore.cpp | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp index 76f46cb8f2..9770582466 100644 --- a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp +++ b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp @@ -98,6 +98,7 @@ void TimelineSemaphore::setup_shared_resources() // Images and image views { const auto present_index = get_device().get_queue_by_present(0).get_family_index(); + auto sharing_mode = VK_SHARING_MODE_CONCURRENT; std::vector queue_families{compute.queue_family_index}; if (graphics.queue_family_index != compute.queue_family_index) @@ -110,16 +111,24 @@ void TimelineSemaphore::setup_shared_resources() queue_families.push_back(present_index); } + if (queue_families.size() <= 1) + { + sharing_mode = VK_SHARING_MODE_EXCLUSIVE; + } + for (int i = 0; i < NumAsyncFrames; ++i) { // Need CONCURRENT usage here since we will sample from the image in both graphics and compute queues. - shared.images[i] = std::make_unique(get_device(), VkExtent3D{grid_width, grid_height, 1}, - VK_FORMAT_R8G8B8A8_UNORM, - VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, - VMA_MEMORY_USAGE_GPU_ONLY, - VK_SAMPLE_COUNT_1_BIT, - 1, 1, VK_IMAGE_TILING_OPTIMAL, - 0, static_cast(queue_families.size()), queue_families.data()); + shared.images[i] = std::make_unique(get_device(), vkb::core::ImageBuilder(VkExtent3D{grid_width, grid_height, 1}) + .with_format(VK_FORMAT_R8G8B8A8_UNORM) + .with_usage(VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT) + .with_vma_usage(VMA_MEMORY_USAGE_GPU_ONLY) + .with_sample_count(VK_SAMPLE_COUNT_1_BIT) + .with_mip_levels(1) + .with_array_layers(1) + .with_tiling(VK_IMAGE_TILING_OPTIMAL) + .with_queue_families(static_cast(queue_families.size()), queue_families.data()) + .with_sharing_mode(sharing_mode)); shared.image_views[i] = std::make_unique(*shared.images[i], VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R8G8B8A8_UNORM); } From 16b7194fdb1ccb33530ec8390631ae55b108c0e7 Mon Sep 17 00:00:00 2001 From: Bryce Young Date: Mon, 19 Aug 2024 11:18:02 +0100 Subject: [PATCH 6/7] Update game_of_life setup to initialise all images --- .../timeline_semaphore/timeline_semaphore.cpp | 61 ++++++++++--------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp index 9770582466..1f38348651 100644 --- a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp +++ b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp @@ -356,46 +356,49 @@ void TimelineSemaphore::setup_compute_resources() void TimelineSemaphore::setup_game_of_life() { - auto begin_info = vkb::initializers::command_buffer_begin_info(); - begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - VK_CHECK(vkResetCommandBuffer(compute.command_buffer, 0)); - VK_CHECK(vkBeginCommandBuffer(compute.command_buffer, &begin_info)); + for (int i = 0; i < NumAsyncFrames; ++i) + { + auto begin_info = vkb::initializers::command_buffer_begin_info(); + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + VK_CHECK(vkResetCommandBuffer(compute.command_buffer, 0)); + VK_CHECK(vkBeginCommandBuffer(compute.command_buffer, &begin_info)); - vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_descriptor_sets[1], 0, nullptr); + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_descriptor_sets[i], 0, nullptr); - // On the first iteration, we initialize the game of life. - vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.init_pipeline); + // On the first iteration, we initialize the game of life. + vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.init_pipeline); - VkImageMemoryBarrier image_barrier = vkb::initializers::image_memory_barrier(); - image_barrier.srcAccessMask = 0; - image_barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - image_barrier.image = shared.images[1]->get_handle(); - image_barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; - image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; - image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; + VkImageMemoryBarrier image_barrier = vkb::initializers::image_memory_barrier(); + image_barrier.srcAccessMask = 0; + image_barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + image_barrier.image = shared.images[i]->get_handle(); + image_barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; - // The semaphore takes care of srcStageMask. - vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); + // The semaphore takes care of srcStageMask. + vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); - vkCmdDispatch(compute.command_buffer, grid_width / 8, grid_height / 8, 1); + vkCmdDispatch(compute.command_buffer, grid_width / 8, grid_height / 8, 1); - image_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - image_barrier.dstAccessMask = 0; - image_barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; - image_barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + image_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + image_barrier.dstAccessMask = 0; + image_barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; + image_barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - // The semaphore takes care of dstStageMask. - vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); + // The semaphore takes care of dstStageMask. + vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); - VK_CHECK(vkEndCommandBuffer(compute.command_buffer)); + VK_CHECK(vkEndCommandBuffer(compute.command_buffer)); - VkSubmitInfo submit_info = vkb::initializers::submit_info(); - submit_info.commandBufferCount = 1; - submit_info.pCommandBuffers = &compute.command_buffer; + VkSubmitInfo submit_info = vkb::initializers::submit_info(); + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &compute.command_buffer; - VK_CHECK(vkQueueSubmit(compute.queue, 1, &submit_info, VK_NULL_HANDLE)); + VK_CHECK(vkQueueSubmit(compute.queue, 1, &submit_info, VK_NULL_HANDLE)); - VK_CHECK(get_device().wait_idle()); + VK_CHECK(get_device().wait_idle()); + } } void TimelineSemaphore::build_compute_command_buffers(const float elapsed) From 0b32c0184363fee165eb3286199e5a5de59192f6 Mon Sep 17 00:00:00 2001 From: Bryce Young Date: Mon, 19 Aug 2024 15:06:14 +0100 Subject: [PATCH 7/7] Updates from review --- .../timeline_semaphore/timeline_semaphore.cpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp index 1f38348651..fd76a09094 100644 --- a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp +++ b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp @@ -356,13 +356,13 @@ void TimelineSemaphore::setup_compute_resources() void TimelineSemaphore::setup_game_of_life() { + auto begin_info = vkb::initializers::command_buffer_begin_info(); + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + VK_CHECK(vkResetCommandBuffer(compute.command_buffer, 0)); + VK_CHECK(vkBeginCommandBuffer(compute.command_buffer, &begin_info)); + for (int i = 0; i < NumAsyncFrames; ++i) { - auto begin_info = vkb::initializers::command_buffer_begin_info(); - begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - VK_CHECK(vkResetCommandBuffer(compute.command_buffer, 0)); - VK_CHECK(vkBeginCommandBuffer(compute.command_buffer, &begin_info)); - vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_descriptor_sets[i], 0, nullptr); // On the first iteration, we initialize the game of life. @@ -388,17 +388,17 @@ void TimelineSemaphore::setup_game_of_life() // The semaphore takes care of dstStageMask. vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); + } - VK_CHECK(vkEndCommandBuffer(compute.command_buffer)); + VK_CHECK(vkEndCommandBuffer(compute.command_buffer)); - VkSubmitInfo submit_info = vkb::initializers::submit_info(); - submit_info.commandBufferCount = 1; - submit_info.pCommandBuffers = &compute.command_buffer; + VkSubmitInfo submit_info = vkb::initializers::submit_info(); + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &compute.command_buffer; - VK_CHECK(vkQueueSubmit(compute.queue, 1, &submit_info, VK_NULL_HANDLE)); + VK_CHECK(vkQueueSubmit(compute.queue, 1, &submit_info, VK_NULL_HANDLE)); - VK_CHECK(get_device().wait_idle()); - } + VK_CHECK(get_device().wait_idle()); } void TimelineSemaphore::build_compute_command_buffers(const float elapsed)