diff --git a/framework/common/vk_initializers.h b/framework/common/vk_initializers.h index a9362d9337..175dc72813 100644 --- a/framework/common/vk_initializers.h +++ b/framework/common/vk_initializers.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2019-2022, Sascha Willems +/* Copyright (c) 2019-2024, Sascha Willems * * SPDX-License-Identifier: Apache-2.0 * @@ -546,7 +546,7 @@ inline VkPipelineMultisampleStateCreateInfo pipeline_multisample_state_create_in } inline VkPipelineDynamicStateCreateInfo pipeline_dynamic_state_create_info( - const VkDynamicState * dynamic_states, + const VkDynamicState *dynamic_states, uint32_t dynamicStateCount, VkPipelineDynamicStateCreateFlags flags = 0) { @@ -652,5 +652,16 @@ inline VkSpecializationInfo specialization_info(uint32_t map_entry_count, const specialization_info.pData = data; return specialization_info; } + +inline VkTimelineSemaphoreSubmitInfo timeline_semaphore_submit_info(uint32_t wait_value_count, uint64_t *wait_values, uint32_t signal_value_count, uint64_t *signal_values) +{ + return VkTimelineSemaphoreSubmitInfo{ + VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO, + NULL, + wait_value_count, + wait_values, + signal_value_count, + signal_values}; +} } // namespace initializers } // namespace vkb diff --git a/samples/extensions/timeline_semaphore/CMakeLists.txt b/samples/extensions/timeline_semaphore/CMakeLists.txt index c1fd093acf..4458805124 100644 --- a/samples/extensions/timeline_semaphore/CMakeLists.txt +++ b/samples/extensions/timeline_semaphore/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2021, Arm Limited and Contributors +# Copyright (c) 2021-2024, Arm Limited and Contributors # # SPDX-License-Identifier: Apache-2.0 # @@ -15,24 +15,19 @@ # limitations under the License. # -if (NOT WIN32) - # Not enabled on Windows at this time due to bugs. - # Out-of-order submission in presentation causes kernel level issues, - # and need to be figured out before this sample can be enabled on Windows. - get_filename_component(FOLDER_NAME ${CMAKE_CURRENT_LIST_DIR} NAME) - get_filename_component(PARENT_DIR ${CMAKE_CURRENT_LIST_DIR} PATH) - get_filename_component(CATEGORY_NAME ${PARENT_DIR} NAME) +get_filename_component(FOLDER_NAME ${CMAKE_CURRENT_LIST_DIR} NAME) +get_filename_component(PARENT_DIR ${CMAKE_CURRENT_LIST_DIR} PATH) +get_filename_component(CATEGORY_NAME ${PARENT_DIR} NAME) - add_sample_with_tags( - ID ${FOLDER_NAME} - CATEGORY ${CATEGORY_NAME} - AUTHOR "Hans-Kristian Arntzen" - NAME "Timeline semaphore" - DESCRIPTION "Demonstrates use of timeline semaphores to express complex queue dependency graphs" - SHADER_FILES_GLSL - "timeline_semaphore/game_of_life_update.comp" - "timeline_semaphore/game_of_life_mutate.comp" - "timeline_semaphore/game_of_life_init.comp" - "timeline_semaphore/render.vert" - "timeline_semaphore/render.frag") -endif() +add_sample_with_tags( + ID ${FOLDER_NAME} + CATEGORY ${CATEGORY_NAME} + AUTHOR "Hans-Kristian Arntzen" + NAME "Timeline semaphore" + DESCRIPTION "Demonstrates use of timeline semaphores to express complex queue dependency graphs" + SHADER_FILES_GLSL + "timeline_semaphore/game_of_life_update.comp" + "timeline_semaphore/game_of_life_mutate.comp" + "timeline_semaphore/game_of_life_init.comp" + "timeline_semaphore/render.vert" + "timeline_semaphore/render.frag") diff --git a/samples/extensions/timeline_semaphore/README.adoc b/samples/extensions/timeline_semaphore/README.adoc index a8ab58d924..88e02f1178 100644 --- a/samples/extensions/timeline_semaphore/README.adoc +++ b/samples/extensions/timeline_semaphore/README.adoc @@ -1,5 +1,5 @@ //// -- Copyright (c) 2021-2023, Arm Limited and Contributors +- Copyright (c) 2021-2024, Arm Limited and Contributors - - SPDX-License-Identifier: Apache-2.0 - @@ -189,69 +189,58 @@ This sample could trivially be done with binary semaphores of course, so in this === Async worker thread - out-of-order submission -The key aspect we use to demonstrate out of order submission is a dedicated worker thread which does all work related to simulation on the async compute queue. -It never synchronizes with the main thread except at teardown, so the only way it synchronizes is through timeline semaphores. -Submission order is completely out-of-order in this case and forward progress in the async queue is generally blocked by the main thread submitting more work. +The key aspects we use to demonstrate out of order submission are dedicated workers thread which perform all work related to simulation on the async compute queue, and drawing on the graphics queue. +They never synchronize with the main thread except at teardown, so the only way to synchronize them is through timeline semaphores. +To avoid issues when running the sample on Windows platforms (particularly when resizing the window), forward progress in the queues is throttled by the main thread (i.e. only allowing the timeline to advance +when a render call is active). + === Data flow To simulate "Game of Life", we allocate two images of 64x64 RGBA8. First, one image is initialized with initial state, and from here there is a ping-pong where image N is updated, while reading from image 1 - N. - After updating image N, the main thread will sample from image N. -Before async compute updates the same image index N again, it must wait for graphics queue to complete. -With the double buffer in play, the async queue can run ahead for a little while and it will be mostly stalled by graphics queue. - -The sequential flow of the rendering is something like, assuming two timeline semaphores A and G: - -* Async compute write image 1. -* Async compute signal A = 1. -* Graphics wait A = 1. -* Graphics read image 1. -* Graphics signal G = 1. -* Async compute wait A = 1. -(Could use pipeline barrier of course, but hey!) -* Async compute write image 0. -* Async compute signal A = 2. -* Graphics wait A = 2. -* Graphics read image 0. -* Graphics signal G = 2. -* Async compute wait G = 1. -(Resolve write-after-read hazard) -* Async compute wait A = 2. -(Could use pipeline barrier of course, but hey!) -* Async compute wait host A = 1. -(Wait for command buffer to retire so we can re-record it!) -* Async compute write image 1. -* Async compute signal A = 3. -* Graphics wait A = 3. -* Graphics read image 1. -* Graphics signal G = 3. + +The sequential flow of the rendering is something like: + +* Compute: wait for "submit" +* Graphics: wait for "submit" +* Main: acquires the swapchain image +* Main: signal "submit" +* Main: wait for "present" +* Compute: wait for "image_acquired" (binary semaphore) +* Graphics: wait for "draw" +* Compute: write image +* Compute: signal "draw" +* Compute: wait for "end of frame" +* Graphics: read image +* Graphics: signal "present" +* Graphics: wait for "end of frame" +* Main: present swapchain +* Main: signals "end of frame" +* Compute: wait for "submit" +* Graphics: wait for "submit" And so on ... With out of order signal, we can end up observing this order of submissions instead. -* Async compute write image 1. -* Async compute signal A = 1. -* Async compute wait A = 1. -* Async compute write image 0. -* Async compute signal A = 2. -* Async compute wait G = 1. -(Out of order submission, queue progress is stalled, but we can keep recording) -* Async compute wait A = 2. -* Async compute wait host A = 1. -* Async compute write image 1. -* Async compute signal A = 3. -* Graphics wait A = 1. -* Graphics read image 1. -* Graphics signal G = 1. -(Unblocks queue forward progress) -* Graphics wait A = 2. -* Graphics read image 0. -* Graphics signal G = 2. -* Graphics wait A = 3. -* Graphics read image 1. -* Graphics signal G = 3. +* Compute: wait for "submit" +* Graphics: wait for "submit" +* Main: acquires the swapchain image +* Main: signal "submit" +* Graphics: wait for "draw" +* Compute: wait for "image_acquired" (binary semaphore) +* Compute: write image +* Compute: signal "draw" +* Graphics: read image +* Graphics: signal "present" +* Main: wait for "present" +* Main: present swapchain +* Compute: wait for "end of frame" +* Main: signals "end of frame" +* Graphics: wait for "end of frame" +* Compute: wait for "submit" +* Graphics: wait for "submit" When submitting out of order, it is important that you don't just submit work way ahead of where the GPU actually is, since the latency becomes extremely large. The natural place to keep submission explosion under control here is the place where we wait for the timeline on host, since we need to re-record command buffers anyways. @@ -269,37 +258,44 @@ Instead, just wait for timeline semaphores on host to "drain" the GPU, or if you Similar to `vkDeviceWaitIdle`, when tearing down the application, an out-of-order submission might be waiting on work which never comes, and that queue becomes deadlocked. To alleviate this, we can make use of host signalling of timeline semaphores to unblock everything in one fell swoop. -From `TimelineSemaphore::finish()`: +From `TimelineSemaphore::finish_timeline_workers()`: + +[,cpp] +---- + graphics_worker.alive = false; + compute_worker.alive = false; + + signal_timeline(Timeline::MAX_STAGES); + + if (graphics_worker.thread.joinable()) + { + graphics_worker.thread.join(); + } + + if (compute_worker.thread.joinable()) + { + compute_worker.thread.join(); + } +---- + +From `TimelineSemaphore::finish_timeline_workers()`: [,cpp] ---- -// Draining queues which submit out-of-order can be quite tricky, since QueueWaitIdle can deadlock for threads which want to run ahead. -// If we call Submit waiting for a semaphore which is yet to be signalled, -// QueueWaitIdle will not finish until a signal in another thread happens. -// Here's an approach we can use to safely tear down the queue. - -// Drain the main thread timeline. -// The async queue might be stalled waiting on the main queue to finish rendering a future frame which it never completes, -// but we might never hit that count, since we're tearing down the application now. -wait_timeline_cpu(main_thread_timeline); - -// Now we're guaranteed that the graphics timeline is at N and the async compute queue is blocked at N + num_frames + 1, waiting for N + 1 to finish. -// Since we're not reading any more in graphics queue, we can jump bump the timeline on CPU towards infinity. -// On the next loop iteration, we will exit the rendering loop and QueueWaitIdle will not be blocked on async thread anymore. -// Just bump the timeline by INT32_MAX which is min-spec for maxTimelineSemaphoreValueDifference. -// This is a useful way to mark a timeline semaphore as "permanently" signalled. -main_thread_timeline.timeline += std::numeric_limits::max(); - -// Order matters here, this works kinda like a condition variable. -// If the timeline update is observed, we should see that the worker is not alive anymore. -async_compute_worker.alive = false; -signal_timeline_cpu(main_thread_timeline, main_thread_timeline_lock); - -// This will now complete in finite time. -if (async_compute_worker.thread.joinable()) -{ - async_compute_worker.thread.join(); -} + graphics_worker.alive = false; + compute_worker.alive = false; + + signal_timeline(Timeline::MAX_STAGES); + + if (graphics_worker.thread.joinable()) + { + graphics_worker.thread.join(); + } + + if (compute_worker.thread.joinable()) + { + compute_worker.thread.join(); + } ---- === Out-of-order submission fallbacks for single queue implementations diff --git a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp index ef861642d9..fd76a09094 100644 --- a/samples/extensions/timeline_semaphore/timeline_semaphore.cpp +++ b/samples/extensions/timeline_semaphore/timeline_semaphore.cpp @@ -16,112 +16,168 @@ */ #include "timeline_semaphore.h" +#include "common/vk_initializers.h" -static constexpr unsigned grid_width = 64; -static constexpr unsigned grid_height = 64; +// What we're trying to demonstrate here is: +// - Out-of-order submission using threads which synchronize GPU work with each other using timeline semaphores. +// In this sample we have a dedicated worker threads for submitting work to the compute and graphics pipelines respectively, +// and the only synchronization with main thread happens via timeline semaphores. +// - Waiting for timeline semaphore on CPU to replace redundant fence objects. +// - Multiple waits on the same timeline. We don't need to worry about allocating and managing binary semaphores in complex scenarios. +// We can wait on the same timeline values as many times as we want, and we avoid all resource management problems that binary semaphores have. -// A simple variant of std::lock_guard which takes a condition for when to lock. -// We need this since we will only lock vkQueueSubmit when the worker thread needs to submit to the main thread as well. -// Otherwise, we will submit lock-free since we only need to externally synchronize the same VkQueue. -class ConditionalLockGuard +namespace { - public: - ConditionalLockGuard(std::mutex &lock_, bool cond_) : - lock(lock_), cond(cond_) - { - if (cond) - { - lock.lock(); - } - } - - ~ConditionalLockGuard() - { - if (cond) - { - lock.unlock(); - } - } +static constexpr unsigned grid_width = 64; +static constexpr unsigned grid_height = 64; - private: - std::mutex &lock; - bool cond; -}; +} // namespace TimelineSemaphore::TimelineSemaphore() { - title = "Timeline semaphore"; + title = "Timeline Semaphore"; - // Need to enable timeline semaphore extension. - add_instance_extension(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); add_device_extension(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME); + add_instance_extension(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); } TimelineSemaphore::~TimelineSemaphore() { + if (prepared) + { + finish_timeline_workers(); + } + if (has_device()) { VkDevice vk_device = get_device().get_handle(); - vkDestroyPipelineLayout(vk_device, pipelines.compute_pipeline_layout, nullptr); - vkDestroyPipelineLayout(vk_device, pipelines.graphics_pipeline_layout, nullptr); - vkDestroyPipeline(vk_device, pipelines.visualize_pipeline, nullptr); - vkDestroyPipeline(vk_device, pipelines.compute_update_pipeline, nullptr); - vkDestroyPipeline(vk_device, pipelines.compute_mutate_pipeline, nullptr); - vkDestroyPipeline(vk_device, pipelines.compute_init_pipeline, nullptr); - vkDestroyDescriptorSetLayout(vk_device, descriptors.sampled_layout, nullptr); - vkDestroyDescriptorSetLayout(vk_device, descriptors.storage_layout, nullptr); - vkDestroyDescriptorPool(vk_device, descriptors.descriptor_pool, nullptr); - - vkDestroySemaphore(vk_device, main_thread_timeline.semaphore, nullptr); - vkDestroySemaphore(vk_device, async_compute_timeline.semaphore, nullptr); + vkDestroyCommandPool(vk_device, graphics.command_pool, nullptr); + vkDestroyPipelineLayout(vk_device, graphics.pipeline_layout, nullptr); + vkDestroyPipeline(vk_device, graphics.pipeline, nullptr); + + vkDestroyCommandPool(vk_device, compute.command_pool, nullptr); + vkDestroyPipelineLayout(vk_device, compute.pipeline_layout, nullptr); + vkDestroyPipeline(vk_device, compute.update_pipeline, nullptr); + vkDestroyPipeline(vk_device, compute.mutate_pipeline, nullptr); + vkDestroyPipeline(vk_device, compute.init_pipeline, nullptr); + + vkDestroyDescriptorSetLayout(vk_device, shared.storage_layout, nullptr); + vkDestroyDescriptorSetLayout(vk_device, shared.sampled_layout, nullptr); + vkDestroyDescriptorPool(vk_device, shared.descriptor_pool, nullptr); + + vkDestroySemaphore(vk_device, timeline.semaphore, nullptr); } } -void TimelineSemaphore::build_command_buffers() +void TimelineSemaphore::setup_shared_resources() { -} + // Descriptor pool + { + VkDescriptorPoolSize pool_sizes[2] = { + {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, NumAsyncFrames}, + {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, NumAsyncFrames}, + }; -void TimelineSemaphore::on_update_ui_overlay(vkb::Drawer &) -{ -} + VkDescriptorPoolCreateInfo pool_info = vkb::initializers::descriptor_pool_create_info(2, pool_sizes, 2 * NumAsyncFrames); + VK_CHECK(vkCreateDescriptorPool(get_device().get_handle(), &pool_info, nullptr, &shared.descriptor_pool)); + } -void TimelineSemaphore::finish() -{ - if (!has_device()) + // Sampler { - return; + auto sampler_create_info = vkb::initializers::sampler_create_info(); + sampler_create_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; + sampler_create_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT; + sampler_create_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT; + sampler_create_info.minFilter = VK_FILTER_NEAREST; + sampler_create_info.magFilter = VK_FILTER_NEAREST; + sampler_create_info.maxLod = VK_LOD_CLAMP_NONE; + sampler_create_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; + shared.immutable_sampler = std::make_unique(get_device(), sampler_create_info); + } + + // Images and image views + { + const auto present_index = get_device().get_queue_by_present(0).get_family_index(); + auto sharing_mode = VK_SHARING_MODE_CONCURRENT; + std::vector queue_families{compute.queue_family_index}; + + if (graphics.queue_family_index != compute.queue_family_index) + { + queue_families.push_back(graphics.queue_family_index); + } + + if (compute.queue_family_index != present_index && graphics.queue_family_index != present_index) + { + queue_families.push_back(present_index); + } + + if (queue_families.size() <= 1) + { + sharing_mode = VK_SHARING_MODE_EXCLUSIVE; + } + + for (int i = 0; i < NumAsyncFrames; ++i) + { + // Need CONCURRENT usage here since we will sample from the image in both graphics and compute queues. + shared.images[i] = std::make_unique(get_device(), vkb::core::ImageBuilder(VkExtent3D{grid_width, grid_height, 1}) + .with_format(VK_FORMAT_R8G8B8A8_UNORM) + .with_usage(VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT) + .with_vma_usage(VMA_MEMORY_USAGE_GPU_ONLY) + .with_sample_count(VK_SAMPLE_COUNT_1_BIT) + .with_mip_levels(1) + .with_array_layers(1) + .with_tiling(VK_IMAGE_TILING_OPTIMAL) + .with_queue_families(static_cast(queue_families.size()), queue_families.data()) + .with_sharing_mode(sharing_mode)); + + shared.image_views[i] = std::make_unique(*shared.images[i], VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R8G8B8A8_UNORM); + } + } + + // Descriptor layouts + { + VkDescriptorSetLayoutBinding storage_binding = vkb::initializers::descriptor_set_layout_binding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT, 0); + VkDescriptorSetLayoutBinding sampled_binding = vkb::initializers::descriptor_set_layout_binding(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_SHADER_STAGE_ALL, 0); + + VkSampler vk_immutable_sampler = shared.immutable_sampler->get_handle(); + sampled_binding.pImmutableSamplers = &vk_immutable_sampler; + + VkDescriptorSetLayoutCreateInfo storage_set_layout_info = vkb::initializers::descriptor_set_layout_create_info(&storage_binding, 1); + VkDescriptorSetLayoutCreateInfo sampled_set_layout_info = vkb::initializers::descriptor_set_layout_create_info(&sampled_binding, 1); + + VK_CHECK(vkCreateDescriptorSetLayout(get_device().get_handle(), &storage_set_layout_info, nullptr, &shared.storage_layout)); + VK_CHECK(vkCreateDescriptorSetLayout(get_device().get_handle(), &sampled_set_layout_info, nullptr, &shared.sampled_layout)); } - // Draining queues which submit out-of-order can be quite tricky, since QueueWaitIdle can deadlock for threads which want to run ahead. - // If we call Submit waiting for a semaphore which is yet to be signalled, - // QueueWaitIdle will not finish until a signal in another thread happens. - // Here's an approach we can use to safely tear down the queue. - - // Drain the main thread timeline. - // The async queue might be stalled waiting on the main queue to finish rendering a future frame which it never completes, - // but we might never hit that count, since we're tearing down the application now. - wait_timeline_cpu(main_thread_timeline); - - // Now we're guaranteed that the graphics timeline is at N and the async compute queue is blocked at N + num_frames + 1, waiting for N + 1 to finish. - // Since we're not reading any more in graphics queue, we can bump the timeline on CPU towards infinity. - // On the next loop iteration, we will exit the rendering loop and QueueWaitIdle will not be blocked on async thread anymore. - // Just bump the timeline by INT32_MAX which is min-spec for maxTimelineSemaphoreValueDifference. - // This is a useful way to mark a timeline semaphore as "permanently" signalled. - main_thread_timeline.timeline += std::numeric_limits::max(); - - // Order matters here, this works kinda like a condition variable. - // If the timeline update is observed, we should see that the worker is not alive anymore. - async_compute_worker.alive = false; - signal_timeline_cpu(main_thread_timeline, main_thread_timeline_lock); - - // This will now complete in finite time. - if (async_compute_worker.thread.joinable()) + // Descriptor sets { - async_compute_worker.thread.join(); + VkDescriptorSetAllocateInfo storage_alloc_info = vkb::initializers::descriptor_set_allocate_info(shared.descriptor_pool, &shared.storage_layout, 1); + VkDescriptorSetAllocateInfo sampled_alloc_info = vkb::initializers::descriptor_set_allocate_info(shared.descriptor_pool, &shared.sampled_layout, 1); + + for (int i = 0; i < NumAsyncFrames; ++i) + { + VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &storage_alloc_info, &shared.storage_descriptor_sets[i])); + VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &sampled_alloc_info, &shared.sampled_descriptor_sets[i])); + + auto general_info = vkb::initializers::descriptor_image_info(VK_NULL_HANDLE, shared.image_views[i]->get_handle(), VK_IMAGE_LAYOUT_GENERAL); + auto readonly_info = vkb::initializers::descriptor_image_info(VK_NULL_HANDLE, shared.image_views[i]->get_handle(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + + const VkWriteDescriptorSet writes[2] = { + vkb::initializers::write_descriptor_set(shared.storage_descriptor_sets[i], VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 0, &general_info), + vkb::initializers::write_descriptor_set(shared.sampled_descriptor_sets[i], VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 0, &readonly_info), + }; + + vkUpdateDescriptorSets(get_device().get_handle(), 2, writes, 0, nullptr); + } } } -void TimelineSemaphore::create_timeline_semaphore(Timeline &timeline) +void TimelineSemaphore::build_command_buffers() +{ + // Unused, but required to resolve pure virtual function inherited from ApiVulkanSample +} + +void TimelineSemaphore::create_timeline_semaphore() { // A timeline semaphore is still a semaphore, but it is of TIMELINE type rather than BINARY. VkSemaphoreCreateInfo create_info = vkb::initializers::semaphore_create_info(); @@ -133,256 +189,197 @@ void TimelineSemaphore::create_timeline_semaphore(Timeline &timeline) VK_CHECK(vkCreateSemaphore(get_device().get_handle(), &create_info, nullptr, &timeline.semaphore)); - timeline.timeline = 0; + timeline.frame = 0; } -void TimelineSemaphore::create_timeline_semaphores() +void TimelineSemaphore::start_timeline_workers() { - create_timeline_semaphore(main_thread_timeline); - create_timeline_semaphore(async_compute_timeline); -} + graphics_worker.alive = true; + graphics_worker.thread = std::thread([this]() { do_graphics_work(); }); -void TimelineSemaphore::create_timeline_worker(TimelineWorker &worker, std::function thread_func) -{ - worker.alive = true; - worker.thread = std::thread(std::move(thread_func)); + compute_worker.alive = true; + compute_worker.thread = std::thread([this]() { do_compute_work(); }); } -// Normally, signal and wait would be merged into a single submit info, -// but this would have made the sample a bit harder to read and reason about. -// For this reason, we split up signals, waits and executions. -void TimelineSemaphore::signal_timeline_gpu(VkQueue signal_queue, const Timeline &timeline, TimelineLock &lock) +void TimelineSemaphore::finish_timeline_workers() { - VkSubmitInfo submit = vkb::initializers::submit_info(); - submit.pSignalSemaphores = &timeline.semaphore; - submit.signalSemaphoreCount = 1; - - // When N semaphores are provided and at least one of them is a timeline semaphore, - // we must pass an auxillary pNext struct which provides which timeline values to use. - VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; - timeline_info.signalSemaphoreValueCount = 1; - timeline_info.pSignalSemaphoreValues = &timeline.timeline; + graphics_worker.alive = false; + compute_worker.alive = false; - submit.pNext = &timeline_info; + // The MAX_STAGES value is used to unblock all threads that are waiting on a timeline stage + signal_timeline(Timeline::MAX_STAGES); - // VkQueue needs to be externally synchronized in vkQueueSubmit if async_queue == queue. + if (graphics_worker.thread.joinable()) { - ConditionalLockGuard holder{submission_lock, async_queue == queue}; - VK_CHECK(vkQueueSubmit(signal_queue, 1, &submit, VK_NULL_HANDLE)); + graphics_worker.thread.join(); } - // This is a special case to handle a scenario where async_queue == queue as well. - // Out-of-order submit is not possible with a single queue since the queue will deadlock itself. - // Very few implementations only support one queue, but the sample should run on all implementations. - // We also need this to handle the fact that we currently cannot use out-of-order submissions with swapchain. - update_pending(lock, timeline.timeline); -} - -void TimelineSemaphore::wait_timeline_gpu(VkQueue wait_queue, const Timeline &timeline, TimelineLock &lock) -{ - if (timeline.timeline == 0) + if (compute_worker.thread.joinable()) { - // No-op. - return; + compute_worker.thread.join(); } +} - // This is a special case to handle a scenario where async_queue == queue as well. - // Out-of-order submit is not possible with a single queue since the queue will deadlock itself. - // Very few implementations only support one queue, but the sample should run on all implementations. - wait_pending_in_order_queue(lock, timeline.timeline); - - const VkPipelineStageFlags wait_stages = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; +// Signal the timeline from the host. +void TimelineSemaphore::signal_timeline(const Timeline::Stages stage) +{ + VkSemaphoreSignalInfo signalInfo; + signalInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO; + signalInfo.pNext = NULL; + signalInfo.semaphore = timeline.semaphore; + signalInfo.value = get_timeline_stage_value(stage); - VkSubmitInfo submit = vkb::initializers::submit_info(); - submit.pWaitSemaphores = &timeline.semaphore; - submit.pWaitDstStageMask = &wait_stages; - submit.waitSemaphoreCount = 1; + VK_CHECK(vkSignalSemaphoreKHR(get_device().get_handle(), &signalInfo)); +} - VkTimelineSemaphoreSubmitInfoKHR timeline_info{VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR}; - timeline_info.waitSemaphoreValueCount = 1; - timeline_info.pWaitSemaphoreValues = &timeline.timeline; +// Wait on the timeline from the host. +void TimelineSemaphore::wait_on_timeline(const Timeline::Stages stage) +{ + const uint64_t waitValue = get_timeline_stage_value(stage); - submit.pNext = &timeline_info; + VkSemaphoreWaitInfo waitInfo; + waitInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO; + waitInfo.pNext = NULL; + waitInfo.flags = 0; + waitInfo.semaphoreCount = 1; + waitInfo.pSemaphores = &timeline.semaphore; + waitInfo.pValues = &waitValue; - // VkQueue needs to be externally synchronized in vkQueueSubmit if async_queue == queue. - { - ConditionalLockGuard holder{submission_lock, async_queue == queue}; - VK_CHECK(vkQueueSubmit(wait_queue, 1, &submit, VK_NULL_HANDLE)); - } + VK_CHECK(vkWaitSemaphoresKHR(get_device().get_handle(), &waitInfo, UINT64_MAX)); } -void TimelineSemaphore::wait_timeline_cpu(const Timeline &timeline) +// Sends the MAX_STAGES signal for the current frame, then increments the frame counter +void TimelineSemaphore::signal_next_frame() { - // There is no distinction between fences and semaphores anymore. - // We can freely wait for a timeline semaphore on host. - // There is also no external synchronization requirement like with VkFence! - // This allows for a free flowing synchronization implementation which makes multithreading even nicer. - - VkSemaphoreWaitInfoKHR wait_info{VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR}; - wait_info.pSemaphores = &timeline.semaphore; - wait_info.semaphoreCount = 1; - wait_info.pValues = &timeline.timeline; - VK_CHECK(vkWaitSemaphoresKHR(get_device().get_handle(), &wait_info, UINT64_MAX)); -} + VkSemaphoreSignalInfo signalInfo; + signalInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO; + signalInfo.pNext = NULL; + signalInfo.semaphore = timeline.semaphore; + signalInfo.value = get_timeline_stage_value(Timeline::MAX_STAGES); -void TimelineSemaphore::signal_timeline_cpu(const Timeline &timeline, TimelineLock &lock) -{ - VkSemaphoreSignalInfoKHR signal_info{VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO_KHR}; - signal_info.semaphore = timeline.semaphore; - signal_info.value = timeline.timeline; - VK_CHECK(vkSignalSemaphoreKHR(get_device().get_handle(), &signal_info)); - - // This is a special case to handle a scenario where async_queue == queue as well. - // Out-of-order submit is not possible with a single queue since the queue will deadlock itself. - // Very few implementations only support one queue, but the sample should run on all implementations. - update_pending(lock, timeline.timeline); + timeline.frame++; + + VK_CHECK(vkSignalSemaphoreKHR(get_device().get_handle(), &signalInfo)); } -void TimelineSemaphore::update_pending(TimelineLock &lock, uint64_t timeline) +// Waits for the timeline to reach MAX_STAGES for the current frame +void TimelineSemaphore::wait_for_next_frame() { - // To support out-of-order signal and wait with a single queue we must do some workarounds. - // Normally, an application should not bother with multiple async queues - // if they have to be hammered onto one VkQueue in the end, - // but it can be useful to know about these problem scenarios up front. - // - // The other case where we need to ensure some kind of ordering is when waiting on a binary semaphore. - // Binary semaphores still have the requirement that all dependencies must already have been submitted, - // and we must still use binary semaphores for swapchain. - // - // To make the single queue scenario work, we must be able to guarantee that a wait is submitted after a signal, - // since we cannot signal on a queue once it is blocked by a wait. - // The only way to do this is to hold back submissions and ensure submissions happen in a forward-progress order. - // - // In this sample, we can achieve this with a condition variable where we wait until - // a pending signal has been submitted, but this approach does not work in all cases. - // It works here since we have a dedicated submission thread. - // It is always possible to add submission threads which may or may not be practical. - // - // This is called after signalling the timeline, which lets other submission threads know that it is safe to wait on - // any timeline value that is <= pending_timeline. - std::lock_guard holder{lock.lock}; - lock.pending_timeline = timeline; - lock.cond.notify_one(); + // MAX_STAGES is used as it provides a boundary value between the stages of this frame and the next + const uint64_t waitValue = (timeline.frame + 1) * Timeline::MAX_STAGES; + + VkSemaphoreWaitInfo waitInfo; + waitInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO; + waitInfo.pNext = NULL; + waitInfo.flags = 0; + waitInfo.semaphoreCount = 1; + waitInfo.pSemaphores = &timeline.semaphore; + waitInfo.pValues = &waitValue; + + VK_CHECK(vkWaitSemaphoresKHR(get_device().get_handle(), &waitInfo, UINT64_MAX)); } -void TimelineSemaphore::wait_pending(TimelineLock &lock, uint64_t timeline) +// Calculates the timeline value for the specified stage in the current frame +uint64_t TimelineSemaphore::get_timeline_stage_value(const Timeline::Stages stage) { - // See update_pending(). This is called before submitting a wait to the single VkQueue. - std::unique_lock holder{lock.lock}; - lock.cond.wait(holder, [&lock, timeline]() -> bool { - return lock.pending_timeline >= timeline; - }); + return (timeline.frame * Timeline::MAX_STAGES) + stage; } -void TimelineSemaphore::wait_pending_in_order_queue(TimelineLock &lock, uint64_t timeline) +void TimelineSemaphore::do_compute_work() { - if (async_queue == queue) + compute.timer.start(); + + while (compute_worker.alive) { - wait_pending(lock, timeline); + // Wait for the main thread to signal that the workers can prepare and submit their work + wait_on_timeline(Timeline::submit); + + auto elapsed = static_cast(compute.timer.elapsed()); + + build_compute_command_buffers(elapsed); + + uint64_t signal_value = get_timeline_stage_value(Timeline::draw); + VkTimelineSemaphoreSubmitInfo timeline_info = vkb::initializers::timeline_semaphore_submit_info(0, nullptr, 1, &signal_value); + + VkSubmitInfo submit_info = vkb::initializers::submit_info(); + submit_info.pNext = &timeline_info; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &compute.command_buffer; + submit_info.signalSemaphoreCount = 1; + submit_info.pSignalSemaphores = &timeline.semaphore; + + // If the threads are being killed, we need to skip the queue submission to allow the program to exit gracefully + if (compute_worker.alive) + { + VK_CHECK(vkQueueSubmit(compute.queue, 1, &submit_info, VK_NULL_HANDLE)); + } + + wait_for_next_frame(); } } -// We want to achieve a pipeline where we're doing these in a double-buffered fashion: -// - Do async compute work, write buffer frame % 2, read buffer (frame - 1) % 2. -// - Blit results in main thread, read buffer frame % 2, write swapchain. +void TimelineSemaphore::setup_compute_pipeline() +{ + VkDescriptorSetLayout layouts[2] = {shared.storage_layout, shared.sampled_layout}; + auto layout_info = vkb::initializers::pipeline_layout_create_info(layouts, 2); -// What we're trying to demonstrate here is: -// - Out-of-order submission using threads which synchronize GPU work with each other using timeline semaphores. -// In this sample we have a dedicated worker thread which submits work to async compute, -// and the only synchronization with main thread happens via timeline semaphores. -// - Waiting for timeline semaphore on CPU to replace redundant fence objects. -// - Multiple waits on the same timeline. We don't need to worry about allocating and managing binary semaphores in complex scenarios. -// We can wait on the same timeline values as many times as we want, and we avoid all resource management problems that binary semaphores have. + VkPushConstantRange range = vkb::initializers::push_constant_range(VK_SHADER_STAGE_COMPUTE_BIT, sizeof(float), 0); + layout_info.pushConstantRangeCount = 1; + layout_info.pPushConstantRanges = ⦥ -void TimelineSemaphore::async_compute_loop() -{ - uint64_t iteration = 0; + VK_CHECK(vkCreatePipelineLayout(get_device().get_handle(), &layout_info, nullptr, &compute.pipeline_layout)); + VkComputePipelineCreateInfo info = vkb::initializers::compute_pipeline_create_info(compute.pipeline_layout); - vkb::Timer timer; - timer.start(); + info.stage = load_shader("timeline_semaphore/game_of_life_update.comp", VK_SHADER_STAGE_COMPUTE_BIT); + VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &compute.update_pipeline)); - // We're going to be recording commands on a thread, so make sure we have our own command pool. - VkCommandPool pool = get_device().create_command_pool(get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT), - VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT); + info.stage = load_shader("timeline_semaphore/game_of_life_mutate.comp", VK_SHADER_STAGE_COMPUTE_BIT); + VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &compute.mutate_pipeline)); - // Pre-allocate N command buffers. We will however re-record them every iteration. - VkCommandBufferAllocateInfo alloc_info = - vkb::initializers::command_buffer_allocate_info(pool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, NumAsyncFrames); - VkCommandBuffer cmds[NumAsyncFrames]; - VK_CHECK(vkAllocateCommandBuffers(get_device().get_handle(), &alloc_info, cmds)); + info.stage = load_shader("timeline_semaphore/game_of_life_init.comp", VK_SHADER_STAGE_COMPUTE_BIT); + VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &compute.init_pipeline)); +} - while (async_compute_worker.alive) - { - iteration++; - unsigned frame_index = iteration % NumAsyncFrames; - VkCommandBuffer cmd = cmds[frame_index]; +void TimelineSemaphore::setup_compute_resources() +{ + // Get compute queue + compute.queue_family_index = get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT); + vkGetDeviceQueue(get_device().get_handle(), compute.queue_family_index, 0, &compute.queue); - if (iteration >= NumAsyncFrames) - { - // Wait for main thread to be done reading from the buffer, before we clobber it. - wait_timeline_gpu(async_queue, {main_thread_timeline.semaphore, iteration - NumAsyncFrames}, main_thread_timeline_lock); + compute.command_pool = get_device().create_command_pool(compute.queue_family_index, VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT); - // We're going to re-record command buffers, wait on host here. This also ensures we don't endlessly submit commands to the async queues. - // The signalling of async compute timeline is gated somewhat on the main thread submitting work to the swapchain. - wait_timeline_cpu({async_compute_timeline.semaphore, iteration - NumAsyncFrames}); - } + VkCommandBufferAllocateInfo alloc_info = + vkb::initializers::command_buffer_allocate_info(compute.command_pool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, 1); - // Wait for last iteration to complete since we're going to read from the results. - // Could use pipeline barrier here certainly, but this is a sample - // where we can show how free-flowing queue synchronization can be. - wait_timeline_gpu(async_queue, async_compute_timeline, async_compute_timeline_lock); + VK_CHECK(vkAllocateCommandBuffers(get_device().get_handle(), &alloc_info, &compute.command_buffer)); +} - auto begin_info = vkb::initializers::command_buffer_begin_info(); - begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - VK_CHECK(vkResetCommandBuffer(cmd, 0)); - VK_CHECK(vkBeginCommandBuffer(cmd, &begin_info)); +void TimelineSemaphore::setup_game_of_life() +{ + auto begin_info = vkb::initializers::command_buffer_begin_info(); + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + VK_CHECK(vkResetCommandBuffer(compute.command_buffer, 0)); + VK_CHECK(vkBeginCommandBuffer(compute.command_buffer, &begin_info)); - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines.compute_pipeline_layout, 0, 1, - &descriptors.storage_images[frame_index], - 0, nullptr); + for (int i = 0; i < NumAsyncFrames; ++i) + { + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_descriptor_sets[i], 0, nullptr); - if (iteration == 1) - { - // On the first iteration, we initialize the game of life. - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines.compute_init_pipeline); - } - else - { - auto elapsed = static_cast(timer.elapsed()); - - // Either we iterate the game every second, or we mutate it by changing colors gradually - // to make something more aesthetically interesting. - if (elapsed > 1.0f) - { - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines.compute_update_pipeline); - timer.lap(); - } - else - { - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines.compute_mutate_pipeline); - vkCmdPushConstants(cmd, pipelines.compute_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, - 0, sizeof(elapsed), &elapsed); - } - - // Bind previous iteration's texture. - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines.compute_pipeline_layout, 1, 1, - &descriptors.sampled_images[(frame_index + (NumAsyncFrames - 1)) % NumAsyncFrames], - 0, nullptr); - } + // On the first iteration, we initialize the game of life. + vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.init_pipeline); VkImageMemoryBarrier image_barrier = vkb::initializers::image_memory_barrier(); image_barrier.srcAccessMask = 0; image_barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - image_barrier.image = images[frame_index]->get_handle(); + image_barrier.image = shared.images[i]->get_handle(); image_barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; // The semaphore takes care of srcStageMask. - vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - 0, 0, nullptr, 0, nullptr, 1, &image_barrier); + vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); - vkCmdDispatch(cmd, grid_width / 8, grid_height / 8, 1); + vkCmdDispatch(compute.command_buffer, grid_width / 8, grid_height / 8, 1); image_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; image_barrier.dstAccessMask = 0; @@ -390,174 +387,134 @@ void TimelineSemaphore::async_compute_loop() image_barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; // The semaphore takes care of dstStageMask. - vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, - 0, 0, nullptr, 0, nullptr, 1, &image_barrier); - - VK_CHECK(vkEndCommandBuffer(cmd)); - - auto submit_info = vkb::initializers::submit_info(); - submit_info.commandBufferCount = 1; - submit_info.pCommandBuffers = &cmd; + vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); + } - // VkQueue needs to be externally synchronized in vkQueueSubmit if async_queue == queue. - { - ConditionalLockGuard holder{submission_lock, async_queue == queue}; - VK_CHECK(vkQueueSubmit(async_queue, 1, &submit_info, VK_NULL_HANDLE)); - } + VK_CHECK(vkEndCommandBuffer(compute.command_buffer)); - // Kicks shading work in main queue. - async_compute_timeline.timeline = iteration; - signal_timeline_gpu(async_queue, async_compute_timeline, async_compute_timeline_lock); - } + VkSubmitInfo submit_info = vkb::initializers::submit_info(); + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &compute.command_buffer; - // This QueueWaitIdle can be precarious. - // See TimelineSemaphore::finish() comments for why this is the case. - { - ConditionalLockGuard holder{submission_lock, async_queue == queue}; - vkQueueWaitIdle(async_queue); - } + VK_CHECK(vkQueueSubmit(compute.queue, 1, &submit_info, VK_NULL_HANDLE)); - // This also frees command buffers allocated from the pool. - vkDestroyCommandPool(get_device().get_handle(), pool, nullptr); + VK_CHECK(get_device().wait_idle()); } -void TimelineSemaphore::create_timeline_workers() +void TimelineSemaphore::build_compute_command_buffers(const float elapsed) { - create_timeline_worker(async_compute_worker, [this]() { async_compute_loop(); }); -} + auto begin_info = vkb::initializers::command_buffer_begin_info(); + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + VK_CHECK(vkResetCommandBuffer(compute.command_buffer, 0)); + VK_CHECK(vkBeginCommandBuffer(compute.command_buffer, &begin_info)); -void TimelineSemaphore::prepare_queue() -{ - // Attempt to find a queue which is async compute. - // If we cannot find that queue family, at least try to find a queue which is not the "main" queue. - // If we have different queues we can safely use out of order signal and wait which is a core part of this sample. + auto frame_index = timeline.frame % NumAsyncFrames; + auto prev_index = (timeline.frame - 1) % NumAsyncFrames; - auto &device = get_device(); - uint32_t family_index = device.get_queue_family_index(VK_QUEUE_COMPUTE_BIT); - uint32_t num_queues = device.get_num_queues_for_queue_family(family_index); + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 0, 1, &shared.storage_descriptor_sets[frame_index], 0, nullptr); - for (uint32_t i = 0; i < num_queues; i++) + if (elapsed > 1.0f) { - auto &candidate = device.get_queue(family_index, i); - if (candidate.get_handle() != queue) - { - async_queue = candidate.get_handle(); - break; - } + vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.update_pipeline); + compute.timer.lap(); } - - if (!async_queue) + else { - // Fallback path. Cannot use out-of-order signal and wait here since the queue will deadlock itself. - // If this happens we need to add some locks and condition variables to make things work. - // See comments in TimelineSemaphore::update_pending(). - async_queue = queue; + vkCmdBindPipeline(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.mutate_pipeline); + vkCmdPushConstants(compute.command_buffer, compute.pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(elapsed), &elapsed); } -} -void TimelineSemaphore::create_resources() -{ - uint32_t queue_families[2]{}; - uint32_t num_queue_families{}; + // Bind previous iteration's texture. + vkCmdBindDescriptorSets(compute.command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, compute.pipeline_layout, 1, 1, &shared.sampled_descriptor_sets[prev_index], 0, nullptr); - // Need CONCURRENT usage here since we will sample from the image - // in both graphics and compute queues. - if (get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT) != - get_device().get_queue_by_present(0).get_family_index()) - { - queue_families[0] = get_device().get_queue_by_present(0).get_family_index(); - queue_families[1] = get_device().get_queue_family_index(VK_QUEUE_COMPUTE_BIT); - num_queue_families = 2; - } + VkImageMemoryBarrier image_barrier = vkb::initializers::image_memory_barrier(); + image_barrier.srcAccessMask = 0; + image_barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + image_barrier.image = shared.images[frame_index]->get_handle(); + image_barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; - for (int i = 0; i < NumAsyncFrames; i++) - { - images[i] = std::make_unique(get_device(), VkExtent3D{grid_width, grid_height, 1}, - VK_FORMAT_R8G8B8A8_UNORM, - VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, - VMA_MEMORY_USAGE_GPU_ONLY, - VK_SAMPLE_COUNT_1_BIT, - 1, 1, VK_IMAGE_TILING_OPTIMAL, - 0, num_queue_families, queue_families); - - image_views[i] = std::make_unique(*images[i], VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R8G8B8A8_UNORM); - } + // The semaphore takes care of srcStageMask. + vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); - // Boilerplate where we create a STORAGE_IMAGE descriptor set and SAMPLED_IMAGE descriptor set. - - auto sampler_create_info = vkb::initializers::sampler_create_info(); - sampler_create_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; - sampler_create_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT; - sampler_create_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT; - sampler_create_info.minFilter = VK_FILTER_NEAREST; - sampler_create_info.magFilter = VK_FILTER_NEAREST; - sampler_create_info.maxLod = VK_LOD_CLAMP_NONE; - sampler_create_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; - immutable_sampler = std::make_unique(get_device(), sampler_create_info); - VkSampler vk_immutable_sampler = immutable_sampler->get_handle(); - - VkDescriptorSetLayoutBinding storage_binding = vkb::initializers::descriptor_set_layout_binding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT, 0); - VkDescriptorSetLayoutBinding sampled_binding = vkb::initializers::descriptor_set_layout_binding(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_SHADER_STAGE_ALL, 0); - sampled_binding.pImmutableSamplers = &vk_immutable_sampler; - VkDescriptorSetLayoutCreateInfo storage_set_layout_info = vkb::initializers::descriptor_set_layout_create_info(&storage_binding, 1); - VkDescriptorSetLayoutCreateInfo sampled_set_layout_info = vkb::initializers::descriptor_set_layout_create_info(&sampled_binding, 1); - - VK_CHECK(vkCreateDescriptorSetLayout(get_device().get_handle(), &storage_set_layout_info, nullptr, &descriptors.storage_layout)); - VK_CHECK(vkCreateDescriptorSetLayout(get_device().get_handle(), &sampled_set_layout_info, nullptr, &descriptors.sampled_layout)); - - VkDescriptorPoolSize pool_sizes[2] = { - {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, NumAsyncFrames}, - {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, NumAsyncFrames}, - }; - VkDescriptorPoolCreateInfo pool_info = vkb::initializers::descriptor_pool_create_info(2, pool_sizes, NumAsyncFrames * 2); - VK_CHECK(vkCreateDescriptorPool(get_device().get_handle(), &pool_info, nullptr, &descriptors.descriptor_pool)); - - VkDescriptorSetAllocateInfo storage_alloc_info = vkb::initializers::descriptor_set_allocate_info(descriptors.descriptor_pool, &descriptors.storage_layout, 1); - VkDescriptorSetAllocateInfo sampled_alloc_info = vkb::initializers::descriptor_set_allocate_info(descriptors.descriptor_pool, &descriptors.sampled_layout, 1); - for (int i = 0; i < NumAsyncFrames; i++) - { - VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &storage_alloc_info, &descriptors.storage_images[i])); - VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &sampled_alloc_info, &descriptors.sampled_images[i])); + vkCmdDispatch(compute.command_buffer, grid_width / 8, grid_height / 8, 1); - auto general_info = vkb::initializers::descriptor_image_info(VK_NULL_HANDLE, image_views[i]->get_handle(), VK_IMAGE_LAYOUT_GENERAL); - auto readonly_info = vkb::initializers::descriptor_image_info(VK_NULL_HANDLE, image_views[i]->get_handle(), VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + image_barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + image_barrier.dstAccessMask = 0; + image_barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; + image_barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - const VkWriteDescriptorSet writes[2] = { - vkb::initializers::write_descriptor_set(descriptors.storage_images[i], VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 0, &general_info), - vkb::initializers::write_descriptor_set(descriptors.sampled_images[i], VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 0, &readonly_info), - }; - vkUpdateDescriptorSets(get_device().get_handle(), 2, writes, 0, nullptr); - } + // The semaphore takes care of dstStageMask. + vkCmdPipelineBarrier(compute.command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 1, &image_barrier); + + VK_CHECK(vkEndCommandBuffer(compute.command_buffer)); } -void TimelineSemaphore::create_compute_pipeline() +void TimelineSemaphore::do_graphics_work() { - VkDescriptorSetLayout layouts[2] = {descriptors.storage_layout, descriptors.sampled_layout}; - auto layout_info = vkb::initializers::pipeline_layout_create_info(layouts, 2); + while (graphics_worker.alive) + { + // Wait for the main thread to signal that the workers can prepare and submit their work + wait_on_timeline(Timeline::submit); + + build_graphics_command_buffer(); + + uint64_t wait_values[] = {get_timeline_stage_value(Timeline::draw), 0}; + VkPipelineStageFlags wait_stage_masks[] = {VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT}; + VkSemaphore wait_semaphores[] = {timeline.semaphore, semaphores.acquired_image_ready}; + uint64_t signal_values[] = {get_timeline_stage_value(Timeline::present), 0}; + VkSemaphore signal_semaphores[] = {timeline.semaphore, semaphores.render_complete}; + VkTimelineSemaphoreSubmitInfo timeline_info = vkb::initializers::timeline_semaphore_submit_info(2, wait_values, 2, signal_values); + + VkSubmitInfo submit_info = vkb::initializers::submit_info(); + submit_info.pNext = &timeline_info; + submit_info.waitSemaphoreCount = 2; + submit_info.pWaitSemaphores = wait_semaphores; + submit_info.pWaitDstStageMask = wait_stage_masks; + submit_info.signalSemaphoreCount = 2; + submit_info.pSignalSemaphores = signal_semaphores; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &graphics.command_buffer; + + if (compute.queue == graphics.queue) + { + // If compute.queue == queue, we need synchronise access to the queue AND ensure that submissions are made in order + // (otherwise the queue will deadlock itself). So we wait for the "draw" stage to be signalled on the host, before + // submitting the work. + wait_on_timeline(Timeline::draw); + } - VkPushConstantRange range = vkb::initializers::push_constant_range(VK_SHADER_STAGE_COMPUTE_BIT, sizeof(float), 0); - layout_info.pushConstantRangeCount = 1; - layout_info.pPushConstantRanges = ⦥ + // If the threads are being killed, we need to skip the queue submission to allow the program to exit gracefully + if (graphics_worker.alive) + { + VK_CHECK(vkQueueSubmit(graphics.queue, 1, &submit_info, VK_NULL_HANDLE)); + } + + wait_for_next_frame(); + } +} - VK_CHECK(vkCreatePipelineLayout(get_device().get_handle(), &layout_info, nullptr, &pipelines.compute_pipeline_layout)); - VkComputePipelineCreateInfo info = vkb::initializers::compute_pipeline_create_info(pipelines.compute_pipeline_layout); +void TimelineSemaphore::setup_graphics_resources() +{ + graphics.queue_family_index = get_device().get_queue_family_index(VK_QUEUE_GRAPHICS_BIT); + graphics.queue = queue; - info.stage = load_shader("timeline_semaphore/game_of_life_update.comp", VK_SHADER_STAGE_COMPUTE_BIT); - VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &pipelines.compute_update_pipeline)); + graphics.command_pool = get_device().create_command_pool(graphics.queue_family_index, VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT); - info.stage = load_shader("timeline_semaphore/game_of_life_mutate.comp", VK_SHADER_STAGE_COMPUTE_BIT); - VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &pipelines.compute_mutate_pipeline)); + VkCommandBufferAllocateInfo alloc_info = + vkb::initializers::command_buffer_allocate_info(graphics.command_pool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, 1); - info.stage = load_shader("timeline_semaphore/game_of_life_init.comp", VK_SHADER_STAGE_COMPUTE_BIT); - VK_CHECK(vkCreateComputePipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &pipelines.compute_init_pipeline)); + VK_CHECK(vkAllocateCommandBuffers(get_device().get_handle(), &alloc_info, &graphics.command_buffer)); } -void TimelineSemaphore::create_graphics_pipeline() +void TimelineSemaphore::setup_graphics_pipeline() { - auto layout_info = vkb::initializers::pipeline_layout_create_info(&descriptors.sampled_layout, 1); - VK_CHECK(vkCreatePipelineLayout(get_device().get_handle(), &layout_info, nullptr, &pipelines.graphics_pipeline_layout)); + auto layout_info = vkb::initializers::pipeline_layout_create_info(&shared.sampled_layout, 1); + VK_CHECK(vkCreatePipelineLayout(get_device().get_handle(), &layout_info, nullptr, &graphics.pipeline_layout)); - VkGraphicsPipelineCreateInfo info = vkb::initializers::pipeline_create_info(pipelines.graphics_pipeline_layout, render_pass); + VkGraphicsPipelineCreateInfo info = vkb::initializers::pipeline_create_info(graphics.pipeline_layout, render_pass); VkPipelineVertexInputStateCreateInfo vertex_input_state = vkb::initializers::pipeline_vertex_input_state_create_info(); @@ -595,41 +552,14 @@ void TimelineSemaphore::create_graphics_pipeline() stages[0] = load_shader("timeline_semaphore/render.vert", VK_SHADER_STAGE_VERTEX_BIT); stages[1] = load_shader("timeline_semaphore/render.frag", VK_SHADER_STAGE_FRAGMENT_BIT); - VK_CHECK(vkCreateGraphicsPipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &pipelines.visualize_pipeline)); + VK_CHECK(vkCreateGraphicsPipelines(get_device().get_handle(), VK_NULL_HANDLE, 1, &info, nullptr, &graphics.pipeline)); } -void TimelineSemaphore::create_pipelines() +void TimelineSemaphore::build_graphics_command_buffer() { - create_compute_pipeline(); - create_graphics_pipeline(); -} - -bool TimelineSemaphore::prepare(const vkb::ApplicationOptions &options) -{ - if (!ApiVulkanSample::prepare(options)) - { - return false; - } - - create_resources(); - create_pipelines(); - prepare_queue(); - create_timeline_semaphores(); - create_timeline_workers(); - - prepared = true; - return true; -} - -void TimelineSemaphore::render(float delta_time) -{ - ApiVulkanSample::prepare_frame(); - - VK_CHECK(vkWaitForFences(get_device().get_handle(), 1, &wait_fences[current_buffer], VK_TRUE, UINT64_MAX)); - VK_CHECK(vkResetFences(get_device().get_handle(), 1, &wait_fences[current_buffer])); - - VkViewport viewport = {0.0f, 0.0f, static_cast(width), static_cast(height), 0.0f, 1.0f}; - VkRect2D scissor = {{0, 0}, {width, height}}; + auto frame_index = timeline.frame % NumAsyncFrames; + VkViewport viewport = {0.0f, 0.0f, static_cast(width), static_cast(height), 0.0f, 1.0f}; + VkRect2D scissor = {{0, 0}, {width, height}}; // Simple fix for 1:1 pixel aspect ratio. if (viewport.width > viewport.height) @@ -643,11 +573,11 @@ void TimelineSemaphore::render(float delta_time) viewport.height = viewport.width; } - recreate_current_command_buffer(); - auto cmd = draw_cmd_buffers[current_buffer]; + VK_CHECK(vkResetCommandBuffer(graphics.command_buffer, 0)); + auto begin_info = vkb::initializers::command_buffer_begin_info(); begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - vkBeginCommandBuffer(cmd, &begin_info); + VK_CHECK(vkBeginCommandBuffer(graphics.command_buffer, &begin_info)); VkRenderPassBeginInfo render_pass_begin = vkb::initializers::render_pass_begin_info(); render_pass_begin.renderPass = render_pass; @@ -661,54 +591,77 @@ void TimelineSemaphore::render(float delta_time) render_pass_begin.pClearValues = clears; render_pass_begin.framebuffer = framebuffers[current_buffer]; - vkCmdBeginRenderPass(cmd, &render_pass_begin, VK_SUBPASS_CONTENTS_INLINE); + vkCmdBeginRenderPass(graphics.command_buffer, &render_pass_begin, VK_SUBPASS_CONTENTS_INLINE); - vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelines.visualize_pipeline); - vkCmdSetViewport(cmd, 0, 1, &viewport); - vkCmdSetScissor(cmd, 0, 1, &scissor); + vkCmdBindPipeline(graphics.command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphics.pipeline); + vkCmdSetViewport(graphics.command_buffer, 0, 1, &viewport); + vkCmdSetScissor(graphics.command_buffer, 0, 1, &scissor); - main_thread_timeline.timeline++; - uint32_t frame_index = main_thread_timeline.timeline % NumAsyncFrames; - vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelines.graphics_pipeline_layout, - 0, 1, &descriptors.sampled_images[frame_index], 0, nullptr); - vkCmdDraw(cmd, 3, 1, 0, 0); + vkCmdBindDescriptorSets(graphics.command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphics.pipeline_layout, 0, 1, &shared.sampled_descriptor_sets[frame_index], 0, nullptr); + vkCmdDraw(graphics.command_buffer, 3, 1, 0, 0); - draw_ui(cmd); + draw_ui(graphics.command_buffer); - vkCmdEndRenderPass(cmd); + vkCmdEndRenderPass(graphics.command_buffer); - VK_CHECK(vkEndCommandBuffer(cmd)); - submit_info.commandBufferCount = 1; - submit_info.pCommandBuffers = &draw_cmd_buffers[current_buffer]; + VK_CHECK(vkEndCommandBuffer(graphics.command_buffer)); +} - // Wait for the async queue to have completed rendering. - wait_timeline_gpu(queue, {async_compute_timeline.semaphore, main_thread_timeline.timeline}, async_compute_timeline_lock); +void TimelineSemaphore::request_gpu_features(vkb::PhysicalDevice &gpu) +{ + // Need to enable the timelineSemaphore feature. + auto &features = gpu.request_extension_features( + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR); + features.timelineSemaphore = VK_TRUE; +} - // Need to hold the conditional lock during submit_frame as well since vkQueuePresentKHR uses the main queue as well. +bool TimelineSemaphore::prepare(const vkb::ApplicationOptions &options) +{ + if (!ApiVulkanSample::prepare(options)) { - ConditionalLockGuard holder{submission_lock, async_queue == queue}; - VK_CHECK(vkQueueSubmit(queue, 1, &submit_info, wait_fences[current_buffer])); + return false; + } - // Before we call present, which uses a binary semaphore, we must ensure that all dependent submissions - // have been submitted, so that the presenting queue is unblocked at the time of calling. - wait_pending(async_compute_timeline_lock, main_thread_timeline.timeline); + setup_compute_resources(); + setup_graphics_resources(); + setup_shared_resources(); - ApiVulkanSample::submit_frame(); - } + setup_compute_pipeline(); + setup_graphics_pipeline(); + + setup_game_of_life(); - // Let async queue know it is safe to clobber the image since main queue is done reading it. - signal_timeline_gpu(queue, main_thread_timeline, main_thread_timeline_lock); + create_timeline_semaphore(); + + start_timeline_workers(); + + prepared = true; + + return true; } -void TimelineSemaphore::request_gpu_features(vkb::PhysicalDevice &gpu) +void TimelineSemaphore::render(float delta_time) { - // Need to enable the timelineSemaphore feature. - auto &features = gpu.request_extension_features( - VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR); - features.timelineSemaphore = VK_TRUE; + if (!prepared) + { + return; + } + + ApiVulkanSample::prepare_frame(); + + // Signal to the worker threads that they can submit their work + signal_timeline(Timeline::submit); + + // Wait for the worker threads to signal that the frame is ready to present + wait_on_timeline(Timeline::present); + + ApiVulkanSample::submit_frame(); + + // Signal to the worker threads that they can proceed to the next frame's work + signal_next_frame(); } -std::unique_ptr> create_timeline_semaphore() +std::unique_ptr create_timeline_semaphore() { return std::make_unique(); } diff --git a/samples/extensions/timeline_semaphore/timeline_semaphore.h b/samples/extensions/timeline_semaphore/timeline_semaphore.h index 0dc6c9e0f5..cfa0a17b29 100644 --- a/samples/extensions/timeline_semaphore/timeline_semaphore.h +++ b/samples/extensions/timeline_semaphore/timeline_semaphore.h @@ -18,98 +18,111 @@ #pragma once #include "api_vulkan_sample.h" -#include -#include -#include -#include class TimelineSemaphore : public ApiVulkanSample { public: - TimelineSemaphore(); - ~TimelineSemaphore(); + static const uint32_t NumAsyncFrames = 2; - private: - virtual void request_gpu_features(vkb::PhysicalDevice &gpu) override; - virtual void render(float delta_time) override; - virtual void build_command_buffers() override; - virtual void on_update_ui_overlay(vkb::Drawer &drawer) override; - virtual bool prepare(const vkb::ApplicationOptions &options) override; - virtual void finish() override; + // Resources for the graphics worker + struct GraphicsResources + { + VkQueue queue; + VkCommandPool command_pool; + VkCommandBuffer command_buffer; - void create_resources(); - void create_pipelines(); - void create_compute_pipeline(); - void create_graphics_pipeline(); + VkPipelineLayout pipeline_layout; + VkPipeline pipeline; - struct Pipelines - { - VkPipelineLayout compute_pipeline_layout{}; - VkPipelineLayout graphics_pipeline_layout{}; - VkPipeline visualize_pipeline{}; - VkPipeline compute_update_pipeline{}; - VkPipeline compute_mutate_pipeline{}; - VkPipeline compute_init_pipeline{}; - } pipelines; - - enum + uint32_t queue_family_index; + } graphics; + + // Resources for the compute worker + struct ComputeResources { - NumAsyncFrames = 2 - }; + VkQueue queue; + VkCommandPool command_pool; + VkCommandBuffer command_buffer; + + VkPipelineLayout pipeline_layout; + VkPipeline init_pipeline; + VkPipeline update_pipeline; + VkPipeline mutate_pipeline; - struct Descriptors + vkb::Timer timer; + uint32_t queue_family_index; + } compute; + + // Resources used by both workers for storing/sampling images + struct SharedResources { VkDescriptorSetLayout storage_layout; VkDescriptorSetLayout sampled_layout; - VkDescriptorSet storage_images[NumAsyncFrames]; - VkDescriptorSet sampled_images[NumAsyncFrames]; + VkDescriptorSet storage_descriptor_sets[NumAsyncFrames]; + VkDescriptorSet sampled_descriptor_sets[NumAsyncFrames]; VkDescriptorPool descriptor_pool; - } descriptors{}; - - std::unique_ptr immutable_sampler; - std::unique_ptr images[NumAsyncFrames]; - std::unique_ptr image_views[NumAsyncFrames]; - VkQueue async_queue{VK_NULL_HANDLE}; - void prepare_queue(); - std::mutex submission_lock; + std::unique_ptr immutable_sampler; + std::unique_ptr images[NumAsyncFrames]; + std::unique_ptr image_views[NumAsyncFrames]; + } shared; struct Timeline { - VkSemaphore semaphore; - uint64_t timeline; - }; - Timeline main_thread_timeline{}, async_compute_timeline{}; + // The stages of the timeline are enumerated, to make it easier to read which stage we are signalling/waiting on, and to allow + // the stages to be reused without needing to recreate the semaphore. + enum Stages + { + submit = 1, // Worker threads can create and submit their command buffers, + draw, // The graphics worker can draw the current frame + present, // The main thread can present the frame to the display + MAX_STAGES + }; - struct TimelineLock - { - std::condition_variable cond; - std::mutex lock; - uint64_t pending_timeline; - }; - TimelineLock main_thread_timeline_lock{}, async_compute_timeline_lock{}; + VkSemaphore semaphore; + uint64_t frame; // Number of iterations through the timeline stages + } timeline; struct TimelineWorker { std::thread thread; std::atomic_bool alive; }; - TimelineWorker async_compute_worker; - void create_timeline_semaphores(); - void create_timeline_semaphore(Timeline &timeline); - - void create_timeline_workers(); - static void create_timeline_worker(TimelineWorker &worker, std::function thread_func); - - void async_compute_loop(); - - void signal_timeline_gpu(VkQueue queue, const Timeline &timeline, TimelineLock &lock); - void wait_timeline_gpu(VkQueue queue, const Timeline &timeline, TimelineLock &lock); - void wait_timeline_cpu(const Timeline &timeline); - void signal_timeline_cpu(const Timeline &timeline, TimelineLock &lock); - void update_pending(TimelineLock &lock, uint64_t timeline); - void wait_pending_in_order_queue(TimelineLock &lock, uint64_t timeline); - void wait_pending(TimelineLock &lock, uint64_t timeline); + + TimelineWorker graphics_worker, compute_worker; + + TimelineSemaphore(); + ~TimelineSemaphore(); + + void setup_shared_resources(); + void build_command_buffers() override; + + // Timeline operations + void create_timeline_semaphore(); + void start_timeline_workers(); + void finish_timeline_workers(); + void signal_timeline(const Timeline::Stages stage); + void wait_on_timeline(const Timeline::Stages stage); + void signal_next_frame(); + void wait_for_next_frame(); + uint64_t get_timeline_stage_value(const Timeline::Stages stage); + + // Compute Work + void do_compute_work(); + void setup_compute_pipeline(); + void setup_compute_resources(); + void setup_game_of_life(); + void build_compute_command_buffers(const float elapsed = 0.0f); + + // Graphics Work + void do_graphics_work(); + void setup_graphics_resources(); + void setup_graphics_pipeline(); + void build_graphics_command_buffer(); + + virtual void request_gpu_features(vkb::PhysicalDevice &gpu) override; + virtual bool prepare(const vkb::ApplicationOptions &options) override; + virtual void render(float delta_time) override; }; -std::unique_ptr> create_timeline_semaphore(); +std::unique_ptr create_timeline_semaphore();