From 88168db3e0eaa0665dd119ddf7704d5723267d3c Mon Sep 17 00:00:00 2001 From: Hyunjun Ko Date: Fri, 12 Dec 2025 09:21:52 +0100 Subject: [PATCH 1/2] decode: Add support for cross-queue family transfer operations When using linear output mode (ENABLE_LINEAR_OUTPUT), the decoder needs to copy decoded frames from optimal to linear images using a transfer operation. The original implementation assumed the transfer queue and video decode queue were in the same queue family, recording the copy operation directly in the decode command buffer. On some devices (e.g., Intel ANV), the video decode queue and transfer queue are in different queue families. This requires proper queue family ownership transfer using release/acquire barriers and semaphore synchronization between the queues. --- .../libs/VkVideoDecoder/VkVideoDecoder.cpp | 206 +++++++++++++++++- .../libs/VkVideoDecoder/VkVideoDecoder.h | 9 + 2 files changed, 207 insertions(+), 8 deletions(-) diff --git a/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp b/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp index 5ab91cfc..a9fdf4ab 100644 --- a/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp +++ b/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp @@ -549,6 +549,53 @@ int32_t VkVideoDecoder::StartVideoSequence(VkParserDetectedVideoFormat* pVideoFo // There will be no more than VulkanVideoFrameBuffer::maxImages frames in the queue. m_decodeFramesData.resize(std::max(maxDecodeFramesCount, VulkanVideoFrameBuffer::maxImages)); + // Check if transfer queue is in a different family than video decode queue + // and initialize transfer queue resources if needed + if (m_useTransferOperation == VK_TRUE) { + int32_t txQueueFamilyIdx = m_vkDevCtx->GetTransferQueueFamilyIdx(); + int32_t videoQueueFamilyIdx = m_vkDevCtx->GetVideoDecodeQueueFamilyIdx(); + + if (txQueueFamilyIdx != -1 && txQueueFamilyIdx != videoQueueFamilyIdx) { + m_useSeparateTransferQueue = VK_TRUE; + + // Create command pool for transfer queue if not already created + if (m_transferCommandPool == VK_NULL_HANDLE) { + VkCommandPoolCreateInfo cmdPoolInfo = {}; + cmdPoolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + cmdPoolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; + cmdPoolInfo.queueFamilyIndex = txQueueFamilyIdx; + result = m_vkDevCtx->CreateCommandPool(*m_vkDevCtx, &cmdPoolInfo, nullptr, &m_transferCommandPool); + if (result != VK_SUCCESS) { + fprintf(stderr, "\nERROR: CreateCommandPool() for transfer queue result: 0x%x\n", result); + m_useSeparateTransferQueue = VK_FALSE; + } + } + + // Allocate command buffers for transfer operations + if (m_transferCommandPool != VK_NULL_HANDLE && m_transferCommandBuffers.empty()) { + const uint32_t numTransferCmdBuffers = std::max(maxDecodeFramesCount, VulkanVideoFrameBuffer::maxImages); + VkCommandBufferAllocateInfo cmdInfo = {}; + cmdInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + cmdInfo.commandBufferCount = numTransferCmdBuffers; + cmdInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + cmdInfo.commandPool = m_transferCommandPool; + + m_transferCommandBuffers.resize(numTransferCmdBuffers); + result = m_vkDevCtx->AllocateCommandBuffers(*m_vkDevCtx, &cmdInfo, m_transferCommandBuffers.data()); + if (result != VK_SUCCESS) { + fprintf(stderr, "\nERROR: AllocateCommandBuffers() for transfer queue result: 0x%x\n", result); + m_transferCommandBuffers.clear(); + m_useSeparateTransferQueue = VK_FALSE; + } + } + + if (m_useSeparateTransferQueue) { + std::cout << "\t Using separate transfer queue (family " << txQueueFamilyIdx + << ") for decode output copy (video decode family: " << videoQueueFamilyIdx << ")" << std::endl; + } + } + } + int32_t availableBuffers = (int32_t)m_decodeFramesData.GetBitstreamBuffersQueue(). GetAvailableNodesNumber(); if (availableBuffers < m_numBitstreamBuffersToPreallocate) { @@ -1137,12 +1184,42 @@ int VkVideoDecoder::DecodePictureWithParameters(VkParserPerFrameDecodeParameters assert((pOutputPictureResource != nullptr) && (pOutputPictureResourceInfo != nullptr)); - CopyOptimalToLinearImage(frameDataSlot.commandBuffer, - *pOutputPictureResource, - *pOutputPictureResourceInfo, - *pFrameFilterOutResource, - *pFrameFilterOutResourceInfo, - &frameSynchronizationInfo); + if (m_useSeparateTransferQueue == VK_FALSE) { + // Same queue family - record copy directly in decode command buffer + CopyOptimalToLinearImage(frameDataSlot.commandBuffer, + *pOutputPictureResource, + *pOutputPictureResourceInfo, + *pFrameFilterOutResource, + *pFrameFilterOutResourceInfo, + &frameSynchronizationInfo); + } else { + // Different queue families - add release barrier for queue ownership transfer + int32_t videoQueueFamilyIdx = m_vkDevCtx->GetVideoDecodeQueueFamilyIdx(); + int32_t txQueueFamilyIdx = m_vkDevCtx->GetTransferQueueFamilyIdx(); + + const VkImageSubresourceRange imageSubresourceRange = { + VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, pOutputPictureResourceInfo->baseArrayLayer, 1 + }; + + // Release barrier: release ownership from video decode queue to transfer queue + VkImageMemoryBarrier2KHR releaseBarrier = { VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2_KHR }; + releaseBarrier.srcStageMask = VK_PIPELINE_STAGE_2_VIDEO_DECODE_BIT_KHR; + releaseBarrier.srcAccessMask = VK_ACCESS_2_VIDEO_DECODE_WRITE_BIT_KHR; + releaseBarrier.dstStageMask = VK_PIPELINE_STAGE_2_NONE; + releaseBarrier.dstAccessMask = 0; + releaseBarrier.oldLayout = pOutputPictureResourceInfo->currentImageLayout; + releaseBarrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; + releaseBarrier.srcQueueFamilyIndex = videoQueueFamilyIdx; + releaseBarrier.dstQueueFamilyIndex = txQueueFamilyIdx; + releaseBarrier.image = pOutputPictureResourceInfo->image; + releaseBarrier.subresourceRange = imageSubresourceRange; + + VkDependencyInfoKHR depInfo = { VK_STRUCTURE_TYPE_DEPENDENCY_INFO_KHR }; + depInfo.imageMemoryBarrierCount = 1; + depInfo.pImageMemoryBarriers = &releaseBarrier; + + m_vkDevCtx->CmdPipelineBarrier2KHR(frameDataSlot.commandBuffer, &depInfo); + } } m_vkDevCtx->EndCommandBuffer(frameDataSlot.commandBuffer); @@ -1155,10 +1232,10 @@ int VkVideoDecoder::DecodePictureWithParameters(VkParserPerFrameDecodeParameters videoDecodeCompleteFence = filterCmdBuffer->GetFence(); } - const uint32_t waitSemaphoreMaxCount = 3; + const uint32_t waitSemaphoreMaxCount = 4; VkSemaphoreSubmitInfoKHR waitSemaphoreInfos[waitSemaphoreMaxCount]{}; - const uint32_t signalSemaphoreMaxCount = 3; + const uint32_t signalSemaphoreMaxCount = 4; VkSemaphoreSubmitInfoKHR signalSemaphoreInfos[signalSemaphoreMaxCount]{}; uint32_t waitSemaphoreCount = 0; @@ -1322,6 +1399,101 @@ int VkVideoDecoder::DecodePictureWithParameters(VkParserPerFrameDecodeParameters } m_decodePicCount++; + // Submit transfer queue operation when using separate transfer queue for image copy + if (m_useSeparateTransferQueue == VK_TRUE && m_useTransferOperation == VK_TRUE) { + assert(pOutputPictureResource != nullptr); + assert(pOutputPictureResourceInfo != nullptr); + assert(frameDataSlot.slot < m_transferCommandBuffers.size()); + + VkCommandBuffer txCmdBuffer = m_transferCommandBuffers[frameDataSlot.slot]; + int32_t videoQueueFamilyIdx = m_vkDevCtx->GetVideoDecodeQueueFamilyIdx(); + int32_t txQueueFamilyIdx = m_vkDevCtx->GetTransferQueueFamilyIdx(); + + // Reset and begin transfer command buffer + m_vkDevCtx->ResetCommandBuffer(txCmdBuffer, 0); + + const VkCommandBufferBeginInfo cmdBufBeginInfo = { + VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + nullptr, + 0, + nullptr + }; + m_vkDevCtx->BeginCommandBuffer(txCmdBuffer, &cmdBufBeginInfo); + + // Acquire barrier: acquire ownership from video decode queue to transfer queue + const VkImageSubresourceRange imageSubresourceRange = { + VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, pOutputPictureResourceInfo->baseArrayLayer, 1 + }; + + VkImageMemoryBarrier2KHR acquireBarrier = { VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2_KHR }; + acquireBarrier.srcStageMask = VK_PIPELINE_STAGE_2_NONE; + acquireBarrier.srcAccessMask = 0; + acquireBarrier.dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT; + acquireBarrier.dstAccessMask = VK_ACCESS_2_TRANSFER_READ_BIT; + acquireBarrier.oldLayout = pOutputPictureResourceInfo->currentImageLayout; + acquireBarrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; + acquireBarrier.srcQueueFamilyIndex = videoQueueFamilyIdx; + acquireBarrier.dstQueueFamilyIndex = txQueueFamilyIdx; + acquireBarrier.image = pOutputPictureResourceInfo->image; + acquireBarrier.subresourceRange = imageSubresourceRange; + + VkDependencyInfoKHR acquireDependencyInfo = { VK_STRUCTURE_TYPE_DEPENDENCY_INFO_KHR }; + acquireDependencyInfo.imageMemoryBarrierCount = 1; + acquireDependencyInfo.pImageMemoryBarriers = &acquireBarrier; + + m_vkDevCtx->CmdPipelineBarrier2KHR(txCmdBuffer, &acquireDependencyInfo); + + // Perform the image copy in transfer queue + CopyOptimalToLinearImage(txCmdBuffer, + *pOutputPictureResource, + *pOutputPictureResourceInfo, + *pFrameFilterOutResource, + *pFrameFilterOutResourceInfo, + &frameSynchronizationInfo); + + m_vkDevCtx->EndCommandBuffer(txCmdBuffer); + + // Create a fence for this transfer operation + VkFence transferFence = VK_NULL_HANDLE; + const VkFenceCreateInfo fenceInfo = { VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, nullptr }; + result = m_vkDevCtx->CreateFence(*m_vkDevCtx, &fenceInfo, nullptr, &transferFence); + if (result != VK_SUCCESS) { + fprintf(stderr, "\nERROR: CreateFence() for transfer operation failed: 0x%x\n", result); + } + + VkCommandBufferSubmitInfoKHR txCmdBufferInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO_KHR }; + txCmdBufferInfo.commandBuffer = txCmdBuffer; + + VkSubmitInfo2KHR txSubmitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO_2_KHR }; + txSubmitInfo.waitSemaphoreInfoCount = signalSemaphoreCount; + txSubmitInfo.pWaitSemaphoreInfos = signalSemaphoreInfos; + txSubmitInfo.commandBufferInfoCount = 1; + txSubmitInfo.pCommandBufferInfos = &txCmdBufferInfo; + txSubmitInfo.signalSemaphoreInfoCount = 0; + txSubmitInfo.pSignalSemaphoreInfos = nullptr; + + result = m_vkDevCtx->MultiThreadedQueueSubmit(VulkanDeviceContext::TRANSFER, + 0, // transfer queue index + 1, + &txSubmitInfo, + transferFence, + "Transfer Copy", + picNumInDecodeOrder); + if (result != VK_SUCCESS) { + fprintf(stderr, "\nERROR: Transfer queue submit failed: 0x%x\n", result); + } + + result = m_vkDevCtx->WaitForFences(*m_vkDevCtx, 1, &transferFence, true, gFenceTimeout); + if (result != VK_SUCCESS) { + fprintf(stderr, "\nERROR: WaitForFences() for transfer operation failed: 0x%x\n", result); + } + m_vkDevCtx->DestroyFence(*m_vkDevCtx, transferFence, nullptr); + + if (m_dumpDecodeData) { + std::cout << "\t => Transfer queue submitted for frame copy" << std::endl; + } + } + if (m_enableDecodeComputeFilter) { assert(filterCmdBuffer != nullptr); @@ -1527,6 +1699,24 @@ void VkVideoDecoder::Deinitialize() m_hwLoadBalancingTimelineSemaphore = VK_NULL_HANDLE; } + // Wait for transfer queue to complete before cleanup + if (m_useSeparateTransferQueue == VK_TRUE) { + m_vkDevCtx->MultiThreadedQueueWaitIdle(VulkanDeviceContext::TRANSFER, 0); + } + + // Clean up transfer queue resources + if (m_transferCommandPool != VK_NULL_HANDLE) { + if (!m_transferCommandBuffers.empty()) { + m_vkDevCtx->FreeCommandBuffers(*m_vkDevCtx, m_transferCommandPool, + (uint32_t)m_transferCommandBuffers.size(), + m_transferCommandBuffers.data()); + m_transferCommandBuffers.clear(); + } + m_vkDevCtx->DestroyCommandPool(*m_vkDevCtx, m_transferCommandPool, nullptr); + m_transferCommandPool = VK_NULL_HANDLE; + } + m_useSeparateTransferQueue = VK_FALSE; + m_videoFrameBuffer = nullptr; m_decodeFramesData.deinit(); m_videoSession = nullptr; diff --git a/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.h b/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.h index 05314b4a..7243d2c2 100644 --- a/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.h +++ b/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.h @@ -231,6 +231,9 @@ class VkVideoDecoder : public IVulkanVideoDecoderHandler { , m_useTransferOperation(VK_FALSE) , m_resetDecoder(VK_TRUE) , m_dumpDecodeData(VK_FALSE) + , m_useSeparateTransferQueue(VK_FALSE) + , m_transferCommandPool(VK_NULL_HANDLE) + , m_transferCommandBuffers() , m_numImageTypes(1) // At least the decoder requires images for DPB , m_numImageTypesEnabled(DecodeFrameBufferIf::IMAGE_TYPE_MASK_DECODE_DPB) , m_imageSpecsIndex() @@ -333,6 +336,12 @@ class VkVideoDecoder : public IVulkanVideoDecoderHandler { uint32_t m_useTransferOperation : 1; uint32_t m_resetDecoder : 1; uint32_t m_dumpDecodeData : 1; + uint32_t m_useSeparateTransferQueue : 1; // True when video decode and transfer queues are different families + + // Transfer queue resources for cross-queue family image copy operations + VkCommandPool m_transferCommandPool; + std::vector m_transferCommandBuffers; + uint32_t m_numImageTypes; uint32_t m_numImageTypesEnabled; DecodeFrameBufferIf::ImageSpecsIndex m_imageSpecsIndex; From 20ec8d61fc732884fcc66993991633f08cb458a2 Mon Sep 17 00:00:00 2001 From: Hyunjun Ko Date: Tue, 16 Dec 2025 07:52:32 +0100 Subject: [PATCH 2/2] decode: Create transfer queue when video decode queue lacks transfer support Even if without noPresent, if the option -o is provided, we need to try to create a transfer queue when video decode queue lacks transfer support. --- vk_video_decoder/demos/vk-video-dec/Main.cpp | 2 +- vk_video_decoder/test/vulkan-video-dec/Main.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vk_video_decoder/demos/vk-video-dec/Main.cpp b/vk_video_decoder/demos/vk-video-dec/Main.cpp index 281d7e68..cd8067be 100644 --- a/vk_video_decoder/demos/vk-video-dec/Main.cpp +++ b/vk_video_decoder/demos/vk-video-dec/Main.cpp @@ -118,7 +118,7 @@ int main(int argc, const char **argv) vkDevCtxt.CreateVulkanDevice(numDecodeQueues, // numDecodeQueues 0, // num encode queues videoCodecOperation, // videoCodecs - false, // createTransferQueue + ((vkDevCtxt.GetVideoDecodeQueueFlag() & VK_QUEUE_TRANSFER_BIT) == 0), // createTransferQueue true, // createGraphicsQueue true, // createDisplayQueue requestVideoComputeQueueMask != 0 // createComputeQueue diff --git a/vk_video_decoder/test/vulkan-video-dec/Main.cpp b/vk_video_decoder/test/vulkan-video-dec/Main.cpp index fe405cdc..63f7d3c0 100644 --- a/vk_video_decoder/test/vulkan-video-dec/Main.cpp +++ b/vk_video_decoder/test/vulkan-video-dec/Main.cpp @@ -129,7 +129,7 @@ int main(int argc, const char** argv) vkDevCtxt.CreateVulkanDevice(numDecodeQueues, 0, // num encode queues videoCodec, - false, // createTransferQueue + ((vkDevCtxt.GetVideoDecodeQueueFlag() & VK_QUEUE_TRANSFER_BIT) == 0), // createTransferQueue true, // createGraphicsQueue true, // createDisplayQueue requestVideoComputeQueueMask != 0 // createComputeQueue