diff --git a/cmake/XglOptions.cmake b/cmake/XglOptions.cmake index d67546db..614a76b0 100644 --- a/cmake/XglOptions.cmake +++ b/cmake/XglOptions.cmake @@ -42,6 +42,9 @@ macro(xgl_options) option(XGL_BUILD_NAVI23 "Build open source vulkan for Navi23" ON) + option(XGL_BUILD_TESTS "Build all tests?" OFF) + + # Deprecated, use XGL_BUILD_TESTS instead. option(XGL_BUILD_LIT "Build with Lit test?" OFF) option(XGL_BUILD_CACHE_CREATOR "Build cache-creator tools?" OFF) diff --git a/cmake/XglOverrides.cmake b/cmake/XglOverrides.cmake index 8e41acd9..caddce50 100644 --- a/cmake/XglOverrides.cmake +++ b/cmake/XglOverrides.cmake @@ -185,9 +185,13 @@ macro(xgl_overrides_vkgc) set(LLPC_CLIENT_INTERFACE_MAJOR_VERSION ${ICD_LLPC_CLIENT_MAJOR_VERSION} CACHE STRING "${PROJECT_NAME} override." FORCE) if(ICD_BUILD_LLPC) + set(LLPC_BUILD_TESTS ${XGL_BUILD_TESTS} CACHE BOOL "${PROJECT_NAME} override." FORCE) set(LLPC_BUILD_LIT ${XGL_BUILD_LIT} CACHE BOOL "${PROJECT_NAME} override." FORCE) + if(XGL_BUILD_LIT) + message(DEPRECATION "XGL_BUILD_LIT is deprecated, use XGL_BUILD_TESTS instead") + endif() set(LLPC_BUILD_NAVI12 ${XGL_BUILD_NAVI12} CACHE BOOL "${PROJECT_NAME} override." FORCE) set(LLPC_BUILD_NAVI22 ${XGL_BUILD_NAVI22} CACHE BOOL "${PROJECT_NAME} override." FORCE) diff --git a/icd/CMakeLists.txt b/icd/CMakeLists.txt index b3a19396..3d6d61d9 100644 --- a/icd/CMakeLists.txt +++ b/icd/CMakeLists.txt @@ -136,9 +136,6 @@ target_sources(xgl PRIVATE api/vk_descriptor_update_template.cpp api/appopt/barrier_filter_layer.cpp api/appopt/strange_brigade_layer.cpp - api/appopt/async_layer.cpp - api/appopt/async_shader_module.cpp - api/appopt/async_partial_pipeline.cpp api/appopt/g_shader_profile.cpp api/render_state_cache.cpp api/renderpass/renderpass_builder.cpp diff --git a/icd/Loader/LunarG/Lnx/amd-icd.json b/icd/Loader/LunarG/Lnx/amd-icd.json index 025cf513..5a668070 100644 --- a/icd/Loader/LunarG/Lnx/amd-icd.json +++ b/icd/Loader/LunarG/Lnx/amd-icd.json @@ -2,13 +2,13 @@ "file_format_version": "1.0.0", "ICD": { "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.2.188" + "api_version": "1.2.191" }, "layer": { "name": "VK_LAYER_AMD_switchable_graphics_@ISABITS@", "type": "GLOBAL", "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.2.188", + "api_version": "1.2.191", "implementation_version": "1", "description": "AMD switchable graphics layer", "functions": { diff --git a/icd/api/appopt/async_layer.cpp b/icd/api/appopt/async_layer.cpp deleted file mode 100644 index 11cb3ee1..00000000 --- a/icd/api/appopt/async_layer.cpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** -*********************************************************************************************************************** -* @file async_layer.cpp -* @brief Implementation of async compiler layer. -*********************************************************************************************************************** -*/ -#include "async_layer.h" -#include "async_shader_module.h" -#include "async_partial_pipeline.h" - -#include "include/vk_device.h" -#include "include/vk_shader.h" -#include "include/vk_graphics_pipeline.h" -#include "include/vk_compute_pipeline.h" -#include "palListImpl.h" - -namespace vk -{ - -namespace entry -{ - -namespace async -{ - -// ===================================================================================================================== -VKAPI_ATTR VkResult VKAPI_CALL vkCreateShaderModule( - VkDevice device, - const VkShaderModuleCreateInfo* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkShaderModule* pShaderModule) -{ - Device* pDevice = ApiDevice::ObjectFromHandle(device); - const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks(); - return vk::async::ShaderModule::Create(pDevice, pCreateInfo, pAllocCB, pShaderModule); -} - -// ===================================================================================================================== -VKAPI_ATTR void VKAPI_CALL vkDestroyShaderModule( - VkDevice device, - VkShaderModule shaderModule, - const VkAllocationCallbacks* pAllocator) -{ - if (shaderModule != VK_NULL_HANDLE) - { - Device* pDevice = ApiDevice::ObjectFromHandle(device); - AsyncLayer* pAsyncLayer = pDevice->GetAsyncLayer(); - const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks(); - - pAsyncLayer->SyncAll(); - vk::async::ShaderModule::ObjectFromHandle(shaderModule)->Destroy(pDevice, pAllocCB); - } -} - -// ===================================================================================================================== -VKAPI_ATTR VkResult VKAPI_CALL vkCreateGraphicsPipelines( - VkDevice device, - VkPipelineCache pipelineCache, - uint32_t createInfoCount, - const VkGraphicsPipelineCreateInfo* pCreateInfos, - const VkAllocationCallbacks* pAllocator, - VkPipeline* pPipelines) -{ - Device* pDevice = ApiDevice::ObjectFromHandle(device); - AsyncLayer* pAsyncLayer = pDevice->GetAsyncLayer(); - VkResult result = VK_SUCCESS; - - for (uint32_t i = 0; (i < createInfoCount) && (result == VK_SUCCESS); ++i) - { - VkGraphicsPipelineCreateInfo createInfo = pCreateInfos[i]; - VkPipelineShaderStageCreateInfo stages[ShaderStage::ShaderStageGfxCount]; - VK_ASSERT(createInfo.stageCount <= ShaderStage::ShaderStageGfxCount); - for (uint32_t stage = 0; stage < createInfo.stageCount; ++stage) - { - stages[stage] = createInfo.pStages[stage]; - vk::async::ShaderModule* pModule = vk::async::ShaderModule::ObjectFromHandle(stages[stage].module); - stages[stage].module = pModule->GetNextLayerModule(); - } - createInfo.pStages = stages; - result = ASYNC_CALL_NEXT_LAYER(vkCreateGraphicsPipelines)(device, - pipelineCache, - 1, - &createInfo, - pAllocator, - pPipelines + i); - } - - return result; -} - -// ===================================================================================================================== -VKAPI_ATTR VkResult VKAPI_CALL vkCreateComputePipelines( - VkDevice device, - VkPipelineCache pipelineCache, - uint32_t createInfoCount, - const VkComputePipelineCreateInfo* pCreateInfos, - const VkAllocationCallbacks* pAllocator, - VkPipeline* pPipelines) -{ - Device* pDevice = ApiDevice::ObjectFromHandle(device); - AsyncLayer* pAsyncLayer = pDevice->GetAsyncLayer(); - VkResult result = VK_SUCCESS; - - for (uint32_t i = 0; (i < createInfoCount) && (result == VK_SUCCESS); ++i) - { - VkComputePipelineCreateInfo createInfo = pCreateInfos[i]; - VK_ASSERT(createInfo.stage.module != VK_NULL_HANDLE); - vk::async::ShaderModule* pModule = vk::async::ShaderModule::ObjectFromHandle(createInfo.stage.module); - createInfo.stage.module = pModule->GetNextLayerModule(); - result = ASYNC_CALL_NEXT_LAYER(vkCreateComputePipelines)(device, - pipelineCache, - 1, - &createInfo, - pAllocator, - pPipelines + i); - } - - return result; -} - -} // namespace async - -} // namespace entry - -// ===================================================================================================================== -AsyncLayer::AsyncLayer(Device* pDevice) - : - m_pDevice(pDevice), - m_pModuleTaskThreads(), - m_pPipelineTaskThreads() -{ - Util::SystemInfo sysInfo = {}; - Util::QuerySystemInfo(&sysInfo); - - for (uint32_t i = 0; i < MaxTaskType; ++i) - { - m_taskId[i] = 0; - m_activeThreadCount[i] = Util::Min(MaxThreads, sysInfo.cpuLogicalCoreCount / 2); - } - for (uint32_t i = 0; i < m_activeThreadCount[0]; ++i) - { - m_pModuleTaskThreads[i] = VK_PLACEMENT_NEW(m_moduleTaskThreadBuffer[i]) - async::TaskThread(this, pDevice->VkInstance()->Allocator()); - m_pModuleTaskThreads[i]->Begin(); - - m_pPipelineTaskThreads[i] = VK_PLACEMENT_NEW(m_pipelineTaskThreadBuffer[i]) - async::TaskThread(this, pDevice->VkInstance()->Allocator()); - m_pPipelineTaskThreads[i]->Begin(); - } -} - -// ===================================================================================================================== -AsyncLayer::~AsyncLayer() -{ - for (uint32_t i = 0; i < m_activeThreadCount[0]; ++i) - { - m_pModuleTaskThreads[i]->SetStop(); - m_pModuleTaskThreads[i]->Join(); - Util::Destructor(m_pModuleTaskThreads[i]); - m_pModuleTaskThreads[i] = nullptr; - - m_pPipelineTaskThreads[i]->SetStop(); - m_pPipelineTaskThreads[i]->Join(); - Util::Destructor(m_pPipelineTaskThreads[i]); - m_pPipelineTaskThreads[i] = nullptr; - } -} - -// ===================================================================================================================== -void AsyncLayer::SyncAll() -{ - for (uint32_t i = 0; i < m_activeThreadCount[0]; ++i) - { - m_pModuleTaskThreads[i]->SyncAll(); - m_pPipelineTaskThreads[i]->SyncAll(); - } -} - -// ===================================================================================================================== -void AsyncLayer::OverrideDispatchTable( - DispatchTable* pDispatchTable) -{ - // Save current device dispatch table to use as the next layer. - m_nextLayer = *pDispatchTable; - - ASYNC_OVERRIDE_ENTRY(vkCreateShaderModule); - ASYNC_OVERRIDE_ENTRY(vkDestroyShaderModule); - ASYNC_OVERRIDE_ENTRY(vkCreateGraphicsPipelines); - ASYNC_OVERRIDE_ENTRY(vkCreateComputePipelines); -} - -} // namespace vk diff --git a/icd/api/appopt/async_layer.h b/icd/api/appopt/async_layer.h deleted file mode 100644 index c7b3bf6e..00000000 --- a/icd/api/appopt/async_layer.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** -*********************************************************************************************************************** -* @file async_layer.h -* @brief Declaration of async compiler layer -*********************************************************************************************************************** -*/ - -#ifndef __ASYNC_LAYER_H__ -#define __ASYNC_LAYER_H__ - -#pragma once - -#include "opt_layer.h" -#include "async_task_thread.h" - -namespace vk -{ - -class Device; -class AsyncLayer; -class PalAllocator; - -namespace async { class ShaderModule; class PartialPipeline; } - -// Represents the shader module async compile info -struct ShaderModuleTask -{ - VkShaderModuleCreateInfo info; // Shader module create info - async::ShaderModule* pObj; // Output shader module object -}; - -// Represents the pipeline async compile info -struct PartialPipelineTask -{ - VkShaderModule shaderModuleHandle; // Shader module handle - async::PartialPipeline* pObj; // Output shader module object -}; - -// Thread task type -enum TaskType : uint32_t -{ - ShaderModuleTaskType = 0, - PartialPipelineTaskType, - MaxTaskType, -}; - -// ===================================================================================================================== -// Class that specifies dispatch table override behavior for async compiler layers -class AsyncLayer final : public OptLayer -{ -public: - AsyncLayer(Device* pDevice); - virtual ~AsyncLayer(); - - virtual void OverrideDispatchTable(DispatchTable* pDispatchTable) override; - - Device* GetDevice() { return m_pDevice; } - - void* GetTaskThread(TaskType type) - { - VK_ASSERT(type < MaxTaskType); - if (type == ShaderModuleTaskType) - { - return (m_activeThreadCount[type] > 0) ? - m_pModuleTaskThreads[(m_taskId[type]++) % m_activeThreadCount[type]] : - nullptr; - } - else - { - return (m_activeThreadCount[type] > 0) ? - m_pPipelineTaskThreads[(m_taskId[type]++) % m_activeThreadCount[type]] : - nullptr; - } - } - - void SyncAll(); - -protected: - static constexpr uint32_t MaxThreads = 8; // Max thread count for shader module compile - Device* m_pDevice; // Vulkan Device object - async::TaskThread* m_pModuleTaskThreads[MaxThreads]; // Async compiler threads - async::TaskThread* m_pPipelineTaskThreads[MaxThreads]; // Async compiler threads - uint32_t m_taskId[MaxTaskType]; // Hint to select compile thread - uint32_t m_activeThreadCount[MaxTaskType]; // Active thread count - // Internal buffer for m_taskThreadBuffer - uint8_t m_moduleTaskThreadBuffer[MaxThreads][sizeof(async::TaskThread)]; - uint8_t m_pipelineTaskThreadBuffer[MaxThreads] - [sizeof(async::TaskThread)]; - -private: - PAL_DISALLOW_COPY_AND_ASSIGN(AsyncLayer); -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -#define ASYNC_OVERRIDE_ALIAS(entry_name, func_name) \ - pDispatchTable->OverrideEntryPoints()->entry_name = vk::entry::async::func_name - -#define ASYNC_OVERRIDE_ENTRY(entry_name) ASYNC_OVERRIDE_ALIAS(entry_name, entry_name) -// Helper function to call the next layer's function by name -#define ASYNC_CALL_NEXT_LAYER(entry_name) \ - pAsyncLayer->GetNextLayer()->GetEntryPoints().entry_name - -} // namespace vk - -#endif /* __OPT_LAYER_H__ */ diff --git a/icd/api/appopt/async_partial_pipeline.cpp b/icd/api/appopt/async_partial_pipeline.cpp deleted file mode 100644 index d1f952a6..00000000 --- a/icd/api/appopt/async_partial_pipeline.cpp +++ /dev/null @@ -1,397 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** -*********************************************************************************************************************** -* @file async_partial_pipeline.cpp -* @brief Implementation of class async::PartialPipeline -*********************************************************************************************************************** -*/ -#include "async_layer.h" -#include "async_partial_pipeline.h" - -#include "include/vk_device.h" -#include "include/vk_shader.h" -#include "palListImpl.h" - -#include - -namespace vk -{ - -namespace async -{ -// ===================================================================================================================== -PartialPipeline::PartialPipeline( - const VkAllocationCallbacks* pAllocator) - : - m_pAllocator(pAllocator) -{ -} - -// ===================================================================================================================== -// Creates async partial pipeline object -PartialPipeline* PartialPipeline::Create( - Device* pDevice, - const VkAllocationCallbacks* pAllocator) -{ - const size_t objSize = sizeof(PartialPipeline); - void* pMemory = pAllocator->pfnAllocation( - pAllocator->pUserData, - objSize, - VK_DEFAULT_MEM_ALIGN, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - - if (pMemory == nullptr) - { - return nullptr; - } - - VK_PLACEMENT_NEW(pMemory) PartialPipeline(pAllocator); - - return static_cast(pMemory); -} - -// ===================================================================================================================== -// Destory async partial pipeline object -VkResult PartialPipeline::Destroy() -{ - m_pAllocator->pfnFree(m_pAllocator->pUserData, this); - - return VK_SUCCESS; -} - -// ===================================================================================================================== -// Builds partial pipeline in async mode -void PartialPipeline::AsyncBuildPartialPipeline( - AsyncLayer* pAsyncLayer, - VkShaderModule asyncShaderModule) -{ - auto pTaskThread = reinterpret_cast*> - (pAsyncLayer->GetTaskThread(PartialPipelineTaskType)); - if (pTaskThread != nullptr) - { - PartialPipelineTask task = {}; - - task.shaderModuleHandle = asyncShaderModule; - task.pObj = this; - pTaskThread->AddTask(&task); - } - else - { - Destroy(); - } -} - -static const uint32_t OffsetStrideInDwords = 12; -// ===================================================================================================================== -// Creat ResourceMappingNode from module data -void PartialPipeline::CreatePipelineLayoutFromModuleData( - AsyncLayer* pAsyncLayer, - Vkgc::ShaderModuleEntryData* pShaderModuleEntryData, - const Vkgc::ResourceMappingRootNode** ppResourceMappingNode, - uint32_t* pMappingNodeCount) -{ - const Vkgc::ResourceNodeData* pResourceNodeData = pShaderModuleEntryData->pResNodeDatas; - uint32_t resNodeDataCount = pShaderModuleEntryData->resNodeDataCount; - uint32_t pushConstSize = pShaderModuleEntryData->pushConstSize; - uint32_t setCount = 0; - uint32_t set = 0; - - if (resNodeDataCount > 0) - { - set = pResourceNodeData[0].set; - setCount = 1; - for (uint32_t i = 1; i < resNodeDataCount; ++i) - { - if (set != pResourceNodeData[i].set) - { - set = pResourceNodeData[i].set; - ++setCount; - } - } - } - - // 1 reperents push constant - uint32_t totalNodes = pushConstSize != 0 ? resNodeDataCount + setCount + 1 : resNodeDataCount + setCount; - - auto pSets = static_cast(m_pAllocator->pfnAllocation( - m_pAllocator->pUserData, - totalNodes * sizeof(Vkgc::ResourceMappingRootNode), - VK_DEFAULT_MEM_ALIGN, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)); - auto pNodes = reinterpret_cast(pSets + setCount + 1); - uint32_t topLevelOffset = 0; - - for (uint32_t i = 0; i < resNodeDataCount; ++i) - { - pNodes[i].type = pResourceNodeData[i].type; - pNodes[i].sizeInDwords = OffsetStrideInDwords * pResourceNodeData[i].arraySize; - pNodes[i].offsetInDwords = pResourceNodeData[i].binding * OffsetStrideInDwords; - pNodes[i].srdRange.set = pResourceNodeData[i].set; - pNodes[i].srdRange.binding = pResourceNodeData[i].binding; - if ((i == 0) || (set != pNodes[i].srdRange.set)) - { - set = pNodes[i].srdRange.set; - pSets[set].node.tablePtr.pNext = &pNodes[i]; - pSets[set].node.type = Vkgc::ResourceMappingNodeType::DescriptorTableVaPtr; - pSets[set].node.sizeInDwords = 1; - pSets[set].node.offsetInDwords = topLevelOffset; - pSets[set].visibility = UINT_MAX; - topLevelOffset += pSets[set].node.sizeInDwords; - } - ++pSets[pResourceNodeData[i].set].node.tablePtr.nodeCount; - } - - // Add UseDynamic options for below cases: - // 1. Force all uniform buffer are dynamic buffer in auto layout pipeline layout - // 2. Force all storage buffer are dynamic buffer in auto layout pipeline layout - - if (pushConstSize) - { - // Add a node for push consts at the end of root descriptor list. - pSets[resNodeDataCount + setCount].node.type = Vkgc::ResourceMappingNodeType::PushConst; - pSets[resNodeDataCount + setCount].node.sizeInDwords = pushConstSize; - pSets[resNodeDataCount + setCount].node.offsetInDwords = topLevelOffset; - } - - *pMappingNodeCount = setCount; - *ppResourceMappingNode = pSets; -} - -// ===================================================================================================================== -// Creat color target from module data -void PartialPipeline::CreateColorTargetFromModuleData( - Vkgc::ShaderModuleDataEx* pShaderModuleDataEx, - Vkgc::ColorTarget* pTarget) -{ - for (uint32_t i = 0; i < pShaderModuleDataEx->extra.fsOutInfoCount; ++i) - { - uint32_t location = pShaderModuleDataEx->extra.pFsOutInfos[i].location; - uint32_t componentCount = pShaderModuleDataEx->extra.pFsOutInfos[i].componentCount; - Vkgc::BasicType basicType = pShaderModuleDataEx->extra.pFsOutInfos[i].basicType; - - VK_ASSERT(location < Vkgc::MaxColorTargets); - pTarget[location].channelWriteMask = (1U << componentCount) - 1; - // Further optimization is app profile for color format according to fsOutInfos. - switch (basicType) - { - case Vkgc::BasicType::Float: - { - static const VkFormat formatTable[] = - { - VK_FORMAT_R32_SFLOAT, - VK_FORMAT_R32G32_SFLOAT, - VK_FORMAT_R32G32B32_SFLOAT, - VK_FORMAT_R32G32B32A32_SFLOAT, - }; - pTarget[location].format = formatTable[componentCount - 1]; - break; - } - case Vkgc::BasicType::Double: - { - static const VkFormat formatTable[] = - { - VK_FORMAT_R64_SFLOAT, - VK_FORMAT_R64G64_SFLOAT, - VK_FORMAT_R64G64B64_SFLOAT, - VK_FORMAT_R64G64B64A64_SFLOAT, - }; - pTarget[location].format = formatTable[componentCount - 1]; - break; - } - case Vkgc::BasicType::Int: - { - static const VkFormat formatTable[] = - { - VK_FORMAT_R32_SINT, - VK_FORMAT_R32G32_SINT, - VK_FORMAT_R32G32B32_SINT, - VK_FORMAT_R32G32B32A32_SINT, - }; - pTarget[location].format = formatTable[componentCount - 1]; - break; - } - case Vkgc::BasicType::Uint: - { - static const VkFormat formatTable[] = - { - VK_FORMAT_R32_UINT, - VK_FORMAT_R32G32_UINT, - VK_FORMAT_R32G32B32_UINT, - VK_FORMAT_R32G32B32A32_UINT, - }; - pTarget[location].format = formatTable[componentCount - 1]; - break; - } - case Vkgc::BasicType::Int64: - { - static const VkFormat formatTable[] = - { - VK_FORMAT_R64_SINT, - VK_FORMAT_R64G64_SINT, - VK_FORMAT_R64G64B64_SINT, - VK_FORMAT_R64G64B64A64_SINT, - }; - pTarget[location].format = formatTable[componentCount - 1]; - break; - } - case Vkgc::BasicType::Uint64: - { - static const VkFormat formatTable[] = - { - VK_FORMAT_R64_UINT, - VK_FORMAT_R64G64_UINT, - VK_FORMAT_R64G64B64_UINT, - VK_FORMAT_R64G64B64A64_UINT, - }; - pTarget[location].format = formatTable[componentCount - 1]; - break; - } - case Vkgc::BasicType::Float16: - { - static const VkFormat formatTable[] = - { - VK_FORMAT_R16_SFLOAT, - VK_FORMAT_R16G16_SFLOAT, - VK_FORMAT_R16G16B16_SFLOAT, - VK_FORMAT_R16G16B16A16_SFLOAT, - }; - pTarget[location].format = formatTable[componentCount - 1]; - break; - } - case Vkgc::BasicType::Int16: - { - static const VkFormat formatTable[] = - { - VK_FORMAT_R16_SINT, - VK_FORMAT_R16G16_SINT, - VK_FORMAT_R16G16B16_SINT, - VK_FORMAT_R16G16B16A16_SINT, - }; - pTarget[location].format = formatTable[componentCount - 1]; - break; - } - case Vkgc::BasicType::Uint16: - { - static const VkFormat formatTable[] = - { - VK_FORMAT_R16_UINT, - VK_FORMAT_R16G16_UINT, - VK_FORMAT_R16G16B16_UINT, - VK_FORMAT_R16G16B16A16_UINT, - }; - pTarget[location].format = formatTable[componentCount - 1]; - break; - } - case Vkgc::BasicType::Int8: - { - static const VkFormat formatTable[] = - { - VK_FORMAT_R8_SINT, - VK_FORMAT_R8G8_SINT, - VK_FORMAT_R8G8B8_SINT, - VK_FORMAT_R8G8B8A8_SINT, - }; - pTarget[location].format = formatTable[componentCount - 1]; - break; - } - case Vkgc::BasicType::Uint8: - { - static const VkFormat formatTable[] = - { - VK_FORMAT_R8_UINT, - VK_FORMAT_R8G8_UINT, - VK_FORMAT_R8G8B8_UINT, - VK_FORMAT_R8G8B8A8_UINT, - }; - pTarget[location].format = formatTable[componentCount - 1]; - break; - } - default: - break; - } - } -} - -// ===================================================================================================================== -// Creates partial pipeline with partial pipeline opt enabled. -void PartialPipeline::Execute( - AsyncLayer* pAsyncLayer, - PartialPipelineTask* pTask) -{ - Device* pDevice = pAsyncLayer->GetDevice(); - PipelineCompilerType compilerType = pDevice->GetCompiler(0)->GetShaderCacheType(); - if (compilerType != PipelineCompilerTypeLlpc) - { - return; - } - - vk::ShaderModule* pShaderModule = vk::ShaderModule::ObjectFromHandle(pTask->shaderModuleHandle); - void* pShaderModuleData = pShaderModule->GetShaderData(compilerType); - auto pShaderModuleDataEx = reinterpret_cast(pShaderModuleData); - Vkgc::ShaderModuleEntryData* pShaderModuleEntryData = nullptr; - Vkgc::ColorTarget pColorTarget[Vkgc::MaxColorTargets] = {}; - if ((pShaderModuleDataEx->extra.entryCount == 1) && - (pShaderModuleDataEx->extra.entryDatas[0].stage == Vkgc::ShaderStageCompute)) - { - pShaderModuleEntryData = &pShaderModuleDataEx->extra.entryDatas[0]; - } - else - { - for (uint32_t i = 0; i < pShaderModuleDataEx->extra.entryCount; ++i) - { - if (pShaderModuleDataEx->extra.entryDatas[i].stage == Vkgc::ShaderStageFragment) - { - CreateColorTargetFromModuleData(pShaderModuleDataEx, pColorTarget); - if (pColorTarget[0].format == VK_FORMAT_UNDEFINED) - { - break; - } - - pShaderModuleEntryData = &pShaderModuleDataEx->extra.entryDatas[i]; - break; - } - } - } - if (pShaderModuleEntryData != nullptr) - { - for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) - { - const Vkgc::ResourceMappingRootNode* pResourceMappingNode = nullptr; - uint32_t mappingNodeCount = 0; - CreatePipelineLayoutFromModuleData(pAsyncLayer, pShaderModuleEntryData, &pResourceMappingNode, &mappingNodeCount); - - auto result = pDevice->GetCompiler(deviceIdx)->CreatePartialPipelineBinary(deviceIdx, - pShaderModuleData, pShaderModuleEntryData, pResourceMappingNode, mappingNodeCount, pColorTarget); - VK_ASSERT(result == VK_SUCCESS); - m_pAllocator->pfnFree(m_pAllocator->pUserData, (void*)pResourceMappingNode); - } - } - Destroy(); -} - -} // namespace async - -} // namespace vk diff --git a/icd/api/appopt/async_partial_pipeline.h b/icd/api/appopt/async_partial_pipeline.h deleted file mode 100644 index b235d673..00000000 --- a/icd/api/appopt/async_partial_pipeline.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** -*********************************************************************************************************************** -* @file async_partial_pipeline.h -* @brief Header file of class async::PartialPipeline -*********************************************************************************************************************** -*/ - -#ifndef __ASYNC_PARTIAL_PIPELINE_H__ -#define __ASYNC_PARTIAL_PIPELINE_H__ - -#pragma once - -#include "include/vk_dispatch.h" -#include "vkgcDefs.h" - -namespace vk -{ - -namespace async -{ - -// ===================================================================================================================== -// Implementation of a async shader module -class PartialPipeline -{ -public: - static PartialPipeline* Create( - Device* pDevice, - const VkAllocationCallbacks* pAllocator); - - VkResult Destroy(); - - void CreatePipelineLayoutFromModuleData( - AsyncLayer* pAsyncLayer, - Vkgc::ShaderModuleEntryData* pShaderModuleEntryData, - const Vkgc::ResourceMappingRootNode** ppResourceMappingNode, - uint32_t* pMappingNodeCount); - - void CreateColorTargetFromModuleData( - Vkgc::ShaderModuleDataEx* pShaderModuleDataEx, - Vkgc::ColorTarget* pTarget); - - void Execute(AsyncLayer* pAsyncLayer, PartialPipelineTask* pTask); - - void AsyncBuildPartialPipeline(AsyncLayer* pAsyncLayer, VkShaderModule asyncShaderModule); - -protected: - PartialPipeline(const VkAllocationCallbacks* pAllocator); - -private: - const VkAllocationCallbacks* m_pAllocator; - - PAL_DISALLOW_COPY_AND_ASSIGN(PartialPipeline); -}; - -} // namespace async - -} // namespace vk - -#endif diff --git a/icd/api/appopt/async_shader_module.cpp b/icd/api/appopt/async_shader_module.cpp deleted file mode 100644 index 2e802662..00000000 --- a/icd/api/appopt/async_shader_module.cpp +++ /dev/null @@ -1,178 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** -*********************************************************************************************************************** -* @file async_shader_module.cpp -* @brief Implementation of class async::ShaderModule -*********************************************************************************************************************** -*/ -#include "async_layer.h" -#include "async_shader_module.h" -#include "async_partial_pipeline.h" - -#include "include/vk_device.h" -#include "include/vk_shader.h" -#include "palListImpl.h" - -namespace vk -{ - -namespace async -{ - -// ===================================================================================================================== -ShaderModule::ShaderModule( - VkShaderModule immedModule) - : - m_immedModule(immedModule), - m_asyncModule(VK_NULL_HANDLE) -{ -} - -// ===================================================================================================================== -// Creates async shdaer module object -VkResult ShaderModule::Create( - Device* pDevice, - const VkShaderModuleCreateInfo* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkShaderModule* pShaderModule) -{ - AsyncLayer* pAsyncLayer = pDevice->GetAsyncLayer(); - VkShaderModule immedModule = VK_NULL_HANDLE; - - VK_ASSERT(pCreateInfo->flags == 0); - - // Build shader module with immedidate mode - auto result = ASYNC_CALL_NEXT_LAYER(vkCreateShaderModule)( - VkDevice(ApiDevice::FromObject(pDevice)), - pCreateInfo, - pAllocator, - &immedModule); - - if (result == VK_SUCCESS) - { - const size_t objSize = sizeof(ShaderModule); - void* pMemory = pAllocator->pfnAllocation( - pAllocator->pUserData, - objSize, - VK_DEFAULT_MEM_ALIGN, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - - if (pMemory == nullptr) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - - VK_PLACEMENT_NEW(pMemory) ShaderModule(immedModule); - ShaderModule* pShaderModuleObj = static_cast(pMemory); - *pShaderModule = ShaderModule::HandleFromVoidPointer(pMemory); - - // Build shader module in async mode - pShaderModuleObj->AsyncBuildShaderModule(pDevice->GetAsyncLayer()); - } - - return result; -} - -// ===================================================================================================================== -// Destory async shader module object -VkResult ShaderModule::Destroy( - Device* pDevice, - const VkAllocationCallbacks* pAllocator) -{ - AsyncLayer* pAsyncLayer = pDevice->GetAsyncLayer(); - if (m_asyncModule == VK_NULL_HANDLE) - { - pAsyncLayer->SyncAll(); - } - - if (m_immedModule != VK_NULL_HANDLE) - { - ASYNC_CALL_NEXT_LAYER(vkDestroyShaderModule)( - VkDevice(ApiDevice::FromObject(pDevice)), - m_immedModule, - pAllocator); - } - - if (m_asyncModule != VK_NULL_HANDLE) - { - ASYNC_CALL_NEXT_LAYER(vkDestroyShaderModule)( - VkDevice(ApiDevice::FromObject(pDevice)), - m_asyncModule, - pAllocator); - } - - return VK_SUCCESS; -} - -// ===================================================================================================================== -// Builds shader module in async mode -void ShaderModule::AsyncBuildShaderModule( - AsyncLayer* pAsyncLayer) -{ - auto pTaskThread = reinterpret_cast*> - (pAsyncLayer->GetTaskThread(ShaderModuleTaskType)); - if (pTaskThread != nullptr) - { - vk::ShaderModule* pNextLayerModule = vk::ShaderModule::ObjectFromHandle(m_immedModule); - - ShaderModuleTask task = {}; - task.info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; - task.info.pCode = reinterpret_cast(pNextLayerModule->GetCode()); - task.info.codeSize = pNextLayerModule->GetCodeSize(); - task.info.flags = VK_SHADER_MODULE_ENABLE_OPT_BIT; - task.pObj = this; - pTaskThread->AddTask(&task); - } -} - -// ===================================================================================================================== -// Creates shader module with shader module opt enabled. -void ShaderModule::Execute( - AsyncLayer* pAsyncLayer, - ShaderModuleTask* pTask) -{ - Device* pDevice = pAsyncLayer->GetDevice(); - ASYNC_CALL_NEXT_LAYER(vkCreateShaderModule)(VkDevice(ApiDevice::FromObject(pDevice)), - &pTask->info, - nullptr, - &m_asyncModule); - const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - if (settings.enablePartialPipelineCompile) - { - const VkAllocationCallbacks* pAllocCB = pDevice->VkInstance()->GetAllocCallbacks(); - auto pPartialPipelineObj = vk::async::PartialPipeline::Create(pDevice, pAllocCB); - - if ((pPartialPipelineObj != nullptr) && (m_asyncModule != VK_NULL_HANDLE)) - { - // Build partial pipeline in async mode - pPartialPipelineObj->AsyncBuildPartialPipeline(pDevice->GetAsyncLayer(), m_asyncModule); - } - } -} - -} // namespace async - -} // namespace vk diff --git a/icd/api/appopt/async_shader_module.h b/icd/api/appopt/async_shader_module.h deleted file mode 100644 index cca54ec6..00000000 --- a/icd/api/appopt/async_shader_module.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** -*********************************************************************************************************************** -* @file async_shader_module.h -* @brief Header file of class async::ShaderModule -*********************************************************************************************************************** -*/ - -#ifndef __ASYNC_SHADER_MODULE_H__ -#define __ASYNC_SHADER_MODULE_H__ - -#pragma once - -#include "include/vk_dispatch.h" - -namespace vk -{ - -namespace async -{ - -// ===================================================================================================================== -// Implementation of a async shader module -class ShaderModule final : public vk::NonDispatchable -{ -public: - static VkResult Create( - Device* pDevice, - const VkShaderModuleCreateInfo* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkShaderModule* pShaderModule); - - VkResult Destroy( - Device* pDevice, - const VkAllocationCallbacks* pAllocator); - - VkShaderModule GetNextLayerModule() - { - return (m_asyncModule == VK_NULL_HANDLE) ? m_immedModule : m_asyncModule; - } - - void Execute(AsyncLayer* pAsyncLayer, ShaderModuleTask* pTask); - - void AsyncBuildShaderModule(AsyncLayer* pAsyncLayer); - -protected: - ShaderModule(VkShaderModule immedModule); - - VkShaderModule m_immedModule; // Shader module handle which is compiled with immedidate mode - VkShaderModule m_asyncModule; // Shader module handle which is compiled with async mode - -private: - PAL_DISALLOW_COPY_AND_ASSIGN(ShaderModule); -}; - -} // namespace async - -} // namespace vk - -#endif diff --git a/icd/api/appopt/async_task_thread.h b/icd/api/appopt/async_task_thread.h deleted file mode 100644 index 9a11da2f..00000000 --- a/icd/api/appopt/async_task_thread.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** -*********************************************************************************************************************** -* @file async_task_thread.h -* @brief Implementation of template class async::TaskThread -*********************************************************************************************************************** -*/ -#ifndef __ASYNC_TASK_THREAD_H__ -#define __ASYNC_TASK_THREAD_H__ - -#pragma once - -#include "include/vk_alloccb.h" -#include "palThread.h" -#include "palMutex.h" -#include "palList.h" -#include "palEvent.h" - -namespace vk -{ - -class AsyncLayer; -class PalAllocator; - -namespace async -{ - -// ===================================================================================================================== -// Represents the general thread for async shader/pipeline compiler. -template -class TaskThread final : public Util::Thread -{ -public: - TaskThread(AsyncLayer* pAsyncLayer, PalAllocator* pAllocator) - : - m_pAsyncLayer(pAsyncLayer), - m_taskList(pAllocator), - m_stop(false) - { - Util::EventCreateFlags flags = {}; - flags.manualReset = false; - flags.initiallySignaled = false; - m_event.Init(flags); - } - - // Starts a new thread which starts by running function TaskThreadFunc. - void Begin() - { - Util::Thread::Begin(ThreadFunc, this); - } - - // Adds task to list. - void AddTask(Task* pTask) - { - Util::MutexAuto mutexAuto(&m_lock); - m_taskList.PushBack(*pTask); - m_event.Set(); - } - - // Set flag stop and trig event. - void SetStop() - { - m_event.Set(); - m_stop = true; - } - - // Returns until all tasks are executed. - void SyncAll() - { - m_event.Set(); - while (m_taskList.Begin() != m_taskList.End()) - { - Util::YieldThread(); - } - } - -protected: - // Async thread function - static void ThreadFunc( - void* pParam) - { - auto pThis = reinterpret_cast*>(pParam); - pThis->TaskThreadFunc(); - } - - // The implementation of async thread function - void TaskThreadFunc() - { - while (m_stop == false) - { - // Waits for new signal. - m_event.Wait(1.0f); - - Task task; - while (FetchTask(&task)) - { - task.pObj->Execute(m_pAsyncLayer, &task); - } - } - } - - // Fetches task in list, return false if task list is empty. - bool FetchTask(Task* pTask) - { - Util::MutexAuto mutexAuto(&m_lock); - auto beginIt = m_taskList.Begin(); - if (beginIt != m_taskList.End()) - { - *pTask = *(beginIt.Get()); - m_taskList.Erase(&beginIt); - return true; - } - return false; - } - - AsyncLayer* m_pAsyncLayer; // Async compiler layer object - Util::List m_taskList; // Async compile task list - volatile bool m_stop; // Flag to stop the thread - Util::Mutex m_lock; // Lock for accessing task list - Util::Event m_event; // Event to notify async thread -}; - -} // namespace async - -} // namespace vk - -#endif diff --git a/icd/api/barrier_policy.cpp b/icd/api/barrier_policy.cpp index 499d2d54..3e54c63e 100644 --- a/icd/api/barrier_policy.cpp +++ b/icd/api/barrier_policy.cpp @@ -293,7 +293,9 @@ static uint32_t ImageLayoutToCacheMask(VkImageLayout imageLayout) // Converts source access flags to source cache coherency flags. static uint32_t SrcAccessToCacheMask(AccessFlags accessMask, VkImageLayout imageLayout) { - uint32_t cacheMask = 0; + uint32_t cacheMask = (((imageLayout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR) || + (imageLayout == VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR)) ? + Pal::CoherPresent : 0); if (accessMask & VK_ACCESS_SHADER_WRITE_BIT) { @@ -357,7 +359,9 @@ static uint32_t SrcAccessToCacheMask(AccessFlags accessMask, VkImageLayout image // Converts destination access flags to destination cache coherency flags. static uint32_t DstAccessToCacheMask(AccessFlags accessMask, VkImageLayout imageLayout) { - uint32_t cacheMask = 0; + uint32_t cacheMask = (((imageLayout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR) || + (imageLayout == VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR)) ? + Pal::CoherPresent : 0); if (accessMask & VK_ACCESS_INDIRECT_COMMAND_READ_BIT) { @@ -688,7 +692,8 @@ void DeviceBarrierPolicy::InitQueueFamilyPolicies( | Pal::CoherResolve | Pal::CoherClear | Pal::CoherIndirectArgs - | Pal::CoherIndexData; + | Pal::CoherIndexData + | Pal::CoherPresent; policy.supportedLayoutUsageMask |= Pal::LayoutColorTarget | Pal::LayoutDepthStencilTarget | Pal::LayoutShaderRead @@ -982,8 +987,8 @@ void ImageBarrierPolicy::InitImageCachePolicy( { // Initialize supported cache masks based on the usage flags provided. // Always allow CPU and memory reads/writes. - uint32_t supportedOutputCacheMask = Pal::CoherCpu | Pal::CoherMemory; - uint32_t supportedInputCacheMask = Pal::CoherCpu | Pal::CoherMemory; + uint32_t supportedOutputCacheMask = Pal::CoherCpu | Pal::CoherMemory | Pal::CoherPresent; + uint32_t supportedInputCacheMask = Pal::CoherCpu | Pal::CoherMemory | Pal::CoherPresent; if (usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) { diff --git a/icd/api/compiler_solution_llpc.cpp b/icd/api/compiler_solution_llpc.cpp index e4a2052d..0cadb2fa 100644 --- a/icd/api/compiler_solution_llpc.cpp +++ b/icd/api/compiler_solution_llpc.cpp @@ -688,6 +688,12 @@ VkResult CompilerSolutionLlpc::CreateLlpcCompiler( pOptionBuffer += optionLength; bufSize -= optionLength; + if ((m_gfxIp.major == 10) && (m_gfxIp.minor >= 3)) + { + // Enable flat scratch for gfx10.3+ + llpcOptions[numOptions++] = "-amdgpu-enable-flat-scratch"; + } + if (settings.llpcOptions[0] != '\0') { const char* pOptions = &settings.llpcOptions[0]; diff --git a/icd/api/graphics_pipeline_common.cpp b/icd/api/graphics_pipeline_common.cpp index 65cb1dad..0f188321 100644 --- a/icd/api/graphics_pipeline_common.cpp +++ b/icd/api/graphics_pipeline_common.cpp @@ -1298,7 +1298,7 @@ static void BuildExecutablePipelineState( void GraphicsPipelineCommon::BuildPipelineObjectCreateInfo( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pIn, - const VbInfo* pVbInfo, + const VbBindingInfo* pVbInfo, const GraphicsPipelineBinaryInfo* pBinInfo, const PipelineLayout* pPipelineLayout, GraphicsPipelineObjectCreateInfo* pInfo) @@ -1311,7 +1311,7 @@ void GraphicsPipelineCommon::BuildPipelineObjectCreateInfo( pIn->pDynamicState ); - BuildVertexInputInterfaceState(pDevice, pIn, &pVbInfo->bindingInfo, dynamicStateFlags, false, pInfo); + BuildVertexInputInterfaceState(pDevice, pIn, pVbInfo, dynamicStateFlags, false, pInfo); BuildPreRasterizationShaderState(pDevice, pIn, @@ -1354,7 +1354,8 @@ VkResult GraphicsPipelineCommon::BuildPipelineBinaryCreateInfo( const PipelineLayout* pPipelineLayout, GraphicsPipelineBinaryCreateInfo* pBinInfo, GraphicsPipelineShaderStageInfo* pShaderInfo, - VbInfo* pVbInfo, + VbBindingInfo* pVbInfo, + PipelineInternalBufferInfo* pInternalBufferInfo, ShaderModuleHandle* pTempModules) { VkResult result = BuildShaderStageInfo(pDevice, @@ -1371,7 +1372,7 @@ VkResult GraphicsPipelineCommon::BuildPipelineBinaryCreateInfo( if (result == VK_SUCCESS) { result = pDevice->GetCompiler(DefaultDeviceIndex)->ConvertGraphicsPipelineInfo( - pDevice, pCreateInfo, pShaderInfo, pPipelineLayout, pBinInfo, pVbInfo); + pDevice, pCreateInfo, pShaderInfo, pPipelineLayout, pBinInfo, pVbInfo, pInternalBufferInfo); } return result; diff --git a/icd/api/include/compiler_solution.h b/icd/api/include/compiler_solution.h index c4d80dd2..24ce5434 100644 --- a/icd/api/include/compiler_solution.h +++ b/icd/api/include/compiler_solution.h @@ -59,6 +59,7 @@ enum FreeCompilerBinary : uint32_t // Represents the result of PipelineCompiler::BuildShaderModule struct ShaderModuleHandle { + uint32_t* pRefCount; void* pLlpcShaderModule; // Shader module handle from LLPC }; diff --git a/icd/api/include/defer_compile_thread.h b/icd/api/include/defer_compile_thread.h new file mode 100644 index 00000000..95924d06 --- /dev/null +++ b/icd/api/include/defer_compile_thread.h @@ -0,0 +1,231 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file defer_compile_thread.h +* @brief Implementation of class DeferCompileThread & DeferCompileManager +*********************************************************************************************************************** +*/ +#ifndef __DEFER_COMPILE_THREAD_H__ +#define __DEFER_COMPILE_THREAD_H__ + +#pragma once + +#include "include/vk_alloccb.h" +#include "palThread.h" +#include "palMutex.h" +#include "palList.h" +#include "palEvent.h" + +namespace vk +{ + +class DeferCompileManager; +class PalAllocator; + +struct DeferredCompileWorkload +{ + void* pPayloads; + void (*Execute)(void*); // Function pointer to the call used to execute the workload + Util::Event* pEvent; +}; + +// ===================================================================================================================== +// Represents the general thread for async shader/pipeline compiler. +class DeferCompileThread final : public Util::Thread +{ +public: + DeferCompileThread(PalAllocator* pAllocator) + : + m_taskList(pAllocator), + m_stop(false) + { + Util::EventCreateFlags flags = {}; + flags.manualReset = true; + flags.initiallySignaled = false; + m_event.Init(flags); + } + + // Starts a new thread which starts by running function TaskThreadFunc. + void Begin() + { + Util::Thread::Begin(ThreadFunc, this); + } + + // Adds task to list. + void AddTask(DeferredCompileWorkload* pTask) + { + Util::MutexAuto mutexAuto(&m_lock); + m_taskList.PushBack(*pTask); + m_event.Set(); + } + + // Set flag stop and trig event. + void SetStop() + { + m_event.Set(); + m_stop = true; + } + + // Returns until all tasks are executed. + void SyncAll() + { + m_event.Set(); + while (m_taskList.Begin() != m_taskList.End()) + { + Util::YieldThread(); + } + } + +protected: + // Async thread function + static void ThreadFunc( + void* pParam) + { + auto pThis = reinterpret_cast(pParam); + pThis->TaskThreadFunc(); + } + + // The implementation of async thread function + void TaskThreadFunc() + { + while (m_stop == false) + { + // Waits for new signal. + m_event.Wait(1.0f); + m_event.Reset(); + + DeferredCompileWorkload task; + while (FetchTask(&task)) + { + task.Execute(task.pPayloads); + if (task.pEvent != nullptr) + { + task.pEvent->Set(); + } + } + } + } + + // Fetches task in list, return false if task list is empty. + bool FetchTask(DeferredCompileWorkload* pTask) + { + Util::MutexAuto mutexAuto(&m_lock); + auto beginIt = m_taskList.Begin(); + if (beginIt != m_taskList.End()) + { + *pTask = *(beginIt.Get()); + m_taskList.Erase(&beginIt); + return true; + } + return false; + } + + Util::List m_taskList; // Deferred compile task list + volatile bool m_stop; // Flag to stop the thread + Util::Mutex m_lock; // Lock for accessing task list + Util::Event m_event; // Event to notify async thread +}; + +// ===================================================================================================================== +// Class that manage DeferCompileThread instance. +class DeferCompileManager +{ +public: + DeferCompileManager() + : + m_pCompileThreads{}, + m_taskId(0), + m_activeThreadCount(0) + { + } + + void Init(uint32_t threadCount, PalAllocator* pAllocator) + { + if (threadCount == 0) + { + m_activeThreadCount = 0; + } + else if (threadCount == UINT32_MAX) + { + Util::SystemInfo sysInfo = {}; + Util::QuerySystemInfo(&sysInfo); + m_activeThreadCount = Util::Min(MaxThreads, sysInfo.cpuLogicalCoreCount / 2); + } + else + { + m_activeThreadCount = Util::Min(MaxThreads, threadCount); + } + + for (uint32_t i = 0; i < m_activeThreadCount; ++i) + { + m_pCompileThreads[i] = VK_PLACEMENT_NEW(m_compileThreadBuffer[i]) + DeferCompileThread(pAllocator); + m_pCompileThreads[i]->Begin(); + } + } + + ~DeferCompileManager() + { + for (uint32_t i = 0; i < m_activeThreadCount; ++i) + { + m_pCompileThreads[i]->SetStop(); + m_pCompileThreads[i]->Join(); + Util::Destructor(m_pCompileThreads[i]); + m_pCompileThreads[i] = nullptr; + } + m_activeThreadCount = 0; + } + + void SyncAll() + { + for (uint32_t i = 0; i < m_activeThreadCount; ++i) + { + m_pCompileThreads[i]->SyncAll(); + } + } + + DeferCompileThread* GetCompileThread() + { + return (m_activeThreadCount > 0) ? + m_pCompileThreads[(m_taskId++) % m_activeThreadCount] : + nullptr; + } + +protected: + static constexpr uint32_t MaxThreads = 8; // Max thread count for shader module compile + DeferCompileThread* m_pCompileThreads[MaxThreads]; // Async compiler threads + uint32_t m_taskId; // Hint to select compile thread + uint32_t m_activeThreadCount; // Active thread count + + // Internal buffer for m_pCompileThreads + uint8_t m_compileThreadBuffer[MaxThreads][sizeof(DeferCompileThread)]; +private: + PAL_DISALLOW_COPY_AND_ASSIGN(DeferCompileManager); +}; + +} // namespace vk + +#endif diff --git a/icd/api/include/graphics_pipeline_common.h b/icd/api/include/graphics_pipeline_common.h index d0640a96..d0a94aeb 100644 --- a/icd/api/include/graphics_pipeline_common.h +++ b/icd/api/include/graphics_pipeline_common.h @@ -65,19 +65,19 @@ struct VbBindingInfo } bindings[Pal::MaxVertexBuffers]; }; -struct UberFetchShaderBufferInfo +constexpr uint32_t MaxPipelineInternalBufferCount = 4; +struct InternalBufferEntry { - bool requirePerIntanceFetch; - bool requirePerCompFetch; uint32_t userDataOffset; - uint32_t bufferSize; - uint32_t bufferData[Vkgc::MaxFetchShaderInternalBufferSize]; + uint32_t bufferOffset; }; -struct VbInfo +struct PipelineInternalBufferInfo { - VbBindingInfo bindingInfo; - UberFetchShaderBufferInfo uberFetchShaderBuffer; + uint32_t internalBufferCount; + InternalBufferEntry internalBufferEntries[MaxPipelineInternalBufferCount]; + uint32_t dataSize; + void* pData; }; // ===================================================================================================================== @@ -100,6 +100,7 @@ struct GraphicsPipelineObjectImmedInfo Pal::VrsRateParams vrsRateParams; Pal::DepthStencilStateCreateInfo depthStencilCreateInfo; bool rasterizerDiscardEnable; + bool checkDeferCompilePipeline; // Static pipeline parameter token values. These can be used to efficiently redundancy check static pipeline // state programming during pipeline binds. @@ -200,14 +201,15 @@ class GraphicsPipelineCommon : public Pipeline const PipelineLayout* pPipelineLayout, GraphicsPipelineBinaryCreateInfo* pBinInfo, GraphicsPipelineShaderStageInfo* pShaderInfo, - VbInfo* pVbInfo, + VbBindingInfo* pVbInfo, + PipelineInternalBufferInfo* pInternalBufferInfo, ShaderModuleHandle* pTempModules); // Convert API information into internal create info used to create internal pipeline object static void BuildPipelineObjectCreateInfo( const Device* pDevice, const VkGraphicsPipelineCreateInfo* pIn, - const VbInfo* pVbInfo, + const VbBindingInfo* pVbInfo, const GraphicsPipelineBinaryInfo* pBinInfo, const PipelineLayout* pPipelineLayout, GraphicsPipelineObjectCreateInfo* pObjInfo); diff --git a/icd/api/include/khronos/GLSL.ext.AMD.h b/icd/api/include/khronos/GLSL.ext.AMD.h index efe849fe..297a6f98 100644 --- a/icd/api/include/khronos/GLSL.ext.AMD.h +++ b/icd/api/include/khronos/GLSL.ext.AMD.h @@ -84,6 +84,35 @@ enum GcnShaderAMD { GcnShaderCountAMD }; +#if VKI_TEXEL_BUFFER_EXPLICIT_FORMAT_SUPPORT +// SPV_AMD_shader_texel_buffer_explicit_format +static const Capability CapabilityImageBufferReadWriteWithFormatAMD = static_cast(5024); + +static const Op OpImageBufferReadAMD = static_cast(5025); +static const Op OpImageBufferWriteAMD = static_cast(5026); + +static const ImageFormat ImageFormatRgb32fAMD = static_cast(5028); +static const ImageFormat ImageFormatRgb32uiAMD = static_cast(5029); +static const ImageFormat ImageFormatRgb32iAMD = static_cast(5030); +static const ImageFormat ImageFormatR10G11B11fAMD = static_cast(5031); +static const ImageFormat ImageFormatRgb10A2SnormAMD = static_cast(5032); +static const ImageFormat ImageFormatRgb10A2iAMD = static_cast(5033); +static const ImageFormat ImageFormatRgba16SscaledAMD = static_cast(5034); +static const ImageFormat ImageFormatRgb10A2SscaledAMD = static_cast(5035); +static const ImageFormat ImageFormatRg16SscaledAMD = static_cast(5036); +static const ImageFormat ImageFormatRgba8SscaledAMD = static_cast(5037); +static const ImageFormat ImageFormatRg8SscaledAMD = static_cast(5038); +static const ImageFormat ImageFormatR16SscaledAMD = static_cast(5039); +static const ImageFormat ImageFormatR8SscaledAMD = static_cast(5040); +static const ImageFormat ImageFormatRgba16UscaledAMD = static_cast(5041); +static const ImageFormat ImageFormatRgb10A2UscaledAMD = static_cast(5042); +static const ImageFormat ImageFormatRg16UscaledAMD = static_cast(5043); +static const ImageFormat ImageFormatRgba8USscaledAMD = static_cast(5044); +static const ImageFormat ImageFormatRg8UscaledAMD = static_cast(5045); +static const ImageFormat ImageFormatR16UscaledAMD = static_cast(5046); +static const ImageFormat ImageFormatR8UscaledAMD = static_cast(5047); +#endif + #if VKI_NORMALIZED_TRIG_FUNCTIONS // SPV_AMD_normalized_trig - Internal Use Only static const Capability CapabilityTrigNormalizedAMD = static_cast(5058); diff --git a/icd/api/include/khronos/sdk-1.2/vulkan_beta.h b/icd/api/include/khronos/sdk-1.2/vulkan_beta.h index e2337adf..f67fab36 100644 --- a/icd/api/include/khronos/sdk-1.2/vulkan_beta.h +++ b/icd/api/include/khronos/sdk-1.2/vulkan_beta.h @@ -90,7 +90,6 @@ typedef enum VkVideoCodingControlFlagBitsKHR { typedef VkFlags VkVideoCodingControlFlagsKHR; typedef enum VkVideoCodingQualityPresetFlagBitsKHR { - VK_VIDEO_CODING_QUALITY_PRESET_DEFAULT_BIT_KHR = 0, VK_VIDEO_CODING_QUALITY_PRESET_NORMAL_BIT_KHR = 0x00000001, VK_VIDEO_CODING_QUALITY_PRESET_POWER_BIT_KHR = 0x00000002, VK_VIDEO_CODING_QUALITY_PRESET_QUALITY_BIT_KHR = 0x00000004, diff --git a/icd/api/include/khronos/sdk-1.2/vulkan_core.h b/icd/api/include/khronos/sdk-1.2/vulkan_core.h index 0e081aaf..18b302fa 100644 --- a/icd/api/include/khronos/sdk-1.2/vulkan_core.h +++ b/icd/api/include/khronos/sdk-1.2/vulkan_core.h @@ -72,7 +72,7 @@ extern "C" { #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0 // Version of this file -#define VK_HEADER_VERSION 188 +#define VK_HEADER_VERSION 191 // Complete version of this file #define VK_HEADER_VERSION_COMPLETE VK_MAKE_API_VERSION(0, 1, 2, VK_HEADER_VERSION) @@ -754,6 +754,8 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_FEATURES_NV = 1000277007, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INHERITED_VIEWPORT_SCISSOR_FEATURES_NV = 1000278000, VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_VIEWPORT_SCISSOR_INFO_NV = 1000278001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR = 1000280000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR = 1000280001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT = 1000281000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT = 1000281001, VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_RENDER_PASS_TRANSFORM_INFO_QCOM = 1000282000, @@ -824,6 +826,7 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_VERTEX_INPUT_BINDING_DESCRIPTION_2_EXT = 1000352001, VK_STRUCTURE_TYPE_VERTEX_INPUT_ATTRIBUTE_DESCRIPTION_2_EXT = 1000352002, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT = 1000353000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT = 1000356000, VK_STRUCTURE_TYPE_IMPORT_MEMORY_ZIRCON_HANDLE_INFO_FUCHSIA = 1000364000, VK_STRUCTURE_TYPE_MEMORY_ZIRCON_HANDLE_PROPERTIES_FUCHSIA = 1000364001, VK_STRUCTURE_TYPE_MEMORY_GET_ZIRCON_HANDLE_INFO_FUCHSIA = 1000364002, @@ -843,6 +846,7 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_EXT = 1000388001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_FEATURES_EXT = 1000392000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT = 1000392001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT = 1000412000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES, VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, @@ -2125,10 +2129,6 @@ typedef enum VkImageViewCreateFlagBits { VK_IMAGE_VIEW_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkImageViewCreateFlagBits; typedef VkFlags VkImageViewCreateFlags; - -typedef enum VkShaderModuleCreateFlagBits { - VK_SHADER_MODULE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF -} VkShaderModuleCreateFlagBits; typedef VkFlags VkShaderModuleCreateFlags; typedef enum VkPipelineCacheCreateFlagBits { @@ -7867,6 +7867,52 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineExecutableInternalRepresentationsKHR #endif +#define VK_KHR_shader_integer_dot_product 1 +#define VK_KHR_SHADER_INTEGER_DOT_PRODUCT_SPEC_VERSION 1 +#define VK_KHR_SHADER_INTEGER_DOT_PRODUCT_EXTENSION_NAME "VK_KHR_shader_integer_dot_product" +typedef struct VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR { + VkStructureType sType; + void* pNext; + VkBool32 shaderIntegerDotProduct; +} VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR; + +typedef struct VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR { + VkStructureType sType; + void* pNext; + VkBool32 integerDotProduct8BitUnsignedAccelerated; + VkBool32 integerDotProduct8BitSignedAccelerated; + VkBool32 integerDotProduct8BitMixedSignednessAccelerated; + VkBool32 integerDotProduct4x8BitPackedUnsignedAccelerated; + VkBool32 integerDotProduct4x8BitPackedSignedAccelerated; + VkBool32 integerDotProduct4x8BitPackedMixedSignednessAccelerated; + VkBool32 integerDotProduct16BitUnsignedAccelerated; + VkBool32 integerDotProduct16BitSignedAccelerated; + VkBool32 integerDotProduct16BitMixedSignednessAccelerated; + VkBool32 integerDotProduct32BitUnsignedAccelerated; + VkBool32 integerDotProduct32BitSignedAccelerated; + VkBool32 integerDotProduct32BitMixedSignednessAccelerated; + VkBool32 integerDotProduct64BitUnsignedAccelerated; + VkBool32 integerDotProduct64BitSignedAccelerated; + VkBool32 integerDotProduct64BitMixedSignednessAccelerated; + VkBool32 integerDotProductAccumulatingSaturating8BitUnsignedAccelerated; + VkBool32 integerDotProductAccumulatingSaturating8BitSignedAccelerated; + VkBool32 integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated; + VkBool32 integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated; + VkBool32 integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated; + VkBool32 integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated; + VkBool32 integerDotProductAccumulatingSaturating16BitUnsignedAccelerated; + VkBool32 integerDotProductAccumulatingSaturating16BitSignedAccelerated; + VkBool32 integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated; + VkBool32 integerDotProductAccumulatingSaturating32BitUnsignedAccelerated; + VkBool32 integerDotProductAccumulatingSaturating32BitSignedAccelerated; + VkBool32 integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated; + VkBool32 integerDotProductAccumulatingSaturating64BitUnsignedAccelerated; + VkBool32 integerDotProductAccumulatingSaturating64BitSignedAccelerated; + VkBool32 integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated; +} VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR; + + + #define VK_KHR_pipeline_library 1 #define VK_KHR_PIPELINE_LIBRARY_SPEC_VERSION 1 #define VK_KHR_PIPELINE_LIBRARY_EXTENSION_NAME "VK_KHR_pipeline_library" @@ -12454,6 +12500,18 @@ typedef struct VkPhysicalDeviceDrmPropertiesEXT { +#define VK_EXT_primitive_topology_list_restart 1 +#define VK_EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART_SPEC_VERSION 1 +#define VK_EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART_EXTENSION_NAME "VK_EXT_primitive_topology_list_restart" +typedef struct VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT { + VkStructureType sType; + void* pNext; + VkBool32 primitiveTopologyListRestart; + VkBool32 primitiveTopologyPatchListRestart; +} VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT; + + + #define VK_HUAWEI_subpass_shading 1 #define VK_HUAWEI_SUBPASS_SHADING_SPEC_VERSION 2 #define VK_HUAWEI_SUBPASS_SHADING_EXTENSION_NAME "VK_HUAWEI_subpass_shading" @@ -12675,6 +12733,25 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDrawMultiIndexedEXT( #define VK_EXT_LOAD_STORE_OP_NONE_EXTENSION_NAME "VK_EXT_load_store_op_none" +#define VK_EXT_pageable_device_local_memory 1 +#define VK_EXT_PAGEABLE_DEVICE_LOCAL_MEMORY_SPEC_VERSION 1 +#define VK_EXT_PAGEABLE_DEVICE_LOCAL_MEMORY_EXTENSION_NAME "VK_EXT_pageable_device_local_memory" +typedef struct VkPhysicalDevicePageableDeviceLocalMemoryFeaturesEXT { + VkStructureType sType; + void* pNext; + VkBool32 pageableDeviceLocalMemory; +} VkPhysicalDevicePageableDeviceLocalMemoryFeaturesEXT; + +typedef void (VKAPI_PTR *PFN_vkSetDeviceMemoryPriorityEXT)(VkDevice device, VkDeviceMemory memory, float priority); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkSetDeviceMemoryPriorityEXT( + VkDevice device, + VkDeviceMemory memory, + float priority); +#endif + + #define VK_KHR_acceleration_structure 1 VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkAccelerationStructureKHR) #define VK_KHR_ACCELERATION_STRUCTURE_SPEC_VERSION 12 diff --git a/icd/api/include/khronos/vulkan.h b/icd/api/include/khronos/vulkan.h index fc10d0b5..0d91a8aa 100644 --- a/icd/api/include/khronos/vulkan.h +++ b/icd/api/include/khronos/vulkan.h @@ -58,6 +58,9 @@ // Internal (under development) extension definitions #include "devext/vk_amd_gpa_interface.h" +#if VKI_TEXEL_BUFFER_EXPLICIT_FORMAT_SUPPORT +#include "devext/vk_amd_shader_texel_buffer_explicit_format.h" +#endif #define VK_FORMAT_BEGIN_RANGE VK_FORMAT_UNDEFINED #define VK_FORMAT_END_RANGE VK_FORMAT_ASTC_12x12_SRGB_BLOCK diff --git a/icd/api/include/pipeline_compiler.h b/icd/api/include/pipeline_compiler.h index 466d3351..dd7cf824 100644 --- a/icd/api/include/pipeline_compiler.h +++ b/icd/api/include/pipeline_compiler.h @@ -39,6 +39,7 @@ #include "include/vk_shader_code.h" #include "include/vk_conv.h" +#include "include/defer_compile_thread.h" namespace vk { @@ -48,8 +49,7 @@ class PipelineCache; class ShaderModule; class PipelineCompiler; struct VbBindingInfo; -struct VbInfo; -struct UberFetchShaderBufferInfo; +struct PipelineInternalBufferInfo; struct ShaderModuleHandle; class PipelineBinaryCache; @@ -192,7 +192,8 @@ class PipelineCompiler const GraphicsPipelineShaderStageInfo* pShaderInfo, const PipelineLayout* pPipelineLayout, GraphicsPipelineBinaryCreateInfo* pCreateInfo, - VbInfo* pVbInfo); + VbBindingInfo* pVbInfo, + PipelineInternalBufferInfo* pInternalBufferInfo); VkResult ConvertComputePipelineInfo( const Device* pDevice, @@ -212,7 +213,7 @@ class PipelineCompiler void FreeComputePipelineCreateInfo(ComputePipelineBinaryCreateInfo* pCreateInfo); - void FreeGraphicsPipelineCreateInfo(GraphicsPipelineBinaryCreateInfo* pCreateInfo); + void FreeGraphicsPipelineCreateInfo(GraphicsPipelineBinaryCreateInfo* pCreateInfo, bool keepConvertTempMem); #if ICD_GPUOPEN_DEVMODE_BUILD Util::Result RegisterAndLoadReinjectionBinary( @@ -239,10 +240,8 @@ class PipelineCompiler void DestroyPipelineBinaryCache(); - VkResult BuildUberFetchShaderInternalData(PipelineCompilerType compilerType, - const VkPipelineVertexInputStateCreateInfo* pVertexInput, - bool isDynamicStride, - UberFetchShaderBufferInfo* pFetchShaderBufferInfo); + VkResult BuildPipelineInternalBufferData(GraphicsPipelineBinaryCreateInfo* pCreateInfo, + PipelineInternalBufferInfo* pInternalBufferInfo); void GetComputePipelineCacheId( uint32_t deviceIdx, @@ -258,6 +257,24 @@ class PipelineCompiler const Util::MetroHash::Hash& settingsHash, Util::MetroHash::Hash* pCacheId); + static void BuildNggState( + const Device* pDevice, + const VkShaderStageFlagBits activeStages, + const bool isConservativeOverestimation, + GraphicsPipelineBinaryCreateInfo* pCreateInfo); + + static void BuildPipelineShaderInfo( + const Device* pDevice, + const ShaderStageInfo* pShaderInfoIn, + Vkgc::PipelineShaderInfo* pShaderInfoOut, + Vkgc::PipelineOptions* pPipelineOptions, + PipelineOptimizerKey* pOptimizerKey, + Vkgc::NggState* pNggState + ); + + void ExecuteDeferCompile( + DeferredCompileWorkload* pWorkload); + private: PAL_DISALLOW_COPY_AND_ASSIGN(PipelineCompiler); @@ -302,11 +319,30 @@ class PipelineCompiler FreeCompilerBinary* pFreeCompilerBinary, PipelineCreationFeedback* pPipelineFeedback); + VkResult LoadShaderModuleFromCache( + const Device* pDevice, + VkShaderModuleCreateFlags flags, + uint32_t compilerMask, + Util::MetroHash::Hash& uniqueHash, + ShaderModuleHandle* pShaderModule); + + void StoreShaderModuleToCache( + const Device* pDevice, + VkShaderModuleCreateFlags flags, + uint32_t compilerMask, + Util::MetroHash::Hash& uniqueHash, + ShaderModuleHandle* pShaderModule); + + Util::MetroHash::Hash GetShaderModuleCacheHash( + VkShaderModuleCreateFlags flags, + uint32_t compilerMask, + Util::MetroHash::Hash& uniqueHash); + // ----------------------------------------------------------------------------------------------------------------- PhysicalDevice* m_pPhysicalDevice; // Vulkan physical device object Vkgc::GfxIpVersion m_gfxIp; // Graphics IP version info, used by Vkgcf - + DeferCompileManager m_deferCompileMgr; // Defer compile thread manager CompilerSolutionLlpc m_compilerSolutionLlpc; PipelineBinaryCache* m_pBinaryCache; // Pipeline binary cache object @@ -320,9 +356,10 @@ class PipelineCompiler UberFetchShaderFormatInfoMap m_uberFetchShaderInfoFormatMap; // Uber fetch shader format info map - void GetPipelineCreationInfoNext( - const VkStructHeader* pHeader, - const VkPipelineCreationFeedbackCreateInfoEXT** ppPipelineCreationFeadbackCreateInfo); + typedef Util::HashMap ShaderModuleHandleMap; + + Util::Mutex m_shaderModuleCacheLock; + ShaderModuleHandleMap m_shaderModuleHandleMap; }; // class PipelineCompiler diff --git a/icd/api/include/vk_buffer.h b/icd/api/include/vk_buffer.h index 0d66f92a..6a3a1054 100644 --- a/icd/api/include/vk_buffer.h +++ b/icd/api/include/vk_buffer.h @@ -82,11 +82,6 @@ class Buffer final : public NonDispatchable const Device* pDevice, VkMemoryRequirements* pMemoryRequirements); - static void CalculateMemoryRequirements( - const Device* pDevice, - const VkBufferCreateInfo* pCreateInfo, - VkMemoryRequirements* pMemoryRequirements); - VkDeviceSize GetSize() const { return m_size; } @@ -134,7 +129,6 @@ class Buffer final : public NonDispatchable }; Buffer(Device* pDevice, - const VkAllocationCallbacks* pAllocator, const VkBufferCreateInfo* pCreateInfo, Pal::IGpuMemory** pGpuMemory, BufferFlags internalFlags); diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h index 9cdaef88..bf5116cb 100644 --- a/icd/api/include/vk_cmdbuffer.h +++ b/icd/api/include/vk_cmdbuffer.h @@ -519,8 +519,8 @@ class CmdBuffer VkPrimitiveTopology primitiveTopology); void SetLineStippleEXT( - const Pal::LineStippleStateParams& params, - uint32_t staticToken); + uint32_t lineStippleFactor, + uint16_t lineStipplePattern); void SetColorWriteEnableEXT( uint32_t attachmentCount, @@ -707,10 +707,6 @@ class CmdBuffer uint32_t counterOffset, uint32_t vertexStride); - void SetLineStippleEXT( - uint32_t lineStippleFactor, - uint16_t lineStipplePattern); - void CmdSetPerDrawVrsRate( const VkExtent2D* pFragmentSize, const VkFragmentShadingRateCombinerOpKHR combinerOps[2]); @@ -950,7 +946,7 @@ class CmdBuffer const Pal::IDepthStencilState* pState); void PalCmdSetMsaaQuadSamplePattern( - uint32_t numSamplesPerPixel, + uint32_t numSamplesPerPixel, const Pal::MsaaQuadSamplePattern& quadSamplePattern); inline void PalCmdBufferSetUserData( diff --git a/icd/api/include/vk_conv.h b/icd/api/include/vk_conv.h index 06ff8d1c..2aea5ce8 100644 --- a/icd/api/include/vk_conv.h +++ b/icd/api/include/vk_conv.h @@ -3405,8 +3405,25 @@ struct UberFetchShaderFormatInfo }; }; -typedef Util::HashMap - UberFetchShaderFormatInfoMap; +// ===================================================================================================================== +class UberFetchShaderFormatInfoMap : + public Util::HashMap +{ +public: + explicit UberFetchShaderFormatInfoMap(uint32 numBuckets, PalAllocator* const pAllocator) + : + Util::HashMap(numBuckets, pAllocator), + m_bufferFormatMask(0) + { } + + void SetBufferFormatMask(uint32_t mask) { m_bufferFormatMask = mask; } + + uint32_t GetBufferFormatMask() const { return m_bufferFormatMask; } + +private: + uint32_t m_bufferFormatMask; +}; + class PhysicalDevice; // ===================================================================================================================== @@ -3416,7 +3433,8 @@ VkResult InitializeUberFetchShaderFormatTable( UberFetchShaderFormatInfo GetUberFetchShaderFormatInfo( UberFetchShaderFormatInfoMap* pFormatInfoMap, - VkFormat vkFormat); + VkFormat vkFormat, + bool isZeroStride); } // namespace vk diff --git a/icd/api/include/vk_descriptor_set.h b/icd/api/include/vk_descriptor_set.h index 91365156..78d6cae9 100644 --- a/icd/api/include/vk_descriptor_set.h +++ b/icd/api/include/vk_descriptor_set.h @@ -125,6 +125,8 @@ class DescriptorSet final : public NonDispatchableGetPalHeapFromVkTypeIndex(vkIndex); } - uint32_t GetUmdFpsCapFrameRate() const - { - return VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().osProperties.umdFpsCapFrameRate; - } - uint64_t TimestampFrequency() const { return VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().timestampFrequency; @@ -575,9 +574,6 @@ class Device BarrierFilterLayer* GetBarrierFilterLayer() { return m_pBarrierFilterLayer; } - AsyncLayer* GetAsyncLayer() - { return m_pAsyncLayer; } - #if VKI_GPU_DECOMPRESS GpuDecoderLayer* GetGpuDecoderLayer() { return m_pGpuDecoderLayer; } @@ -632,6 +628,9 @@ class Device bool UseCompactDynamicDescriptors() const { return !GetRuntimeSettings().enableRelocatableShaders && !GetEnabledFeatures().robustBufferAccess;} + bool MustWriteImmutableSamplers() const + { return GetEnabledFeatures().mustWriteImmutableSamplers; } + bool SupportDepthStencilResolve() const { return (IsExtensionEnabled(DeviceExtensions::KHR_DEPTH_STENCIL_RESOLVE) || @@ -777,7 +776,6 @@ class Device const DeviceExtensions::Enabled m_enabledExtensions; // Enabled device extensions DispatchTable m_dispatchTable; // Device dispatch table SqttMgr* m_pSqttMgr; // Manager for developer mode SQ thread tracing - AsyncLayer* m_pAsyncLayer; // State for async compiler layer, otherwise null OptLayer* m_pAppOptLayer; // State for an app-specific layer, otherwise null BarrierFilterLayer* m_pBarrierFilterLayer; // State for enabling barrier filtering, otherwise // null @@ -808,6 +806,9 @@ class Device // If set to true, will use a compute queue internally for transfers. bool m_useComputeAsTransferQueue; + // If set to true, overrides compute queue to universal queue internally + bool m_useUniversalAsComputeQueue; + // The max VRS shading rate supported VkExtent2D m_maxVrsShadingRate; @@ -1152,6 +1153,11 @@ VKAPI_ATTR void VKAPI_CALL vkCmdSetLineStippleEXT( uint32_t lineStippleFactor, uint16_t lineStipplePattern); +VKAPI_ATTR void VKAPI_CALL vkSetDeviceMemoryPriorityEXT( + VkDevice device, + VkDeviceMemory memory, + float priority); + } // namespace entry } // namespace vk diff --git a/icd/api/include/vk_extensions.h b/icd/api/include/vk_extensions.h index a15af0e6..d5342afe 100644 --- a/icd/api/include/vk_extensions.h +++ b/icd/api/include/vk_extensions.h @@ -272,6 +272,7 @@ class DeviceExtensions final : public Extensions KHR_SHADER_DRAW_PARAMETERS, KHR_SHADER_FLOAT16_INT8, KHR_SHADER_FLOAT_CONTROLS, + KHR_SHADER_INTEGER_DOT_PRODUCT, KHR_SHADER_NON_SEMANTIC_INFO, KHR_SHADER_SUBGROUP_EXTENDED_TYPES, KHR_SHADER_SUBGROUP_UNIFORM_CONTROL_FLOW, @@ -312,10 +313,12 @@ class DeviceExtensions final : public Extensions EXT_LOAD_STORE_OP_NONE, EXT_MEMORY_BUDGET, EXT_MEMORY_PRIORITY, + EXT_PAGEABLE_DEVICE_LOCAL_MEMORY, EXT_PCI_BUS_INFO, EXT_PIPELINE_CREATION_CACHE_CONTROL, EXT_PIPELINE_CREATION_FEEDBACK, EXT_POST_DEPTH_COVERAGE, + EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART, EXT_PRIVATE_DATA, EXT_QUEUE_FAMILY_FOREIGN, EXT_ROBUSTNESS2, diff --git a/icd/api/include/vk_graphics_pipeline.h b/icd/api/include/vk_graphics_pipeline.h index d09a3c6f..df537f38 100644 --- a/icd/api/include/vk_graphics_pipeline.h +++ b/icd/api/include/vk_graphics_pipeline.h @@ -49,6 +49,18 @@ class CmdBuffer; class RenderPass; struct CmdBufferRenderState; +// ===================================================================================================================== +// Create info of graphics pipeline deferred compile +struct DeferGraphicsPipelineCreateInfo +{ + Device* pDevice; + PipelineCache* pPipelineCache; + GraphicsPipeline* pPipeline; + GraphicsPipelineBinaryCreateInfo binaryCreateInfo; + GraphicsPipelineShaderStageInfo shaderStageInfo; + GraphicsPipelineObjectCreateInfo objectCreateInfo; +}; + // ===================================================================================================================== // Convert sample location coordinates from [0,1] space (sent by the application) to [-8, 7] space (accepted by PAL) static void ConvertCoordinates( @@ -158,7 +170,7 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch const VkAllocationCallbacks* pAllocator) override; const VbBindingInfo& GetVbBindingInfo() const - { return m_vbInfo.bindingInfo; } + { return m_vbInfo; } void BindToCmdBuffer( CmdBuffer* pCmdBuffer, @@ -197,7 +209,8 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch bool bindInputAssemblyState, bool force1x1ShaderRate, bool customSampleLocations, - const VbInfo& vbInfo, + const VbBindingInfo& vbInfo, + const PipelineInternalBufferInfo* pInternalBuffer, Pal::IMsaaState** pPalMsaa, Pal::IColorBlendState** pPalColorBlend, Pal::IDepthStencilState** pPalDepthStencil, @@ -234,7 +247,8 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch const VkGraphicsPipelineCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, const PipelineLayout* pPipelineLayout, - const VbInfo* pVbInfo, + const VbBindingInfo* pVbInfo, + const PipelineInternalBufferInfo* pInternalBuffer, const size_t* pPipelineBinarySizes, const void** pPipelineBinaries, PipelineCache* pPipelineCache, @@ -247,12 +261,54 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch private: PAL_DISALLOW_COPY_AND_ASSIGN(GraphicsPipeline); + VkResult DeferCreateOptimizedPipeline( + Device* pDevice, + PipelineCache* pPipelineCache, + GraphicsPipelineBinaryCreateInfo* pBinaryCreateInfo, + GraphicsPipelineShaderStageInfo* pShaderStageInfo, + GraphicsPipelineObjectCreateInfo* pObjectCreateInfo); + + static VkResult CreatePalPipelineObjects( + Device* pDevice, + PipelineCache* pPipelineCache, + GraphicsPipelineObjectCreateInfo* pObjectCreateInfo, + const size_t* pPipelineBinarySizes, + const void** pPipelineBinaries, + const Util::MetroHash::Hash* pCacheIds, + void* pSystemMem, + Pal::IPipeline** pPalPipeline); + + void SetOptimizedPipeline(Pal::IPipeline** pPalPipeline); + + bool UseOptimizedPipeline() const + { + bool result = m_info.checkDeferCompilePipeline; + if (result) + { + Util::MutexAuto pipelineSwitchLock(const_cast(&m_pipelineSwitchLock)); + result = m_pOptimizedPipeline[0] != nullptr && m_optimizedPipelineHash != 0; + } + return result; + } + VkResult BuildDeferCompileWorkload( + Device* pDevice, + PipelineCache* pPipelineCache, + GraphicsPipelineBinaryCreateInfo* pBinaryCreateInfo, + GraphicsPipelineShaderStageInfo* pShaderStageInfo, + GraphicsPipelineObjectCreateInfo* pObjectCreateInfo); + + static void ExecuteDeferCreateOptimizedPipeline(void* pPayload); + GraphicsPipelineObjectImmedInfo m_info; // Immediate state that will go in CmdSet* functions Pal::IMsaaState* m_pPalMsaa[MaxPalDevices]; // PAL MSAA state object Pal::IColorBlendState* m_pPalColorBlend[MaxPalDevices]; // PAL color blend state object Pal::IDepthStencilState* m_pPalDepthStencil[MaxPalDevices]; // PAL depth stencil state object - VbInfo m_vbInfo; // Information about vertex buffer bindings - + VbBindingInfo m_vbInfo; // Information about vertex buffer bindings + PipelineInternalBufferInfo m_internalBufferInfo; // Information about internal buffer + Pal::IPipeline* m_pOptimizedPipeline[MaxPalDevices]; // Optimized PAL pipelines + uint64_t m_optimizedPipelineHash; // Pipeline hash of optimized PAL pipelines + Util::Mutex m_pipelineSwitchLock; // Lock for optimized pipeline and default pipeline + DeferredCompileWorkload m_deferWorkload; // Workload of deferred compiled union { uint8 value; diff --git a/icd/api/include/vk_image.h b/icd/api/include/vk_image.h index 9d9e7e8f..d8278745 100644 --- a/icd/api/include/vk_image.h +++ b/icd/api/include/vk_image.h @@ -279,7 +279,6 @@ class Image final : public NonDispatchable Image( Device* pDevice, - const VkAllocationCallbacks* pAllocator, VkImageCreateFlags flags, Pal::IImage** pPalImage, Pal::IGpuMemory** pPalMemory, diff --git a/icd/api/include/vk_memory.h b/icd/api/include/vk_memory.h index 98b1f4f9..6ebf5b3f 100644 --- a/icd/api/include/vk_memory.h +++ b/icd/api/include/vk_memory.h @@ -71,6 +71,10 @@ union MemoryPriority { return ((priority < memPriority.priority) || ((priority == memPriority.priority) && (offset < memPriority.offset))); } + bool operator!=(const MemoryPriority& memPriority) const + { return ((priority != memPriority.priority) || + ((priority == memPriority.priority) && (offset != memPriority.offset))); } + static MemoryPriority FromSetting(uint32_t value); static MemoryPriority FromVkMemoryPriority(float value); @@ -133,6 +137,10 @@ class Memory final : public NonDispatchable void ElevatePriority(MemoryPriority priority); + void SetPriority( + const MemoryPriority priority, + const bool mustBeLower); + Pal::IGpuMemory* PalMemory(uint32_t resourceIndex, uint32_t memoryIndex); Pal::IGpuMemory* PalMemory(uint32_t resourceIndex) const diff --git a/icd/api/include/vk_pipeline_layout.h b/icd/api/include/vk_pipeline_layout.h index 253abb9b..e12b72ad 100644 --- a/icd/api/include/vk_pipeline_layout.h +++ b/icd/api/include/vk_pipeline_layout.h @@ -117,7 +117,7 @@ class PipelineLayout final : public NonDispatchable #include "palFile.h" #include "palHashSetImpl.h" +#include "palListImpl.h" #include "include/pipeline_binary_cache.h" @@ -113,6 +114,7 @@ PipelineCompiler::PipelineCompiler( , m_totalBinaries(0) , m_totalTimeSpent(0) , m_uberFetchShaderInfoFormatMap(8, pPhysicalDevice->Manager()->VkInstance()->Allocator()) + , m_shaderModuleHandleMap(8, pPhysicalDevice->Manager()->VkInstance()->Allocator()) { } @@ -245,7 +247,12 @@ VkResult PipelineCompiler::Initialize() if (result == VK_SUCCESS) { - if (settings.enableUberFetchShader) + result = PalToVkResult(m_shaderModuleHandleMap.Init()); + } + + if (result == VK_SUCCESS) + { + if (settings.enableUberFetchShader || settings.enableEarlyCompile) { m_uberFetchShaderInfoFormatMap.Init(); @@ -253,6 +260,12 @@ VkResult PipelineCompiler::Initialize() } } + if (result == VK_SUCCESS) + { + uint32_t threadCount = settings.deferCompileOptimizedPipeline ? settings.deferCompileThreadCount : 0; + m_deferCompileMgr.Init(threadCount, m_pPhysicalDevice->VkInstance()->Allocator()); + } + return result; } @@ -263,6 +276,28 @@ void PipelineCompiler::Destroy() m_compilerSolutionLlpc.Destroy(); DestroyPipelineBinaryCache(); + + { + Util::MutexAuto mutexLock(&m_shaderModuleCacheLock); + for (auto it = m_shaderModuleHandleMap.Begin(); it.Get() != nullptr; it.Next()) + { + VK_ASSERT(it.Get()->value.pRefCount != nullptr); + + if (*(it.Get()->value.pRefCount) == 1) + { + // Force use un-lock version of FreeShaderModule. + auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); + pInstance->FreeMem(it.Get()->value.pRefCount); + it.Get()->value.pRefCount = nullptr; + FreeShaderModule(&it.Get()->value); + } + else + { + (*(it.Get()->value.pRefCount))--; + } + } + m_shaderModuleHandleMap.Reset(); + } } // ===================================================================================================================== @@ -331,6 +366,130 @@ bool PipelineCompiler::LoadReplaceShaderBinary( return findShader; } +// ===================================================================================================================== +// Generates shader module cache hash ID +Util::MetroHash::Hash PipelineCompiler::GetShaderModuleCacheHash( + VkShaderModuleCreateFlags flags, + uint32_t compilerMask, + Util::MetroHash::Hash& uniqueHash) +{ + Util::MetroHash128 hasher; + Util::MetroHash::Hash hash; + hasher.Update(compilerMask); + hasher.Update(uniqueHash); + hasher.Update(flags); + hasher.Update(m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash()); + hasher.Finalize(hash.bytes); + return hash; +} + +// ===================================================================================================================== +// Loads shader module from cache, include both run-time cache and binary cache +VkResult PipelineCompiler::LoadShaderModuleFromCache( + const Device* pDevice, + VkShaderModuleCreateFlags flags, + uint32_t compilerMask, + Util::MetroHash::Hash& uniqueHash, + ShaderModuleHandle* pShaderModule) +{ + bool supportModuleCache = true; + +#if ICD_X86_BUILD + supportModuleCache = false; +#endif + + if (compilerMask & (1 << PipelineCompilerTypeLlpc)) + { + // LLPC always defers SPIRV conversion, we needn't cache the result + supportModuleCache = false; + } + + VK_ASSERT(pShaderModule->pRefCount == nullptr); + + VkResult result = VK_ERROR_INITIALIZATION_FAILED; + if (supportModuleCache) + { + Util::MutexAuto mutexLock(&m_shaderModuleCacheLock); + + Util::MetroHash::Hash shaderModuleCacheHash = GetShaderModuleCacheHash(flags, compilerMask, uniqueHash); + auto pHandle = m_shaderModuleHandleMap.FindKey(shaderModuleCacheHash); + if (pHandle != nullptr) + { + VK_ASSERT(pHandle->pRefCount != nullptr); + (*(pHandle->pRefCount))++; + *pShaderModule = *pHandle; + result = VK_SUCCESS; + } + else if (m_pBinaryCache != nullptr) + { + if (result == VK_SUCCESS) + { + auto pInstance = m_pPhysicalDevice->VkInstance(); + pShaderModule->pRefCount = reinterpret_cast( + pInstance->AllocMem(sizeof(uint32_t), VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_CACHE)); + if (pShaderModule->pRefCount != nullptr) + { + // Initialize the reference count to two: one for the runtime cache and one for this shader module. + *pShaderModule->pRefCount = 2; + result = PalToVkResult(m_shaderModuleHandleMap.Insert(shaderModuleCacheHash, *pShaderModule)); + VK_ASSERT(result == VK_SUCCESS); + } + } + } + } + + return result; +} + +// ===================================================================================================================== +// Stores shader module to cache, include both run-time cache and binary cache +void PipelineCompiler::StoreShaderModuleToCache( + const Device* pDevice, + VkShaderModuleCreateFlags flags, + uint32_t compilerMask, + Util::MetroHash::Hash& uniqueHash, + ShaderModuleHandle* pShaderModule) +{ + + VK_ASSERT(pShaderModule->pRefCount == nullptr); + + bool supportModuleCache = true; + +#if ICD_X86_BUILD + supportModuleCache = false; +#endif + + if (compilerMask & (1 << PipelineCompilerTypeLlpc)) + { + // LLPC alway defer SPIRV conversion, we nedn't cache the result + supportModuleCache = false; + } + + if (supportModuleCache) + { + Util::MetroHash::Hash shaderModuleCacheHash = GetShaderModuleCacheHash(flags, compilerMask, uniqueHash); + auto pInstance = m_pPhysicalDevice->VkInstance(); + pShaderModule->pRefCount = reinterpret_cast( + pInstance->AllocMem(sizeof(uint32_t), VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_CACHE)); + if (pShaderModule->pRefCount != nullptr) + { + Util::MutexAuto mutexLock(&m_shaderModuleCacheLock); + // Initialize the reference count to two: one for the runtime cache and one for this shader module. + *pShaderModule->pRefCount = 2; + auto palResult = m_shaderModuleHandleMap.Insert(shaderModuleCacheHash, *pShaderModule); + if (palResult != Util::Result::Success) + { + // Reset refference count to one if fail to add it to runtime cache + *pShaderModule->pRefCount = 1; + } + } + + if (m_pBinaryCache != nullptr) + { + } + } +} + // ===================================================================================================================== // Builds shader module from SPIR-V binary code. VkResult PipelineCompiler::BuildShaderModule( @@ -344,26 +503,49 @@ VkResult PipelineCompiler::BuildShaderModule( auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); VkResult result = VK_SUCCESS; uint32_t compilerMask = GetCompilerCollectionMask(); - Util::MetroHash::Hash hash = {}; - Util::MetroHash64::Hash(reinterpret_cast(pCode), codeSize, hash.bytes); + Util::MetroHash::Hash stableHash = {}; + Util::MetroHash::Hash uniqueHash = {}; + Util::MetroHash64::Hash(reinterpret_cast(pCode), codeSize, stableHash.bytes); + uniqueHash = stableHash; bool findReplaceShader = false; - if (pSettings->shaderReplaceMode == ShaderReplaceShaderHash) + if ((pSettings->shaderReplaceMode == ShaderReplaceShaderHash) || + (pSettings->shaderReplaceMode == ShaderReplaceShaderHashPipelineBinaryHash)) { size_t replaceCodeSize = 0; void* pReplaceCode = nullptr; - uint64_t hash64 = Util::MetroHash::Compact64(&hash); + uint64_t hash64 = Util::MetroHash::Compact64(&stableHash); findReplaceShader = LoadReplaceShaderBinary(hash64, &replaceCodeSize, &pReplaceCode); if (findReplaceShader) { pCode = pReplaceCode; codeSize = replaceCodeSize; + Util::MetroHash64::Hash(reinterpret_cast(pCode), codeSize, uniqueHash.bytes); } } - if (compilerMask & (1 << PipelineCompilerTypeLlpc)) + result = LoadShaderModuleFromCache(pDevice, flags, compilerMask, uniqueHash, pShaderModule); + if (result != VK_SUCCESS) + { + if (compilerMask & (1 << PipelineCompilerTypeLlpc)) + { + result = m_compilerSolutionLlpc.BuildShaderModule(pDevice, flags, codeSize, pCode, pShaderModule, stableHash); + } + + StoreShaderModuleToCache(pDevice, flags, compilerMask, uniqueHash, pShaderModule); + } + else { - result = m_compilerSolutionLlpc.BuildShaderModule(pDevice, flags, codeSize, pCode, pShaderModule, hash); + if (result == VK_SUCCESS) + { + if (pSettings->enablePipelineDump) + { + Vkgc::BinaryData spvBin = {}; + spvBin.pCode = pCode; + spvBin.codeSize = codeSize; + Vkgc::IPipelineDumper::DumpSpirvBinary(pSettings->pipelineDumpDir, &spvBin); + } + } } if (findReplaceShader) @@ -390,11 +572,28 @@ bool PipelineCompiler::IsValidShaderModule( void PipelineCompiler::FreeShaderModule( ShaderModuleHandle* pShaderModule) { - m_compilerSolutionLlpc.FreeShaderModule(pShaderModule); + if (pShaderModule->pRefCount != nullptr) + { + Util::MutexAuto mutexLock(&m_shaderModuleCacheLock); + if (*pShaderModule->pRefCount > 1) + { + (*pShaderModule->pRefCount)--; + } + else + { + m_compilerSolutionLlpc.FreeShaderModule(pShaderModule); + auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); + pInstance->FreeMem(pShaderModule->pRefCount); + } + } + else + { + m_compilerSolutionLlpc.FreeShaderModule(pShaderModule); + } } // ===================================================================================================================== -// Replaces pipeline binary from external replacment file (_repalce.elf) +// Replaces pipeline binary from external replacement file (_replace.elf) template bool PipelineCompiler::ReplacePipelineBinary( const PipelineBuildInfo* pPipelineBuildInfo, @@ -698,6 +897,7 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary( int64_t compileTime = 0; uint64_t pipelineHash = Vkgc::IPipelineDumper::GetPipelineHash(&pCreateInfo->pipelineInfo); + uint64_t optimizedPipelineHash = 0; void* pPipelineDumpHandle = nullptr; const void* moduleDataBaks[ShaderStage::ShaderStageGfxCount]; @@ -713,7 +913,8 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary( &pCreateInfo->pipelineInfo.fs, }; - if (settings.shaderReplaceMode == ShaderReplacePipelineBinaryHash) + if ((settings.shaderReplaceMode == ShaderReplacePipelineBinaryHash) || + (settings.shaderReplaceMode == ShaderReplaceShaderHashPipelineBinaryHash)) { if (ReplacePipelineBinary(&pCreateInfo->pipelineInfo, pPipelineBinarySize, ppPipelineBinary, pipelineHash)) { @@ -742,17 +943,18 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary( } } - if (settings.enablePipelineDump) + // Generate optimized pipeline hash if both early compile and defer compile are enabled + if (settings.deferCompileOptimizedPipeline && + (pCreateInfo->pipelineInfo.enableEarlyCompile || pCreateInfo->pipelineInfo.enableUberFetchShader)) { - Vkgc::PipelineDumpOptions dumpOptions = {}; - dumpOptions.pDumpDir = settings.pipelineDumpDir; - dumpOptions.filterPipelineDumpByType = settings.filterPipelineDumpByType; - dumpOptions.filterPipelineDumpByHash = settings.filterPipelineDumpByHash; - dumpOptions.dumpDuplicatePipelines = settings.dumpDuplicatePipelines; + bool enableEarlyCompile = pCreateInfo->pipelineInfo.enableEarlyCompile; + bool enableUberFetchShader = pCreateInfo->pipelineInfo.enableUberFetchShader; - Vkgc::PipelineBuildInfo pipelineInfo = {}; - pipelineInfo.pGraphicsInfo = &pCreateInfo->pipelineInfo; - pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, pipelineInfo, pipelineHash); + pCreateInfo->pipelineInfo.enableEarlyCompile = false; + pCreateInfo->pipelineInfo.enableUberFetchShader = false; + optimizedPipelineHash = Vkgc::IPipelineDumper::GetPipelineHash(&pCreateInfo->pipelineInfo); + pCreateInfo->pipelineInfo.enableEarlyCompile = enableEarlyCompile; + pCreateInfo->pipelineInfo.enableUberFetchShader = enableUberFetchShader; } // PAL Pipeline caching @@ -770,22 +972,73 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary( pPipelineBinaryCache = pPipelineCache->GetPipelineCache(); } + int64_t startTime = 0; if (shouldCompile && ((pPipelineBinaryCache != nullptr) || (m_pBinaryCache != nullptr))) { - int64_t startTime = Util::GetPerfCpuTime(); + startTime = Util::GetPerfCpuTime(); - GetGraphicsPipelineCacheId( - deviceIdx, - pCreateInfo, - pipelineHash, - m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash(), - pCacheId); + // Search optimized pipeline first + if (optimizedPipelineHash != 0) + { + GetGraphicsPipelineCacheId( + deviceIdx, + pCreateInfo, + optimizedPipelineHash, + m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash(), + pCacheId); + + cacheResult = GetCachedPipelineBinary(pCacheId, pPipelineBinaryCache, pPipelineBinarySize, ppPipelineBinary, + &isUserCacheHit, &isInternalCacheHit, &pCreateInfo->freeCompilerBinary, &pCreateInfo->pipelineFeedback); + if (cacheResult == Util::Result::Success) + { + shouldCompile = false; + // Update pipeline option for optimized pipeline and update dump handle. + pCreateInfo->pipelineInfo.enableEarlyCompile = false; + pCreateInfo->pipelineInfo.enableUberFetchShader = false; - cacheResult = GetCachedPipelineBinary(pCacheId, pPipelineBinaryCache, pPipelineBinarySize, ppPipelineBinary, - &isUserCacheHit, &isInternalCacheHit, &pCreateInfo->freeCompilerBinary, &pCreateInfo->pipelineFeedback); - if (cacheResult == Util::Result::Success) + } + } + } + + if (settings.enablePipelineDump) + { + Vkgc::PipelineDumpOptions dumpOptions = {}; + dumpOptions.pDumpDir = settings.pipelineDumpDir; + dumpOptions.filterPipelineDumpByType = settings.filterPipelineDumpByType; + dumpOptions.filterPipelineDumpByHash = settings.filterPipelineDumpByHash; + dumpOptions.dumpDuplicatePipelines = settings.dumpDuplicatePipelines; + + Vkgc::PipelineBuildInfo pipelineInfo = {}; + pipelineInfo.pGraphicsInfo = &pCreateInfo->pipelineInfo; + uint64_t dumpHash = pipelineHash; + if (optimizedPipelineHash != 0) { - shouldCompile = false; + if (shouldCompile == false) + { + // Current pipeline is optimized pipeline if optimized pipeline is valid and pipeline cache is hit + dumpHash = optimizedPipelineHash; + } + } + pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, pipelineInfo, dumpHash); + } + + if (shouldCompile && ((pPipelineBinaryCache != nullptr) || (m_pBinaryCache != nullptr))) + { + if (shouldCompile) + { + GetGraphicsPipelineCacheId( + deviceIdx, + pCreateInfo, + pipelineHash, + m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash(), + pCacheId); + + cacheResult = GetCachedPipelineBinary(pCacheId, pPipelineBinaryCache, pPipelineBinarySize, ppPipelineBinary, + &isUserCacheHit, &isInternalCacheHit, &pCreateInfo->freeCompilerBinary, &pCreateInfo->pipelineFeedback); + if (cacheResult == Util::Result::Success) + { + shouldCompile = false; + } } cacheTime = Util::GetPerfCpuTime() - startTime; @@ -904,7 +1157,8 @@ VkResult PipelineCompiler::CreateComputePipelineBinary( ShaderModuleHandle shaderModuleReplaceHandle = {}; bool shaderModuleReplaced = false; - if (settings.shaderReplaceMode == ShaderReplacePipelineBinaryHash) + if ((settings.shaderReplaceMode == ShaderReplacePipelineBinaryHash) || + (settings.shaderReplaceMode == ShaderReplaceShaderHashPipelineBinaryHash)) { if (ReplacePipelineBinary(&pCreateInfo->pipelineInfo, pPipelineBinarySize, ppPipelineBinary, pipelineHash)) { @@ -1373,7 +1627,7 @@ static void BuildViewportState( } // ===================================================================================================================== -static void BuildNggState( +void PipelineCompiler::BuildNggState( const Device* pDevice, const VkShaderStageFlagBits activeStages, const bool isConservativeOverestimation, @@ -1468,7 +1722,7 @@ static void BuildDepthStencilState( } // ===================================================================================================================== -static void BuildPipelineShaderInfo( +void PipelineCompiler::BuildPipelineShaderInfo( const Device* pDevice, const ShaderStageInfo* pShaderInfoIn, Vkgc::PipelineShaderInfo* pShaderInfoOut, @@ -1488,7 +1742,6 @@ static void BuildPipelineShaderInfo( pShaderInfoOut->pSpecializationInfo = pShaderInfoIn->pSpecializationInfo; pShaderInfoOut->pEntryTarget = pShaderInfoIn->pEntryPoint; pShaderInfoOut->entryStage = stage; - pCompiler->ApplyDefaultShaderOptions(stage, &pShaderInfoOut->options ); @@ -1516,7 +1769,7 @@ static VkResult BuildPipelineResourceMapping( const Device* pDevice, const PipelineLayout* pLayout, const uint32_t stageMask, - VbInfo* pVbInfo, + VbBindingInfo* pVbInfo, GraphicsPipelineBinaryCreateInfo* pCreateInfo) { VkResult result = VK_SUCCESS; @@ -1612,7 +1865,7 @@ static void BuildPipelineShadersInfo( { if (((shaderMask & (1 << stage)) != 0) && (pShaderInfo->stages[stage].pModuleHandle != nullptr)) { - BuildPipelineShaderInfo(pDevice, + PipelineCompiler::BuildPipelineShaderInfo(pDevice, &pShaderInfo->stages[stage], ppShaderInfoOut[stage], &pCreateInfo->pipelineInfo.options, @@ -1706,7 +1959,7 @@ static void BuildVertexInputInterfaceState( pCreateInfo->pipelineInfo.dynamicVertexStride = true; } - if (pDevice->GetRuntimeSettings().enableUberFetchShader) + if (pDevice->GetRuntimeSettings().enableUberFetchShader || pDevice->GetRuntimeSettings().enableEarlyCompile) { pCreateInfo->pipelineInfo.enableUberFetchShader = true; } @@ -1733,7 +1986,7 @@ static void BuildPreRasterizationShaderState( BuildViewportState(pDevice, pIn->pViewportState, dynamicStateFlags, pCreateInfo); } - BuildNggState(pDevice, activeStages, isConservativeOverestimation, pCreateInfo); + PipelineCompiler::BuildNggState(pDevice, activeStages, isConservativeOverestimation, pCreateInfo); if (activeStages & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) { @@ -1805,41 +2058,18 @@ static void BuildFragmentOutputInterfaceState( } // ===================================================================================================================== -static VkResult BuildUberFetchShaderInternalData( +static VkResult BuildPipelineInternalBufferData( const Device* pDevice, GraphicsPipelineBinaryCreateInfo* pCreateInfo, - VbInfo* pVbInfo) + PipelineInternalBufferInfo* pInternalBufferInfo) { PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); VK_ASSERT(pCreateInfo->pipelineInfo.enableUberFetchShader); - - auto result = pDefaultCompiler->BuildUberFetchShaderInternalData(pCreateInfo->compilerType, - pCreateInfo->pipelineInfo.pVertexInput, - pCreateInfo->pipelineInfo.dynamicVertexStride, - &pVbInfo->uberFetchShaderBuffer); - - auto pSettings = &pDevice->GetRuntimeSettings(); - - if (pSettings->disablePerInstanceFetch) - { - if (pVbInfo->uberFetchShaderBuffer.requirePerIntanceFetch) - { - pCreateInfo->pipelineInfo.enableUberFetchShader = false; - pVbInfo->uberFetchShaderBuffer.bufferSize = 0; - } - } - - if (pSettings->disablePerCompFetch) - { - if (pVbInfo->uberFetchShaderBuffer.requirePerCompFetch) - { - pCreateInfo->pipelineInfo.enableUberFetchShader = false; - pVbInfo->uberFetchShaderBuffer.bufferSize = 0; - } - } - + auto result = pDefaultCompiler->BuildPipelineInternalBufferData(pCreateInfo, + pInternalBufferInfo); return result; } + // ===================================================================================================================== static VkResult BuildExecutablePipelineState( const Device* pDevice, @@ -1848,7 +2078,8 @@ static VkResult BuildExecutablePipelineState( const PipelineLayout* pPipelineLayout, const uint32_t dynamicStateFlags, GraphicsPipelineBinaryCreateInfo* pCreateInfo, - VbInfo* pVbInfo) + VbBindingInfo* pVbInfo, + PipelineInternalBufferInfo* pInternalBufferInfo) { const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex); @@ -1908,8 +2139,7 @@ static VkResult BuildExecutablePipelineState( if (pCreateInfo->pipelineInfo.enableUberFetchShader) { - VK_ASSERT(pVbInfo->uberFetchShaderBuffer.userDataOffset > 0); - result = BuildUberFetchShaderInternalData(pDevice, pCreateInfo, pVbInfo); + result = BuildPipelineInternalBufferData(pDevice, pCreateInfo, pInternalBufferInfo); } } @@ -1924,7 +2154,8 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( const GraphicsPipelineShaderStageInfo* pShaderInfo, const PipelineLayout* pPipelineLayout, GraphicsPipelineBinaryCreateInfo* pCreateInfo, - VbInfo* pVbInfo) + VbBindingInfo* pVbInfo, + PipelineInternalBufferInfo* pInternalBufferInfo) { VK_ASSERT(pIn != nullptr); @@ -1938,7 +2169,7 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( pIn->pDynamicState ); - BuildVertexInputInterfaceState(pDevice, pIn, dynamicStateFlags, pCreateInfo, &pVbInfo->bindingInfo); + BuildVertexInputInterfaceState(pDevice, pIn, dynamicStateFlags, pCreateInfo, pVbInfo); BuildPreRasterizationShaderState(pDevice, pIn, pShaderInfo, dynamicStateFlags, activeStages, pCreateInfo); @@ -1955,7 +2186,7 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( { result = BuildExecutablePipelineState( - pDevice, pIn, pShaderInfo, pPipelineLayout, dynamicStateFlags, pCreateInfo, pVbInfo); + pDevice, pIn, pShaderInfo, pPipelineLayout, dynamicStateFlags, pCreateInfo, pVbInfo, pInternalBufferInfo); } return result; @@ -2238,11 +2469,12 @@ void PipelineCompiler::FreeComputePipelineCreateInfo( // ===================================================================================================================== // Free the temp memories in graphics pipeline create info void PipelineCompiler::FreeGraphicsPipelineCreateInfo( - GraphicsPipelineBinaryCreateInfo* pCreateInfo) + GraphicsPipelineBinaryCreateInfo* pCreateInfo, + bool keepConvertTempMemory) { auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); - if (pCreateInfo->pTempBuffer != nullptr) + if ((pCreateInfo->pTempBuffer != nullptr) && (keepConvertTempMemory == false)) { pInstance->FreeMem(pCreateInfo->pTempBuffer); pCreateInfo->pTempBuffer = nullptr; @@ -2393,15 +2625,13 @@ void PipelineCompiler::GetGraphicsPipelineCacheId( } // ===================================================================================================================== -VkResult PipelineCompiler::BuildUberFetchShaderInternalData( - PipelineCompilerType compilerType, - const VkPipelineVertexInputStateCreateInfo* pVertexInput, - bool isDynamicStride, - UberFetchShaderBufferInfo* pFetchShaderBufferInfo) +VkResult PipelineCompiler::BuildPipelineInternalBufferData( + GraphicsPipelineBinaryCreateInfo* pCreateInfo, + PipelineInternalBufferInfo* pInternalBufferInfo) { VkResult result = VK_SUCCESS; - if (compilerType == PipelineCompilerTypeLlpc) + if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc) { VK_NOT_IMPLEMENTED; } @@ -2409,4 +2639,23 @@ VkResult PipelineCompiler::BuildUberFetchShaderInternalData( return result; } +// ===================================================================================================================== +void PipelineCompiler::ExecuteDeferCompile( + DeferredCompileWorkload* pWorkload) +{ + auto pThread = m_deferCompileMgr.GetCompileThread(); + if (pThread != nullptr) + { + pThread->AddTask(pWorkload); + } + else + { + pWorkload->Execute(pWorkload->pPayloads); + if (pWorkload->pEvent != nullptr) + { + pWorkload->pEvent->Set(); + } + } +} + } diff --git a/icd/api/strings/entry_points.txt b/icd/api/strings/entry_points.txt index 28b0ffd7..a03242e5 100644 --- a/icd/api/strings/entry_points.txt +++ b/icd/api/strings/entry_points.txt @@ -362,6 +362,8 @@ vkGetPipelineExecutableInternalRepresentationsKHR @device @dext(KHR_pipeli vkCmdSetLineStippleEXT @device @dext(EXT_line_rasterization) +vkSetDeviceMemoryPriorityEXT @device @dext(EXT_pageable_device_local_memory) + vkCreatePrivateDataSlotEXT @device @dext(EXT_private_data) vkDestroyPrivateDataSlotEXT @device @dext(EXT_private_data) vkSetPrivateDataEXT @device @dext(EXT_private_data) diff --git a/icd/api/strings/extensions.txt b/icd/api/strings/extensions.txt index d14325c3..4f2afb95 100644 --- a/icd/api/strings/extensions.txt +++ b/icd/api/strings/extensions.txt @@ -100,6 +100,7 @@ VK_EXT_inline_uniform_block VK_EXT_transform_feedback VK_KHR_shader_float16_int8 VK_EXT_memory_priority +VK_EXT_pageable_device_local_memory VK_EXT_memory_budget VK_KHR_depth_stencil_resolve VK_EXT_host_query_reset @@ -115,6 +116,7 @@ VK_EXT_line_rasterization VK_EXT_shader_atomic_float VK_EXT_shader_atomic_float2 VK_KHR_shader_clock +VK_KHR_shader_integer_dot_product VK_KHR_shader_subgroup_extended_types VK_KHR_spirv_1_4 VK_EXT_texel_buffer_alignment @@ -138,6 +140,7 @@ VK_EXT_4444_formats VK_EXT_color_write_enable VK_KHR_shader_terminate_invocation VK_KHR_synchronization2 +VK_EXT_primitive_topology_list_restart VK_EXT_extended_dynamic_state2 VK_KHR_copy_commands2 VK_EXT_ycbcr_image_arrays diff --git a/icd/api/vk_buffer.cpp b/icd/api/vk_buffer.cpp index b556784a..8dc364f5 100644 --- a/icd/api/vk_buffer.cpp +++ b/icd/api/vk_buffer.cpp @@ -43,7 +43,6 @@ namespace vk // ===================================================================================================================== Buffer::Buffer( Device* pDevice, - const VkAllocationCallbacks* pAllocator, const VkBufferCreateInfo* pCreateInfo, Pal::IGpuMemory** pGpuMemory, BufferFlags internalFlags) @@ -72,7 +71,6 @@ Buffer::Buffer( m_perGpu[deviceIdx].gpuVirtAddr = 0; } } - } // ===================================================================================================================== @@ -83,16 +81,15 @@ VkResult Buffer::Create( const VkAllocationCallbacks* pAllocator, VkBuffer* pBuffer) { - void* pMemory = nullptr; - Pal::IGpuMemory* pGpuMemory[MaxPalDevices] = {}; - - Pal::Result palResult = Pal::Result::Success; + Pal::IGpuMemory* pGpuMemory[MaxPalDevices] = {}; + Pal::GpuMemoryCreateInfo gpuMemoryCreateInfo = {}; - size_t apiSize = ObjectSize(pDevice); + VkResult result = VK_SUCCESS; + size_t apiSize = ObjectSize(pDevice); + size_t palMemSize = 0; + bool isSparse = (pCreateInfo->flags & SparseEnablingFlags) != 0; - bool isSparse = (pCreateInfo->flags & SparseEnablingFlags) != 0; - - if (isSparse && (pCreateInfo->size != 0)) + if (isSparse) { // We need virtual remapping support for all sparse resources VK_ASSERT(pDevice->VkPhysicalDevice(DefaultDeviceIndex)->IsVirtualRemappingSupported()); @@ -103,72 +100,68 @@ VkResult Buffer::Create( VK_ASSERT(pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetPrtFeatures() & Pal::PrtFeatureBuffer); } - size_t palMemSize; - Pal::GpuMemoryCreateInfo info = { }; - - info.alignment = pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties(). - gpuMemoryProperties.virtualMemAllocGranularity; - info.size = Util::Pow2Align(pCreateInfo->size, info.alignment); - info.flags.u32All = 0; - info.flags.virtualAlloc = 1; - info.flags.globalGpuVa = pDevice->IsGlobalGpuVaEnabled(); - info.heapAccess = Pal::GpuHeapAccess::GpuHeapAccessExplicit; + gpuMemoryCreateInfo.alignment = pDevice->GetProperties().virtualMemAllocGranularity; + gpuMemoryCreateInfo.size = Util::Pow2Align(pCreateInfo->size, gpuMemoryCreateInfo.alignment); + gpuMemoryCreateInfo.flags.virtualAlloc = 1; + gpuMemoryCreateInfo.flags.globalGpuVa = pDevice->IsGlobalGpuVaEnabled(); + gpuMemoryCreateInfo.heapAccess = Pal::GpuHeapAccess::GpuHeapAccessExplicit; // Virtual resource should return 0 on unmapped read if residencyNonResidentStrict is set. if (pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetPrtFeatures() & Pal::PrtFeatureStrictNull) { - info.virtualAccessMode = Pal::VirtualGpuMemAccessMode::ReadZero; + gpuMemoryCreateInfo.virtualAccessMode = Pal::VirtualGpuMemAccessMode::ReadZero; } - palMemSize = pDevice->PalDevice(DefaultDeviceIndex)->GetGpuMemorySize(info, &palResult); + Pal::Result palResult; + + palMemSize = pDevice->PalDevice(DefaultDeviceIndex)->GetGpuMemorySize(gpuMemoryCreateInfo, &palResult); VK_ASSERT(palResult == Pal::Result::Success); + } - // Allocate enough system memory to also store the VA-only memory object - pMemory = pDevice->AllocApiObject( + // Allocate memory for the dispatchable object and for sparse buffers, the VA-only memory object + void* pMemory = pDevice->AllocApiObject( pAllocator, apiSize + (palMemSize * pDevice->NumPalDevices())); - if (pMemory == nullptr) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - - void* pPalMemory = Util::VoidPtrInc(pMemory, apiSize); + if (pMemory == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + else if (isSparse) + { + void* pPalMemory = Util::VoidPtrInc(pMemory, apiSize); + Pal::Result palResult = Pal::Result::Success; for (uint32_t deviceIdx = 0; (deviceIdx < pDevice->NumPalDevices()) && (palResult == Pal::Result::Success); deviceIdx++) { + if (deviceIdx != DefaultDeviceIndex) + { + VK_ASSERT(palMemSize == pDevice->PalDevice(deviceIdx)->GetGpuMemorySize(gpuMemoryCreateInfo, + &palResult)); + VK_ASSERT(palResult == Pal::Result::Success); + } + // Create the VA-only memory object needed for sparse buffer support palResult = pDevice->PalDevice(deviceIdx)->CreateGpuMemory( - info, - (uint8_t*)pPalMemory, + gpuMemoryCreateInfo, + pPalMemory, &pGpuMemory[deviceIdx]); pPalMemory = Util::VoidPtrInc(pPalMemory, palMemSize); } - } - else - { - // Allocate memory only for the dispatchable object - pMemory = pDevice->AllocApiObject( - pAllocator, - apiSize); - if (pMemory == nullptr) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } + result = PalToVkResult(palResult); } - if (palResult == Pal::Result::Success) + if (result == VK_SUCCESS) { BufferFlags bufferFlags; CalculateBufferFlags(pDevice, pCreateInfo, &bufferFlags); // Construct API buffer object. VK_PLACEMENT_NEW (pMemory) Buffer (pDevice, - pAllocator, pCreateInfo, pGpuMemory, bufferFlags); @@ -178,7 +171,7 @@ VkResult Buffer::Create( LogBufferCreate(pCreateInfo, *pBuffer, pDevice); } - return PalToVkResult(palResult); + return result; } // ===================================================================================================================== @@ -289,7 +282,6 @@ VkResult Buffer::Destroy( Device* pDevice, const VkAllocationCallbacks* pAllocator) { - Pal::ResourceDestroyEventData data = {}; data.pObj = this; @@ -378,20 +370,6 @@ void Buffer::GetMemoryRequirements( GetBufferMemoryRequirements(pDevice, &m_internalFlags, m_size, pMemoryRequirements); } -// ===================================================================================================================== -// Get the buffer's memory requirements from VkBufferCreateInfo -void Buffer::CalculateMemoryRequirements( - const Device* pDevice, - const VkBufferCreateInfo* pCreateInfo, - VkMemoryRequirements* pMemoryRequirements) -{ - BufferFlags bufferFlags; - - CalculateBufferFlags(pDevice, pCreateInfo, &bufferFlags); - - GetBufferMemoryRequirements(pDevice, &bufferFlags, pCreateInfo->size, pMemoryRequirements); -} - // ===================================================================================================================== // Get the buffer's memory requirements void Buffer::GetBufferMemoryRequirements( @@ -569,36 +547,19 @@ VKAPI_ATTR void VKAPI_CALL vkGetBufferMemoryRequirements2( VkMemoryRequirements2* pMemoryRequirements) { const Device* pDevice = ApiDevice::ObjectFromHandle(device); - VK_ASSERT((pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) || - pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2)); - union - { - const VkStructHeader* pHeader; - const VkBufferMemoryRequirementsInfo2* pRequirementsInfo2; - }; - - pRequirementsInfo2 = pInfo; - pHeader = utils::GetExtensionStructure(pHeader, VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2); - VK_ASSERT(pHeader != nullptr); - if (pHeader != nullptr) - { - Buffer* pBuffer = Buffer::ObjectFromHandle(pRequirementsInfo2->buffer); - VkMemoryRequirements* pRequirements = &pMemoryRequirements->memoryRequirements; - pBuffer->GetMemoryRequirements(pDevice, pRequirements); + Buffer* pBuffer = Buffer::ObjectFromHandle(pInfo->buffer); + VkMemoryRequirements* pRequirements = &pMemoryRequirements->memoryRequirements; + pBuffer->GetMemoryRequirements(pDevice, pRequirements); - if (pMemoryRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2) - { - VkMemoryDedicatedRequirements* pMemDedicatedRequirements = - static_cast(pMemoryRequirements->pNext); + VkMemoryDedicatedRequirements* pMemDedicatedRequirements = + static_cast(pMemoryRequirements->pNext); - if ((pMemDedicatedRequirements != nullptr) && - (pMemDedicatedRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS)) - { - pMemDedicatedRequirements->prefersDedicatedAllocation = pBuffer->DedicatedMemoryRequired(); - pMemDedicatedRequirements->requiresDedicatedAllocation = pBuffer->DedicatedMemoryRequired(); - } - } + if ((pMemDedicatedRequirements != nullptr) && + (pMemDedicatedRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS)) + { + pMemDedicatedRequirements->prefersDedicatedAllocation = pBuffer->DedicatedMemoryRequired(); + pMemDedicatedRequirements->requiresDedicatedAllocation = pBuffer->DedicatedMemoryRequired(); } } diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp index 4cd01858..94ae308f 100644 --- a/icd/api/vk_cmdbuffer.cpp +++ b/icd/api/vk_cmdbuffer.cpp @@ -5363,6 +5363,8 @@ void CmdBuffer::SetSampleLocations( ConvertToPalMsaaQuadSamplePattern(pSampleLocationsInfo, &locations); PalCmdSetMsaaQuadSamplePattern(sampleLocationsPerPixel, locations); + + m_allGpuState.staticTokens.samplePattern = DynamicRenderStateToken; } // ===================================================================================================================== @@ -5520,11 +5522,10 @@ void CmdBuffer::BeginRenderPass( sizeof(SamplePattern) * subpassCount, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)); - memset(m_renderPassInstance.pSamplePatterns, 0, subpassCount * sizeof(SamplePattern)); - if (m_renderPassInstance.pSamplePatterns != nullptr) { m_renderPassInstance.maxSubpassCount = subpassCount; + memset(m_renderPassInstance.pSamplePatterns, 0, subpassCount * sizeof(SamplePattern)); } else { @@ -5924,7 +5925,7 @@ void CmdBuffer::RPSyncPoint( const uint32_t sampleCount = attachment.pImage->GetImageSamples(); - if (sampleCount > 1) + if (sampleCount > 0) { if (attachment.pImage->IsSampleLocationsCompatibleDepth() && tr.flags.isInitialLayoutTransition) @@ -5958,6 +5959,27 @@ void CmdBuffer::RPSyncPoint( m_recordingResult = VK_ERROR_OUT_OF_HOST_MEMORY; } + // Construct a dumb transition to sync cache + const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); + if (settings.enableDumbTransitionSync && (barrier.transitionCount == 0) && (rpBarrier.flags.needsGlobalTransition)) + { + if (pPalTransitions == nullptr) + { + pPalTransitions = pVirtStack->AllocArray(1); + } + + if (pPalTransitions != nullptr) + { + Pal::BarrierTransition *pDumbTransition = &pPalTransitions[0]; + pDumbTransition->srcCacheMask = 0; + pDumbTransition->dstCacheMask = 0; + pDumbTransition->imageInfo.pImage = nullptr; + + barrier.transitionCount = 1; + barrier.pTransitions = pDumbTransition; + } + } + // Execute the barrier if it actually did anything if ((barrier.waitPoint != Pal::HwPipeBottom) || (barrier.transitionCount > 0) || @@ -6531,8 +6553,8 @@ void CmdBuffer::WritePushConstants( // pipeline layout (e.g. at the top of the command buffer) and this register write will be redundant because // a future vkCmdBindPipeline will reprogram the user data registers during the rebase. if (PalPipelineBindingOwnedBy(palBindPoint, apiBindPoint) && - pBindState->userDataLayout.pushConstRegBase == userDataLayout.pushConstRegBase && - pBindState->userDataLayout.pushConstRegCount >= startInDwords + lengthInDwords) + (pBindState->userDataLayout.pushConstRegBase == userDataLayout.pushConstRegBase) && + (pBindState->userDataLayout.pushConstRegCount >= (startInDwords + lengthInDwords))) { utils::IterateMask deviceGroup(m_curDeviceMask); do @@ -7151,24 +7173,6 @@ void CmdBuffer::DrawIndirectByteCount( while (deviceGroup.IterateNext()); } -// ===================================================================================================================== -void CmdBuffer::SetLineStippleEXT( - const Pal::LineStippleStateParams& params, - uint32_t staticToken) -{ - m_allGpuState.lineStipple = params; - - utils::IterateMask deviceGroup(m_cbBeginDeviceMask); - do - { - const uint32_t deviceIdx = deviceGroup.Index(); - PalCmdBuffer(deviceIdx)->CmdSetLineStippleState(m_allGpuState.lineStipple); - } - while (deviceGroup.IterateNext()); - - m_allGpuState.staticTokens.lineStippleState = staticToken; -} - // ===================================================================================================================== void CmdBuffer::SetLineStippleEXT( uint32_t lineStippleFactor, diff --git a/icd/api/vk_conv.cpp b/icd/api/vk_conv.cpp index 95a59797..01a4e8f0 100644 --- a/icd/api/vk_conv.cpp +++ b/icd/api/vk_conv.cpp @@ -1056,6 +1056,8 @@ uint32_t GetBufferSrdFormatInfo( } else { + VK_ASSERT(pPhysicalDevice->PalProperties().gfxipProperties.srdSizes.bufferView == 4 * sizeof(uint32_t)); + uint32_t result[4] = {}; Pal::BufferViewInfo bufferInfo = {}; bufferInfo.gpuAddr = 0x300000000ull; @@ -1063,20 +1065,23 @@ uint32_t GetBufferSrdFormatInfo( bufferInfo.range = UINT32_MAX; bufferInfo.stride = Pal::Formats::BytesPerPixel(swizzledFormat.format); pPhysicalDevice->PalDevice()->CreateTypedBufferViewSrds(1, &bufferInfo, result); + + // NOTE: Until now, all buffer format info is stored the fourth DWORD of buffer SRD. please modify + // both BilVertexFetchManager::IssueUberFetchInst and UberFetchShaderFormatInfo once it is changed. return result[3]; } } -#define INIT_UBER_FORMATINFO(vkFmt, palFmt, unpckedPalFmt, packed, fixed, compCnt, compSize, align) { \ +#define INIT_UBER_FORMATINFO(vkFmt, palFmt, unpackedPalFmt, packed, fixed, compCnt, compSize, align) { \ UberFetchShaderFormatInfo fmtInfo = {}; \ fmtInfo.swizzledFormat = palFmt; \ - fmtInfo.unpackedFormat = unpckedPalFmt; \ + fmtInfo.unpackedFormat = unpackedPalFmt; \ fmtInfo.isPacked = packed; \ fmtInfo.isFixed = fixed; \ fmtInfo.componentCount = compCnt; \ fmtInfo.componentSize = compSize; \ fmtInfo.bufferFormat = GetBufferSrdFormatInfo(pPhysicalDevice, palFmt); \ - fmtInfo.unpackedBufferFormat = GetBufferSrdFormatInfo(pPhysicalDevice, unpckedPalFmt); \ + fmtInfo.unpackedBufferFormat = GetBufferSrdFormatInfo(pPhysicalDevice, unpackedPalFmt); \ fmtInfo.alignment = align; \ pFormatInfoMap->Insert(VK_FORMAT_##vkFmt, fmtInfo); } @@ -1290,6 +1295,29 @@ VkResult InitializeUberFetchShaderFormatTable( INIT_UBER_FORMATINFO(UNDEFINED, PalFmt_Undefined, PalFmt_Undefined, 0, 0, 0, 0, 4); + // OOB flag is in buffer dword3 on Gfx10+, and the value is different when stride is 0. + // to avoid access the exact bit in buffer SRD, we create untypeded buffer twice with different stride, + // and record the modified bits. + + VK_ASSERT(pPhysicalDevice->PalProperties().gfxipProperties.srdSizes.bufferView == 4 * sizeof(uint32_t)); + + uint32_t defaultSrd[4] = {}; + uint32_t zeroStrideSrd[4] = {}; + Pal::BufferViewInfo bufferInfo = {}; + bufferInfo.gpuAddr = 0x300000000ull; + bufferInfo.swizzledFormat = PalFmt_RGBA(32, 32, 32, 32, Float); + bufferInfo.range = UINT32_MAX; + + // Build SRD with non-zero stride + bufferInfo.stride = 16; + pPhysicalDevice->PalDevice()->CreateUntypedBufferViewSrds(1, &bufferInfo, defaultSrd); + + // Build SRD with zero stride + bufferInfo.stride = 0; + pPhysicalDevice->PalDevice()->CreateUntypedBufferViewSrds(1, &bufferInfo, zeroStrideSrd); + + // Save the modified bits in buffer SRD + pFormatInfoMap->SetBufferFormatMask(defaultSrd[3] ^ zeroStrideSrd[3]); return VK_SUCCESS; } #undef INIT_UBER_FORMATINFO @@ -1297,11 +1325,23 @@ VkResult InitializeUberFetchShaderFormatTable( // ===================================================================================================================== UberFetchShaderFormatInfo GetUberFetchShaderFormatInfo( UberFetchShaderFormatInfoMap* pFormatInfoMap, - VkFormat vkFormat) + VkFormat vkFormat, + bool isZeroStride) { - UberFetchShaderFormatInfo dummyInfo = {}; + UberFetchShaderFormatInfo formatInfo = {}; auto pFormatInfo = pFormatInfoMap->FindKey(vkFormat); - return (pFormatInfo == nullptr) ? dummyInfo : *pFormatInfo; + if (pFormatInfo != nullptr) + { + formatInfo = *pFormatInfo; + if (isZeroStride) + { + // Apply zero stride modified bits, which are caclulated in UberFetchShaderFormatInfoMap initialization. + formatInfo.bufferFormat = formatInfo.bufferFormat ^ pFormatInfoMap->GetBufferFormatMask(); + formatInfo.unpackedBufferFormat = formatInfo.unpackedBufferFormat ^ pFormatInfoMap->GetBufferFormatMask(); + } + } + + return formatInfo; } } // namespace vk diff --git a/icd/api/vk_descriptor_pool.cpp b/icd/api/vk_descriptor_pool.cpp index fa09fed9..30652c00 100644 --- a/icd/api/vk_descriptor_pool.cpp +++ b/icd/api/vk_descriptor_pool.cpp @@ -347,6 +347,10 @@ VkResult DescriptorPool::AllocDescriptorSets( setGpuMemOffset, m_addresses, pSetAllocHandle); + if (m_pDevice->MustWriteImmutableSamplers()) + { + pSet->WriteImmutableSamplers(); + } } else { @@ -446,10 +450,21 @@ VkResult DescriptorGpuMemHeap::Init( bool oneShot = (m_usage & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT) == 0; - for (uint32_t i = 0; i < count; ++i) + if (pDevice->GetRuntimeSettings().pipelineLayoutMode == PipelineLayoutAngle) + { + for (uint32_t i = 0; i < count; ++i) + { + m_gpuMemSize += AngleDescPattern::DescriptorSetBindingStride * sizeof(uint32_t) * + pTypeCount[i].descriptorCount; + } + } + else { - m_gpuMemSize += DescriptorSetLayout::GetSingleDescStaticSize(pDevice, pTypeCount[i].type) * - pTypeCount[i].descriptorCount; + for (uint32_t i = 0; i < count; ++i) + { + m_gpuMemSize += DescriptorSetLayout::GetSingleDescStaticSize(pDevice, pTypeCount[i].type) * + pTypeCount[i].descriptorCount; + } } m_gpuMemAddrAlignment = pDevice->GetProperties().descriptorSizes.alignment; diff --git a/icd/api/vk_descriptor_set.cpp b/icd/api/vk_descriptor_set.cpp index 99d6486a..81de6a4c 100644 --- a/icd/api/vk_descriptor_set.cpp +++ b/icd/api/vk_descriptor_set.cpp @@ -89,6 +89,27 @@ void DescriptorSet::Reassign( } +// ===================================================================================================================== +// Writes the immutable samplers in the layout to memory. +template +void DescriptorSet::WriteImmutableSamplers() +{ + for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++) + { + for (uint32_t bindingIndex = 0; bindingIndex < Layout()->Info().count; ++bindingIndex) + { + const DescriptorSetLayout::BindingInfo& bindingInfo = Layout()->Binding(bindingIndex); + + if (bindingInfo.imm.dwSize != 0) + { + uint32_t* pSamplerDesc = Layout()->Info().imm.pImmutableSamplerData + bindingInfo.imm.dwOffset; + uint32_t* pDestAddr = StaticCpuAddress(deviceIdx) + Layout()->GetDstStaOffset(bindingInfo, 0); + memcpy(pDestAddr, pSamplerDesc, sizeof(uint32)*bindingInfo.imm.dwSize); + } + } + } +} + // ===================================================================================================================== // Resets a DescriptorSet to an intial state template @@ -1048,6 +1069,9 @@ void DescriptorSet<1>::Reassign( template void DescriptorSet<1>::Reset(); +template +void DescriptorSet<1>::WriteImmutableSamplers(); + template DescriptorSet<2>::DescriptorSet(uint32_t heapIndex); @@ -1061,6 +1085,9 @@ void DescriptorSet<2>::Reassign( template void DescriptorSet<2>::Reset(); +template +void DescriptorSet<2>::WriteImmutableSamplers(); + template DescriptorSet<3>::DescriptorSet(uint32_t heapIndex); @@ -1074,6 +1101,9 @@ void DescriptorSet<3>::Reassign( template void DescriptorSet<3>::Reset(); +template +void DescriptorSet<3>::WriteImmutableSamplers(); + template DescriptorSet<4>::DescriptorSet(uint32_t heapIndex); @@ -1087,4 +1117,7 @@ void DescriptorSet<4>::Reassign( template void DescriptorSet<4>::Reset(); +template +void DescriptorSet<4>::WriteImmutableSamplers(); + } // namespace vk diff --git a/icd/api/vk_descriptor_set_layout.cpp b/icd/api/vk_descriptor_set_layout.cpp index ddf32725..b0f75dbf 100644 --- a/icd/api/vk_descriptor_set_layout.cpp +++ b/icd/api/vk_descriptor_set_layout.cpp @@ -308,7 +308,7 @@ void DescriptorSetLayout::ConvertBindingInfo( { // Dword offset to this binding - pBindingSectionInfo->dwOffset = Util::Pow2Align(pSectionInfo->dwSize, descAlignmentInDw); + pBindingSectionInfo->dwOffset = Util::RoundUpToMultiple(pSectionInfo->dwSize, descAlignmentInDw); if (pBindingInfo->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { @@ -332,7 +332,7 @@ void DescriptorSetLayout::ConvertBindingInfo( if (pBindingSectionInfo->dwSize > 0) { // Update total section size by how much space this binding takes. - pSectionInfo->dwSize += pBindingSectionInfo->dwSize; + pSectionInfo->dwSize = pBindingSectionInfo->dwOffset + pBindingSectionInfo->dwSize; // Update total number of ResourceMappingNodes required by this binding. pSectionInfo->numRsrcMapNodes++; @@ -530,7 +530,12 @@ VkResult DescriptorSetLayout::ConvertCreateInfo( // Determine the alignment requirement of descriptors in dwords. uint32 descAlignmentInDw = pDevice->GetProperties().descriptorSizes.alignment / sizeof(uint32); - + uint32_t staDescAlignmentInDw = descAlignmentInDw; + if (pDevice->GetRuntimeSettings().pipelineLayoutMode == PipelineLayoutAngle) + { + VK_ASSERT(AngleDescPattern::DescriptorSetBindingStride % descAlignmentInDw == 0); + staDescAlignmentInDw = AngleDescPattern::DescriptorSetBindingStride; + } // If the last binding has the VARIABLE_DESCRIPTOR_COUNT_BIT set, write the varDescDwStride if ((bindingNumber == (pOut->count - 1)) && pBinding->bindingFlags.variableDescriptorCount) { @@ -541,7 +546,7 @@ VkResult DescriptorSetLayout::ConvertCreateInfo( ConvertBindingInfo( &pBinding->info, GetDescStaticSectionDwSize(pDevice, &pBinding->info, pBinding->bindingFlags), - descAlignmentInDw, + staDescAlignmentInDw, &pOut->sta, &pBinding->sta); diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp index 3a26e103..5141fa1a 100644 --- a/icd/api/vk_device.cpp +++ b/icd/api/vk_device.cpp @@ -71,8 +71,6 @@ #include "sqtt/sqtt_mgr.h" #include "sqtt/sqtt_rgp_annotations.h" -#include "appopt/async_layer.h" - #if VKI_GPU_DECOMPRESS #include "appopt/gpu_decode_layer.h" #endif @@ -256,7 +254,6 @@ Device::Device( m_enabledExtensions(enabledExtensions), m_dispatchTable(DispatchTable::Type::DEVICE, m_pInstance, this), m_pSqttMgr(nullptr), - m_pAsyncLayer(nullptr), m_pAppOptLayer(nullptr), m_pBarrierFilterLayer(nullptr), #if VKI_GPU_DECOMPRESS @@ -264,6 +261,7 @@ Device::Device( #endif m_allocationSizeTracking(m_settings.memoryDeviceOverallocationAllowed ? false : true), m_useComputeAsTransferQueue(useComputeAsTransferQueue), + m_useUniversalAsComputeQueue(pPhysicalDevices[DefaultDeviceIndex]->GetRuntimeSettings().useUniversalAsComputeQueue), m_useGlobalGpuVa(false) , m_pBorderColorUsedIndexes(nullptr) { @@ -312,6 +310,15 @@ Device::Device( m_enabledFeatures.robustBufferAccess = false; } + if (RuntimeSettings().enableRelocatableShaders) + { + m_enabledFeatures.mustWriteImmutableSamplers = true; + } + else + { + m_enabledFeatures.mustWriteImmutableSamplers = false; + } + m_enabledFeatures.scalarBlockLayout = false; m_enabledFeatures.attachmentFragmentShadingRate = false; @@ -356,6 +363,11 @@ static void ConstructQueueCreateInfo( { palQueueType = Pal::QueueType::QueueTypeCompute; } + else if ((palQueueType == Pal::QueueType::QueueTypeCompute) && + (pPhysicalDevices[deviceIdx]->GetRuntimeSettings().useUniversalAsComputeQueue)) + { + palQueueType = Pal::QueueType::QueueTypeUniversal; + } pQueueCreateInfo->tmzOnly = isTmzQueue; @@ -384,8 +396,14 @@ static void ConstructQueueCreateInfo( } else { - pQueueCreateInfo->engineType = - pPhysicalDevices[deviceIdx]->GetQueueFamilyPalEngineType(queueFamilyIndex); + if (palQueueType == Pal::QueueType::QueueTypeUniversal) + { + pQueueCreateInfo->engineType = Pal::EngineType::EngineTypeUniversal; + } + else + { + pQueueCreateInfo->engineType = pPhysicalDevices[deviceIdx]->GetQueueFamilyPalEngineType(queueFamilyIndex); + } if (palQueueType == Pal::QueueType::QueueTypeUniversal) { @@ -467,6 +485,7 @@ VkResult Device::Create( bool scalarBlockLayoutEnabled = false; ExtendedRobustness extendedRobustnessEnabled = { false, false, false }; bool attachmentFragmentShadingRate = false; + bool pageableDeviceLocalMemory = false; uint32 privateDataSlotRequestCount = 0; bool privateDataEnabled = false; @@ -516,7 +535,8 @@ VkResult Device::Create( { VK_ASSERT(pCreateInfo->pEnabledFeatures == nullptr); - // If present, VkPhysicalDeviceFeatures2 controls which features are enabled instead of pEnabledFeatures + // If present, VkPhysicalDeviceFeatures2 controls which features are enabled + // instead of pEnabledFeatures. pEnabledFeatures = &reinterpret_cast(pHeader)->features; break; @@ -534,7 +554,19 @@ VkResult Device::Create( case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES: { - if (reinterpret_cast(pHeader)->bufferDeviceAddressMultiDevice) + if (reinterpret_cast( + pHeader)->bufferDeviceAddressMultiDevice) + { + bufferDeviceAddressMultiDeviceEnabled = true; + } + + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT: + { + if (reinterpret_cast( + pHeader)->bufferDeviceAddressMultiDevice) { bufferDeviceAddressMultiDeviceEnabled = true; } @@ -562,8 +594,8 @@ VkResult Device::Create( { deviceCoherentMemoryEnabled = enabledDeviceExtensions.IsExtensionEnabled( - DeviceExtensions::AMD_DEVICE_COHERENT_MEMORY) && - reinterpret_cast(pHeader)->deviceCoherentMemory; + DeviceExtensions::AMD_DEVICE_COHERENT_MEMORY) && + reinterpret_cast(pHeader)->deviceCoherentMemory; break; } @@ -582,7 +614,8 @@ VkResult Device::Create( case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_FEATURES_KHR: { - if (reinterpret_cast(pHeader)->attachmentFragmentShadingRate) + if (reinterpret_cast( + pHeader)->attachmentFragmentShadingRate) { attachmentFragmentShadingRate = true; } @@ -612,7 +645,8 @@ VkResult Device::Create( case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIVATE_DATA_FEATURES_EXT: { - privateDataEnabled = reinterpret_cast(pHeader)->privateData; + privateDataEnabled = reinterpret_cast( + pHeader)->privateData; break; } @@ -627,6 +661,17 @@ VkResult Device::Create( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT: + { + if (reinterpret_cast( + pHeader)->pageableDeviceLocalMemory) + { + pageableDeviceLocalMemory = true; + } + + break; + } + default: break; } @@ -1052,7 +1097,8 @@ VkResult Device::Create( attachmentFragmentShadingRate, scalarBlockLayoutEnabled, extendedRobustnessEnabled, - bufferDeviceAddressMultiDeviceEnabled); + bufferDeviceAddressMultiDeviceEnabled, + pageableDeviceLocalMemory); // If we've failed to Initialize, make sure we destroy anything we might have allocated. if (vkResult != VK_SUCCESS) @@ -1086,7 +1132,8 @@ VkResult Device::Initialize( const bool attachmentFragmentShadingRate, bool scalarBlockLayoutEnabled, const ExtendedRobustness& extendedRobustnessEnabled, - bool bufferDeviceAddressMultiDeviceEnabled) + bool bufferDeviceAddressMultiDeviceEnabled, + bool pageableDeviceLocalMemory) { // Initialize the internal memory manager VkResult result = m_internalMemMgr.Init(); @@ -1198,6 +1245,12 @@ VkResult Device::Initialize( m_enabledFeatures.robustImageAccessExtended = extendedRobustnessEnabled.robustImageAccess; m_enabledFeatures.nullDescriptorExtended = extendedRobustnessEnabled.nullDescriptor; + if (IsExtensionEnabled(DeviceExtensions::EXT_MEMORY_PRIORITY) || + (IsExtensionEnabled(DeviceExtensions::EXT_PAGEABLE_DEVICE_LOCAL_MEMORY) && pageableDeviceLocalMemory)) + { + m_enabledFeatures.appControlledMemPriority = true; + } + // If VkPhysicalDeviceBufferDeviceAddressFeaturesEXT.bufferDeviceAddressMultiDevice is enabled // and if globalGpuVaSupport is supported and if multiple devices are used set the global GpuVa. m_useGlobalGpuVa = (bufferDeviceAddressMultiDeviceEnabled && @@ -1331,20 +1384,6 @@ VkResult Device::Initialize( } #endif - if ((result == VK_SUCCESS) && m_settings.enableAsyncCompile) - { - void* pMemory = VkInstance()->AllocMem(sizeof(AsyncLayer), VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (pMemory != nullptr) - { - m_pAsyncLayer = VK_PLACEMENT_NEW(pMemory) AsyncLayer(this); - } - else - { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - } - } - const Pal::DeviceProperties& palProps = pPhysicalDevice->PalProperties(); if (result == VK_SUCCESS) @@ -1401,6 +1440,13 @@ VkResult Device::Initialize( break; } } + else if (enabled.IsExtensionEnabled(DeviceExtensions::ExtensionId::EXT_PAGEABLE_DEVICE_LOCAL_MEMORY) && + pageableDeviceLocalMemory) + { + // Add back-up heaps for device-local heaps + m_overallocationRequestedForPalHeap[Pal::GpuHeap::GpuHeapInvisible] = true; + m_overallocationRequestedForPalHeap[Pal::GpuHeap::GpuHeapLocal] = true; + } else if ((m_settings.overrideHeapChoiceToLocal != 0) && (palProps.gpuType == Pal::GpuType::Discrete)) { // This setting utilizes overallocation behavior's heap size tracking. Overallocation to the local @@ -1484,12 +1530,6 @@ void Device::InitDispatchTable() m_pBarrierFilterLayer->OverrideDispatchTable(&m_dispatchTable); } - // Install the async compile layer if needed - if (m_pAsyncLayer != nullptr) - { - m_pAsyncLayer->OverrideDispatchTable(&m_dispatchTable); - } - #if VKI_GPU_DECOMPRESS if (m_pGpuDecoderLayer != nullptr) { @@ -1715,13 +1755,6 @@ VkResult Device::Destroy(const VkAllocationCallbacks* pAllocator) VkInstance()->FreeMem(m_pAppOptLayer); } - if (m_pAsyncLayer != nullptr) - { - Util::Destructor(m_pAsyncLayer); - - VkInstance()->FreeMem(m_pAsyncLayer); - } - #if VKI_GPU_DECOMPRESS if (m_pGpuDecoderLayer != nullptr) { @@ -1809,6 +1842,11 @@ Pal::QueueType Device::GetQueueFamilyPalQueueType( palQueueType = Pal::QueueType::QueueTypeCompute; } + else if ((palQueueType == Pal::QueueType::QueueTypeCompute) && m_useUniversalAsComputeQueue) + { + palQueueType = Pal::QueueType::QueueTypeUniversal; + } + return palQueueType; } @@ -1823,6 +1861,11 @@ Pal::EngineType Device::GetQueueFamilyPalEngineType( palEngineType = Pal::EngineType::EngineTypeCompute; } + else if ((palEngineType == Pal::EngineType::EngineTypeCompute) && m_useUniversalAsComputeQueue) + { + palEngineType = Pal::EngineType::EngineTypeUniversal; + } + return palEngineType; } @@ -4187,6 +4230,16 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryHostPointerPropertiesEXT( return result; } +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkSetDeviceMemoryPriorityEXT( + VkDevice device, + VkDeviceMemory memory, + float priority) +{ + Memory* pMemory = Memory::ObjectFromHandle(memory); + pMemory->SetPriority(MemoryPriority::FromVkMemoryPriority(priority), false); +} + } // entry } // vk diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp index b04411a7..56cbf30d 100644 --- a/icd/api/vk_dispatch.cpp +++ b/icd/api/vk_dispatch.cpp @@ -591,6 +591,8 @@ void DispatchTable::Init() vkResetQueryPool ); INIT_DISPATCH_ENTRY(vkCmdSetLineStippleEXT ); + INIT_DISPATCH_ENTRY(vkSetDeviceMemoryPriorityEXT ); + INIT_DISPATCH_ENTRY(vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ); INIT_DISPATCH_ENTRY(vkGetCalibratedTimestampsEXT ); diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp index 3051e680..253ceb86 100644 --- a/icd/api/vk_graphics_pipeline.cpp +++ b/icd/api/vk_graphics_pipeline.cpp @@ -92,7 +92,7 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( for (uint32_t i = 0; (result == VK_SUCCESS) && (i < numPalDevices) ; ++i) { - if (i == DefaultDeviceIndex) + if ((i == DefaultDeviceIndex) || (pCreateInfo == nullptr)) { result = pDevice->GetCompiler(i)->CreateGraphicsPipelineBinary( pDevice, @@ -106,9 +106,10 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( else { GraphicsPipelineBinaryCreateInfo binaryCreateInfoMGPU = {}; - VbInfo vbInfoMGPU = {}; + VbBindingInfo vbInfoMGPU = {}; + PipelineInternalBufferInfo internalBufferInfoMGPU = {}; pDefaultCompiler->ConvertGraphicsPipelineInfo( - pDevice, pCreateInfo, pShaderInfo, pPipelineLayout, &binaryCreateInfoMGPU, &vbInfoMGPU); + pDevice, pCreateInfo, pShaderInfo, pPipelineLayout, &binaryCreateInfoMGPU, &vbInfoMGPU, &internalBufferInfoMGPU); result = pDevice->GetCompiler(i)->CreateGraphicsPipelineBinary( pDevice, @@ -129,13 +130,99 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( binaryCreateInfoMGPU.stageFeedback); } - pDefaultCompiler->FreeGraphicsPipelineCreateInfo(&binaryCreateInfoMGPU); + pDefaultCompiler->FreeGraphicsPipelineCreateInfo(&binaryCreateInfoMGPU, false); } } return result; } +// ===================================================================================================================== +// Creates graphics PAL pipeline objects +VkResult GraphicsPipeline::CreatePalPipelineObjects( + Device* pDevice, + PipelineCache* pPipelineCache, + GraphicsPipelineObjectCreateInfo* pObjectCreateInfo, + const size_t* pPipelineBinarySizes, + const void** pPipelineBinaries, + const Util::MetroHash::Hash* pCacheIds, + void* pSystemMem, + Pal::IPipeline** pPalPipeline) +{ + size_t palSize = 0; + + pObjectCreateInfo->pipeline.pipelineBinarySize = pPipelineBinarySizes[DefaultDeviceIndex]; + pObjectCreateInfo->pipeline.pPipelineBinary = pPipelineBinaries[DefaultDeviceIndex]; + + Pal::Result palResult = Pal::Result::Success; + palSize = + pDevice->PalDevice(DefaultDeviceIndex)->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, &palResult); + VK_ASSERT(palResult == Pal::Result::Success); + + RenderStateCache* pRSCache = pDevice->GetRenderStateCache(); + const uint32_t numPalDevices = pDevice->NumPalDevices(); + size_t palOffset = 0; + + for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++) + { + Pal::IDevice* pPalDevice = pDevice->PalDevice(deviceIdx); + + if (palResult == Pal::Result::Success) + { + // If pPipelineBinaries[DefaultDeviceIndex] is sufficient for all devices, the other pipeline binaries + // won't be created. Otherwise, like if gl_DeviceIndex is used, they will be. + if (pPipelineBinaries[deviceIdx] != nullptr) + { + pObjectCreateInfo->pipeline.pipelineBinarySize = pPipelineBinarySizes[deviceIdx]; + pObjectCreateInfo->pipeline.pPipelineBinary = pPipelineBinaries[deviceIdx]; + } + + palResult = pPalDevice->CreateGraphicsPipeline( + pObjectCreateInfo->pipeline, + Util::VoidPtrInc(pSystemMem, palOffset), + &pPalPipeline[deviceIdx]); + +#if ICD_GPUOPEN_DEVMODE_BUILD + // Temporarily reinject post Pal pipeline creation (when the internal pipeline hash is available). + // The reinjection cache layer can be linked back into the pipeline cache chain once the + // Vulkan pipeline cache key can be stored (and read back) inside the ELF as metadata. + if ((pDevice->VkInstance()->GetDevModeMgr() != nullptr) && + (palResult == Util::Result::Success)) + { + const auto& info = pPalPipeline[deviceIdx]->GetInfo(); + + palResult = pDevice->GetCompiler(deviceIdx)->RegisterAndLoadReinjectionBinary( + &info.internalPipelineHash, + &pCacheIds[deviceIdx], + &pObjectCreateInfo->pipeline.pipelineBinarySize, + &pObjectCreateInfo->pipeline.pPipelineBinary, + pPipelineCache); + + if (palResult == Util::Result::Success) + { + pPalPipeline[deviceIdx]->Destroy(); + + palResult = pPalDevice->CreateGraphicsPipeline( + pObjectCreateInfo->pipeline, + Util::VoidPtrInc(pSystemMem, palOffset), + &pPalPipeline[deviceIdx]); + } + else if (palResult == Util::Result::NotFound) + { + // If a replacement was not found, proceed with the original + palResult = Util::Result::Success; + } + } +#endif + + VK_ASSERT(palSize == pPalDevice->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, nullptr)); + palOffset += palSize; + } + } + + return PalToVkResult(palResult); +} + // ===================================================================================================================== // Create graphics pipeline objects VkResult GraphicsPipeline::CreatePipelineObjects( @@ -143,7 +230,8 @@ VkResult GraphicsPipeline::CreatePipelineObjects( const VkGraphicsPipelineCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, const PipelineLayout* pPipelineLayout, - const VbInfo* pVbInfo, + const VbBindingInfo* pVbInfo, + const PipelineInternalBufferInfo* pInternalBuffer, const size_t* pPipelineBinarySizes, const void** pPipelineBinaries, PipelineCache* pPipelineCache, @@ -170,16 +258,13 @@ VkResult GraphicsPipeline::CreatePipelineObjects( void* pSystemMem = nullptr; size_t palSize = 0; - pObjectCreateInfo->pipeline.pipelineBinarySize = pPipelineBinarySizes[DefaultDeviceIndex]; - pObjectCreateInfo->pipeline.pPipelineBinary = pPipelineBinaries[DefaultDeviceIndex]; - palSize = pDevice->PalDevice(DefaultDeviceIndex)->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, &palResult); VK_ASSERT(palResult == Pal::Result::Success); pSystemMem = pDevice->AllocApiObject( pAllocator, - sizeof(GraphicsPipeline) + (palSize * numPalDevices)); + sizeof(GraphicsPipeline) + (palSize * numPalDevices) + pInternalBuffer->dataSize); if (pSystemMem == nullptr) { @@ -190,64 +275,22 @@ VkResult GraphicsPipeline::CreatePipelineObjects( if (result == VK_SUCCESS) { - size_t palOffset = sizeof(GraphicsPipeline); + result = CreatePalPipelineObjects(pDevice, + pPipelineCache, + pObjectCreateInfo, + pPipelineBinarySizes, + pPipelineBinaries, + pCacheIds, + Util::VoidPtrInc(pSystemMem, sizeof(GraphicsPipeline)), + pPalPipeline); + } + if (result == VK_SUCCESS) + { for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++) { Pal::IDevice* pPalDevice = pDevice->PalDevice(deviceIdx); - if (palResult == Pal::Result::Success) - { - // If pPipelineBinaries[DefaultDeviceIndex] is sufficient for all devices, the other pipeline binaries - // won't be created. Otherwise, like if gl_DeviceIndex is used, they will be. - if (pPipelineBinaries[deviceIdx] != nullptr) - { - pObjectCreateInfo->pipeline.pipelineBinarySize = pPipelineBinarySizes[deviceIdx]; - pObjectCreateInfo->pipeline.pPipelineBinary = pPipelineBinaries[deviceIdx]; - } - - palResult = pPalDevice->CreateGraphicsPipeline( - pObjectCreateInfo->pipeline, - Util::VoidPtrInc(pSystemMem, palOffset), - &pPalPipeline[deviceIdx]); - -#if ICD_GPUOPEN_DEVMODE_BUILD - // Temporarily reinject post Pal pipeline creation (when the internal pipeline hash is available). - // The reinjection cache layer can be linked back into the pipeline cache chain once the - // Vulkan pipeline cache key can be stored (and read back) inside the ELF as metadata. - if ((pDevice->VkInstance()->GetDevModeMgr() != nullptr) && - (palResult == Util::Result::Success)) - { - const auto& info = pPalPipeline[deviceIdx]->GetInfo(); - - palResult = pDevice->GetCompiler(deviceIdx)->RegisterAndLoadReinjectionBinary( - &info.internalPipelineHash, - &pCacheIds[deviceIdx], - &pObjectCreateInfo->pipeline.pipelineBinarySize, - &pObjectCreateInfo->pipeline.pPipelineBinary, - pPipelineCache); - - if (palResult == Util::Result::Success) - { - pPalPipeline[deviceIdx]->Destroy(); - - palResult = pPalDevice->CreateGraphicsPipeline( - pObjectCreateInfo->pipeline, - Util::VoidPtrInc(pSystemMem, palOffset), - &pPalPipeline[deviceIdx]); - } - else if (palResult == Util::Result::NotFound) - { - // If a replacement was not found, proceed with the original - palResult = Util::Result::Success; - } - } -#endif - - VK_ASSERT(palSize == pPalDevice->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, nullptr)); - palOffset += palSize; - } - // Create the PAL MSAA state object if (palResult == Pal::Result::Success) { @@ -321,6 +364,13 @@ VkResult GraphicsPipeline::CreatePipelineObjects( pCreateInfo->flags, VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT); + PipelineInternalBufferInfo internalBuffer = *pInternalBuffer; + if (pInternalBuffer->dataSize > 0) + { + internalBuffer.pData = Util::VoidPtrInc(pSystemMem, sizeof(GraphicsPipeline) + (palSize * numPalDevices)); + memcpy(internalBuffer.pData, pInternalBuffer->pData, pInternalBuffer->dataSize); + } + VK_PLACEMENT_NEW(pSystemMem) GraphicsPipeline( pDevice, pPalPipeline, @@ -334,6 +384,7 @@ VkResult GraphicsPipeline::CreatePipelineObjects( pObjectCreateInfo->flags.force1x1ShaderRate, pObjectCreateInfo->flags.customSampleLocations, *pVbInfo, + &internalBuffer, pPalMsaa, pPalColorBlend, pPalDepthStencil, @@ -400,15 +451,16 @@ VkResult GraphicsPipeline::Create( VkResult result = AchievePipelineLayout(pDevice, pCreateInfo, pAllocator, &pPipelineLayout, &isTempLayout); // 2. Build pipeline binary create info - GraphicsPipelineBinaryCreateInfo binaryCreateInfo = {}; - GraphicsPipelineShaderStageInfo shaderStageInfo = {}; - VbInfo vbInfo = {}; + GraphicsPipelineBinaryCreateInfo binaryCreateInfo = {}; + GraphicsPipelineShaderStageInfo shaderStageInfo = {}; + VbBindingInfo vbInfo = {}; + PipelineInternalBufferInfo internalBufferInfo = {}; ShaderModuleHandle tempModules[ShaderStage::ShaderStageGfxCount] = {}; if (result == VK_SUCCESS) { result = BuildPipelineBinaryCreateInfo( - pDevice, pCreateInfo, pPipelineLayout, &binaryCreateInfo, &shaderStageInfo, &vbInfo, tempModules); + pDevice, pCreateInfo, pPipelineLayout, &binaryCreateInfo, &shaderStageInfo, &vbInfo, &internalBufferInfo, tempModules); } // 3. Create pipeine binaries @@ -431,19 +483,22 @@ VkResult GraphicsPipeline::Create( } uint64_t pipelineHash = 0; - + GraphicsPipelineObjectCreateInfo objectCreateInfo = {}; + GraphicsPipelineBinaryInfo binaryInfo = {}; if (result == VK_SUCCESS) { pipelineHash = Vkgc::IPipelineDumper::GetPipelineHash(&binaryCreateInfo.pipelineInfo); // 4. Build pipeline object create info - GraphicsPipelineObjectCreateInfo objectCreateInfo = {}; - GraphicsPipelineBinaryInfo binaryInfo = {}; binaryInfo.pOptimizerKey = &binaryCreateInfo.pipelineProfileKey; BuildPipelineObjectCreateInfo( pDevice, pCreateInfo, &vbInfo, &binaryInfo, pPipelineLayout, &objectCreateInfo); + objectCreateInfo.immedInfo.checkDeferCompilePipeline = + pDevice->GetRuntimeSettings().deferCompileOptimizedPipeline && + (binaryCreateInfo.pipelineInfo.enableEarlyCompile || binaryCreateInfo.pipelineInfo.enableUberFetchShader); + // 5. Create pipeline objects result = CreatePipelineObjects( pDevice, @@ -451,6 +506,7 @@ VkResult GraphicsPipeline::Create( pAllocator, pPipelineLayout, &vbInfo, + &internalBufferInfo, pipelineBinarySizes, pPipelineBinaries, pPipelineCache, @@ -468,6 +524,11 @@ VkResult GraphicsPipeline::Create( pPipelineLayout->Destroy(pDevice, pAllocator); } + if (internalBufferInfo.pData != nullptr) + { + pDevice->VkInstance()->FreeMem(internalBufferInfo.pData); + internalBufferInfo.pData = nullptr; + } // Free the created pipeline binaries now that the PAL Pipelines/PipelineBinaryInfo have read them. for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) { @@ -477,7 +538,25 @@ VkResult GraphicsPipeline::Create( &binaryCreateInfo, pPipelineBinaries[deviceIdx], pipelineBinarySizes[deviceIdx]); } } - pDefaultCompiler->FreeGraphicsPipelineCreateInfo(&binaryCreateInfo); + + // Deferred compile will reuse all object generated in BuildPipelineBinaryCreateInfo. + // i.e. we need keep temp buffer in binaryCreateInfo + pDefaultCompiler->FreeGraphicsPipelineCreateInfo(&binaryCreateInfo, + objectCreateInfo.immedInfo.checkDeferCompilePipeline); + + if (objectCreateInfo.immedInfo.checkDeferCompilePipeline) + { + GraphicsPipeline* pThis = GraphicsPipeline::ObjectFromHandle(*pPipeline); + result = pThis->BuildDeferCompileWorkload(pDevice, + pPipelineCache, + &binaryCreateInfo, + &shaderStageInfo, + &objectCreateInfo); + if (result == VK_SUCCESS) + { + pDefaultCompiler->ExecuteDeferCompile(&pThis->m_deferWorkload); + } + } if (result == VK_SUCCESS) { @@ -502,6 +581,326 @@ VkResult GraphicsPipeline::Create( return result; } +// ===================================================================================================================== +static size_t GetVertexInputStructSize( + const VkPipelineVertexInputStateCreateInfo* pVertexInput) +{ + size_t size = 0; + size += sizeof(VkPipelineVertexInputStateCreateInfo); + size += sizeof(VkVertexInputBindingDescription) * pVertexInput->vertexBindingDescriptionCount; + size += sizeof(VkVertexInputAttributeDescription) * pVertexInput->vertexAttributeDescriptionCount; + + const VkPipelineVertexInputDivisorStateCreateInfoEXT* pVertexDivisor = nullptr; + const vk::VkStructHeader* pStructHeader = + static_cast(pVertexInput->pNext); + while (pStructHeader != nullptr) + { + if (pStructHeader->sType == VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT) + { + pVertexDivisor = reinterpret_cast(pStructHeader); + break; + } + else + { + pStructHeader = pStructHeader->pNext; + } + } + + if (pVertexDivisor != nullptr) + { + size += sizeof(VkPipelineVertexInputDivisorStateCreateInfoEXT); + size += sizeof(VkVertexInputBindingDivisorDescriptionEXT) * pVertexDivisor->vertexBindingDivisorCount; + } + + return size; +} + +// ===================================================================================================================== +static void CopyVertexInputStruct( + const VkPipelineVertexInputStateCreateInfo* pSrcVertexInput, + VkPipelineVertexInputStateCreateInfo* pDestVertexInput) +{ + // Copy VkPipelineVertexInputStateCreateInfo + *pDestVertexInput = *pSrcVertexInput; + void* pNext = Util::VoidPtrInc(pDestVertexInput, sizeof(VkPipelineVertexInputStateCreateInfo)); + + // Copy VkVertexInputBindingDescription + pDestVertexInput->pVertexBindingDescriptions = reinterpret_cast(pNext); + size_t size = sizeof(VkVertexInputBindingDescription) * pSrcVertexInput->vertexBindingDescriptionCount; + memcpy(pNext, pSrcVertexInput->pVertexBindingDescriptions, size); + pNext = Util::VoidPtrInc(pNext, size); + + // Copy VkVertexInputAttributeDescription + pDestVertexInput->pVertexAttributeDescriptions = reinterpret_cast(pNext); + size = sizeof(VkVertexInputAttributeDescription) * pSrcVertexInput->vertexAttributeDescriptionCount; + memcpy(pNext, pSrcVertexInput->pVertexAttributeDescriptions, size); + pNext = Util::VoidPtrInc(pNext, size); + + const VkPipelineVertexInputDivisorStateCreateInfoEXT* pSrcVertexDivisor = nullptr; + const vk::VkStructHeader* pStructHeader = + reinterpret_cast(pSrcVertexInput->pNext); + while (pStructHeader != nullptr) + { + if (pStructHeader->sType == VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT) + { + pSrcVertexDivisor = reinterpret_cast(pStructHeader); + break; + } + else + { + pStructHeader = pStructHeader->pNext; + } + } + + if (pSrcVertexDivisor != nullptr) + { + // Copy VkPipelineVertexInputDivisorStateCreateInfoEXT + VkPipelineVertexInputDivisorStateCreateInfoEXT* pDestVertexDivisor = + reinterpret_cast(pNext); + *pDestVertexDivisor = *pSrcVertexDivisor; + pDestVertexInput->pNext = pDestVertexDivisor; + pNext = Util::VoidPtrInc(pNext, sizeof(VkPipelineVertexInputDivisorStateCreateInfoEXT)); + + // Copy VkVertexInputBindingDivisorDescriptionEXT + pDestVertexDivisor->pVertexBindingDivisors = reinterpret_cast(pNext); + size = sizeof(VkVertexInputBindingDivisorDescriptionEXT) * pSrcVertexDivisor->vertexBindingDivisorCount; + memcpy(pNext, pSrcVertexDivisor->pVertexBindingDivisors, size); + pNext = Util::VoidPtrInc(pNext, size); + } +} + +// ===================================================================================================================== +VkResult GraphicsPipeline::BuildDeferCompileWorkload( + Device* pDevice, + PipelineCache* pPipelineCache, + GraphicsPipelineBinaryCreateInfo* pBinaryCreateInfo, + GraphicsPipelineShaderStageInfo* pShaderStageInfo, + GraphicsPipelineObjectCreateInfo* pObjectCreateInfo) +{ + VkResult result = VK_SUCCESS; + DeferGraphicsPipelineCreateInfo* pCreateInfo = nullptr; + + // Calculate payload size + size_t payloadSize = sizeof(DeferGraphicsPipelineCreateInfo) + sizeof(Util::Event); + for (uint32_t i = 0; i < ShaderStage::ShaderStageGfxCount; i++) + { + if (pShaderStageInfo->stages[i].pEntryPoint != nullptr) + { + payloadSize += strlen(pShaderStageInfo->stages[i].pEntryPoint) + 1; + if (pShaderStageInfo->stages[i].pSpecializationInfo != nullptr) + { + auto pSpecializationInfo = pShaderStageInfo->stages[i].pSpecializationInfo; + payloadSize += sizeof(VkSpecializationInfo); + payloadSize += sizeof(VkSpecializationMapEntry) * pSpecializationInfo->mapEntryCount; + payloadSize += pSpecializationInfo->dataSize; + } + } + } + + size_t vertexInputSize = 0; + if ((pShaderStageInfo->stages[ShaderStage::ShaderStageVertex].pEntryPoint != nullptr) && + (pBinaryCreateInfo->pipelineInfo.pVertexInput != nullptr)) + { + vertexInputSize = GetVertexInputStructSize(pBinaryCreateInfo->pipelineInfo.pVertexInput); + payloadSize += vertexInputSize; + } + + size_t memOffset = 0; + Instance* pInstance = pDevice->VkInstance(); + void* pPayloadMem = pInstance->AllocMem(payloadSize, VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (pPayloadMem == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + + if (result == VK_SUCCESS) + { + memset(pPayloadMem, 0, payloadSize); + pCreateInfo = static_cast(pPayloadMem); + memOffset = sizeof(DeferGraphicsPipelineCreateInfo); + + // Fill create info and reset defer compile related options + pCreateInfo->pDevice = pDevice; + pCreateInfo->pPipelineCache = pPipelineCache; + pCreateInfo->pPipeline = this; + pCreateInfo->shaderStageInfo = *pShaderStageInfo; + pCreateInfo->binaryCreateInfo = *pBinaryCreateInfo; + pCreateInfo->objectCreateInfo = *pObjectCreateInfo; + + pCreateInfo->binaryCreateInfo.pipelineInfo.enableEarlyCompile = false; + pCreateInfo->binaryCreateInfo.pipelineInfo.enableUberFetchShader = false; + pCreateInfo->objectCreateInfo.immedInfo.checkDeferCompilePipeline = false; + + PipelineShaderInfo* pShaderInfo[] = + { + &pCreateInfo->binaryCreateInfo.pipelineInfo.vs, + &pCreateInfo->binaryCreateInfo.pipelineInfo.tcs, + &pCreateInfo->binaryCreateInfo.pipelineInfo.tes, + &pCreateInfo->binaryCreateInfo.pipelineInfo.gs, + &pCreateInfo->binaryCreateInfo.pipelineInfo.fs, + }; + + // Do deep copy for binaryCreateInfo members + for (uint32_t i = 0; i < ShaderStage::ShaderStageGfxCount; i++) + { + if (pShaderStageInfo->stages[i].pEntryPoint != nullptr) + { + size_t size = strlen(pShaderStageInfo->stages[i].pEntryPoint) + 1; + char* pEntryPoint = reinterpret_cast(Util::VoidPtrInc(pPayloadMem, memOffset)); + memcpy(pEntryPoint, pShaderStageInfo->stages[i].pEntryPoint, size); + pCreateInfo->shaderStageInfo.stages[i].pEntryPoint = pEntryPoint; + pShaderInfo[i]->pEntryTarget = pEntryPoint; + memOffset += size; + + if (pShaderStageInfo->stages[i].pSpecializationInfo != nullptr) + { + auto pSrcSpecInfo = pShaderStageInfo->stages[i].pSpecializationInfo; + auto pDestSpecInfo = reinterpret_cast(Util::VoidPtrInc(pPayloadMem, memOffset)); + *pDestSpecInfo = *pSrcSpecInfo; + memOffset += sizeof(VkSpecializationInfo); + + pDestSpecInfo->pMapEntries = reinterpret_cast(Util::VoidPtrInc(pPayloadMem, memOffset)); + memcpy(const_cast(pDestSpecInfo->pMapEntries), + pSrcSpecInfo->pMapEntries, + pSrcSpecInfo->mapEntryCount * sizeof(VkSpecializationMapEntry)); + memOffset += pSrcSpecInfo->mapEntryCount * sizeof(VkSpecializationMapEntry); + + pDestSpecInfo->pData = Util::VoidPtrInc(pPayloadMem, memOffset); + memcpy(const_cast(pDestSpecInfo->pData), + pSrcSpecInfo->pData, + pSrcSpecInfo->dataSize); + memOffset += pSrcSpecInfo->dataSize; + pCreateInfo->shaderStageInfo.stages[i].pSpecializationInfo = pDestSpecInfo; + pShaderInfo[i]->pSpecializationInfo = pDestSpecInfo; + } + } + } + + if (vertexInputSize != 0) + { + VkPipelineVertexInputStateCreateInfo* pVertexInput = + reinterpret_cast(Util::VoidPtrInc(pPayloadMem, memOffset)); + pCreateInfo->binaryCreateInfo.pipelineInfo.pVertexInput = pVertexInput; + CopyVertexInputStruct(pBinaryCreateInfo->pipelineInfo.pVertexInput, pVertexInput); + memOffset += vertexInputSize; + } + + // Build defer workload + m_deferWorkload.pPayloads = pPayloadMem; + m_deferWorkload.pEvent = VK_PLACEMENT_NEW(Util::VoidPtrInc(pPayloadMem, memOffset))(Util::Event); + memOffset += sizeof(Util::Event); + VK_ASSERT(memOffset == payloadSize); + + EventCreateFlags flags = {}; + flags.manualReset = true; + m_deferWorkload.pEvent->Init(flags); + m_deferWorkload.Execute = ExecuteDeferCreateOptimizedPipeline; + } + + return result; +} + +// ===================================================================================================================== +void GraphicsPipeline::ExecuteDeferCreateOptimizedPipeline( + void *pPayload) +{ + DeferGraphicsPipelineCreateInfo* pCreateInfo = static_cast(pPayload); + pCreateInfo->pPipeline->DeferCreateOptimizedPipeline(pCreateInfo->pDevice, + pCreateInfo->pPipelineCache, + &pCreateInfo->binaryCreateInfo, + &pCreateInfo->shaderStageInfo, + &pCreateInfo->objectCreateInfo); +} + +// ===================================================================================================================== +VkResult GraphicsPipeline::DeferCreateOptimizedPipeline( + Device* pDevice, + PipelineCache* pPipelineCache, + GraphicsPipelineBinaryCreateInfo* pBinaryCreateInfo, + GraphicsPipelineShaderStageInfo* pShaderStageInfo, + GraphicsPipelineObjectCreateInfo* pObjectCreateInfo) +{ + VkResult result = VK_SUCCESS; + size_t pipelineBinarySizes[MaxPalDevices] = {}; + const void* pPipelineBinaries[MaxPalDevices] = {}; + Util::MetroHash::Hash cacheId[MaxPalDevices] = {}; + Pal::IPipeline* pPalPipeline[MaxPalDevices] = {}; + + Pal::Result palResult = Pal::Result::Success; + size_t palSize = + pDevice->PalDevice(DefaultDeviceIndex)->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, &palResult); + VK_ASSERT(palResult == Pal::Result::Success); + + uint32_t numPalDevices = pDevice->NumPalDevices(); + void* pSystemMem = pDevice->VkInstance()->AllocMem( + palSize, VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pSystemMem == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + + if (result == VK_SUCCESS) + { + result = CreatePipelineBinaries(pDevice, + nullptr, + pShaderStageInfo, + nullptr, + pBinaryCreateInfo, + pPipelineCache, + nullptr, + cacheId, + pipelineBinarySizes, + pPipelineBinaries); + } + + if (result == VK_SUCCESS) + { + result = CreatePalPipelineObjects(pDevice, + pPipelineCache, + pObjectCreateInfo, + pipelineBinarySizes, + pPipelineBinaries, + cacheId, + pSystemMem, + pPalPipeline); + } + + if (result == VK_SUCCESS) + { + VK_ASSERT(pSystemMem == pPalPipeline[0]); + SetOptimizedPipeline(pPalPipeline); + } + + pDevice->GetCompiler(DefaultDeviceIndex)->FreeGraphicsPipelineCreateInfo(pBinaryCreateInfo, false); + + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + if (pPipelineBinaries[deviceIdx] != nullptr) + { + pDevice->GetCompiler(deviceIdx)->FreeGraphicsPipelineBinary( + pBinaryCreateInfo, pPipelineBinaries[deviceIdx], pipelineBinarySizes[deviceIdx]); + } + } + return result; +} + +// ===================================================================================================================== +void GraphicsPipeline::SetOptimizedPipeline( + Pal::IPipeline* pPalPipeline[MaxPalDevices]) +{ + const bool optimizedPipeline = true; + Util::MetroHash::Hash hash = {}; + Util::MetroHash64 palPipelineHasher; + palPipelineHasher.Update(PalPipelineHash()); + palPipelineHasher.Update(optimizedPipeline); + palPipelineHasher.Finalize(hash.bytes); + + Util::MutexAuto pipelineSwitchLock(&m_pipelineSwitchLock); + memcpy(m_pOptimizedPipeline, pPalPipeline, sizeof(m_pOptimizedPipeline)); + m_optimizedPipelineHash = hash.qwords[0]; +} + // ===================================================================================================================== GraphicsPipeline::GraphicsPipeline( Device* const pDevice, @@ -515,7 +914,8 @@ GraphicsPipeline::GraphicsPipeline( bool bindInputAssemblyState, bool force1x1ShaderRate, bool customSampleLocations, - const VbInfo& vbInfo, + const VbBindingInfo& vbInfo, + const PipelineInternalBufferInfo* pInternalBuffer, Pal::IMsaaState** pPalMsaa, Pal::IColorBlendState** pPalColorBlend, Pal::IDepthStencilState** pPalDepthStencil, @@ -529,6 +929,10 @@ GraphicsPipeline::GraphicsPipeline( pDevice), m_info(immedInfo), m_vbInfo(vbInfo), + m_internalBufferInfo(*pInternalBuffer), + m_pOptimizedPipeline{}, + m_optimizedPipelineHash(0), + m_deferWorkload{}, m_flags() { Pipeline::Init(pPalPipeline, pLayout, pBinary, staticStateMask, apiHash); @@ -689,8 +1093,33 @@ VkResult GraphicsPipeline::Destroy( Device* pDevice, const VkAllocationCallbacks* pAllocator) { + if (m_deferWorkload.pEvent != nullptr) + { + auto result = m_deferWorkload.pEvent->Wait(10); + if (result == Util::Result::Success) + { + Util::Destructor(m_deferWorkload.pEvent); + pDevice->VkInstance()->FreeMem(m_deferWorkload.pPayloads); + } + m_deferWorkload.pEvent = nullptr; + m_deferWorkload.pPayloads = nullptr; + } + DestroyStaticState(pAllocator); + if (m_pOptimizedPipeline[0] != nullptr) + { + void* pBaseMem = m_pOptimizedPipeline[0]; + for (uint32_t deviceIdx = 0; + (deviceIdx < m_pDevice->NumPalDevices()) && (m_pPalPipeline[deviceIdx] != nullptr); + deviceIdx++) + { + m_pOptimizedPipeline[deviceIdx]->Destroy(); + m_pOptimizedPipeline[deviceIdx] = nullptr; + } + pDevice->VkInstance()->FreeMem(pBaseMem); + } + return Pipeline::Destroy(pDevice, pAllocator); } @@ -920,8 +1349,9 @@ void GraphicsPipeline::BindToCmdBuffer( pRenderState->inputAssemblyState = m_info.inputAssemblyState; } + const bool useOptimizedPipeline = UseOptimizedPipeline(); const uint64_t oldHash = pRenderState->boundGraphicsPipelineHash; - const uint64_t newHash = PalPipelineHash(); + const uint64_t newHash = useOptimizedPipeline ? m_optimizedPipelineHash : PalPipelineHash(); utils::IterateMask deviceGroup(pCmdBuffer->GetDeviceMask()); do @@ -939,7 +1369,7 @@ void GraphicsPipeline::BindToCmdBuffer( Pal::PipelineBindParams params = {}; params.pipelineBindPoint = Pal::PipelineBindPoint::Graphics; - params.pPipeline = m_pPalPipeline[deviceIdx]; + params.pPipeline = useOptimizedPipeline ? m_pOptimizedPipeline[deviceIdx] : m_pPalPipeline[deviceIdx]; params.graphics = graphicsShaderInfos; params.apiPsoHash = m_apiHash; @@ -975,9 +1405,9 @@ void GraphicsPipeline::BindToCmdBuffer( Pal::PipelineBindParams params = {}; params.pipelineBindPoint = Pal::PipelineBindPoint::Graphics; - params.pPipeline = m_pPalPipeline[deviceIdx]; + params.pPipeline = useOptimizedPipeline ? m_pOptimizedPipeline[deviceIdx] : m_pPalPipeline[deviceIdx]; params.graphics = graphicsShaderInfos; - params.apiPsoHash = m_apiHash; + params.apiPsoHash = m_apiHash; pPalCmdBuf->CmdBindPipeline(params); } @@ -1079,16 +1509,21 @@ void GraphicsPipeline::BindToCmdBuffer( pRenderState->dirtyGraphics.vrs = 0; } - if (m_vbInfo.uberFetchShaderBuffer.bufferSize > 0) + if ((useOptimizedPipeline == false) && (m_internalBufferInfo.dataSize > 0)) { - VK_ASSERT(m_vbInfo.uberFetchShaderBuffer.userDataOffset > 0); + VK_ASSERT(m_internalBufferInfo.internalBufferCount > 0); Pal::gpusize gpuAddress = {}; - uint32_t* pCpuAddr = pPalCmdBuf->CmdAllocateEmbeddedData(m_vbInfo.uberFetchShaderBuffer.bufferSize, 1, &gpuAddress); - memcpy(pCpuAddr, m_vbInfo.uberFetchShaderBuffer.bufferData, m_vbInfo.uberFetchShaderBuffer.bufferSize); - pPalCmdBuf->CmdSetUserData(Pal::PipelineBindPoint::Graphics, - m_vbInfo.uberFetchShaderBuffer.userDataOffset, - 2, - reinterpret_cast(&gpuAddress)); + uint32_t* pCpuAddr = pPalCmdBuf->CmdAllocateEmbeddedData(m_internalBufferInfo.dataSize, 1, &gpuAddress); + memcpy(pCpuAddr, m_internalBufferInfo.pData, m_internalBufferInfo.dataSize); + for (uint32_t i = 0; i < m_internalBufferInfo.internalBufferCount; i++) + { + Pal::gpusize bufferAddress = gpuAddress; + bufferAddress += m_internalBufferInfo.internalBufferEntries[i].bufferOffset; + pPalCmdBuf->CmdSetUserData(Pal::PipelineBindPoint::Graphics, + m_internalBufferInfo.internalBufferEntries[i].userDataOffset, + 2, + reinterpret_cast(&bufferAddress)); + } } } while (deviceGroup.IterateNext()); diff --git a/icd/api/vk_image.cpp b/icd/api/vk_image.cpp index 9f268e9b..e74936d8 100644 --- a/icd/api/vk_image.cpp +++ b/icd/api/vk_image.cpp @@ -131,7 +131,7 @@ void Image::CalcMemoryPriority( m_priority = MemoryPriority::FromSetting(settings.memoryPriorityDefault); - if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_MEMORY_PRIORITY) == false) + if (pDevice->GetEnabledFeatures().appControlledMemPriority == false) { UpgradeToHigherPriority(settings.memoryPriorityImageAny, &m_priority); @@ -161,7 +161,6 @@ void Image::CalcMemoryPriority( // ===================================================================================================================== Image::Image( Device* pDevice, - const VkAllocationCallbacks* pAllocator, VkImageCreateFlags flags, Pal::IImage** pPalImages, Pal::IGpuMemory** pPalMemory, @@ -245,7 +244,6 @@ Image::Image( } CalcMemoryPriority(pDevice); - } // ===================================================================================================================== @@ -342,6 +340,9 @@ static VkResult InitSparseVirtualMemory( pPalImage[DefaultDeviceIndex]->GetGpuMemoryRequirements(&palReqs); + // We need virtual remapping support for all sparse resources + VK_ASSERT(pDevice->VkPhysicalDevice(DefaultDeviceIndex)->IsVirtualRemappingSupported()); + const VkDeviceSize sparseAllocGranularity = pDevice->GetProperties().virtualMemAllocGranularity; memset(pSparseMemCreateInfo, 0, sizeof(*pSparseMemCreateInfo)); @@ -359,20 +360,6 @@ static VkResult InitSparseVirtualMemory( pSparseMemCreateInfo->virtualAccessMode = Pal::VirtualGpuMemAccessMode::ReadZero; } - size_t palMemSize = 0; - - for (uint32_t deviceIdx = 0; (result == VK_SUCCESS) && (deviceIdx < pDevice->NumPalDevices()); deviceIdx++) - { - Pal::Result palResult = Pal::Result::Success; - - palMemSize += pDevice->PalDevice(DefaultDeviceIndex)->GetGpuMemorySize(*pSparseMemCreateInfo, &palResult); - - if (palResult != Pal::Result::Success) - { - result = VK_ERROR_INITIALIZATION_FAILED; - } - } - // If it's a sparse image we should also cache sparse image block dimensions (tile size) to // optimize sparse binding update, keeping in mind that each supported aspect (color, depth, // stencil) is permitted to use different granularity @@ -390,47 +377,53 @@ static VkResult InitSparseVirtualMemory( *pSparseTileSize = sparseFormatProperties.imageGranularity; - void* pPalMemoryObj = nullptr; + Pal::Result palResult; - if (result == VK_SUCCESS) - { - pPalMemoryObj = pAllocator->pfnAllocation( + size_t palMemSize = pDevice->PalDevice(DefaultDeviceIndex)->GetGpuMemorySize(*pSparseMemCreateInfo, &palResult); + VK_ASSERT(palResult == Pal::Result::Success); + + void* pPalMemoryObj = pAllocator->pfnAllocation( pAllocator->pUserData, - palMemSize, + (palMemSize * pDevice->NumPalDevices()), VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (pPalMemoryObj == nullptr) + if (pPalMemoryObj != nullptr) + { + size_t palMemOffset = 0; + + for (uint32_t deviceIdx = 0; + (deviceIdx < pDevice->NumPalDevices()) && (palResult == Pal::Result::Success); + deviceIdx++) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - } - } + if (deviceIdx != DefaultDeviceIndex) + { + Pal::GpuMemoryRequirements deviceReqs = {}; + pPalImage[deviceIdx]->GetGpuMemoryRequirements(&deviceReqs); + VK_ASSERT(memcmp(&palReqs, &deviceReqs, sizeof(deviceReqs)) == 0); - size_t palMemOffset = 0; + VK_ASSERT(palMemSize == pDevice->PalDevice(deviceIdx)->GetGpuMemorySize(*pSparseMemCreateInfo, &palResult)); + VK_ASSERT(palResult == Pal::Result::Success); + } - for (uint32_t deviceIdx = 0; - (deviceIdx < pDevice->NumPalDevices()) && (result == VK_SUCCESS); - deviceIdx++) - { - Pal::Result palResult = pDevice->PalDevice(deviceIdx)->CreateGpuMemory( - *pSparseMemCreateInfo, - Util::VoidPtrInc(pPalMemoryObj, palMemOffset), - &pSparseMemory[deviceIdx]); + palResult = pDevice->PalDevice(deviceIdx)->CreateGpuMemory( + *pSparseMemCreateInfo, + Util::VoidPtrInc(pPalMemoryObj, palMemOffset), + &pSparseMemory[deviceIdx]); - if (palResult == Pal::Result::Success) - { - palResult = pPalImage[deviceIdx]->BindGpuMemory(pSparseMemory[deviceIdx], 0); - } + if (palResult == Pal::Result::Success) + { + palResult = pPalImage[deviceIdx]->BindGpuMemory(pSparseMemory[deviceIdx], 0); + } - if (palResult == Pal::Result::Success) - { - palMemOffset += pDevice->PalDevice(deviceIdx)->GetGpuMemorySize(*pSparseMemCreateInfo, &palResult); + palMemOffset += palMemSize; } - if (palResult != Pal::Result::Success) - { - result = VK_ERROR_INITIALIZATION_FAILED; - } + result = PalToVkResult(palResult); + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; } return result; @@ -576,7 +569,9 @@ VkResult Image::Create( } case VK_STRUCTURE_TYPE_IMAGE_SWAPCHAIN_CREATE_INFO_KHR: { - VK_NOT_IMPLEMENTED; + // Nothing to do. BindSwapchainMemory has access to the swapchain and reinitializes based on it. + // Some of that could be pulled here, but validation is needed to be sure the same swapchain is provided or + // else reinitialization would be required anyway. break; } @@ -890,7 +885,6 @@ VkResult Image::Create( // Construct API image object. VK_PLACEMENT_NEW (pMemory) Image( pDevice, - pAllocator, pCreateInfo->flags, pPalImages, pSparseMemory, @@ -1081,7 +1075,6 @@ VkResult Image::CreatePresentableImage( // Construct API image object. VK_PLACEMENT_NEW (pImgObjMemory) Image( pDevice, - pAllocator, 0, pPalImage, nullptr, @@ -1127,7 +1120,6 @@ VkResult Image::Destroy( Device* pDevice, const VkAllocationCallbacks* pAllocator) { - for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) { if (m_perGpu[deviceIdx].pPalImage != nullptr) @@ -1303,7 +1295,7 @@ VkResult Image::BindMemory( // After applying any necessary base address offset, the full GPU address should be aligned VK_ASSERT(Util::IsPow2Aligned(baseGpuAddr + baseAddrOffset + memOffset, reqs.alignment)); - if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_MEMORY_PRIORITY) == false) + if (pDevice->GetEnabledFeatures().appControlledMemPriority == false) { pMemory->ElevatePriority(m_priority); } @@ -1638,6 +1630,16 @@ VkResult Image::GetMemoryRequirements( PalImage(DefaultDeviceIndex)->GetGpuMemoryRequirements(&palReqs); + for (uint32 deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + if (deviceIdx != DefaultDeviceIndex) + { + Pal::GpuMemoryRequirements deviceReqs = {}; + PalImage(deviceIdx)->GetGpuMemoryRequirements(&deviceReqs); + VK_ASSERT(memcmp(&palReqs, &deviceReqs, sizeof(deviceReqs)) == 0); + } + } + if (isSparse) { pReqs->alignment = Util::RoundUpToMultiple(virtualGranularity, palReqs.alignment); @@ -1673,13 +1675,6 @@ VkResult Image::GetMemoryRequirements( pReqs->memoryTypeBits &= pDevice->GetMemoryTypeMaskForExternalSharing(); } - // Optional: if the image is optimally tiled, don't allow it with host visible memory types. - if ((m_internalFlags.linear == 0) && - pDevice->GetRuntimeSettings().addHostInvisibleMemoryTypesForOptimalImages) - { - pReqs->memoryTypeBits &= ~pDevice->GetMemoryTypeMaskMatching(VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); - } - if (m_internalFlags.isProtected) { // If the image is protected only keep the protected type @@ -1843,36 +1838,19 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageMemoryRequirements2( VkMemoryRequirements2* pMemoryRequirements) { const Device* pDevice = ApiDevice::ObjectFromHandle(device); - VK_ASSERT((pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) || - pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2)); - union - { - const VkStructHeader* pHeader; - const VkImageMemoryRequirementsInfo2* pRequirementsInfo2; - }; + VkMemoryRequirements* pMemReq = &pMemoryRequirements->memoryRequirements; + Image* pImage = Image::ObjectFromHandle(pInfo->image); + pImage->GetMemoryRequirements(pDevice, pMemReq); - pRequirementsInfo2 = pInfo; - pHeader = utils::GetExtensionStructure(pHeader, VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2); + VkMemoryDedicatedRequirements* pMemDedicatedRequirements = + static_cast(pMemoryRequirements->pNext); - if (pHeader != nullptr) + if ((pMemDedicatedRequirements != nullptr) && + (pMemDedicatedRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS)) { - VkMemoryRequirements* pMemReq = &pMemoryRequirements->memoryRequirements; - Image* pImage = Image::ObjectFromHandle(pRequirementsInfo2->image); - pImage->GetMemoryRequirements(pDevice, pMemReq); - - if (pMemoryRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2) - { - VkMemoryDedicatedRequirements* pMemDedicatedRequirements = - static_cast(pMemoryRequirements->pNext); - - if ((pMemDedicatedRequirements != nullptr) && - (pMemDedicatedRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS)) - { - pMemDedicatedRequirements->prefersDedicatedAllocation = pImage->DedicatedMemoryRequired(); - pMemDedicatedRequirements->requiresDedicatedAllocation = pImage->DedicatedMemoryRequired(); - } - } + pMemDedicatedRequirements->prefersDedicatedAllocation = pImage->DedicatedMemoryRequired(); + pMemDedicatedRequirements->requiresDedicatedAllocation = pImage->DedicatedMemoryRequired(); } } @@ -1884,26 +1862,12 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageSparseMemoryRequirements2( VkSparseImageMemoryRequirements2* pSparseMemoryRequirements) { const Device* pDevice = ApiDevice::ObjectFromHandle(device); - VK_ASSERT((pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) || - pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2)); - - union - { - const VkStructHeader* pHeader; - const VkImageSparseMemoryRequirementsInfo2* pRequirementsInfo2; - }; - pRequirementsInfo2 = pInfo; - pHeader = utils::GetExtensionStructure(pHeader, VK_STRUCTURE_TYPE_IMAGE_SPARSE_MEMORY_REQUIREMENTS_INFO_2); - - if (pHeader != nullptr) - { - Image* pImage = Image::ObjectFromHandle(pRequirementsInfo2->image); - auto memReqsView = utils::ArrayView( - pSparseMemoryRequirements, - &pSparseMemoryRequirements->memoryRequirements); - pImage->GetSparseMemoryRequirements(pDevice, pSparseMemoryRequirementCount, memReqsView); - } + Image* pImage = Image::ObjectFromHandle(pInfo->image); + auto memReqsView = utils::ArrayView( + pSparseMemoryRequirements, + &pSparseMemoryRequirements->memoryRequirements); + pImage->GetSparseMemoryRequirements(pDevice, pSparseMemoryRequirementCount, memReqsView); } } // namespace entry diff --git a/icd/api/vk_instance.cpp b/icd/api/vk_instance.cpp index e1ca79f8..54189536 100644 --- a/icd/api/vk_instance.cpp +++ b/icd/api/vk_instance.cpp @@ -347,7 +347,7 @@ VkResult Instance::Init( createInfo.pLogInfo = &callbackInfo; -#if defined(__unix__) +#if defined(__unix__) createInfo.pSettingsPath = "/etc/amd"; #else createInfo.pSettingsPath = "Vulkan"; diff --git a/icd/api/vk_memory.cpp b/icd/api/vk_memory.cpp index 3e4eb5d2..67eb3937 100644 --- a/icd/api/vk_memory.cpp +++ b/icd/api/vk_memory.cpp @@ -406,10 +406,6 @@ VkResult Memory::Create( &bindData, sizeof(Pal::GpuMemoryResourceBindEventData)); } - else - { - VK_NEVER_CALLED(); - } // When share a dedicated image, metadata(width/height/mips/...) info is necessary in handle, // so driver calls bindMemory here to update metadata at allocation time. @@ -1175,18 +1171,27 @@ void Memory::ElevatePriority( // the new given priority. if (m_priority < priority) { - Util::MutexAuto lock(m_pDevice->GetMemoryMutex()); + SetPriority(priority, true); + } +} - if (m_priority < priority) +// ===================================================================================================================== +// This function set new priority of this memory's allocation. +void Memory::SetPriority( + const MemoryPriority priority, + const bool mustBeLower) +{ + Util::MutexAuto lock(m_pDevice->GetMemoryMutex()); + if (((mustBeLower == false) && (m_priority != priority)) || + ((mustBeLower == true) && (m_priority < priority))) + { + for (uint32_t deviceIdx = 0; deviceIdx < m_pDevice->NumPalDevices(); deviceIdx++) { - for (uint32_t deviceIdx = 0; deviceIdx < m_pDevice->NumPalDevices(); deviceIdx++) + if ((PalMemory(deviceIdx) != nullptr) && + (PalMemory(deviceIdx)->SetPriority(priority.PalPriority(), priority.PalOffset()) == + Pal::Result::Success)) { - if ((PalMemory(deviceIdx) != nullptr) && - (PalMemory(deviceIdx)->SetPriority(priority.PalPriority(), priority.PalOffset()) == - Pal::Result::Success)) - { - m_priority = priority; - } + m_priority = priority; } } } @@ -1360,11 +1365,9 @@ VKAPI_ATTR void VKAPI_CALL vkFreeMemory( Device* pDevice = ApiDevice::ObjectFromHandle(device); Memory* pMemory = Memory::ObjectFromHandle(memory); - { - const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks(); + const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks(); - pMemory->Free(pDevice, pAllocCB); - } + pMemory->Free(pDevice, pAllocCB); } } diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp index cea821da..c3561002 100644 --- a/icd/api/vk_physical_device.cpp +++ b/icd/api/vk_physical_device.cpp @@ -741,6 +741,12 @@ VkResult PhysicalDevice::Initialize() Pal::GpuHeapGartCacheable }; + if (settings.forceUMA) + { + heapProperties[Pal::GpuHeapInvisible].heapSize = 0; + heapProperties[Pal::GpuHeapLocal].heapSize = 0; + } + const Pal::gpusize invisHeapSize = heapProperties[Pal::GpuHeapInvisible].heapSize; const Pal::gpusize localHeapSize = heapProperties[Pal::GpuHeapLocal].heapSize; @@ -859,29 +865,6 @@ VkResult PhysicalDevice::Initialize() memTypeWantsCoherentMemory[memoryTypeIndex] = true; } } - - // Optional: if we have exposed a memory type that is host visible, add a backup - // memory type that is not host visible. We will use it for optimally tiled images. - if (settings.addHostInvisibleMemoryTypesForOptimalImages && - ((pMemoryType->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0) && - // Skip host visible+coherent+cached as we won't need it - ((pMemoryType->propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) == 0)) - { - memoryTypeIndex = m_memoryProperties.memoryTypeCount++; - - m_memoryTypeMask |= 1 << memoryTypeIndex; - m_memoryVkIndexToPalHeap[memoryTypeIndex] = palGpuHeap; - m_memoryPalHeapToVkIndexBits[palGpuHeap] |= (1UL << memoryTypeIndex); - - VkMemoryType* pNextMemoryType = &m_memoryProperties.memoryTypes[memoryTypeIndex]; - - constexpr VkFlags hostMask = (VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | - VK_MEMORY_PROPERTY_HOST_CACHED_BIT); - - pNextMemoryType->heapIndex = pMemoryType->heapIndex; - pNextMemoryType->propertyFlags = pMemoryType->propertyFlags & ~hostMask; - } } } @@ -956,6 +939,10 @@ VkResult PhysicalDevice::Initialize() (1UL << memoryTypeIndex); m_memoryTypeMask |= 1 << m_memoryProperties.memoryTypeCount; + + m_memoryVkIndexAddRemoteBackupHeap[m_memoryProperties.memoryTypeCount] = + m_memoryVkIndexAddRemoteBackupHeap[memoryTypeIndex]; + ++m_memoryProperties.memoryTypeCount; } } @@ -3613,8 +3600,8 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(GOOGLE_HLSL_FUNCTIONALITY1)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(GOOGLE_DECORATE_STRING)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SCALAR_BLOCK_LAYOUT)); - availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_MEMORY_PRIORITY)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_MEMORY_BUDGET)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_MEMORY_PRIORITY)); if ((pPhysicalDevice == nullptr) || pPhysicalDevice->PalProperties().gfxipProperties.flags.supportPostDepthCoverage) { @@ -3678,9 +3665,16 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_TERMINATE_INVOCATION)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_EXTENDED_DYNAMIC_STATE2)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_INTEGER_DOT_PRODUCT)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_COPY_COMMANDS2)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_SUBGROUP_UNIFORM_CONTROL_FLOW)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_ATOMIC_FLOAT)); + if ((pPhysicalDevice == nullptr) || (pPhysicalDevice->PalProperties().gfxLevel > Pal::GfxIpLevel::GfxIp9)) + { + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_ATOMIC_FLOAT2)); + } + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_4444_FORMATS)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SYNCHRONIZATION2)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_CUSTOM_BORDER_COLOR)); @@ -4905,6 +4899,22 @@ size_t PhysicalDevice::GetFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR: + { + if (IsExtensionSupported(DeviceExtensions::KHR_SHADER_INTEGER_DOT_PRODUCT)) + { + auto* pExtInfo = reinterpret_cast(pHeader); + + if (updateFeatures) + { + pExtInfo->shaderIntegerDotProduct = VK_TRUE; + } + + structSize = sizeof(*pExtInfo); + } + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES: { auto* pExtInfo = reinterpret_cast(pHeader); @@ -4977,6 +4987,21 @@ size_t PhysicalDevice::GetFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT: + { + auto* pExtInfo = reinterpret_cast(pHeader); + + if (updateFeatures) + { + pExtInfo->primitiveTopologyListRestart = VK_TRUE; + pExtInfo->primitiveTopologyPatchListRestart = VK_FALSE; + } + + structSize = sizeof(*pExtInfo); + + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_TERMINATE_INVOCATION_FEATURES_KHR: { auto* pExtInfo = reinterpret_cast(pHeader); @@ -5089,6 +5114,22 @@ size_t PhysicalDevice::GetFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT: + { + auto* pExtInfo = reinterpret_cast(pHeader); + + if (updateFeatures) + { + GetPhysicalDeviceBufferAddressFeatures( + &pExtInfo->bufferDeviceAddress, + &pExtInfo->bufferDeviceAddressCaptureReplay, + &pExtInfo->bufferDeviceAddressMultiDevice); + } + + structSize = sizeof(*pExtInfo); + + break; + } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT: { auto* pExtInfo = reinterpret_cast(pHeader); @@ -5527,11 +5568,25 @@ size_t PhysicalDevice::GetFeatures2( { pExtInfo->shaderBufferFloat32Atomics = VK_TRUE; pExtInfo->shaderBufferFloat32AtomicAdd = VK_FALSE; - pExtInfo->shaderBufferFloat64Atomics = VK_TRUE; + if (PalProperties().gfxipProperties.flags.support64BitInstructions) + { + pExtInfo->shaderBufferFloat64Atomics = VK_TRUE; + } + else + { + pExtInfo->shaderBufferFloat64Atomics = VK_FALSE; + } pExtInfo->shaderBufferFloat64AtomicAdd = VK_FALSE; pExtInfo->shaderSharedFloat32Atomics = VK_TRUE; pExtInfo->shaderSharedFloat32AtomicAdd = VK_FALSE; - pExtInfo->shaderSharedFloat64Atomics = VK_TRUE; + if (PalProperties().gfxipProperties.flags.support64BitInstructions) + { + pExtInfo->shaderSharedFloat64Atomics = VK_TRUE; + } + else + { + pExtInfo->shaderSharedFloat64Atomics = VK_FALSE; + } pExtInfo->shaderSharedFloat64AtomicAdd = VK_FALSE; pExtInfo->shaderImageFloat32Atomics = VK_TRUE; pExtInfo->shaderImageFloat32AtomicAdd = VK_FALSE; @@ -5553,12 +5608,26 @@ size_t PhysicalDevice::GetFeatures2( pExtInfo->shaderBufferFloat16AtomicAdd = VK_FALSE; pExtInfo->shaderBufferFloat16AtomicMinMax = VK_FALSE; pExtInfo->shaderBufferFloat32AtomicMinMax = VK_TRUE; - pExtInfo->shaderBufferFloat64AtomicMinMax = VK_TRUE; + if (PalProperties().gfxipProperties.flags.support64BitInstructions) + { + pExtInfo->shaderBufferFloat64AtomicMinMax = VK_TRUE; + } + else + { + pExtInfo->shaderBufferFloat64AtomicMinMax = VK_FALSE; + } pExtInfo->shaderSharedFloat16Atomics = VK_FALSE; pExtInfo->shaderSharedFloat16AtomicAdd = VK_FALSE; pExtInfo->shaderSharedFloat16AtomicMinMax = VK_FALSE; pExtInfo->shaderSharedFloat32AtomicMinMax = VK_TRUE; - pExtInfo->shaderSharedFloat64AtomicMinMax = VK_TRUE; + if (PalProperties().gfxipProperties.flags.support64BitInstructions) + { + pExtInfo->shaderSharedFloat64AtomicMinMax = VK_TRUE; + } + else + { + pExtInfo->shaderSharedFloat64AtomicMinMax = VK_FALSE; + } pExtInfo->shaderImageFloat32AtomicMinMax = VK_TRUE; pExtInfo->sparseImageFloat32AtomicMinMax = VK_TRUE; } @@ -5567,6 +5636,19 @@ size_t PhysicalDevice::GetFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT: + { + auto* pExtInfo = reinterpret_cast(pHeader); + + if (updateFeatures) + { + pExtInfo->pageableDeviceLocalMemory = VK_TRUE; + } + + structSize = sizeof(*pExtInfo); + break; + } + default: { // skip any unsupported extension structures @@ -6113,6 +6195,55 @@ void PhysicalDevice::GetDeviceProperties2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR: + { + auto* pProps = static_cast(pNext); + + const VkBool32 int8DotSupport = PalProperties().gfxipProperties.flags.supportInt8Dot ? VK_TRUE : + VK_FALSE; + pProps->integerDotProduct8BitUnsignedAccelerated = int8DotSupport; + pProps->integerDotProduct8BitSignedAccelerated = int8DotSupport; + pProps->integerDotProduct4x8BitPackedUnsignedAccelerated = int8DotSupport; + pProps->integerDotProduct4x8BitPackedSignedAccelerated = int8DotSupport; + + { + pProps->integerDotProduct8BitMixedSignednessAccelerated = VK_FALSE; + pProps->integerDotProduct4x8BitPackedMixedSignednessAccelerated = VK_FALSE; + } + + const VkBool32 int16DotSupport = ((PalProperties().gfxipProperties.flags.support16BitInstructions) && + ((GetRuntimeSettings().optOnlyEnableFP16ForGfx9Plus == false) || + (PalProperties().gfxLevel >= Pal::GfxIpLevel::GfxIp9)) + ) ? VK_TRUE : VK_FALSE; + + pProps->integerDotProduct16BitUnsignedAccelerated = int16DotSupport; + pProps->integerDotProduct16BitSignedAccelerated = int16DotSupport; + pProps->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = int16DotSupport; + pProps->integerDotProductAccumulatingSaturating16BitSignedAccelerated = int16DotSupport; + + pProps->integerDotProduct16BitMixedSignednessAccelerated = VK_FALSE; + pProps->integerDotProduct32BitUnsignedAccelerated = VK_FALSE; + pProps->integerDotProduct32BitSignedAccelerated = VK_FALSE; + pProps->integerDotProduct32BitMixedSignednessAccelerated = VK_FALSE; + pProps->integerDotProduct64BitUnsignedAccelerated = VK_FALSE; + pProps->integerDotProduct64BitSignedAccelerated = VK_FALSE; + pProps->integerDotProduct64BitMixedSignednessAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating8BitSignedAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating32BitSignedAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating64BitSignedAccelerated = VK_FALSE; + pProps->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = VK_FALSE; + } + break; case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT: { auto* pProps = static_cast(pNext); diff --git a/icd/api/vk_pipeline_layout.cpp b/icd/api/vk_pipeline_layout.cpp index 18acc405..f73e9a2f 100644 --- a/icd/api/vk_pipeline_layout.cpp +++ b/icd/api/vk_pipeline_layout.cpp @@ -141,40 +141,20 @@ VkResult PipelineLayout::ConvertCreateInfo( pInfo->userDataLayout.setBindingRegCount = 0; pInfo->userDataLayout.setBindingRegBase = 0; - // Reserve an user-data to store the VA of buffer for transform feedback. - if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK)) + if (pDevice->GetRuntimeSettings().enableEarlyCompile) { - pInfo->userDataLayout.transformFeedbackRegCount = 1; - pInfo->userDataRegCount += pInfo->userDataLayout.transformFeedbackRegCount; - pPipelineInfo->numUserDataNodes += 1; + // Early compile mode will enable uber-fetch shader and spec constant buffer on vertex shader and + // fragment shader implicitly. so we need three reserved node. + pPipelineInfo->numUserDataNodes += 3; + pInfo->userDataRegCount += 6; // Each buffer consume 2 user data register now. } - - // Reserve one user data nodes for uber-fetch shader. - if (pDevice->GetRuntimeSettings().enableUberFetchShader) + else if (pDevice->GetRuntimeSettings().enableUberFetchShader) { + // Reserve one user data nodes for uber-fetch shader. pPipelineInfo->numUserDataNodes += 1; + pInfo->userDataRegCount += 2; } - // Calculate the number of bytes needed for push constants - uint32_t pushConstantsSizeInBytes = 0; - - for (uint32_t i = 0; i < pIn->pushConstantRangeCount; ++i) - { - const VkPushConstantRange* pRange = &pIn->pPushConstantRanges[i]; - - // Test if this push constant range is active in at least one stage - if (pRange->stageFlags != 0) - { - pushConstantsSizeInBytes = Util::Max(pushConstantsSizeInBytes, pRange->offset + pRange->size); - } - } - - uint32_t pushConstRegCount = pushConstantsSizeInBytes / sizeof(uint32_t); - - pInfo->userDataLayout.pushConstRegBase = pInfo->userDataLayout.transformFeedbackRegCount; - pInfo->userDataLayout.pushConstRegCount = pushConstRegCount; - pInfo->userDataRegCount += pushConstRegCount; - VK_ASSERT(pIn->setLayoutCount <= MaxDescriptorSets); // Total number of dynamic descriptors across all descriptor sets @@ -234,6 +214,17 @@ VkResult PipelineLayout::ConvertCreateInfo( // Add the number of user data regs used by this set to the total count for the whole layout pInfo->userDataRegCount += pSetUserData->totalRegCount; + if (pDevice->GetRuntimeSettings().pipelineLayoutMode == PipelineLayoutAngle) + { + // Force next set firstRegOffset align to AngleDescPattern. + if ((i + 1) < Util::ArrayLen(AngleDescPattern::DescriptorSetOffset)) + { + if (pInfo->userDataRegCount < AngleDescPattern::DescriptorSetOffset[i + 1]) + { + pInfo->userDataRegCount = AngleDescPattern::DescriptorSetOffset[i + 1]; + } + } + } } // Calculate total number of user data regs used for active descriptor set data @@ -241,6 +232,35 @@ VkResult PipelineLayout::ConvertCreateInfo( VK_ASSERT(totalDynDescCount <= MaxDynamicDescriptors); + // Calculate the number of bytes needed for push constants + uint32_t pushConstantsSizeInBytes = 0; + + for (uint32_t i = 0; i < pIn->pushConstantRangeCount; ++i) + { + const VkPushConstantRange* pRange = &pIn->pPushConstantRanges[i]; + + // Test if this push constant range is active in at least one stage + if (pRange->stageFlags != 0) + { + pushConstantsSizeInBytes = Util::Max(pushConstantsSizeInBytes, pRange->offset + pRange->size); + } + } + + uint32_t pushConstRegCount = pushConstantsSizeInBytes / sizeof(uint32_t); + + pInfo->userDataLayout.pushConstRegBase = pInfo->userDataRegCount; + pInfo->userDataLayout.pushConstRegCount = pushConstRegCount; + pInfo->userDataRegCount += pushConstRegCount; + + // Reserve an user-data to store the VA of buffer for transform feedback. + if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK)) + { + pInfo->userDataLayout.transformFeedbackRegBase = pInfo->userDataRegCount; + pInfo->userDataLayout.transformFeedbackRegCount = 1; + pInfo->userDataRegCount += pInfo->userDataLayout.transformFeedbackRegCount; + pPipelineInfo->numUserDataNodes += 1; + } + // In case we need an internal vertex buffer table, add nodes required for its entries, and its set pointer. pPipelineInfo->numRsrcMapNodes += Pal::MaxVertexBuffers; @@ -715,7 +735,7 @@ VkResult PipelineLayout::BuildLlpcSetMapping( // This function populates the resource mapping node details to the shader-stage specific pipeline info structure. VkResult PipelineLayout::BuildLlpcPipelineMapping( const uint32_t stageMask, - VbInfo* pVbInfo, + VbBindingInfo* pVbInfo, void* pBuffer, bool appendFetchShaderCb, Vkgc::ResourceMappingData* pResourceMapping @@ -732,41 +752,51 @@ VkResult PipelineLayout::BuildLlpcPipelineMapping( uint32_t mappingNodeCount = 0; // Number of consumed ResourceMappingNodes (only sub-nodes) uint32_t descriptorRangeCount = 0; // Number of consumed StaticResourceValues - if (m_info.userDataLayout.transformFeedbackRegCount > 0) - { - uint32_t xfbStages = (stageMask & (Vkgc::ShaderStageFragmentBit - 1)) >> 1; - uint32_t lastXfbStageBit = Vkgc::ShaderStageVertexBit; + constexpr uint32_t InternalCbRegCount = 2; - while (xfbStages > 0) - { - lastXfbStageBit <<= 1; - xfbStages >>= 1; - } + if (appendFetchShaderCb && pVbInfo != nullptr) + { + // Append node for uber fetch shader constant buffer + auto pFetchShaderCbNode = &pUserDataNodes[userDataNodeCount]; + pFetchShaderCbNode->node.type = Vkgc::ResourceMappingNodeType::DescriptorBufferCompact; + pFetchShaderCbNode->node.offsetInDwords = FetchShaderInternalBufferOffset; + pFetchShaderCbNode->node.sizeInDwords = InternalCbRegCount; + pFetchShaderCbNode->node.srdRange.set = Vkgc::InternalDescriptorSetId; + pFetchShaderCbNode->node.srdRange.binding = Vkgc::FetchShaderInternalBufferBinding; + pFetchShaderCbNode->visibility = Vkgc::ShaderStageVertexBit; + + userDataNodeCount += 1; + } - if (lastXfbStageBit != 0) + if (m_pDevice->GetRuntimeSettings().enableEarlyCompile) + { + if (stageMask & Vkgc::ShaderStageVertexBit) { - auto pTransformFeedbackNode = &pUserDataNodes[userDataNodeCount]; - pTransformFeedbackNode->node.type = Vkgc::ResourceMappingNodeType::StreamOutTableVaPtr; - pTransformFeedbackNode->node.offsetInDwords = m_info.userDataLayout.transformFeedbackRegBase; - pTransformFeedbackNode->node.sizeInDwords = m_info.userDataLayout.transformFeedbackRegCount; - pTransformFeedbackNode->visibility = lastXfbStageBit; + auto pSpecConstVertexCbNode = &pUserDataNodes[userDataNodeCount]; + pSpecConstVertexCbNode->node.type = Vkgc::ResourceMappingNodeType::DescriptorBufferCompact; + pSpecConstVertexCbNode->node.offsetInDwords = SpecConstBufferVertexOffset; + pSpecConstVertexCbNode->node.sizeInDwords = InternalCbRegCount; + pSpecConstVertexCbNode->node.srdRange.set = Vkgc::InternalDescriptorSetId; + pSpecConstVertexCbNode->node.srdRange.binding = SpecConstVertexInternalBufferBindingId; + pSpecConstVertexCbNode->visibility = Vkgc::ShaderStageVertexBit; userDataNodeCount += 1; } - } - // TODO: Build the internal push constant resource mapping - if (m_info.userDataLayout.pushConstRegCount > 0) + if (stageMask & Vkgc::ShaderStageFragmentBit) { - auto pPushConstNode = &pUserDataNodes[userDataNodeCount]; - pPushConstNode->node.type = Vkgc::ResourceMappingNodeType::PushConst; - pPushConstNode->node.offsetInDwords = m_info.userDataLayout.pushConstRegBase; - pPushConstNode->node.sizeInDwords = m_info.userDataLayout.pushConstRegCount; - pPushConstNode->node.srdRange.set = Vkgc::InternalDescriptorSetId; - pPushConstNode->visibility = stageMask; + auto pSpecConstVertexCbNode = &pUserDataNodes[userDataNodeCount]; + pSpecConstVertexCbNode->node.type = Vkgc::ResourceMappingNodeType::DescriptorBufferCompact; + pSpecConstVertexCbNode->node.offsetInDwords = SpecConstBufferFragmentOffset; + pSpecConstVertexCbNode->node.sizeInDwords = InternalCbRegCount; + pSpecConstVertexCbNode->node.srdRange.set = Vkgc::InternalDescriptorSetId; + pSpecConstVertexCbNode->node.srdRange.binding = SpecConstFragmentInternalBufferBindingId; + pSpecConstVertexCbNode->visibility = Vkgc::ShaderStageVertexBit; userDataNodeCount += 1; } + } + // Build descriptor for each set for (uint32_t setIndex = 0; (setIndex < m_info.setCount) && (result == VK_SUCCESS); ++setIndex) { @@ -824,6 +854,41 @@ VkResult PipelineLayout::BuildLlpcPipelineMapping( } } + // TODO: Build the internal push constant resource mapping + if (m_info.userDataLayout.pushConstRegCount > 0) + { + auto pPushConstNode = &pUserDataNodes[userDataNodeCount]; + pPushConstNode->node.type = Vkgc::ResourceMappingNodeType::PushConst; + pPushConstNode->node.offsetInDwords = m_info.userDataLayout.pushConstRegBase; + pPushConstNode->node.sizeInDwords = m_info.userDataLayout.pushConstRegCount; + pPushConstNode->node.srdRange.set = Vkgc::InternalDescriptorSetId; + pPushConstNode->visibility = stageMask; + + userDataNodeCount += 1; + } + + if (m_info.userDataLayout.transformFeedbackRegCount > 0) + { + uint32_t xfbStages = (stageMask & (Vkgc::ShaderStageFragmentBit - 1)) >> 1; + uint32_t lastXfbStageBit = Vkgc::ShaderStageVertexBit; + + while (xfbStages > 0) + { + lastXfbStageBit <<= 1; + xfbStages >>= 1; + } + + if (lastXfbStageBit != 0) + { + auto pTransformFeedbackNode = &pUserDataNodes[userDataNodeCount]; + pTransformFeedbackNode->node.type = Vkgc::ResourceMappingNodeType::StreamOutTableVaPtr; + pTransformFeedbackNode->node.offsetInDwords = m_info.userDataLayout.transformFeedbackRegBase; + pTransformFeedbackNode->node.sizeInDwords = m_info.userDataLayout.transformFeedbackRegCount; + pTransformFeedbackNode->visibility = lastXfbStageBit; + + userDataNodeCount += 1; + } + } if ((result == VK_SUCCESS) && (pVbInfo != nullptr)) { // Build the internal vertex buffer table mapping @@ -836,7 +901,7 @@ VkResult PipelineLayout::BuildLlpcPipelineMapping( // Build the table description itself const uint32_t srdDwSize = m_pDevice->GetProperties().descriptorSizes.bufferView / sizeof(uint32_t); - uint32_t vbTableSize = pVbInfo->bindingInfo.bindingTableSize * srdDwSize; + uint32_t vbTableSize = pVbInfo->bindingTableSize * srdDwSize; // Add the set pointer node pointing to this table auto pVbTblPtrNode = &pUserDataNodes[userDataNodeCount]; @@ -853,32 +918,6 @@ VkResult PipelineLayout::BuildLlpcPipelineMapping( { result = VK_ERROR_INITIALIZATION_FAILED; } - - if (appendFetchShaderCb) - { - // Append node for uber fetch shader constant buffer - constexpr uint32_t FetchShaderCbRegCount = 2; - if ((userDataNodeCount + FetchShaderCbRegCount) <= - m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().gfxipProperties.maxUserDataEntries) - { - auto pFetchShaderCbNode = &pUserDataNodes[userDataNodeCount]; - pFetchShaderCbNode->node.type = Vkgc::ResourceMappingNodeType::DescriptorBufferCompact; - pFetchShaderCbNode->node.offsetInDwords = m_info.userDataRegCount + VbTablePtrRegCount; - pFetchShaderCbNode->node.sizeInDwords = FetchShaderCbRegCount; - pFetchShaderCbNode->node.srdRange.set = Vkgc::InternalDescriptorSetId; - pFetchShaderCbNode->node.srdRange.binding = Vkgc::FetchShaderInternalBufferBinding; - pFetchShaderCbNode->visibility = Vkgc::ShaderStageVertexBit; - - pVbInfo->uberFetchShaderBuffer.userDataOffset = pFetchShaderCbNode->node.offsetInDwords; - userDataNodeCount += 1; - } - else - { - VK_NEVER_CALLED(); - result = VK_ERROR_INITIALIZATION_FAILED; - } - - } } // If you hit these assert, we precomputed an insufficient amount of scratch space during layout creation. diff --git a/icd/api/vk_queue.cpp b/icd/api/vk_queue.cpp index 04461b66..962d0cdc 100644 --- a/icd/api/vk_queue.cpp +++ b/icd/api/vk_queue.cpp @@ -1278,7 +1278,6 @@ VkResult Queue::BindSparseEntry( Memory* pMemory = Memory::ObjectFromHandle(bind.memory); pRealGpuMem = pMemory->PalMemory(resourceDeviceIndex, memoryDeviceIndex); - } VK_ASSERT(bind.flags == 0); @@ -1318,7 +1317,6 @@ VkResult Queue::BindSparseEntry( Memory* pMemory = Memory::ObjectFromHandle(bind.memory); pRealGpuMem = pMemory->PalMemory(resourceDeviceIndex, memoryDeviceIndex); - } result = AddVirtualRemapRange( @@ -1361,7 +1359,6 @@ VkResult Queue::BindSparseEntry( Memory* pMemory = Memory::ObjectFromHandle(bind.memory); pRealGpuMem = pMemory->PalMemory(resourceDeviceIndex, memoryDeviceIndex); - } // Get the subresource layout to be able to figure out its offset diff --git a/icd/api/vk_render_pass.cpp b/icd/api/vk_render_pass.cpp index 2012d4ef..bc15c2cc 100644 --- a/icd/api/vk_render_pass.cpp +++ b/icd/api/vk_render_pass.cpp @@ -634,17 +634,6 @@ static size_t GetRenderPassCreateInfoRequiredMemorySize( { size_t createInfoSize = 0; - createInfoSize += pCreateInfo->attachmentCount * sizeof(AttachmentDescription); - createInfoSize += pCreateInfo->subpassCount * sizeof(SubpassDescription); - createInfoSize += pCreateInfo->dependencyCount * sizeof(SubpassDependency); - - for (uint32_t subpassIndex = 0; subpassIndex < pCreateInfo->subpassCount; ++subpassIndex) - { - const auto& subpassDesc = pCreateInfo->pSubpasses[subpassIndex]; - - createInfoSize += GetSubpassDescriptionBaseMemorySize(subpassDesc); - } - if (renderPassExt.pMultiviewCreateInfo != nullptr) { createInfoSize += renderPassExt.pMultiviewCreateInfo->correlationMaskCount * sizeof(uint32_t); @@ -656,6 +645,19 @@ static size_t GetRenderPassCreateInfoRequiredMemorySize( createInfoSize += pCreateInfo2->correlatedViewMaskCount * sizeof(uint32_t); } + createInfoSize += pCreateInfo->attachmentCount * sizeof(AttachmentDescription); + // Subpasses need to be aligned + createInfoSize = Util::Pow2Align(createInfoSize, alignof(SubpassDescription)); + createInfoSize += pCreateInfo->subpassCount * sizeof(SubpassDescription); + createInfoSize += pCreateInfo->dependencyCount * sizeof(SubpassDependency); + + for (uint32_t subpassIndex = 0; subpassIndex < pCreateInfo->subpassCount; ++subpassIndex) + { + const auto& subpassDesc = pCreateInfo->pSubpasses[subpassIndex]; + + createInfoSize += GetSubpassDescriptionBaseMemorySize(subpassDesc); + } + return createInfoSize; } @@ -697,6 +699,8 @@ static void InitRenderPassCreateInfo( } nextPtr = Util::VoidPtrInc(nextPtr, pCreateInfo->attachmentCount * sizeof(AttachmentDescription)); + // Struct needs to be aligned + nextPtr = Util::VoidPtrAlign(nextPtr, alignof(SubpassDescription)); VK_ASSERT(Util::VoidPtrDiff(nextPtr, pMemoryPtr) <= memorySize); outRenderPassInfo->subpassCount = pCreateInfo->subpassCount; diff --git a/icd/api/vk_swapchain.cpp b/icd/api/vk_swapchain.cpp index 5286892f..34cc902f 100644 --- a/icd/api/vk_swapchain.cpp +++ b/icd/api/vk_swapchain.cpp @@ -335,11 +335,17 @@ VkResult SwapChain::Create( &palResult); VK_ASSERT(palResult == Pal::Result::Success); - size_t queueFamilyArraySize = sizeof(uint32_t*) * pCreateInfo->queueFamilyIndexCount; - size_t imageArraySize = sizeof(VkImage) * swapImageCount; - size_t memoryArraySize = sizeof(VkDeviceMemory) * swapImageCount; - size_t cmdBufArraySize = sizeof(Pal::ICmdBuffer*) * swapImageCount; - size_t objSize = vkSwapChainSize + + properties.queueFamilyIndexCount = ((pCreateInfo->imageSharingMode == VK_SHARING_MODE_CONCURRENT) ? + pCreateInfo->queueFamilyIndexCount : 0u); + + // If imageSharingMode is VK_SHARING_MODE_CONCURRENT, queueFamilyIndexCount must be greater than 1. + VK_ASSERT((pCreateInfo->imageSharingMode != VK_SHARING_MODE_CONCURRENT) || (properties.queueFamilyIndexCount > 1)); + + const size_t queueFamilyArraySize = sizeof(uint32_t*) * properties.queueFamilyIndexCount; + const size_t imageArraySize = sizeof(VkImage) * swapImageCount; + const size_t memoryArraySize = sizeof(VkDeviceMemory) * swapImageCount; + const size_t cmdBufArraySize = sizeof(Pal::ICmdBuffer*) * swapImageCount; + const size_t objSize = vkSwapChainSize + queueFamilyArraySize + palSwapChainSize + imageArraySize + @@ -402,26 +408,27 @@ VkResult SwapChain::Create( &properties.imageCreateInfo); } + // Store creation info for image barrier policy + properties.usage = pCreateInfo->imageUsage; + properties.sharingMode = pCreateInfo->imageSharingMode; + properties.format = pCreateInfo->imageFormat; + properties.images = static_cast(Util::VoidPtrInc(pMemory, offset)); offset += imageArraySize; properties.imageMemory = static_cast(Util::VoidPtrInc(pMemory, offset)); offset += memoryArraySize; - properties.pQueueFamilyIndices = static_cast(Util::VoidPtrInc(pMemory, offset)); - offset += queueFamilyArraySize; + // memcpy queue family indices + if (queueFamilyArraySize > 0u) + { + properties.pQueueFamilyIndices = static_cast(Util::VoidPtrInc(pMemory, offset)); + offset += queueFamilyArraySize; + memcpy(properties.pQueueFamilyIndices, pCreateInfo->pQueueFamilyIndices, queueFamilyArraySize); + } VK_ASSERT(offset == objSize); - // Store creation info for image barrier policy - properties.usage = pCreateInfo->imageUsage; - properties.queueFamilyIndexCount = pCreateInfo->queueFamilyIndexCount; - properties.sharingMode = pCreateInfo->imageSharingMode; - properties.format = pCreateInfo->imageFormat; - - // memcpy queue family indices - memcpy(properties.pQueueFamilyIndices, pCreateInfo->pQueueFamilyIndices, queueFamilyArraySize); - for (properties.imageCount = 0; properties.imageCount < swapImageCount; ++properties.imageCount) { if (result == VK_SUCCESS) diff --git a/icd/make/importdefs b/icd/make/importdefs index f699ea87..e268502f 100644 --- a/icd/make/importdefs +++ b/icd/make/importdefs @@ -26,7 +26,7 @@ # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION. It describes the version of the PAL interface # that the ICD supports. PAL uses this value to enable backwards-compatibility for older interface versions. It must # be updated on each PAL promotion after handling all of the interface changes described in palLib.h. -ICD_PAL_CLIENT_MAJOR_VERSION = 675 +ICD_PAL_CLIENT_MAJOR_VERSION = 678 ICD_PAL_CLIENT_MINOR_VERSION = 0 # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1. It describes diff --git a/icd/res/ver.h b/icd/res/ver.h index 17a4a6d0..5b3c52e1 100644 --- a/icd/res/ver.h +++ b/icd/res/ver.h @@ -36,7 +36,7 @@ #define VERSION_MAJOR_STR MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0" // Bump up after each promotion to mainline -#define VULKAN_ICD_BUILD_VERSION 199 +#define VULKAN_ICD_BUILD_VERSION 201 // String version is needed with leading zeros and extra termination (unicode) #define VERSION_NUMBER_MINOR VULKAN_ICD_BUILD_VERSION diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp index eaef6388..0c2195cb 100644 --- a/icd/settings/settings.cpp +++ b/icd/settings/settings.cpp @@ -376,13 +376,36 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( // WWZ performs worse with DCC forced on, so just let the PAL heuristics decide what's best for now. if (pInfo->gfxLevel >= Pal::GfxIpLevel::GfxIp10_1) { - m_settings.forceEnableDcc = ForceDccDefault; + m_settings.forceEnableDcc = (ForceDccFor64BppShaderStorage | + ForceDccForNonColorAttachmentShaderStorage | + ForceDccForColorAttachments | + ForceDccFor2DShaderStorage); + } // Mall no alloc setting gives a ~0.82% gain if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp10_3) { - m_settings.mallNoAllocSsrPolicy = MallNoAllocSsrAsSnsr; + m_settings.csWaveSize = 64; + m_settings.fsWaveSize = 64; + + if (pInfo->revision == Pal::AsicRevision::Navi21) + { + m_settings.forceEnableDcc = (ForceDccFor64BppShaderStorage | + ForceDccFor32BppShaderStorage | + ForceDccForColorAttachments | + ForceDccFor3DShaderStorage); + + m_settings.mallNoAllocCtPolicy = MallNoAllocCtAsSnsr; + } + + if (pInfo->revision == Pal::AsicRevision::Navi23) + { + m_settings.forceEnableDcc = (ForceDccFor32BppShaderStorage | + ForceDccForNonColorAttachmentShaderStorage | + ForceDccForColorAttachments | + ForceDccFor3DShaderStorage); + } } m_settings.implicitExternalSynchronization = false; diff --git a/icd/settings/settings_xgl.json b/icd/settings/settings_xgl.json index e4feca85..cbfcb3fc 100644 --- a/icd/settings/settings_xgl.json +++ b/icd/settings/settings_xgl.json @@ -651,6 +651,34 @@ "Scope": "Driver", "Type": "enum" }, + { + "Name": "PipelineLayoutMode", + "Description": "Control the pipeline descriptor layout for early compile", + "Tags": [ + "Pipeline Options" + ], + "Defaults": { + "Default": "PipelineLayoutDefault" + }, + "ValidValues": { + "IsEnum": true, + "Values": [ + { + "Name": "PipelineLayoutDefault", + "Value": 0, + "Description": "Build descritptor layout with default layout" + }, + { + "Name": "PipelineLayoutAngle", + "Value": 1, + "Description": "Build descriptor layout compatible with angle base app" + } + ], + "Name": "PipelineLayoutMode" + }, + "Scope": "Driver", + "Type": "enum" + }, { "Name": "PipelineBinningMode", "Description": "Specifies whether to override binning setting for pipeline.", @@ -972,6 +1000,45 @@ "Scope": "Driver", "Type": "bool" }, + { + "Name": "EnableEarlyCompile", + "Description": "Enable pipeline early compile.", + "Tags": [ + "SPIRV Options" + ], + "Defaults": { + "Default": false + }, + "Scope": "Driver", + "Type": "bool" + }, + { + "Name": "DeferCompileOptimizedPipeline", + "Description": "Whether enable defer compile optimized pipeline, it only affects option EnableUberFetchShader and EnableEarlyCompile", + "Tags": [ + "SPIRV Options" + ], + "Defaults": { + "Default": false + }, + "Scope": "Driver", + "Type": "bool" + }, + { + "Name": "DeferCompileThreadCount", + "Description": "Assistant thread count for deferred compile operation, if count is greater than the internal limitation, the real thread count will be clamped to the limitation.", + "Tags": [ + "SPIRV Options" + ], + "Defaults": { + "Default": "0xFFFFFFFF" + }, + "Scope": "Driver", + "Type": "uint32", + "Flags": { + "IsHex": true + } + }, { "Name": "DisablePerCompFetch", "Description": "Disable per component fetch in uber fetch shader.", @@ -1154,6 +1221,11 @@ "Name": "ShaderReplaceShaderISA", "Value": 4, "Description": "Enable replace ISA shader in the pipeline, For every pipeline in the ShaderReplacementPipelineHashs, would find if there is a file named 0xAAA_replace.txt under ShaderReplacementDir, would be loaded for the replacement the replace shader look like this *----offset: ISACODE----* 848:0x7E120303 1480:0x7E1E0303 2592:0x7E0E030E" + }, + { + "Name": "ShaderReplaceShaderHashPipelineBinaryHash", + "Value": 5, + "Description": "Enable both shader hash based shader replacement and pipeline binary hash based pipeline binary replacement. In cases where both a pipeline and one or more of its shaders are replaced, the replacement shader will take precedence and will potentially change the hash of the pipeline. The pipeline will only be replaced if the pipeline replacement file has the new hash." } ], "Name": "ShaderReplaceMode" @@ -2090,6 +2162,18 @@ "Type": "bool", "Name": "DumpDuplicatePipelines" }, + { + "Description": "Re-routes all compute work to a universal queue internally.", + "Tags": [ + "General" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Name": "UseUniversalAsComputeQueue", + "Scope": "Driver" + }, { "Name": "DbgBarrierPostCmdEnable", "Description": "Triggers a CmdBarrier call after any command in the given mask. The barrier behavior is controlled by the other DbgBarrierPost* settings in this category. Requires VK_ENABLE_DEBUG_BARRIERS=1 to take effect. 0x8FFFFFFF: All commands (heavyweight option)", @@ -3310,18 +3394,6 @@ "Type": "uint32", "Name": "TransferGranularityDmaOverride" }, - { - "Description": "If this option is enabled, the following changes are introduced: (1) Images with VK_IMAGE_TILING_OPTIMAL may not be bound to memory types with VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT. (2) For each memory type that was previously host visible and usable with optimal images, an another memory type is created that is not host visible and can be used with optimally tiled images.", - "Tags": [ - "Memory" - ], - "Defaults": { - "Default": false - }, - "Scope": "Driver", - "Type": "bool", - "Name": "AddHostInvisibleMemoryTypesForOptimalImages" - }, { "Description": "If this option is enabled, the driver returns an extra image memory requirement. The amount of memory is determined by memoryPaddingFactorForImageMemoryRequirements.This can be used while capturing GFXR traces and can be helpful for DCC tuning", "Tags": [ @@ -3622,6 +3694,18 @@ "Type": "bool", "Name": "OverrideHeapGartCacheableToUswc" }, + { + "Description": "for APU, set local and local invisibl heap size to 0, Force to use system memory.", + "Tags": [ + "Memory" + ], + "Defaults": { + "Default": false + }, + "Scope": "Driver", + "Type": "bool", + "Name": "ForceUMA" + }, { "Description": "Forces a particular AppProfile value. The profile selected is the value of ForceAppProfileValue. ", "Tags": [ @@ -4402,30 +4486,6 @@ "Scope": "Driver", "Type": "enum" }, - { - "Description": "Enable async compile for shader module and pipelines.", - "Tags": [ - "Optimization" - ], - "Defaults": { - "Default": false - }, - "Scope": "Driver", - "Type": "bool", - "Name": "EnableAsyncCompile" - }, - { - "Description": "Enable partial pipeline compile.", - "Tags": [ - "Optimization" - ], - "Defaults": { - "Default": false - }, - "Scope": "Driver", - "Type": "bool", - "Name": "EnablePartialPipelineCompile" - }, { "Description": "Specifies the maximum threshold in bytes for linear transfer commands to use CP DMA, which have less overhead than CS/Gfx copies, but also less throughput for large copies.", "Tags": [ @@ -5423,6 +5483,18 @@ }, "Type": "bool", "Scope": "Driver" + }, + { + "Name": "EnableDumbTransitionSync", + "Description": "Enable synchronizing cache by adding dumb transition in the barrier", + "Tags": [ + "General" + ], + "Defaults": { + "Default": true + }, + "Type": "bool", + "Scope": "Driver" } ] } \ No newline at end of file diff --git a/tools/cache_creator/CMakeLists.txt b/tools/cache_creator/CMakeLists.txt index 37d0b3f5..d3f3ec7f 100644 --- a/tools/cache_creator/CMakeLists.txt +++ b/tools/cache_creator/CMakeLists.txt @@ -75,8 +75,8 @@ target_link_libraries(cache-info PRIVATE cache_creator_lib) # Build cache creator tools whenever we build XGL. add_dependencies(xgl cache-creator cache-info) -if(XGL_BUILD_LIT) - message(STATUS "Building cache creator LIT tests") +if(XGL_BUILD_TESTS OR XGL_BUILD_LIT) + message(STATUS "Building cache creator tests") set(CACHE_CREATOR_TOOLS_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}") add_subdirectory(test "${CMAKE_CURRENT_BINARY_DIR}/test/cache-creator/lit") add_subdirectory(unittests "${CMAKE_CURRENT_BINARY_DIR}/test/cache-creator/unittests")