diff --git a/cmake/XglOptions.cmake b/cmake/XglOptions.cmake
index d67546db..614a76b0 100644
--- a/cmake/XglOptions.cmake
+++ b/cmake/XglOptions.cmake
@@ -42,6 +42,9 @@ macro(xgl_options)
 
     option(XGL_BUILD_NAVI23 "Build open source vulkan for Navi23" ON)
 
+    option(XGL_BUILD_TESTS "Build all tests?" OFF)
+
+    # Deprecated, use XGL_BUILD_TESTS instead.
     option(XGL_BUILD_LIT "Build with Lit test?" OFF)
 
     option(XGL_BUILD_CACHE_CREATOR "Build cache-creator tools?" OFF)
diff --git a/cmake/XglOverrides.cmake b/cmake/XglOverrides.cmake
index 8e41acd9..caddce50 100644
--- a/cmake/XglOverrides.cmake
+++ b/cmake/XglOverrides.cmake
@@ -185,9 +185,13 @@ macro(xgl_overrides_vkgc)
     set(LLPC_CLIENT_INTERFACE_MAJOR_VERSION ${ICD_LLPC_CLIENT_MAJOR_VERSION} CACHE STRING "${PROJECT_NAME} override." FORCE)
 
     if(ICD_BUILD_LLPC)
+        set(LLPC_BUILD_TESTS ${XGL_BUILD_TESTS} CACHE BOOL "${PROJECT_NAME} override." FORCE)
 
         set(LLPC_BUILD_LIT ${XGL_BUILD_LIT} CACHE BOOL "${PROJECT_NAME} override." FORCE)
 
+        if(XGL_BUILD_LIT)
+            message(DEPRECATION "XGL_BUILD_LIT is deprecated, use XGL_BUILD_TESTS instead")
+        endif()
         set(LLPC_BUILD_NAVI12 ${XGL_BUILD_NAVI12} CACHE BOOL "${PROJECT_NAME} override." FORCE)
 
         set(LLPC_BUILD_NAVI22 ${XGL_BUILD_NAVI22} CACHE BOOL "${PROJECT_NAME} override." FORCE)
diff --git a/icd/CMakeLists.txt b/icd/CMakeLists.txt
index b3a19396..3d6d61d9 100644
--- a/icd/CMakeLists.txt
+++ b/icd/CMakeLists.txt
@@ -136,9 +136,6 @@ target_sources(xgl PRIVATE
     api/vk_descriptor_update_template.cpp
     api/appopt/barrier_filter_layer.cpp
     api/appopt/strange_brigade_layer.cpp
-    api/appopt/async_layer.cpp
-    api/appopt/async_shader_module.cpp
-    api/appopt/async_partial_pipeline.cpp
     api/appopt/g_shader_profile.cpp
     api/render_state_cache.cpp
     api/renderpass/renderpass_builder.cpp
diff --git a/icd/Loader/LunarG/Lnx/amd-icd.json b/icd/Loader/LunarG/Lnx/amd-icd.json
index 025cf513..5a668070 100644
--- a/icd/Loader/LunarG/Lnx/amd-icd.json
+++ b/icd/Loader/LunarG/Lnx/amd-icd.json
@@ -2,13 +2,13 @@
   "file_format_version": "1.0.0",
   "ICD": {
     "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so",
-    "api_version": "1.2.188"
+    "api_version": "1.2.191"
   },
   "layer": {
     "name": "VK_LAYER_AMD_switchable_graphics_@ISABITS@",
     "type": "GLOBAL",
     "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so",
-    "api_version": "1.2.188",
+    "api_version": "1.2.191",
     "implementation_version": "1",
     "description": "AMD switchable graphics layer",
     "functions": {
diff --git a/icd/api/appopt/async_layer.cpp b/icd/api/appopt/async_layer.cpp
deleted file mode 100644
index 11cb3ee1..00000000
--- a/icd/api/appopt/async_layer.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- ***********************************************************************************************************************
- *
- *  Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to deal
- *  in the Software without restriction, including without limitation the rights
- *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *  copies of the Software, and to permit persons to whom the Software is
- *  furnished to do so, subject to the following conditions:
- *
- *  The above copyright notice and this permission notice shall be included in all
- *  copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *  SOFTWARE.
- *
- **********************************************************************************************************************/
-/**
-***********************************************************************************************************************
-* @file  async_layer.cpp
-* @brief Implementation of async compiler layer.
-***********************************************************************************************************************
-*/
-#include "async_layer.h"
-#include "async_shader_module.h"
-#include "async_partial_pipeline.h"
-
-#include "include/vk_device.h"
-#include "include/vk_shader.h"
-#include "include/vk_graphics_pipeline.h"
-#include "include/vk_compute_pipeline.h"
-#include "palListImpl.h"
-
-namespace vk
-{
-
-namespace entry
-{
-
-namespace async
-{
-
-// =====================================================================================================================
-VKAPI_ATTR VkResult VKAPI_CALL vkCreateShaderModule(
-    VkDevice                                    device,
-    const VkShaderModuleCreateInfo*             pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkShaderModule*                             pShaderModule)
-{
-    Device* pDevice = ApiDevice::ObjectFromHandle(device);
-    const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks();
-    return vk::async::ShaderModule::Create(pDevice, pCreateInfo, pAllocCB, pShaderModule);
-}
-
-// =====================================================================================================================
-VKAPI_ATTR void VKAPI_CALL vkDestroyShaderModule(
-    VkDevice                                    device,
-    VkShaderModule                              shaderModule,
-    const VkAllocationCallbacks*                pAllocator)
-{
-    if (shaderModule != VK_NULL_HANDLE)
-    {
-        Device* pDevice  = ApiDevice::ObjectFromHandle(device);
-        AsyncLayer* pAsyncLayer = pDevice->GetAsyncLayer();
-        const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks();
-
-        pAsyncLayer->SyncAll();
-        vk::async::ShaderModule::ObjectFromHandle(shaderModule)->Destroy(pDevice, pAllocCB);
-    }
-}
-
-// =====================================================================================================================
-VKAPI_ATTR VkResult VKAPI_CALL vkCreateGraphicsPipelines(
-    VkDevice                            device,
-    VkPipelineCache                     pipelineCache,
-    uint32_t                            createInfoCount,
-    const VkGraphicsPipelineCreateInfo* pCreateInfos,
-    const VkAllocationCallbacks*        pAllocator,
-    VkPipeline*                         pPipelines)
-{
-    Device* pDevice = ApiDevice::ObjectFromHandle(device);
-    AsyncLayer* pAsyncLayer = pDevice->GetAsyncLayer();
-    VkResult result = VK_SUCCESS;
-
-    for (uint32_t i = 0; (i < createInfoCount) && (result == VK_SUCCESS); ++i)
-    {
-        VkGraphicsPipelineCreateInfo createInfo = pCreateInfos[i];
-        VkPipelineShaderStageCreateInfo stages[ShaderStage::ShaderStageGfxCount];
-        VK_ASSERT(createInfo.stageCount <= ShaderStage::ShaderStageGfxCount);
-        for (uint32_t stage = 0; stage < createInfo.stageCount; ++stage)
-        {
-            stages[stage] = createInfo.pStages[stage];
-            vk::async::ShaderModule* pModule = vk::async::ShaderModule::ObjectFromHandle(stages[stage].module);
-            stages[stage].module = pModule->GetNextLayerModule();
-        }
-        createInfo.pStages = stages;
-        result = ASYNC_CALL_NEXT_LAYER(vkCreateGraphicsPipelines)(device,
-                                                                  pipelineCache,
-                                                                  1,
-                                                                  &createInfo,
-                                                                  pAllocator,
-                                                                  pPipelines + i);
-    }
-
-    return result;
-}
-
-// =====================================================================================================================
-VKAPI_ATTR VkResult VKAPI_CALL vkCreateComputePipelines(
-    VkDevice                                    device,
-    VkPipelineCache                             pipelineCache,
-    uint32_t                                    createInfoCount,
-    const VkComputePipelineCreateInfo*          pCreateInfos,
-    const VkAllocationCallbacks*                pAllocator,
-    VkPipeline*                                 pPipelines)
-{
-    Device* pDevice = ApiDevice::ObjectFromHandle(device);
-    AsyncLayer* pAsyncLayer = pDevice->GetAsyncLayer();
-    VkResult result = VK_SUCCESS;
-
-    for (uint32_t i = 0; (i < createInfoCount) && (result == VK_SUCCESS); ++i)
-    {
-        VkComputePipelineCreateInfo createInfo = pCreateInfos[i];
-        VK_ASSERT(createInfo.stage.module != VK_NULL_HANDLE);
-        vk::async::ShaderModule* pModule = vk::async::ShaderModule::ObjectFromHandle(createInfo.stage.module);
-        createInfo.stage.module = pModule->GetNextLayerModule();
-        result = ASYNC_CALL_NEXT_LAYER(vkCreateComputePipelines)(device,
-                                                                 pipelineCache,
-                                                                 1,
-                                                                 &createInfo,
-                                                                 pAllocator,
-                                                                 pPipelines + i);
-    }
-
-    return result;
-}
-
-} // namespace async
-
-} // namespace entry
-
-// =====================================================================================================================
-AsyncLayer::AsyncLayer(Device* pDevice)
-    :
-    m_pDevice(pDevice),
-    m_pModuleTaskThreads(),
-    m_pPipelineTaskThreads()
-{
-    Util::SystemInfo sysInfo = {};
-    Util::QuerySystemInfo(&sysInfo);
-
-    for (uint32_t i = 0; i < MaxTaskType; ++i)
-    {
-        m_taskId[i] = 0;
-        m_activeThreadCount[i] = Util::Min(MaxThreads, sysInfo.cpuLogicalCoreCount / 2);
-    }
-    for (uint32_t i = 0; i < m_activeThreadCount[0]; ++i)
-    {
-        m_pModuleTaskThreads[i] = VK_PLACEMENT_NEW(m_moduleTaskThreadBuffer[i])
-                                  async::TaskThread<ShaderModuleTask>(this, pDevice->VkInstance()->Allocator());
-        m_pModuleTaskThreads[i]->Begin();
-
-        m_pPipelineTaskThreads[i] = VK_PLACEMENT_NEW(m_pipelineTaskThreadBuffer[i])
-                                    async::TaskThread<PartialPipelineTask>(this, pDevice->VkInstance()->Allocator());
-        m_pPipelineTaskThreads[i]->Begin();
-    }
-}
-
-// =====================================================================================================================
-AsyncLayer::~AsyncLayer()
-{
-    for (uint32_t i = 0; i < m_activeThreadCount[0]; ++i)
-    {
-        m_pModuleTaskThreads[i]->SetStop();
-        m_pModuleTaskThreads[i]->Join();
-        Util::Destructor(m_pModuleTaskThreads[i]);
-        m_pModuleTaskThreads[i] = nullptr;
-
-        m_pPipelineTaskThreads[i]->SetStop();
-        m_pPipelineTaskThreads[i]->Join();
-        Util::Destructor(m_pPipelineTaskThreads[i]);
-        m_pPipelineTaskThreads[i] = nullptr;
-    }
-}
-
-// =====================================================================================================================
-void AsyncLayer::SyncAll()
-{
-    for (uint32_t i = 0; i < m_activeThreadCount[0]; ++i)
-    {
-        m_pModuleTaskThreads[i]->SyncAll();
-        m_pPipelineTaskThreads[i]->SyncAll();
-    }
-}
-
-// =====================================================================================================================
-void AsyncLayer::OverrideDispatchTable(
-    DispatchTable* pDispatchTable)
-{
-    // Save current device dispatch table to use as the next layer.
-    m_nextLayer = *pDispatchTable;
-
-    ASYNC_OVERRIDE_ENTRY(vkCreateShaderModule);
-    ASYNC_OVERRIDE_ENTRY(vkDestroyShaderModule);
-    ASYNC_OVERRIDE_ENTRY(vkCreateGraphicsPipelines);
-    ASYNC_OVERRIDE_ENTRY(vkCreateComputePipelines);
-}
-
-} // namespace vk
diff --git a/icd/api/appopt/async_layer.h b/icd/api/appopt/async_layer.h
deleted file mode 100644
index c7b3bf6e..00000000
--- a/icd/api/appopt/async_layer.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- ***********************************************************************************************************************
- *
- *  Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to deal
- *  in the Software without restriction, including without limitation the rights
- *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *  copies of the Software, and to permit persons to whom the Software is
- *  furnished to do so, subject to the following conditions:
- *
- *  The above copyright notice and this permission notice shall be included in all
- *  copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *  SOFTWARE.
- *
- **********************************************************************************************************************/
-/**
-***********************************************************************************************************************
-* @file  async_layer.h
-* @brief Declaration of async compiler layer
-***********************************************************************************************************************
-*/
-
-#ifndef __ASYNC_LAYER_H__
-#define __ASYNC_LAYER_H__
-
-#pragma once
-
-#include "opt_layer.h"
-#include "async_task_thread.h"
-
-namespace vk
-{
-
-class Device;
-class AsyncLayer;
-class PalAllocator;
-
-namespace async { class ShaderModule; class PartialPipeline; }
-
-// Represents the shader module async compile info
-struct ShaderModuleTask
-{
-    VkShaderModuleCreateInfo info;        // Shader module create info
-    async::ShaderModule*     pObj;        // Output shader module object
-};
-
-// Represents the pipeline async compile info
-struct PartialPipelineTask
-{
-    VkShaderModule              shaderModuleHandle; // Shader module handle
-    async::PartialPipeline*     pObj;               // Output shader module object
-};
-
-// Thread task type
-enum TaskType : uint32_t
-{
-    ShaderModuleTaskType = 0,
-    PartialPipelineTaskType,
-    MaxTaskType,
-};
-
-// =====================================================================================================================
-// Class that specifies dispatch table override behavior for async compiler layers
-class AsyncLayer final : public OptLayer
-{
-public:
-    AsyncLayer(Device* pDevice);
-    virtual ~AsyncLayer();
-
-    virtual void OverrideDispatchTable(DispatchTable* pDispatchTable) override;
-
-    Device* GetDevice() { return m_pDevice; }
-
-    void* GetTaskThread(TaskType type)
-    {
-        VK_ASSERT(type < MaxTaskType);
-        if (type == ShaderModuleTaskType)
-        {
-            return (m_activeThreadCount[type] > 0) ?
-                    m_pModuleTaskThreads[(m_taskId[type]++) % m_activeThreadCount[type]] :
-                    nullptr;
-        }
-        else
-        {
-            return (m_activeThreadCount[type] > 0) ?
-                    m_pPipelineTaskThreads[(m_taskId[type]++) % m_activeThreadCount[type]] :
-                    nullptr;
-        }
-    }
-
-    void SyncAll();
-
-protected:
-    static constexpr uint32_t        MaxThreads = 8;  // Max thread count for shader module compile
-    Device*                          m_pDevice;                  // Vulkan Device object
-    async::TaskThread<ShaderModuleTask>* m_pModuleTaskThreads[MaxThreads]; // Async compiler threads
-    async::TaskThread<PartialPipelineTask>* m_pPipelineTaskThreads[MaxThreads]; // Async compiler threads
-    uint32_t                         m_taskId[MaxTaskType];                   // Hint to select compile thread
-    uint32_t                         m_activeThreadCount[MaxTaskType];        // Active thread count
-    // Internal buffer for m_taskThreadBuffer
-    uint8_t                          m_moduleTaskThreadBuffer[MaxThreads][sizeof(async::TaskThread<ShaderModuleTask>)];
-    uint8_t                          m_pipelineTaskThreadBuffer[MaxThreads]
-                                                               [sizeof(async::TaskThread<PartialPipelineTask>)];
-
-private:
-    PAL_DISALLOW_COPY_AND_ASSIGN(AsyncLayer);
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-#define ASYNC_OVERRIDE_ALIAS(entry_name, func_name) \
-    pDispatchTable->OverrideEntryPoints()->entry_name = vk::entry::async::func_name
-
-#define ASYNC_OVERRIDE_ENTRY(entry_name) ASYNC_OVERRIDE_ALIAS(entry_name, entry_name)
-// Helper function to call the next layer's function by name
-#define ASYNC_CALL_NEXT_LAYER(entry_name) \
-    pAsyncLayer->GetNextLayer()->GetEntryPoints().entry_name
-
-} // namespace vk
-
-#endif /* __OPT_LAYER_H__ */
diff --git a/icd/api/appopt/async_partial_pipeline.cpp b/icd/api/appopt/async_partial_pipeline.cpp
deleted file mode 100644
index d1f952a6..00000000
--- a/icd/api/appopt/async_partial_pipeline.cpp
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
- ***********************************************************************************************************************
- *
- *  Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to deal
- *  in the Software without restriction, including without limitation the rights
- *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *  copies of the Software, and to permit persons to whom the Software is
- *  furnished to do so, subject to the following conditions:
- *
- *  The above copyright notice and this permission notice shall be included in all
- *  copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *  SOFTWARE.
- *
- **********************************************************************************************************************/
-/**
-***********************************************************************************************************************
-* @file  async_partial_pipeline.cpp
-* @brief Implementation of class async::PartialPipeline
-***********************************************************************************************************************
-*/
-#include "async_layer.h"
-#include "async_partial_pipeline.h"
-
-#include "include/vk_device.h"
-#include "include/vk_shader.h"
-#include "palListImpl.h"
-
-#include <limits.h>
-
-namespace vk
-{
-
-namespace async
-{
-// =====================================================================================================================
-PartialPipeline::PartialPipeline(
-    const VkAllocationCallbacks*    pAllocator)
-    :
-    m_pAllocator(pAllocator)
-{
-}
-
-// =====================================================================================================================
-// Creates async partial pipeline object
-PartialPipeline* PartialPipeline::Create(
-    Device*                         pDevice,
-    const VkAllocationCallbacks*    pAllocator)
-{
-    const size_t objSize = sizeof(PartialPipeline);
-    void* pMemory = pAllocator->pfnAllocation(
-        pAllocator->pUserData,
-        objSize,
-        VK_DEFAULT_MEM_ALIGN,
-        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-
-    if (pMemory == nullptr)
-    {
-        return nullptr;
-    }
-
-    VK_PLACEMENT_NEW(pMemory) PartialPipeline(pAllocator);
-
-    return static_cast<PartialPipeline*>(pMemory);
-}
-
-// =====================================================================================================================
-// Destory async partial pipeline object
-VkResult PartialPipeline::Destroy()
-{
-    m_pAllocator->pfnFree(m_pAllocator->pUserData, this);
-
-    return VK_SUCCESS;
-}
-
-// =====================================================================================================================
-// Builds partial pipeline in async mode
-void PartialPipeline::AsyncBuildPartialPipeline(
-    AsyncLayer* pAsyncLayer,
-    VkShaderModule asyncShaderModule)
-{
-    auto pTaskThread = reinterpret_cast<async::TaskThread<PartialPipelineTask>*>
-                       (pAsyncLayer->GetTaskThread(PartialPipelineTaskType));
-    if (pTaskThread != nullptr)
-    {
-        PartialPipelineTask task = {};
-
-        task.shaderModuleHandle = asyncShaderModule;
-        task.pObj = this;
-        pTaskThread->AddTask(&task);
-    }
-    else
-    {
-        Destroy();
-    }
-}
-
-static const uint32_t OffsetStrideInDwords = 12;
-// =====================================================================================================================
-// Creat ResourceMappingNode from module data
-void PartialPipeline::CreatePipelineLayoutFromModuleData(
-    AsyncLayer*                           pAsyncLayer,
-    Vkgc::ShaderModuleEntryData*          pShaderModuleEntryData,
-    const Vkgc::ResourceMappingRootNode** ppResourceMappingNode,
-    uint32_t*                             pMappingNodeCount)
-{
-    const Vkgc::ResourceNodeData* pResourceNodeData = pShaderModuleEntryData->pResNodeDatas;
-    uint32_t resNodeDataCount = pShaderModuleEntryData->resNodeDataCount;
-    uint32_t pushConstSize = pShaderModuleEntryData->pushConstSize;
-    uint32_t setCount = 0;
-    uint32_t set = 0;
-
-    if (resNodeDataCount > 0)
-    {
-        set = pResourceNodeData[0].set;
-        setCount = 1;
-        for (uint32_t i = 1; i < resNodeDataCount; ++i)
-        {
-            if (set != pResourceNodeData[i].set)
-            {
-                set = pResourceNodeData[i].set;
-                ++setCount;
-            }
-        }
-    }
-
-    // 1 reperents push constant
-    uint32_t totalNodes = pushConstSize != 0 ? resNodeDataCount + setCount + 1 : resNodeDataCount + setCount;
-
-    auto pSets = static_cast<Vkgc::ResourceMappingRootNode*>(m_pAllocator->pfnAllocation(
-        m_pAllocator->pUserData,
-        totalNodes * sizeof(Vkgc::ResourceMappingRootNode),
-        VK_DEFAULT_MEM_ALIGN,
-        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT));
-    auto pNodes = reinterpret_cast<Vkgc::ResourceMappingNode*>(pSets + setCount + 1);
-    uint32_t topLevelOffset = 0;
-
-    for (uint32_t i = 0; i < resNodeDataCount; ++i)
-    {
-        pNodes[i].type = pResourceNodeData[i].type;
-        pNodes[i].sizeInDwords = OffsetStrideInDwords * pResourceNodeData[i].arraySize;
-        pNodes[i].offsetInDwords = pResourceNodeData[i].binding * OffsetStrideInDwords;
-        pNodes[i].srdRange.set = pResourceNodeData[i].set;
-        pNodes[i].srdRange.binding = pResourceNodeData[i].binding;
-        if ((i == 0) || (set != pNodes[i].srdRange.set))
-        {
-            set = pNodes[i].srdRange.set;
-            pSets[set].node.tablePtr.pNext = &pNodes[i];
-            pSets[set].node.type = Vkgc::ResourceMappingNodeType::DescriptorTableVaPtr;
-            pSets[set].node.sizeInDwords = 1;
-            pSets[set].node.offsetInDwords = topLevelOffset;
-            pSets[set].visibility = UINT_MAX;
-            topLevelOffset += pSets[set].node.sizeInDwords;
-        }
-        ++pSets[pResourceNodeData[i].set].node.tablePtr.nodeCount;
-    }
-
-    // Add UseDynamic options for below cases:
-    // 1. Force all uniform buffer are dynamic buffer in auto layout pipeline layout
-    // 2. Force all storage buffer are dynamic buffer in auto layout pipeline layout
-
-    if (pushConstSize)
-    {
-        // Add a node for push consts at the end of root descriptor list.
-        pSets[resNodeDataCount + setCount].node.type = Vkgc::ResourceMappingNodeType::PushConst;
-        pSets[resNodeDataCount + setCount].node.sizeInDwords = pushConstSize;
-        pSets[resNodeDataCount + setCount].node.offsetInDwords = topLevelOffset;
-    }
-
-    *pMappingNodeCount = setCount;
-    *ppResourceMappingNode = pSets;
-}
-
-// =====================================================================================================================
-// Creat color target from module data
-void PartialPipeline::CreateColorTargetFromModuleData(
-    Vkgc::ShaderModuleDataEx* pShaderModuleDataEx,
-    Vkgc::ColorTarget* pTarget)
-{
-    for (uint32_t i = 0; i < pShaderModuleDataEx->extra.fsOutInfoCount; ++i)
-    {
-        uint32_t location = pShaderModuleDataEx->extra.pFsOutInfos[i].location;
-        uint32_t componentCount =  pShaderModuleDataEx->extra.pFsOutInfos[i].componentCount;
-        Vkgc::BasicType basicType = pShaderModuleDataEx->extra.pFsOutInfos[i].basicType;
-
-        VK_ASSERT(location < Vkgc::MaxColorTargets);
-        pTarget[location].channelWriteMask = (1U << componentCount) - 1;
-        // Further optimization is app profile for color format according to fsOutInfos.
-        switch (basicType)
-        {
-        case Vkgc::BasicType::Float:
-            {
-                static const VkFormat formatTable[] =
-                {
-                    VK_FORMAT_R32_SFLOAT,
-                    VK_FORMAT_R32G32_SFLOAT,
-                    VK_FORMAT_R32G32B32_SFLOAT,
-                    VK_FORMAT_R32G32B32A32_SFLOAT,
-                };
-                pTarget[location].format = formatTable[componentCount - 1];
-                break;
-            }
-        case Vkgc::BasicType::Double:
-            {
-                static const VkFormat formatTable[] =
-                {
-                    VK_FORMAT_R64_SFLOAT,
-                    VK_FORMAT_R64G64_SFLOAT,
-                    VK_FORMAT_R64G64B64_SFLOAT,
-                    VK_FORMAT_R64G64B64A64_SFLOAT,
-                };
-                pTarget[location].format = formatTable[componentCount - 1];
-                break;
-            }
-        case Vkgc::BasicType::Int:
-            {
-                static const VkFormat formatTable[] =
-                {
-                    VK_FORMAT_R32_SINT,
-                    VK_FORMAT_R32G32_SINT,
-                    VK_FORMAT_R32G32B32_SINT,
-                    VK_FORMAT_R32G32B32A32_SINT,
-                };
-                pTarget[location].format = formatTable[componentCount - 1];
-                break;
-            }
-        case Vkgc::BasicType::Uint:
-            {
-                static const VkFormat formatTable[] =
-                {
-                    VK_FORMAT_R32_UINT,
-                    VK_FORMAT_R32G32_UINT,
-                    VK_FORMAT_R32G32B32_UINT,
-                    VK_FORMAT_R32G32B32A32_UINT,
-                };
-                pTarget[location].format = formatTable[componentCount - 1];
-                break;
-            }
-        case Vkgc::BasicType::Int64:
-            {
-                static const VkFormat formatTable[] =
-                {
-                    VK_FORMAT_R64_SINT,
-                    VK_FORMAT_R64G64_SINT,
-                    VK_FORMAT_R64G64B64_SINT,
-                    VK_FORMAT_R64G64B64A64_SINT,
-                };
-                pTarget[location].format = formatTable[componentCount - 1];
-                break;
-            }
-        case Vkgc::BasicType::Uint64:
-            {
-                static const VkFormat formatTable[] =
-                {
-                    VK_FORMAT_R64_UINT,
-                    VK_FORMAT_R64G64_UINT,
-                    VK_FORMAT_R64G64B64_UINT,
-                    VK_FORMAT_R64G64B64A64_UINT,
-                };
-                pTarget[location].format = formatTable[componentCount - 1];
-                break;
-            }
-        case Vkgc::BasicType::Float16:
-            {
-                static const VkFormat formatTable[] =
-                {
-                    VK_FORMAT_R16_SFLOAT,
-                    VK_FORMAT_R16G16_SFLOAT,
-                    VK_FORMAT_R16G16B16_SFLOAT,
-                    VK_FORMAT_R16G16B16A16_SFLOAT,
-                };
-                pTarget[location].format = formatTable[componentCount - 1];
-                break;
-            }
-        case Vkgc::BasicType::Int16:
-            {
-                static const VkFormat formatTable[] =
-                {
-                    VK_FORMAT_R16_SINT,
-                    VK_FORMAT_R16G16_SINT,
-                    VK_FORMAT_R16G16B16_SINT,
-                    VK_FORMAT_R16G16B16A16_SINT,
-                };
-                pTarget[location].format = formatTable[componentCount - 1];
-                break;
-            }
-        case Vkgc::BasicType::Uint16:
-            {
-                static const VkFormat formatTable[] =
-                {
-                    VK_FORMAT_R16_UINT,
-                    VK_FORMAT_R16G16_UINT,
-                    VK_FORMAT_R16G16B16_UINT,
-                    VK_FORMAT_R16G16B16A16_UINT,
-                };
-                pTarget[location].format = formatTable[componentCount - 1];
-                break;
-            }
-        case Vkgc::BasicType::Int8:
-            {
-                static const VkFormat formatTable[] =
-                {
-                    VK_FORMAT_R8_SINT,
-                    VK_FORMAT_R8G8_SINT,
-                    VK_FORMAT_R8G8B8_SINT,
-                    VK_FORMAT_R8G8B8A8_SINT,
-                };
-                pTarget[location].format = formatTable[componentCount - 1];
-                break;
-            }
-        case Vkgc::BasicType::Uint8:
-            {
-                static const VkFormat formatTable[] =
-                {
-                    VK_FORMAT_R8_UINT,
-                    VK_FORMAT_R8G8_UINT,
-                    VK_FORMAT_R8G8B8_UINT,
-                    VK_FORMAT_R8G8B8A8_UINT,
-                };
-                pTarget[location].format = formatTable[componentCount - 1];
-                break;
-            }
-        default:
-                break;
-        }
-    }
-}
-
-// =====================================================================================================================
-// Creates partial pipeline with partial pipeline opt enabled.
-void PartialPipeline::Execute(
-    AsyncLayer*      pAsyncLayer,
-    PartialPipelineTask* pTask)
-{
-    Device* pDevice = pAsyncLayer->GetDevice();
-    PipelineCompilerType compilerType = pDevice->GetCompiler(0)->GetShaderCacheType();
-    if (compilerType != PipelineCompilerTypeLlpc)
-    {
-        return;
-    }
-
-    vk::ShaderModule* pShaderModule = vk::ShaderModule::ObjectFromHandle(pTask->shaderModuleHandle);
-    void* pShaderModuleData =  pShaderModule->GetShaderData(compilerType);
-    auto pShaderModuleDataEx = reinterpret_cast<Vkgc::ShaderModuleDataEx*>(pShaderModuleData);
-    Vkgc::ShaderModuleEntryData* pShaderModuleEntryData = nullptr;
-    Vkgc::ColorTarget pColorTarget[Vkgc::MaxColorTargets] = {};
-    if ((pShaderModuleDataEx->extra.entryCount == 1) &&
-        (pShaderModuleDataEx->extra.entryDatas[0].stage == Vkgc::ShaderStageCompute))
-    {
-        pShaderModuleEntryData = &pShaderModuleDataEx->extra.entryDatas[0];
-    }
-    else
-    {
-        for (uint32_t i = 0; i < pShaderModuleDataEx->extra.entryCount; ++i)
-        {
-            if (pShaderModuleDataEx->extra.entryDatas[i].stage == Vkgc::ShaderStageFragment)
-            {
-                CreateColorTargetFromModuleData(pShaderModuleDataEx, pColorTarget);
-                if (pColorTarget[0].format == VK_FORMAT_UNDEFINED)
-                {
-                    break;
-                }
-
-                pShaderModuleEntryData = &pShaderModuleDataEx->extra.entryDatas[i];
-                break;
-            }
-        }
-    }
-    if (pShaderModuleEntryData != nullptr)
-    {
-        for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++)
-        {
-            const Vkgc::ResourceMappingRootNode* pResourceMappingNode = nullptr;
-            uint32_t                             mappingNodeCount = 0;
-            CreatePipelineLayoutFromModuleData(pAsyncLayer, pShaderModuleEntryData, &pResourceMappingNode, &mappingNodeCount);
-
-            auto result = pDevice->GetCompiler(deviceIdx)->CreatePartialPipelineBinary(deviceIdx,
-                pShaderModuleData, pShaderModuleEntryData, pResourceMappingNode, mappingNodeCount, pColorTarget);
-            VK_ASSERT(result == VK_SUCCESS);
-            m_pAllocator->pfnFree(m_pAllocator->pUserData, (void*)pResourceMappingNode);
-        }
-    }
-    Destroy();
-}
-
-} // namespace async
-
-} // namespace vk
diff --git a/icd/api/appopt/async_partial_pipeline.h b/icd/api/appopt/async_partial_pipeline.h
deleted file mode 100644
index b235d673..00000000
--- a/icd/api/appopt/async_partial_pipeline.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- ***********************************************************************************************************************
- *
- *  Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to deal
- *  in the Software without restriction, including without limitation the rights
- *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *  copies of the Software, and to permit persons to whom the Software is
- *  furnished to do so, subject to the following conditions:
- *
- *  The above copyright notice and this permission notice shall be included in all
- *  copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *  SOFTWARE.
- *
- **********************************************************************************************************************/
-/**
-***********************************************************************************************************************
-* @file  async_partial_pipeline.h
-* @brief Header file of class async::PartialPipeline
-***********************************************************************************************************************
-*/
-
-#ifndef __ASYNC_PARTIAL_PIPELINE_H__
-#define __ASYNC_PARTIAL_PIPELINE_H__
-
-#pragma once
-
-#include "include/vk_dispatch.h"
-#include "vkgcDefs.h"
-
-namespace vk
-{
-
-namespace async
-{
-
-// =====================================================================================================================
-// Implementation of a async shader module
-class PartialPipeline
-{
-public:
-    static PartialPipeline* Create(
-        Device*                         pDevice,
-        const VkAllocationCallbacks*    pAllocator);
-
-    VkResult Destroy();
-
-    void CreatePipelineLayoutFromModuleData(
-        AsyncLayer*                           pAsyncLayer,
-        Vkgc::ShaderModuleEntryData*          pShaderModuleEntryData,
-        const Vkgc::ResourceMappingRootNode** ppResourceMappingNode,
-        uint32_t*                             pMappingNodeCount);
-
-    void CreateColorTargetFromModuleData(
-        Vkgc::ShaderModuleDataEx* pShaderModuleDataEx,
-        Vkgc::ColorTarget* pTarget);
-
-    void Execute(AsyncLayer* pAsyncLayer, PartialPipelineTask* pTask);
-
-    void AsyncBuildPartialPipeline(AsyncLayer* pAsyncLayer, VkShaderModule asyncShaderModule);
-
-protected:
-    PartialPipeline(const VkAllocationCallbacks* pAllocator);
-
-private:
-    const VkAllocationCallbacks*    m_pAllocator;
-
-    PAL_DISALLOW_COPY_AND_ASSIGN(PartialPipeline);
-};
-
-} // namespace async
-
-} // namespace vk
-
-#endif
diff --git a/icd/api/appopt/async_shader_module.cpp b/icd/api/appopt/async_shader_module.cpp
deleted file mode 100644
index 2e802662..00000000
--- a/icd/api/appopt/async_shader_module.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- ***********************************************************************************************************************
- *
- *  Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to deal
- *  in the Software without restriction, including without limitation the rights
- *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *  copies of the Software, and to permit persons to whom the Software is
- *  furnished to do so, subject to the following conditions:
- *
- *  The above copyright notice and this permission notice shall be included in all
- *  copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *  SOFTWARE.
- *
- **********************************************************************************************************************/
-/**
-***********************************************************************************************************************
-* @file  async_shader_module.cpp
-* @brief Implementation of class async::ShaderModule
-***********************************************************************************************************************
-*/
-#include "async_layer.h"
-#include "async_shader_module.h"
-#include "async_partial_pipeline.h"
-
-#include "include/vk_device.h"
-#include "include/vk_shader.h"
-#include "palListImpl.h"
-
-namespace vk
-{
-
-namespace async
-{
-
-// =====================================================================================================================
-ShaderModule::ShaderModule(
-    VkShaderModule immedModule)
-    :
-    m_immedModule(immedModule),
-    m_asyncModule(VK_NULL_HANDLE)
-{
-}
-
-// =====================================================================================================================
-// Creates async shdaer module object
-VkResult ShaderModule::Create(
-    Device*                         pDevice,
-    const VkShaderModuleCreateInfo* pCreateInfo,
-    const VkAllocationCallbacks*    pAllocator,
-    VkShaderModule*                 pShaderModule)
-{
-    AsyncLayer* pAsyncLayer = pDevice->GetAsyncLayer();
-    VkShaderModule immedModule = VK_NULL_HANDLE;
-
-    VK_ASSERT(pCreateInfo->flags == 0);
-
-    // Build shader module with immedidate mode
-    auto result = ASYNC_CALL_NEXT_LAYER(vkCreateShaderModule)(
-        VkDevice(ApiDevice::FromObject(pDevice)),
-        pCreateInfo,
-        pAllocator,
-        &immedModule);
-
-    if (result == VK_SUCCESS)
-    {
-        const size_t objSize = sizeof(ShaderModule);
-        void* pMemory = pAllocator->pfnAllocation(
-            pAllocator->pUserData,
-            objSize,
-            VK_DEFAULT_MEM_ALIGN,
-            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-
-        if (pMemory == nullptr)
-        {
-            return VK_ERROR_OUT_OF_HOST_MEMORY;
-        }
-
-        VK_PLACEMENT_NEW(pMemory) ShaderModule(immedModule);
-        ShaderModule* pShaderModuleObj = static_cast<ShaderModule*>(pMemory);
-        *pShaderModule = ShaderModule::HandleFromVoidPointer(pMemory);
-
-        // Build shader module in async mode
-        pShaderModuleObj->AsyncBuildShaderModule(pDevice->GetAsyncLayer());
-    }
-
-    return result;
-}
-
-// =====================================================================================================================
-// Destory async shader module object
-VkResult ShaderModule::Destroy(
-    Device*                      pDevice,
-    const VkAllocationCallbacks* pAllocator)
-{
-    AsyncLayer* pAsyncLayer = pDevice->GetAsyncLayer();
-    if (m_asyncModule == VK_NULL_HANDLE)
-    {
-        pAsyncLayer->SyncAll();
-    }
-
-    if (m_immedModule != VK_NULL_HANDLE)
-    {
-        ASYNC_CALL_NEXT_LAYER(vkDestroyShaderModule)(
-            VkDevice(ApiDevice::FromObject(pDevice)),
-            m_immedModule,
-            pAllocator);
-    }
-
-    if (m_asyncModule != VK_NULL_HANDLE)
-    {
-        ASYNC_CALL_NEXT_LAYER(vkDestroyShaderModule)(
-            VkDevice(ApiDevice::FromObject(pDevice)),
-            m_asyncModule,
-            pAllocator);
-    }
-
-    return VK_SUCCESS;
-}
-
-// =====================================================================================================================
-// Builds shader module in async mode
-void ShaderModule::AsyncBuildShaderModule(
-    AsyncLayer* pAsyncLayer)
-{
-    auto pTaskThread = reinterpret_cast<async::TaskThread<ShaderModuleTask>*>
-                       (pAsyncLayer->GetTaskThread(ShaderModuleTaskType));
-    if (pTaskThread != nullptr)
-    {
-        vk::ShaderModule* pNextLayerModule = vk::ShaderModule::ObjectFromHandle(m_immedModule);
-
-        ShaderModuleTask task = {};
-        task.info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
-        task.info.pCode = reinterpret_cast<const uint32_t*>(pNextLayerModule->GetCode());
-        task.info.codeSize = pNextLayerModule->GetCodeSize();
-        task.info.flags = VK_SHADER_MODULE_ENABLE_OPT_BIT;
-        task.pObj = this;
-        pTaskThread->AddTask(&task);
-    }
-}
-
-// =====================================================================================================================
-// Creates shader module with shader module opt enabled.
-void ShaderModule::Execute(
-    AsyncLayer*      pAsyncLayer,
-    ShaderModuleTask* pTask)
-{
-    Device* pDevice = pAsyncLayer->GetDevice();
-    ASYNC_CALL_NEXT_LAYER(vkCreateShaderModule)(VkDevice(ApiDevice::FromObject(pDevice)),
-                                                &pTask->info,
-                                                nullptr,
-                                                &m_asyncModule);
-    const RuntimeSettings& settings  = pDevice->GetRuntimeSettings();
-    if (settings.enablePartialPipelineCompile)
-    {
-        const VkAllocationCallbacks* pAllocCB = pDevice->VkInstance()->GetAllocCallbacks();
-        auto pPartialPipelineObj = vk::async::PartialPipeline::Create(pDevice, pAllocCB);
-
-        if ((pPartialPipelineObj != nullptr) && (m_asyncModule != VK_NULL_HANDLE))
-        {
-            // Build partial pipeline in async mode
-            pPartialPipelineObj->AsyncBuildPartialPipeline(pDevice->GetAsyncLayer(), m_asyncModule);
-        }
-    }
-}
-
-} // namespace async
-
-} // namespace vk
diff --git a/icd/api/appopt/async_shader_module.h b/icd/api/appopt/async_shader_module.h
deleted file mode 100644
index cca54ec6..00000000
--- a/icd/api/appopt/async_shader_module.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- ***********************************************************************************************************************
- *
- *  Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to deal
- *  in the Software without restriction, including without limitation the rights
- *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *  copies of the Software, and to permit persons to whom the Software is
- *  furnished to do so, subject to the following conditions:
- *
- *  The above copyright notice and this permission notice shall be included in all
- *  copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *  SOFTWARE.
- *
- **********************************************************************************************************************/
-/**
-***********************************************************************************************************************
-* @file  async_shader_module.h
-* @brief Header file of class async::ShaderModule
-***********************************************************************************************************************
-*/
-
-#ifndef __ASYNC_SHADER_MODULE_H__
-#define __ASYNC_SHADER_MODULE_H__
-
-#pragma once
-
-#include "include/vk_dispatch.h"
-
-namespace vk
-{
-
-namespace async
-{
-
-// =====================================================================================================================
-// Implementation of a async shader module
-class ShaderModule final : public vk::NonDispatchable<VkShaderModule, ShaderModule>
-{
-public:
-    static VkResult Create(
-        Device*                         pDevice,
-        const VkShaderModuleCreateInfo* pCreateInfo,
-        const VkAllocationCallbacks*    pAllocator,
-        VkShaderModule*                 pShaderModule);
-
-    VkResult Destroy(
-        Device*                      pDevice,
-        const VkAllocationCallbacks* pAllocator);
-
-    VkShaderModule GetNextLayerModule()
-    {
-        return (m_asyncModule == VK_NULL_HANDLE) ? m_immedModule : m_asyncModule;
-    }
-
-    void Execute(AsyncLayer* pAsyncLayer, ShaderModuleTask* pTask);
-
-    void AsyncBuildShaderModule(AsyncLayer* pAsyncLayer);
-
-protected:
-    ShaderModule(VkShaderModule immedModule);
-
-    VkShaderModule m_immedModule;        // Shader module handle which is compiled with immedidate mode
-    VkShaderModule m_asyncModule;        // Shader module handle which is compiled with async mode
-
-private:
-    PAL_DISALLOW_COPY_AND_ASSIGN(ShaderModule);
-};
-
-} // namespace async
-
-} // namespace vk
-
-#endif
diff --git a/icd/api/appopt/async_task_thread.h b/icd/api/appopt/async_task_thread.h
deleted file mode 100644
index 9a11da2f..00000000
--- a/icd/api/appopt/async_task_thread.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- ***********************************************************************************************************************
- *
- *  Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to deal
- *  in the Software without restriction, including without limitation the rights
- *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- *  copies of the Software, and to permit persons to whom the Software is
- *  furnished to do so, subject to the following conditions:
- *
- *  The above copyright notice and this permission notice shall be included in all
- *  copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- *  SOFTWARE.
- *
- **********************************************************************************************************************/
-/**
-***********************************************************************************************************************
-* @file  async_task_thread.h
-* @brief Implementation of template class async::TaskThread
-***********************************************************************************************************************
-*/
-#ifndef __ASYNC_TASK_THREAD_H__
-#define __ASYNC_TASK_THREAD_H__
-
-#pragma once
-
-#include "include/vk_alloccb.h"
-#include "palThread.h"
-#include "palMutex.h"
-#include "palList.h"
-#include "palEvent.h"
-
-namespace vk
-{
-
-class AsyncLayer;
-class PalAllocator;
-
-namespace async
-{
-
-// =====================================================================================================================
-// Represents the general thread for async shader/pipeline compiler.
-template<class Task>
-class TaskThread final : public Util::Thread
-{
-public:
-    TaskThread(AsyncLayer* pAsyncLayer, PalAllocator* pAllocator)
-        :
-        m_pAsyncLayer(pAsyncLayer),
-        m_taskList(pAllocator),
-        m_stop(false)
-    {
-        Util::EventCreateFlags flags = {};
-        flags.manualReset = false;
-        flags.initiallySignaled = false;
-        m_event.Init(flags);
-    }
-
-    // Starts a new thread which starts by running function TaskThreadFunc.
-    void Begin()
-    {
-        Util::Thread::Begin(ThreadFunc, this);
-    }
-
-    // Adds task to list.
-    void AddTask(Task* pTask)
-    {
-        Util::MutexAuto mutexAuto(&m_lock);
-        m_taskList.PushBack(*pTask);
-        m_event.Set();
-    }
-
-    // Set flag stop and trig event.
-    void SetStop()
-    {
-        m_event.Set();
-        m_stop = true;
-    }
-
-    // Returns until all tasks are executed.
-    void SyncAll()
-    {
-        m_event.Set();
-        while (m_taskList.Begin() != m_taskList.End())
-        {
-            Util::YieldThread();
-        }
-    }
-
-protected:
-    // Async thread function
-    static void ThreadFunc(
-        void* pParam)
-    {
-        auto pThis = reinterpret_cast<TaskThread<Task>*>(pParam);
-        pThis->TaskThreadFunc();
-    }
-
-    // The implementation of async thread function
-    void TaskThreadFunc()
-    {
-        while (m_stop == false)
-        {
-            // Waits for new signal.
-            m_event.Wait(1.0f);
-
-            Task task;
-            while (FetchTask(&task))
-            {
-                task.pObj->Execute(m_pAsyncLayer, &task);
-            }
-        }
-    }
-
-    // Fetches task in list, return false if task list is empty.
-    bool FetchTask(Task* pTask)
-    {
-        Util::MutexAuto mutexAuto(&m_lock);
-        auto beginIt = m_taskList.Begin();
-        if (beginIt != m_taskList.End())
-        {
-            *pTask = *(beginIt.Get());
-            m_taskList.Erase(&beginIt);
-            return true;
-        }
-        return false;
-    }
-
-    AsyncLayer* m_pAsyncLayer;                      // Async compiler layer object
-    Util::List<Task, vk::PalAllocator> m_taskList;  // Async compile task list
-    volatile bool                  m_stop;          // Flag to stop the thread
-    Util::Mutex                    m_lock;          // Lock for accessing task list
-    Util::Event                    m_event;         // Event to notify async thread
-};
-
-} // namespace async
-
-} // namespace vk
-
-#endif
diff --git a/icd/api/barrier_policy.cpp b/icd/api/barrier_policy.cpp
index 499d2d54..3e54c63e 100644
--- a/icd/api/barrier_policy.cpp
+++ b/icd/api/barrier_policy.cpp
@@ -293,7 +293,9 @@ static uint32_t ImageLayoutToCacheMask(VkImageLayout imageLayout)
 // Converts source access flags to source cache coherency flags.
 static uint32_t SrcAccessToCacheMask(AccessFlags accessMask, VkImageLayout imageLayout)
 {
-    uint32_t cacheMask = 0;
+    uint32_t cacheMask = (((imageLayout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR) ||
+                          (imageLayout == VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR)) ?
+                          Pal::CoherPresent : 0);
 
     if (accessMask & VK_ACCESS_SHADER_WRITE_BIT)
     {
@@ -357,7 +359,9 @@ static uint32_t SrcAccessToCacheMask(AccessFlags accessMask, VkImageLayout image
 // Converts destination access flags to destination cache coherency flags.
 static uint32_t DstAccessToCacheMask(AccessFlags accessMask, VkImageLayout imageLayout)
 {
-    uint32_t cacheMask = 0;
+    uint32_t cacheMask = (((imageLayout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR) ||
+                          (imageLayout == VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR)) ?
+                          Pal::CoherPresent : 0);
 
     if (accessMask & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
     {
@@ -688,7 +692,8 @@ void DeviceBarrierPolicy::InitQueueFamilyPolicies(
                                              | Pal::CoherResolve
                                              | Pal::CoherClear
                                              | Pal::CoherIndirectArgs
-                                             | Pal::CoherIndexData;
+                                             | Pal::CoherIndexData
+                                             | Pal::CoherPresent;
             policy.supportedLayoutUsageMask |= Pal::LayoutColorTarget
                                              | Pal::LayoutDepthStencilTarget
                                              | Pal::LayoutShaderRead
@@ -982,8 +987,8 @@ void ImageBarrierPolicy::InitImageCachePolicy(
 {
     // Initialize supported cache masks based on the usage flags provided.
     // Always allow CPU and memory reads/writes.
-    uint32_t supportedOutputCacheMask   = Pal::CoherCpu | Pal::CoherMemory;
-    uint32_t supportedInputCacheMask    = Pal::CoherCpu | Pal::CoherMemory;
+    uint32_t supportedOutputCacheMask   = Pal::CoherCpu | Pal::CoherMemory | Pal::CoherPresent;
+    uint32_t supportedInputCacheMask    = Pal::CoherCpu | Pal::CoherMemory | Pal::CoherPresent;
 
     if (usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT)
     {
diff --git a/icd/api/compiler_solution_llpc.cpp b/icd/api/compiler_solution_llpc.cpp
index e4a2052d..0cadb2fa 100644
--- a/icd/api/compiler_solution_llpc.cpp
+++ b/icd/api/compiler_solution_llpc.cpp
@@ -688,6 +688,12 @@ VkResult CompilerSolutionLlpc::CreateLlpcCompiler(
     pOptionBuffer += optionLength;
     bufSize -= optionLength;
 
+    if ((m_gfxIp.major == 10) && (m_gfxIp.minor >= 3))
+    {
+        // Enable flat scratch for gfx10.3+
+        llpcOptions[numOptions++] = "-amdgpu-enable-flat-scratch";
+    }
+
     if (settings.llpcOptions[0] != '\0')
     {
         const char* pOptions = &settings.llpcOptions[0];
diff --git a/icd/api/graphics_pipeline_common.cpp b/icd/api/graphics_pipeline_common.cpp
index 65cb1dad..0f188321 100644
--- a/icd/api/graphics_pipeline_common.cpp
+++ b/icd/api/graphics_pipeline_common.cpp
@@ -1298,7 +1298,7 @@ static void BuildExecutablePipelineState(
 void GraphicsPipelineCommon::BuildPipelineObjectCreateInfo(
     const Device*                       pDevice,
     const VkGraphicsPipelineCreateInfo* pIn,
-    const VbInfo*                       pVbInfo,
+    const VbBindingInfo*                pVbInfo,
     const GraphicsPipelineBinaryInfo*   pBinInfo,
     const PipelineLayout*               pPipelineLayout,
     GraphicsPipelineObjectCreateInfo*   pInfo)
@@ -1311,7 +1311,7 @@ void GraphicsPipelineCommon::BuildPipelineObjectCreateInfo(
                                      pIn->pDynamicState
                                      );
 
-    BuildVertexInputInterfaceState(pDevice, pIn, &pVbInfo->bindingInfo, dynamicStateFlags, false, pInfo);
+    BuildVertexInputInterfaceState(pDevice, pIn, pVbInfo, dynamicStateFlags, false, pInfo);
 
     BuildPreRasterizationShaderState(pDevice,
                                      pIn,
@@ -1354,7 +1354,8 @@ VkResult GraphicsPipelineCommon::BuildPipelineBinaryCreateInfo(
     const PipelineLayout*               pPipelineLayout,
     GraphicsPipelineBinaryCreateInfo*   pBinInfo,
     GraphicsPipelineShaderStageInfo*    pShaderInfo,
-    VbInfo*                             pVbInfo,
+    VbBindingInfo*                      pVbInfo,
+    PipelineInternalBufferInfo*         pInternalBufferInfo,
     ShaderModuleHandle*                 pTempModules)
 {
     VkResult result = BuildShaderStageInfo(pDevice,
@@ -1371,7 +1372,7 @@ VkResult GraphicsPipelineCommon::BuildPipelineBinaryCreateInfo(
     if (result == VK_SUCCESS)
     {
         result = pDevice->GetCompiler(DefaultDeviceIndex)->ConvertGraphicsPipelineInfo(
-            pDevice, pCreateInfo, pShaderInfo, pPipelineLayout, pBinInfo, pVbInfo);
+            pDevice, pCreateInfo, pShaderInfo, pPipelineLayout, pBinInfo, pVbInfo, pInternalBufferInfo);
     }
 
     return result;
diff --git a/icd/api/include/compiler_solution.h b/icd/api/include/compiler_solution.h
index c4d80dd2..24ce5434 100644
--- a/icd/api/include/compiler_solution.h
+++ b/icd/api/include/compiler_solution.h
@@ -59,6 +59,7 @@ enum FreeCompilerBinary : uint32_t
 // Represents the result of PipelineCompiler::BuildShaderModule
 struct ShaderModuleHandle
 {
+    uint32_t* pRefCount;
     void*   pLlpcShaderModule;   // Shader module handle from LLPC
 };
 
diff --git a/icd/api/include/defer_compile_thread.h b/icd/api/include/defer_compile_thread.h
new file mode 100644
index 00000000..95924d06
--- /dev/null
+++ b/icd/api/include/defer_compile_thread.h
@@ -0,0 +1,231 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+***********************************************************************************************************************
+* @file  defer_compile_thread.h
+* @brief Implementation of class DeferCompileThread & DeferCompileManager
+***********************************************************************************************************************
+*/
+#ifndef __DEFER_COMPILE_THREAD_H__
+#define __DEFER_COMPILE_THREAD_H__
+
+#pragma once
+
+#include "include/vk_alloccb.h"
+#include "palThread.h"
+#include "palMutex.h"
+#include "palList.h"
+#include "palEvent.h"
+
+namespace vk
+{
+
+class DeferCompileManager;
+class PalAllocator;
+
+struct DeferredCompileWorkload
+{
+    void*       pPayloads;
+    void        (*Execute)(void*); // Function pointer to the call used to execute the workload
+    Util::Event* pEvent;
+};
+
+// =====================================================================================================================
+// Represents the general thread for async shader/pipeline compiler.
+class DeferCompileThread final : public Util::Thread
+{
+public:
+    DeferCompileThread(PalAllocator* pAllocator)
+        :
+        m_taskList(pAllocator),
+        m_stop(false)
+    {
+        Util::EventCreateFlags flags = {};
+        flags.manualReset = true;
+        flags.initiallySignaled = false;
+        m_event.Init(flags);
+    }
+
+    // Starts a new thread which starts by running function TaskThreadFunc.
+    void Begin()
+    {
+        Util::Thread::Begin(ThreadFunc, this);
+    }
+
+    // Adds task to list.
+    void AddTask(DeferredCompileWorkload* pTask)
+    {
+        Util::MutexAuto mutexAuto(&m_lock);
+        m_taskList.PushBack(*pTask);
+        m_event.Set();
+    }
+
+    // Set flag stop and trig event.
+    void SetStop()
+    {
+        m_event.Set();
+        m_stop = true;
+    }
+
+    // Returns until all tasks are executed.
+    void SyncAll()
+    {
+        m_event.Set();
+        while (m_taskList.Begin() != m_taskList.End())
+        {
+            Util::YieldThread();
+        }
+    }
+
+protected:
+    // Async thread function
+    static void ThreadFunc(
+        void* pParam)
+    {
+        auto pThis = reinterpret_cast<DeferCompileThread*>(pParam);
+        pThis->TaskThreadFunc();
+    }
+
+    // The implementation of async thread function
+    void TaskThreadFunc()
+    {
+        while (m_stop == false)
+        {
+            // Waits for new signal.
+            m_event.Wait(1.0f);
+            m_event.Reset();
+
+            DeferredCompileWorkload task;
+            while (FetchTask(&task))
+            {
+                task.Execute(task.pPayloads);
+                if (task.pEvent != nullptr)
+                {
+                    task.pEvent->Set();
+                }
+            }
+        }
+    }
+
+    // Fetches task in list, return false if task list is empty.
+    bool FetchTask(DeferredCompileWorkload* pTask)
+    {
+        Util::MutexAuto mutexAuto(&m_lock);
+        auto beginIt = m_taskList.Begin();
+        if (beginIt != m_taskList.End())
+        {
+            *pTask = *(beginIt.Get());
+            m_taskList.Erase(&beginIt);
+            return true;
+        }
+        return false;
+    }
+
+    Util::List<DeferredCompileWorkload, vk::PalAllocator> m_taskList;      // Deferred compile task list
+    volatile bool                                         m_stop;          // Flag to stop the thread
+    Util::Mutex                                           m_lock;          // Lock for accessing task list
+    Util::Event                                           m_event;         // Event to notify async thread
+};
+
+// =====================================================================================================================
+// Class that manage DeferCompileThread instance.
+class DeferCompileManager
+{
+public:
+    DeferCompileManager()
+        :
+        m_pCompileThreads{},
+        m_taskId(0),
+        m_activeThreadCount(0)
+    {
+    }
+
+    void Init(uint32_t threadCount, PalAllocator* pAllocator)
+    {
+        if (threadCount == 0)
+        {
+            m_activeThreadCount = 0;
+        }
+        else if (threadCount == UINT32_MAX)
+        {
+            Util::SystemInfo sysInfo = {};
+            Util::QuerySystemInfo(&sysInfo);
+            m_activeThreadCount = Util::Min(MaxThreads, sysInfo.cpuLogicalCoreCount / 2);
+        }
+        else
+        {
+            m_activeThreadCount = Util::Min(MaxThreads, threadCount);
+        }
+
+        for (uint32_t i = 0; i < m_activeThreadCount; ++i)
+        {
+            m_pCompileThreads[i] = VK_PLACEMENT_NEW(m_compileThreadBuffer[i])
+                DeferCompileThread(pAllocator);
+            m_pCompileThreads[i]->Begin();
+        }
+    }
+
+    ~DeferCompileManager()
+    {
+        for (uint32_t i = 0; i < m_activeThreadCount; ++i)
+        {
+            m_pCompileThreads[i]->SetStop();
+            m_pCompileThreads[i]->Join();
+            Util::Destructor(m_pCompileThreads[i]);
+            m_pCompileThreads[i] = nullptr;
+        }
+        m_activeThreadCount = 0;
+    }
+
+    void SyncAll()
+    {
+        for (uint32_t i = 0; i < m_activeThreadCount; ++i)
+        {
+            m_pCompileThreads[i]->SyncAll();
+        }
+    }
+
+    DeferCompileThread* GetCompileThread()
+    {
+        return (m_activeThreadCount > 0) ?
+            m_pCompileThreads[(m_taskId++) % m_activeThreadCount] :
+            nullptr;
+    }
+
+protected:
+    static constexpr uint32_t        MaxThreads = 8;  // Max thread count for shader module compile
+    DeferCompileThread*              m_pCompileThreads[MaxThreads]; // Async compiler threads
+    uint32_t                         m_taskId;                      // Hint to select compile thread
+    uint32_t                         m_activeThreadCount;           // Active thread count
+
+    // Internal buffer for m_pCompileThreads
+    uint8_t                          m_compileThreadBuffer[MaxThreads][sizeof(DeferCompileThread)];
+private:
+    PAL_DISALLOW_COPY_AND_ASSIGN(DeferCompileManager);
+};
+
+} // namespace vk
+
+#endif
diff --git a/icd/api/include/graphics_pipeline_common.h b/icd/api/include/graphics_pipeline_common.h
index d0640a96..d0a94aeb 100644
--- a/icd/api/include/graphics_pipeline_common.h
+++ b/icd/api/include/graphics_pipeline_common.h
@@ -65,19 +65,19 @@ struct VbBindingInfo
     } bindings[Pal::MaxVertexBuffers];
 };
 
-struct UberFetchShaderBufferInfo
+constexpr uint32_t MaxPipelineInternalBufferCount = 4;
+struct InternalBufferEntry
 {
-    bool requirePerIntanceFetch;
-    bool requirePerCompFetch;
     uint32_t userDataOffset;
-    uint32_t bufferSize;
-    uint32_t bufferData[Vkgc::MaxFetchShaderInternalBufferSize];
+    uint32_t bufferOffset;
 };
 
-struct VbInfo
+struct PipelineInternalBufferInfo
 {
-    VbBindingInfo bindingInfo;
-    UberFetchShaderBufferInfo uberFetchShaderBuffer;
+    uint32_t            internalBufferCount;
+    InternalBufferEntry internalBufferEntries[MaxPipelineInternalBufferCount];
+    uint32_t            dataSize;
+    void*               pData;
 };
 
 // =====================================================================================================================
@@ -100,6 +100,7 @@ struct GraphicsPipelineObjectImmedInfo
     Pal::VrsRateParams                    vrsRateParams;
     Pal::DepthStencilStateCreateInfo      depthStencilCreateInfo;
     bool                                  rasterizerDiscardEnable;
+    bool                                  checkDeferCompilePipeline;
 
     // Static pipeline parameter token values.  These can be used to efficiently redundancy check static pipeline
     // state programming during pipeline binds.
@@ -200,14 +201,15 @@ class GraphicsPipelineCommon : public Pipeline
         const PipelineLayout*               pPipelineLayout,
         GraphicsPipelineBinaryCreateInfo*   pBinInfo,
         GraphicsPipelineShaderStageInfo*    pShaderInfo,
-        VbInfo*                             pVbInfo,
+        VbBindingInfo*                      pVbInfo,
+        PipelineInternalBufferInfo*         pInternalBufferInfo,
         ShaderModuleHandle*                 pTempModules);
 
     // Convert API information into internal create info used to create internal pipeline object
     static void BuildPipelineObjectCreateInfo(
         const Device*                       pDevice,
         const VkGraphicsPipelineCreateInfo* pIn,
-        const VbInfo*                       pVbInfo,
+        const VbBindingInfo*                pVbInfo,
         const GraphicsPipelineBinaryInfo*   pBinInfo,
         const PipelineLayout*               pPipelineLayout,
         GraphicsPipelineObjectCreateInfo*   pObjInfo);
diff --git a/icd/api/include/khronos/GLSL.ext.AMD.h b/icd/api/include/khronos/GLSL.ext.AMD.h
index efe849fe..297a6f98 100644
--- a/icd/api/include/khronos/GLSL.ext.AMD.h
+++ b/icd/api/include/khronos/GLSL.ext.AMD.h
@@ -84,6 +84,35 @@ enum GcnShaderAMD {
     GcnShaderCountAMD
 };
 
+#if VKI_TEXEL_BUFFER_EXPLICIT_FORMAT_SUPPORT
+// SPV_AMD_shader_texel_buffer_explicit_format
+static const Capability CapabilityImageBufferReadWriteWithFormatAMD = static_cast<Capability>(5024);
+
+static const Op OpImageBufferReadAMD = static_cast<Op>(5025);
+static const Op OpImageBufferWriteAMD = static_cast<Op>(5026);
+
+static const ImageFormat ImageFormatRgb32fAMD =         static_cast<ImageFormat>(5028);
+static const ImageFormat ImageFormatRgb32uiAMD =        static_cast<ImageFormat>(5029);
+static const ImageFormat ImageFormatRgb32iAMD =         static_cast<ImageFormat>(5030);
+static const ImageFormat ImageFormatR10G11B11fAMD =     static_cast<ImageFormat>(5031);
+static const ImageFormat ImageFormatRgb10A2SnormAMD =   static_cast<ImageFormat>(5032);
+static const ImageFormat ImageFormatRgb10A2iAMD =       static_cast<ImageFormat>(5033);
+static const ImageFormat ImageFormatRgba16SscaledAMD =  static_cast<ImageFormat>(5034);
+static const ImageFormat ImageFormatRgb10A2SscaledAMD = static_cast<ImageFormat>(5035);
+static const ImageFormat ImageFormatRg16SscaledAMD =    static_cast<ImageFormat>(5036);
+static const ImageFormat ImageFormatRgba8SscaledAMD =   static_cast<ImageFormat>(5037);
+static const ImageFormat ImageFormatRg8SscaledAMD =     static_cast<ImageFormat>(5038);
+static const ImageFormat ImageFormatR16SscaledAMD =     static_cast<ImageFormat>(5039);
+static const ImageFormat ImageFormatR8SscaledAMD =      static_cast<ImageFormat>(5040);
+static const ImageFormat ImageFormatRgba16UscaledAMD =  static_cast<ImageFormat>(5041);
+static const ImageFormat ImageFormatRgb10A2UscaledAMD = static_cast<ImageFormat>(5042);
+static const ImageFormat ImageFormatRg16UscaledAMD =    static_cast<ImageFormat>(5043);
+static const ImageFormat ImageFormatRgba8USscaledAMD =  static_cast<ImageFormat>(5044);
+static const ImageFormat ImageFormatRg8UscaledAMD =     static_cast<ImageFormat>(5045);
+static const ImageFormat ImageFormatR16UscaledAMD =     static_cast<ImageFormat>(5046);
+static const ImageFormat ImageFormatR8UscaledAMD =      static_cast<ImageFormat>(5047);
+#endif
+
 #if VKI_NORMALIZED_TRIG_FUNCTIONS
 // SPV_AMD_normalized_trig - Internal Use Only
 static const Capability CapabilityTrigNormalizedAMD = static_cast<Capability>(5058);
diff --git a/icd/api/include/khronos/sdk-1.2/vulkan_beta.h b/icd/api/include/khronos/sdk-1.2/vulkan_beta.h
index e2337adf..f67fab36 100644
--- a/icd/api/include/khronos/sdk-1.2/vulkan_beta.h
+++ b/icd/api/include/khronos/sdk-1.2/vulkan_beta.h
@@ -90,7 +90,6 @@ typedef enum VkVideoCodingControlFlagBitsKHR {
 typedef VkFlags VkVideoCodingControlFlagsKHR;
 
 typedef enum VkVideoCodingQualityPresetFlagBitsKHR {
-    VK_VIDEO_CODING_QUALITY_PRESET_DEFAULT_BIT_KHR = 0,
     VK_VIDEO_CODING_QUALITY_PRESET_NORMAL_BIT_KHR = 0x00000001,
     VK_VIDEO_CODING_QUALITY_PRESET_POWER_BIT_KHR = 0x00000002,
     VK_VIDEO_CODING_QUALITY_PRESET_QUALITY_BIT_KHR = 0x00000004,
diff --git a/icd/api/include/khronos/sdk-1.2/vulkan_core.h b/icd/api/include/khronos/sdk-1.2/vulkan_core.h
index 0e081aaf..18b302fa 100644
--- a/icd/api/include/khronos/sdk-1.2/vulkan_core.h
+++ b/icd/api/include/khronos/sdk-1.2/vulkan_core.h
@@ -72,7 +72,7 @@ extern "C" {
 #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0
 
 // Version of this file
-#define VK_HEADER_VERSION 188
+#define VK_HEADER_VERSION 191
 
 // Complete version of this file
 #define VK_HEADER_VERSION_COMPLETE VK_MAKE_API_VERSION(0, 1, 2, VK_HEADER_VERSION)
@@ -754,6 +754,8 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_FEATURES_NV = 1000277007,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INHERITED_VIEWPORT_SCISSOR_FEATURES_NV = 1000278000,
     VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_VIEWPORT_SCISSOR_INFO_NV = 1000278001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR = 1000280000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR = 1000280001,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT = 1000281000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT = 1000281001,
     VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_RENDER_PASS_TRANSFORM_INFO_QCOM = 1000282000,
@@ -824,6 +826,7 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_VERTEX_INPUT_BINDING_DESCRIPTION_2_EXT = 1000352001,
     VK_STRUCTURE_TYPE_VERTEX_INPUT_ATTRIBUTE_DESCRIPTION_2_EXT = 1000352002,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT = 1000353000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT = 1000356000,
     VK_STRUCTURE_TYPE_IMPORT_MEMORY_ZIRCON_HANDLE_INFO_FUCHSIA = 1000364000,
     VK_STRUCTURE_TYPE_MEMORY_ZIRCON_HANDLE_PROPERTIES_FUCHSIA = 1000364001,
     VK_STRUCTURE_TYPE_MEMORY_GET_ZIRCON_HANDLE_INFO_FUCHSIA = 1000364002,
@@ -843,6 +846,7 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_EXT = 1000388001,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_FEATURES_EXT = 1000392000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT = 1000392001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT = 1000412000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES,
     VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
@@ -2125,10 +2129,6 @@ typedef enum VkImageViewCreateFlagBits {
     VK_IMAGE_VIEW_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
 } VkImageViewCreateFlagBits;
 typedef VkFlags VkImageViewCreateFlags;
-
-typedef enum VkShaderModuleCreateFlagBits {
-    VK_SHADER_MODULE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
-} VkShaderModuleCreateFlagBits;
 typedef VkFlags VkShaderModuleCreateFlags;
 
 typedef enum VkPipelineCacheCreateFlagBits {
@@ -7867,6 +7867,52 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineExecutableInternalRepresentationsKHR
 #endif
 
 
+#define VK_KHR_shader_integer_dot_product 1
+#define VK_KHR_SHADER_INTEGER_DOT_PRODUCT_SPEC_VERSION 1
+#define VK_KHR_SHADER_INTEGER_DOT_PRODUCT_EXTENSION_NAME "VK_KHR_shader_integer_dot_product"
+typedef struct VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           shaderIntegerDotProduct;
+} VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR;
+
+typedef struct VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           integerDotProduct8BitUnsignedAccelerated;
+    VkBool32           integerDotProduct8BitSignedAccelerated;
+    VkBool32           integerDotProduct8BitMixedSignednessAccelerated;
+    VkBool32           integerDotProduct4x8BitPackedUnsignedAccelerated;
+    VkBool32           integerDotProduct4x8BitPackedSignedAccelerated;
+    VkBool32           integerDotProduct4x8BitPackedMixedSignednessAccelerated;
+    VkBool32           integerDotProduct16BitUnsignedAccelerated;
+    VkBool32           integerDotProduct16BitSignedAccelerated;
+    VkBool32           integerDotProduct16BitMixedSignednessAccelerated;
+    VkBool32           integerDotProduct32BitUnsignedAccelerated;
+    VkBool32           integerDotProduct32BitSignedAccelerated;
+    VkBool32           integerDotProduct32BitMixedSignednessAccelerated;
+    VkBool32           integerDotProduct64BitUnsignedAccelerated;
+    VkBool32           integerDotProduct64BitSignedAccelerated;
+    VkBool32           integerDotProduct64BitMixedSignednessAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating8BitUnsignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating8BitSignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating16BitUnsignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating16BitSignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating32BitUnsignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating32BitSignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating64BitUnsignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating64BitSignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated;
+} VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR;
+
+
+
 #define VK_KHR_pipeline_library 1
 #define VK_KHR_PIPELINE_LIBRARY_SPEC_VERSION 1
 #define VK_KHR_PIPELINE_LIBRARY_EXTENSION_NAME "VK_KHR_pipeline_library"
@@ -12454,6 +12500,18 @@ typedef struct VkPhysicalDeviceDrmPropertiesEXT {
 
 
 
+#define VK_EXT_primitive_topology_list_restart 1
+#define VK_EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART_SPEC_VERSION 1
+#define VK_EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART_EXTENSION_NAME "VK_EXT_primitive_topology_list_restart"
+typedef struct VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           primitiveTopologyListRestart;
+    VkBool32           primitiveTopologyPatchListRestart;
+} VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT;
+
+
+
 #define VK_HUAWEI_subpass_shading 1
 #define VK_HUAWEI_SUBPASS_SHADING_SPEC_VERSION 2
 #define VK_HUAWEI_SUBPASS_SHADING_EXTENSION_NAME "VK_HUAWEI_subpass_shading"
@@ -12675,6 +12733,25 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDrawMultiIndexedEXT(
 #define VK_EXT_LOAD_STORE_OP_NONE_EXTENSION_NAME "VK_EXT_load_store_op_none"
 
 
+#define VK_EXT_pageable_device_local_memory 1
+#define VK_EXT_PAGEABLE_DEVICE_LOCAL_MEMORY_SPEC_VERSION 1
+#define VK_EXT_PAGEABLE_DEVICE_LOCAL_MEMORY_EXTENSION_NAME "VK_EXT_pageable_device_local_memory"
+typedef struct VkPhysicalDevicePageableDeviceLocalMemoryFeaturesEXT {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           pageableDeviceLocalMemory;
+} VkPhysicalDevicePageableDeviceLocalMemoryFeaturesEXT;
+
+typedef void (VKAPI_PTR *PFN_vkSetDeviceMemoryPriorityEXT)(VkDevice       device, VkDeviceMemory memory, float          priority);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkSetDeviceMemoryPriorityEXT(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    float                                       priority);
+#endif
+
+
 #define VK_KHR_acceleration_structure 1
 VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkAccelerationStructureKHR)
 #define VK_KHR_ACCELERATION_STRUCTURE_SPEC_VERSION 12
diff --git a/icd/api/include/khronos/vulkan.h b/icd/api/include/khronos/vulkan.h
index fc10d0b5..0d91a8aa 100644
--- a/icd/api/include/khronos/vulkan.h
+++ b/icd/api/include/khronos/vulkan.h
@@ -58,6 +58,9 @@
 // Internal (under development) extension definitions
 
 #include "devext/vk_amd_gpa_interface.h"
+#if VKI_TEXEL_BUFFER_EXPLICIT_FORMAT_SUPPORT
+#include "devext/vk_amd_shader_texel_buffer_explicit_format.h"
+#endif
 
 #define VK_FORMAT_BEGIN_RANGE VK_FORMAT_UNDEFINED
 #define VK_FORMAT_END_RANGE VK_FORMAT_ASTC_12x12_SRGB_BLOCK
diff --git a/icd/api/include/pipeline_compiler.h b/icd/api/include/pipeline_compiler.h
index 466d3351..dd7cf824 100644
--- a/icd/api/include/pipeline_compiler.h
+++ b/icd/api/include/pipeline_compiler.h
@@ -39,6 +39,7 @@
 
 #include "include/vk_shader_code.h"
 #include "include/vk_conv.h"
+#include "include/defer_compile_thread.h"
 
 namespace vk
 {
@@ -48,8 +49,7 @@ class PipelineCache;
 class ShaderModule;
 class PipelineCompiler;
 struct VbBindingInfo;
-struct VbInfo;
-struct UberFetchShaderBufferInfo;
+struct PipelineInternalBufferInfo;
 struct ShaderModuleHandle;
 
 class PipelineBinaryCache;
@@ -192,7 +192,8 @@ class PipelineCompiler
         const GraphicsPipelineShaderStageInfo*          pShaderInfo,
         const PipelineLayout*                           pPipelineLayout,
         GraphicsPipelineBinaryCreateInfo*               pCreateInfo,
-        VbInfo*                                         pVbInfo);
+        VbBindingInfo*                                  pVbInfo,
+        PipelineInternalBufferInfo*                     pInternalBufferInfo);
 
     VkResult ConvertComputePipelineInfo(
         const Device*                                   pDevice,
@@ -212,7 +213,7 @@ class PipelineCompiler
 
     void FreeComputePipelineCreateInfo(ComputePipelineBinaryCreateInfo* pCreateInfo);
 
-    void FreeGraphicsPipelineCreateInfo(GraphicsPipelineBinaryCreateInfo* pCreateInfo);
+    void FreeGraphicsPipelineCreateInfo(GraphicsPipelineBinaryCreateInfo* pCreateInfo, bool keepConvertTempMem);
 
 #if ICD_GPUOPEN_DEVMODE_BUILD
     Util::Result RegisterAndLoadReinjectionBinary(
@@ -239,10 +240,8 @@ class PipelineCompiler
 
     void DestroyPipelineBinaryCache();
 
-    VkResult BuildUberFetchShaderInternalData(PipelineCompilerType                        compilerType,
-                                              const VkPipelineVertexInputStateCreateInfo* pVertexInput,
-                                              bool                                        isDynamicStride,
-                                              UberFetchShaderBufferInfo*                  pFetchShaderBufferInfo);
+    VkResult BuildPipelineInternalBufferData(GraphicsPipelineBinaryCreateInfo*     pCreateInfo,
+                                             PipelineInternalBufferInfo*           pInternalBufferInfo);
 
     void GetComputePipelineCacheId(
         uint32_t                         deviceIdx,
@@ -258,6 +257,24 @@ class PipelineCompiler
         const Util::MetroHash::Hash&      settingsHash,
         Util::MetroHash::Hash*            pCacheId);
 
+    static void BuildNggState(
+        const Device*                     pDevice,
+        const VkShaderStageFlagBits       activeStages,
+        const bool                        isConservativeOverestimation,
+        GraphicsPipelineBinaryCreateInfo* pCreateInfo);
+
+    static void BuildPipelineShaderInfo(
+        const Device*                                 pDevice,
+        const ShaderStageInfo*                        pShaderInfoIn,
+        Vkgc::PipelineShaderInfo*                     pShaderInfoOut,
+        Vkgc::PipelineOptions*                        pPipelineOptions,
+        PipelineOptimizerKey*                         pOptimizerKey,
+        Vkgc::NggState*                               pNggState
+        );
+
+    void ExecuteDeferCompile(
+        DeferredCompileWorkload* pWorkload);
+
 private:
     PAL_DISALLOW_COPY_AND_ASSIGN(PipelineCompiler);
 
@@ -302,11 +319,30 @@ class PipelineCompiler
         FreeCompilerBinary*          pFreeCompilerBinary,
         PipelineCreationFeedback*    pPipelineFeedback);
 
+    VkResult LoadShaderModuleFromCache(
+        const Device*             pDevice,
+        VkShaderModuleCreateFlags flags,
+        uint32_t                  compilerMask,
+        Util::MetroHash::Hash&    uniqueHash,
+        ShaderModuleHandle*       pShaderModule);
+
+    void StoreShaderModuleToCache(
+        const Device*             pDevice,
+        VkShaderModuleCreateFlags flags,
+        uint32_t                  compilerMask,
+        Util::MetroHash::Hash&    uniqueHash,
+        ShaderModuleHandle*       pShaderModule);
+
+    Util::MetroHash::Hash GetShaderModuleCacheHash(
+        VkShaderModuleCreateFlags flags,
+        uint32_t                  compilerMask,
+        Util::MetroHash::Hash&    uniqueHash);
+
     // -----------------------------------------------------------------------------------------------------------------
 
     PhysicalDevice*    m_pPhysicalDevice;      // Vulkan physical device object
     Vkgc::GfxIpVersion m_gfxIp;                // Graphics IP version info, used by Vkgcf
-
+    DeferCompileManager m_deferCompileMgr;     // Defer compile thread manager
     CompilerSolutionLlpc m_compilerSolutionLlpc;
 
     PipelineBinaryCache* m_pBinaryCache;       // Pipeline binary cache object
@@ -320,9 +356,10 @@ class PipelineCompiler
 
     UberFetchShaderFormatInfoMap m_uberFetchShaderInfoFormatMap;  // Uber fetch shader format info map
 
-    void GetPipelineCreationInfoNext(
-        const VkStructHeader*                             pHeader,
-        const VkPipelineCreationFeedbackCreateInfoEXT**   ppPipelineCreationFeadbackCreateInfo);
+    typedef Util::HashMap<Util::MetroHash::Hash, ShaderModuleHandle, PalAllocator, Util::JenkinsHashFunc> ShaderModuleHandleMap;
+
+    Util::Mutex           m_shaderModuleCacheLock;
+    ShaderModuleHandleMap m_shaderModuleHandleMap;
 
 }; // class PipelineCompiler
 
diff --git a/icd/api/include/vk_buffer.h b/icd/api/include/vk_buffer.h
index 0d66f92a..6a3a1054 100644
--- a/icd/api/include/vk_buffer.h
+++ b/icd/api/include/vk_buffer.h
@@ -82,11 +82,6 @@ class Buffer final : public NonDispatchable<VkBuffer, Buffer>
         const Device*         pDevice,
         VkMemoryRequirements* pMemoryRequirements);
 
-    static void CalculateMemoryRequirements(
-        const Device*             pDevice,
-        const VkBufferCreateInfo* pCreateInfo,
-        VkMemoryRequirements*     pMemoryRequirements);
-
     VkDeviceSize GetSize() const
         { return m_size; }
 
@@ -134,7 +129,6 @@ class Buffer final : public NonDispatchable<VkBuffer, Buffer>
     };
 
     Buffer(Device*                      pDevice,
-           const VkAllocationCallbacks* pAllocator,
            const VkBufferCreateInfo*    pCreateInfo,
            Pal::IGpuMemory**            pGpuMemory,
            BufferFlags                  internalFlags);
diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h
index 9cdaef88..bf5116cb 100644
--- a/icd/api/include/vk_cmdbuffer.h
+++ b/icd/api/include/vk_cmdbuffer.h
@@ -519,8 +519,8 @@ class CmdBuffer
         VkPrimitiveTopology                         primitiveTopology);
 
     void SetLineStippleEXT(
-        const Pal::LineStippleStateParams&          params,
-        uint32_t                                    staticToken);
+        uint32_t                                    lineStippleFactor,
+        uint16_t                                    lineStipplePattern);
 
     void SetColorWriteEnableEXT(
         uint32_t                                    attachmentCount,
@@ -707,10 +707,6 @@ class CmdBuffer
         uint32_t        counterOffset,
         uint32_t        vertexStride);
 
-    void SetLineStippleEXT(
-        uint32_t        lineStippleFactor,
-        uint16_t        lineStipplePattern);
-
     void CmdSetPerDrawVrsRate(
         const VkExtent2D*                        pFragmentSize,
         const VkFragmentShadingRateCombinerOpKHR combinerOps[2]);
@@ -950,7 +946,7 @@ class CmdBuffer
         const Pal::IDepthStencilState* pState);
 
     void PalCmdSetMsaaQuadSamplePattern(
-        uint32_t numSamplesPerPixel,
+        uint32_t                           numSamplesPerPixel,
         const  Pal::MsaaQuadSamplePattern& quadSamplePattern);
 
     inline void PalCmdBufferSetUserData(
diff --git a/icd/api/include/vk_conv.h b/icd/api/include/vk_conv.h
index 06ff8d1c..2aea5ce8 100644
--- a/icd/api/include/vk_conv.h
+++ b/icd/api/include/vk_conv.h
@@ -3405,8 +3405,25 @@ struct UberFetchShaderFormatInfo
     };
 };
 
-typedef Util::HashMap<VkFormat, UberFetchShaderFormatInfo, PalAllocator, Util::JenkinsHashFunc>
-    UberFetchShaderFormatInfoMap;
+// =====================================================================================================================
+class UberFetchShaderFormatInfoMap :
+    public Util::HashMap<VkFormat, UberFetchShaderFormatInfo, PalAllocator, Util::JenkinsHashFunc>
+{
+public:
+    explicit UberFetchShaderFormatInfoMap(uint32 numBuckets, PalAllocator* const pAllocator)
+        :
+        Util::HashMap<VkFormat, UberFetchShaderFormatInfo, PalAllocator, Util::JenkinsHashFunc>(numBuckets, pAllocator),
+        m_bufferFormatMask(0)
+    { }
+
+    void SetBufferFormatMask(uint32_t mask) { m_bufferFormatMask = mask; }
+
+    uint32_t GetBufferFormatMask() const { return m_bufferFormatMask; }
+
+private:
+    uint32_t m_bufferFormatMask;
+};
+
 class PhysicalDevice;
 
 // =====================================================================================================================
@@ -3416,7 +3433,8 @@ VkResult InitializeUberFetchShaderFormatTable(
 
 UberFetchShaderFormatInfo GetUberFetchShaderFormatInfo(
     UberFetchShaderFormatInfoMap* pFormatInfoMap,
-    VkFormat                      vkFormat);
+    VkFormat                      vkFormat,
+    bool                          isZeroStride);
 
 } // namespace vk
 
diff --git a/icd/api/include/vk_descriptor_set.h b/icd/api/include/vk_descriptor_set.h
index 91365156..78d6cae9 100644
--- a/icd/api/include/vk_descriptor_set.h
+++ b/icd/api/include/vk_descriptor_set.h
@@ -125,6 +125,8 @@ class DescriptorSet final : public NonDispatchable<VkDescriptorSet, DescriptorSe
         DescriptorAddr*             pBaseAddrs,
         void*                       pAllocHandle);
 
+    void WriteImmutableSamplers();
+
     void Reset();
 
     void* AllocHandle() const
diff --git a/icd/api/include/vk_descriptor_set_layout.h b/icd/api/include/vk_descriptor_set_layout.h
index dac9bc6c..2e8d2d27 100644
--- a/icd/api/include/vk_descriptor_set_layout.h
+++ b/icd/api/include/vk_descriptor_set_layout.h
@@ -47,6 +47,20 @@ class MetroHash64;
 namespace vk
 {
 
+// Fixed offset for pipeline internal resource.
+constexpr uint32_t FetchShaderInternalBufferOffset = 0;
+constexpr uint32_t SpecConstBufferVertexOffset     = 2;
+constexpr uint32_t SpecConstBufferFragmentOffset   = 4;
+constexpr uint32_t SpecConstVertexInternalBufferBindingId = 5;
+constexpr uint32_t SpecConstFragmentInternalBufferBindingId = 9;
+
+// Constants for Angle style descriptor layout pattern
+namespace AngleDescPattern
+{
+constexpr uint32_t DescriptorSetOffset[4]          = { 6, 10, 18, 19 };
+constexpr uint32_t DescriptorSetBindingStride      = 12;
+}
+
 // Internal descriptor binding flags, which contains mapping of VkDescriptorBindingFlagBits
 struct DescriptorBindingFlags
 {
diff --git a/icd/api/include/vk_device.h b/icd/api/include/vk_device.h
index ec142da7..bedf47f9 100644
--- a/icd/api/include/vk_device.h
+++ b/icd/api/include/vk_device.h
@@ -141,7 +141,10 @@ class Device
             uint32                robustBufferAccessExtended    : 1;
             uint32                robustImageAccessExtended     : 1;
             uint32                nullDescriptorExtended        : 1;
-            uint32                reserved                      : 24;
+            // True if EXT_MEMORY_PRIORITY or EXT_PAGEABLE_DEVICE_LOCAL_MEMORY is enabled.
+            uint32                appControlledMemPriority      : 1;
+            uint32                mustWriteImmutableSamplers    : 1;
+            uint32                reserved                      : 22;
         };
 
         uint32 u32All;
@@ -363,7 +366,8 @@ class Device
         const bool                                  attachmentFragmentShadingRate,
         bool                                        scalarBlockLayoutEnabled,
         const ExtendedRobustness&                   extendedRobustnessEnabled,
-        bool                                        bufferDeviceAddressMultiDeviceEnabled);
+        bool                                        bufferDeviceAddressMultiDeviceEnabled,
+        bool                                        pageableDeviceLocalMemory);
 
     void InitDispatchTable();
 
@@ -454,11 +458,6 @@ class Device
         return VkPhysicalDevice(DefaultDeviceIndex)->GetPalHeapFromVkTypeIndex(vkIndex);
     }
 
-    uint32_t GetUmdFpsCapFrameRate() const
-    {
-        return VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().osProperties.umdFpsCapFrameRate;
-    }
-
     uint64_t TimestampFrequency() const
     {
         return VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().timestampFrequency;
@@ -575,9 +574,6 @@ class Device
     BarrierFilterLayer* GetBarrierFilterLayer()
         { return m_pBarrierFilterLayer; }
 
-    AsyncLayer* GetAsyncLayer()
-        { return m_pAsyncLayer; }
-
 #if VKI_GPU_DECOMPRESS
     GpuDecoderLayer* GetGpuDecoderLayer()
         { return m_pGpuDecoderLayer; }
@@ -632,6 +628,9 @@ class Device
     bool UseCompactDynamicDescriptors() const
         { return !GetRuntimeSettings().enableRelocatableShaders && !GetEnabledFeatures().robustBufferAccess;}
 
+    bool MustWriteImmutableSamplers() const
+        { return GetEnabledFeatures().mustWriteImmutableSamplers; }
+
     bool SupportDepthStencilResolve() const
     {
         return (IsExtensionEnabled(DeviceExtensions::KHR_DEPTH_STENCIL_RESOLVE) ||
@@ -777,7 +776,6 @@ class Device
     const DeviceExtensions::Enabled     m_enabledExtensions;       // Enabled device extensions
     DispatchTable                       m_dispatchTable;           // Device dispatch table
     SqttMgr*                            m_pSqttMgr;                // Manager for developer mode SQ thread tracing
-    AsyncLayer*                         m_pAsyncLayer;             // State for async compiler layer, otherwise null
     OptLayer*                           m_pAppOptLayer;            // State for an app-specific layer, otherwise null
     BarrierFilterLayer*                 m_pBarrierFilterLayer;     // State for enabling barrier filtering, otherwise
                                                                    // null
@@ -808,6 +806,9 @@ class Device
     // If set to true, will use a compute queue internally for transfers.
     bool                                m_useComputeAsTransferQueue;
 
+    // If set to true, overrides compute queue to universal queue internally
+    bool                                m_useUniversalAsComputeQueue;
+
     // The max VRS shading rate supported
     VkExtent2D                          m_maxVrsShadingRate;
 
@@ -1152,6 +1153,11 @@ VKAPI_ATTR void VKAPI_CALL vkCmdSetLineStippleEXT(
     uint32_t                                    lineStippleFactor,
     uint16_t                                    lineStipplePattern);
 
+VKAPI_ATTR void VKAPI_CALL vkSetDeviceMemoryPriorityEXT(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    float                                       priority);
+
 } // namespace entry
 
 } // namespace vk
diff --git a/icd/api/include/vk_extensions.h b/icd/api/include/vk_extensions.h
index a15af0e6..d5342afe 100644
--- a/icd/api/include/vk_extensions.h
+++ b/icd/api/include/vk_extensions.h
@@ -272,6 +272,7 @@ class DeviceExtensions final : public Extensions<DeviceExtensions>
         KHR_SHADER_DRAW_PARAMETERS,
         KHR_SHADER_FLOAT16_INT8,
         KHR_SHADER_FLOAT_CONTROLS,
+        KHR_SHADER_INTEGER_DOT_PRODUCT,
         KHR_SHADER_NON_SEMANTIC_INFO,
         KHR_SHADER_SUBGROUP_EXTENDED_TYPES,
         KHR_SHADER_SUBGROUP_UNIFORM_CONTROL_FLOW,
@@ -312,10 +313,12 @@ class DeviceExtensions final : public Extensions<DeviceExtensions>
         EXT_LOAD_STORE_OP_NONE,
         EXT_MEMORY_BUDGET,
         EXT_MEMORY_PRIORITY,
+        EXT_PAGEABLE_DEVICE_LOCAL_MEMORY,
         EXT_PCI_BUS_INFO,
         EXT_PIPELINE_CREATION_CACHE_CONTROL,
         EXT_PIPELINE_CREATION_FEEDBACK,
         EXT_POST_DEPTH_COVERAGE,
+        EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART,
         EXT_PRIVATE_DATA,
         EXT_QUEUE_FAMILY_FOREIGN,
         EXT_ROBUSTNESS2,
diff --git a/icd/api/include/vk_graphics_pipeline.h b/icd/api/include/vk_graphics_pipeline.h
index d09a3c6f..df537f38 100644
--- a/icd/api/include/vk_graphics_pipeline.h
+++ b/icd/api/include/vk_graphics_pipeline.h
@@ -49,6 +49,18 @@ class CmdBuffer;
 class RenderPass;
 struct CmdBufferRenderState;
 
+// =====================================================================================================================
+// Create info of graphics pipeline deferred compile
+struct DeferGraphicsPipelineCreateInfo
+{
+    Device*                          pDevice;
+    PipelineCache*                   pPipelineCache;
+    GraphicsPipeline*                pPipeline;
+    GraphicsPipelineBinaryCreateInfo binaryCreateInfo;
+    GraphicsPipelineShaderStageInfo  shaderStageInfo;
+    GraphicsPipelineObjectCreateInfo objectCreateInfo;
+};
+
 // =====================================================================================================================
 // Convert sample location coordinates from [0,1] space (sent by the application) to [-8, 7] space (accepted by PAL)
 static void ConvertCoordinates(
@@ -158,7 +170,7 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch
         const VkAllocationCallbacks*    pAllocator) override;
 
     const VbBindingInfo& GetVbBindingInfo() const
-        { return m_vbInfo.bindingInfo; }
+        { return m_vbInfo; }
 
     void BindToCmdBuffer(
         CmdBuffer*                             pCmdBuffer,
@@ -197,7 +209,8 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch
         bool                                   bindInputAssemblyState,
         bool                                   force1x1ShaderRate,
         bool                                   customSampleLocations,
-        const VbInfo&                          vbInfo,
+        const VbBindingInfo&                   vbInfo,
+        const PipelineInternalBufferInfo*      pInternalBuffer,
         Pal::IMsaaState**                      pPalMsaa,
         Pal::IColorBlendState**                pPalColorBlend,
         Pal::IDepthStencilState**              pPalDepthStencil,
@@ -234,7 +247,8 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch
         const VkGraphicsPipelineCreateInfo* pCreateInfo,
         const VkAllocationCallbacks*        pAllocator,
         const PipelineLayout*               pPipelineLayout,
-        const VbInfo*                       pVbInfo,
+        const VbBindingInfo*                pVbInfo,
+        const PipelineInternalBufferInfo*   pInternalBuffer,
         const size_t*                       pPipelineBinarySizes,
         const void**                        pPipelineBinaries,
         PipelineCache*                      pPipelineCache,
@@ -247,12 +261,54 @@ class GraphicsPipeline final : public GraphicsPipelineCommon, public NonDispatch
 private:
     PAL_DISALLOW_COPY_AND_ASSIGN(GraphicsPipeline);
 
+    VkResult DeferCreateOptimizedPipeline(
+        Device*                           pDevice,
+        PipelineCache*                    pPipelineCache,
+        GraphicsPipelineBinaryCreateInfo* pBinaryCreateInfo,
+        GraphicsPipelineShaderStageInfo*  pShaderStageInfo,
+        GraphicsPipelineObjectCreateInfo* pObjectCreateInfo);
+
+    static VkResult CreatePalPipelineObjects(
+        Device*                           pDevice,
+        PipelineCache*                    pPipelineCache,
+        GraphicsPipelineObjectCreateInfo* pObjectCreateInfo,
+        const size_t*                     pPipelineBinarySizes,
+        const void**                      pPipelineBinaries,
+        const Util::MetroHash::Hash*      pCacheIds,
+        void*                             pSystemMem,
+        Pal::IPipeline**                  pPalPipeline);
+
+    void SetOptimizedPipeline(Pal::IPipeline** pPalPipeline);
+
+    bool UseOptimizedPipeline() const
+    {
+        bool result = m_info.checkDeferCompilePipeline;
+        if (result)
+        {
+            Util::MutexAuto pipelineSwitchLock(const_cast<Util::Mutex*>(&m_pipelineSwitchLock));
+            result = m_pOptimizedPipeline[0] != nullptr && m_optimizedPipelineHash != 0;
+        }
+        return result;
+    }
+    VkResult BuildDeferCompileWorkload(
+        Device*                           pDevice,
+        PipelineCache*                    pPipelineCache,
+        GraphicsPipelineBinaryCreateInfo* pBinaryCreateInfo,
+        GraphicsPipelineShaderStageInfo*  pShaderStageInfo,
+        GraphicsPipelineObjectCreateInfo* pObjectCreateInfo);
+
+    static void ExecuteDeferCreateOptimizedPipeline(void* pPayload);
+
     GraphicsPipelineObjectImmedInfo m_info;                            // Immediate state that will go in CmdSet* functions
     Pal::IMsaaState*                m_pPalMsaa[MaxPalDevices];         // PAL MSAA state object
     Pal::IColorBlendState*          m_pPalColorBlend[MaxPalDevices];   // PAL color blend state object
     Pal::IDepthStencilState*        m_pPalDepthStencil[MaxPalDevices]; // PAL depth stencil state object
-    VbInfo                          m_vbInfo;                          // Information about vertex buffer bindings
-
+    VbBindingInfo                   m_vbInfo;                          // Information about vertex buffer bindings
+    PipelineInternalBufferInfo      m_internalBufferInfo;              // Information about internal buffer
+    Pal::IPipeline*                 m_pOptimizedPipeline[MaxPalDevices]; // Optimized PAL pipelines
+    uint64_t                        m_optimizedPipelineHash;           // Pipeline hash of optimized PAL pipelines
+    Util::Mutex                     m_pipelineSwitchLock;              // Lock for optimized pipeline and default pipeline
+    DeferredCompileWorkload         m_deferWorkload;                   // Workload of deferred compiled
     union
     {
         uint8 value;
diff --git a/icd/api/include/vk_image.h b/icd/api/include/vk_image.h
index 9d9e7e8f..d8278745 100644
--- a/icd/api/include/vk_image.h
+++ b/icd/api/include/vk_image.h
@@ -279,7 +279,6 @@ class Image final : public NonDispatchable<VkImage, Image>
 
     Image(
         Device*                      pDevice,
-        const VkAllocationCallbacks* pAllocator,
         VkImageCreateFlags           flags,
         Pal::IImage**                pPalImage,
         Pal::IGpuMemory**            pPalMemory,
diff --git a/icd/api/include/vk_memory.h b/icd/api/include/vk_memory.h
index 98b1f4f9..6ebf5b3f 100644
--- a/icd/api/include/vk_memory.h
+++ b/icd/api/include/vk_memory.h
@@ -71,6 +71,10 @@ union MemoryPriority
         { return ((priority < memPriority.priority) ||
                   ((priority == memPriority.priority) && (offset < memPriority.offset))); }
 
+    bool operator!=(const MemoryPriority& memPriority) const
+        { return ((priority != memPriority.priority) ||
+                  ((priority == memPriority.priority) && (offset != memPriority.offset))); }
+
     static MemoryPriority FromSetting(uint32_t value);
 
     static MemoryPriority FromVkMemoryPriority(float value);
@@ -133,6 +137,10 @@ class Memory final : public NonDispatchable<VkDeviceMemory, Memory>
 
     void ElevatePriority(MemoryPriority priority);
 
+    void SetPriority(
+        const MemoryPriority    priority,
+        const bool              mustBeLower);
+
     Pal::IGpuMemory* PalMemory(uint32_t resourceIndex, uint32_t memoryIndex);
 
     Pal::IGpuMemory* PalMemory(uint32_t resourceIndex) const
diff --git a/icd/api/include/vk_pipeline_layout.h b/icd/api/include/vk_pipeline_layout.h
index 253abb9b..e12b72ad 100644
--- a/icd/api/include/vk_pipeline_layout.h
+++ b/icd/api/include/vk_pipeline_layout.h
@@ -117,7 +117,7 @@ class PipelineLayout final : public NonDispatchable<VkPipelineLayout, PipelineLa
 
     VkResult BuildLlpcPipelineMapping(
         const uint32_t             stageMask,
-        VbInfo*                    pVbInfo,
+        VbBindingInfo*             pVbInfo,
         void*                      pBuffer,
         bool                       appendFetchShaderCb,
         Vkgc::ResourceMappingData* pResourceMapping) const;
diff --git a/icd/api/pipeline_compiler.cpp b/icd/api/pipeline_compiler.cpp
index 26881713..5c75a6df 100644
--- a/icd/api/pipeline_compiler.cpp
+++ b/icd/api/pipeline_compiler.cpp
@@ -29,7 +29,6 @@
  ***********************************************************************************************************************
  */
 #include "include/log.h"
-#include "include/pipeline_compiler.h"
 #include "include/vk_device.h"
 #include "include/vk_physical_device.h"
 #include "include/vk_shader.h"
@@ -37,10 +36,12 @@
 #include "include/vk_pipeline_layout.h"
 #include "include/vk_render_pass.h"
 #include "include/vk_graphics_pipeline.h"
+#include "include/pipeline_compiler.h"
 #include <vector>
 
 #include "palFile.h"
 #include "palHashSetImpl.h"
+#include "palListImpl.h"
 
 #include "include/pipeline_binary_cache.h"
 
@@ -113,6 +114,7 @@ PipelineCompiler::PipelineCompiler(
     , m_totalBinaries(0)
     , m_totalTimeSpent(0)
     , m_uberFetchShaderInfoFormatMap(8, pPhysicalDevice->Manager()->VkInstance()->Allocator())
+    , m_shaderModuleHandleMap(8, pPhysicalDevice->Manager()->VkInstance()->Allocator())
 {
 
 }
@@ -245,7 +247,12 @@ VkResult PipelineCompiler::Initialize()
 
     if (result == VK_SUCCESS)
     {
-        if (settings.enableUberFetchShader)
+        result = PalToVkResult(m_shaderModuleHandleMap.Init());
+    }
+
+    if (result == VK_SUCCESS)
+    {
+        if (settings.enableUberFetchShader || settings.enableEarlyCompile)
         {
             m_uberFetchShaderInfoFormatMap.Init();
 
@@ -253,6 +260,12 @@ VkResult PipelineCompiler::Initialize()
         }
     }
 
+    if (result == VK_SUCCESS)
+    {
+        uint32_t threadCount = settings.deferCompileOptimizedPipeline ? settings.deferCompileThreadCount : 0;
+        m_deferCompileMgr.Init(threadCount, m_pPhysicalDevice->VkInstance()->Allocator());
+    }
+
     return result;
 }
 
@@ -263,6 +276,28 @@ void PipelineCompiler::Destroy()
     m_compilerSolutionLlpc.Destroy();
 
     DestroyPipelineBinaryCache();
+
+    {
+        Util::MutexAuto mutexLock(&m_shaderModuleCacheLock);
+        for (auto it = m_shaderModuleHandleMap.Begin(); it.Get() != nullptr; it.Next())
+        {
+            VK_ASSERT(it.Get()->value.pRefCount != nullptr);
+
+            if (*(it.Get()->value.pRefCount) == 1)
+            {
+                // Force use un-lock version of FreeShaderModule.
+                auto pInstance = m_pPhysicalDevice->Manager()->VkInstance();
+                pInstance->FreeMem(it.Get()->value.pRefCount);
+                it.Get()->value.pRefCount = nullptr;
+                FreeShaderModule(&it.Get()->value);
+            }
+            else
+            {
+                (*(it.Get()->value.pRefCount))--;
+            }
+        }
+        m_shaderModuleHandleMap.Reset();
+    }
 }
 
 // =====================================================================================================================
@@ -331,6 +366,130 @@ bool PipelineCompiler::LoadReplaceShaderBinary(
     return findShader;
 }
 
+// =====================================================================================================================
+// Generates shader module cache hash ID
+Util::MetroHash::Hash PipelineCompiler::GetShaderModuleCacheHash(
+    VkShaderModuleCreateFlags flags,
+    uint32_t                  compilerMask,
+    Util::MetroHash::Hash&    uniqueHash)
+{
+    Util::MetroHash128 hasher;
+    Util::MetroHash::Hash hash;
+    hasher.Update(compilerMask);
+    hasher.Update(uniqueHash);
+    hasher.Update(flags);
+    hasher.Update(m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash());
+    hasher.Finalize(hash.bytes);
+    return hash;
+}
+
+// =====================================================================================================================
+// Loads shader module from cache, include both run-time cache and binary cache
+VkResult PipelineCompiler::LoadShaderModuleFromCache(
+    const Device*             pDevice,
+    VkShaderModuleCreateFlags flags,
+    uint32_t                  compilerMask,
+    Util::MetroHash::Hash&    uniqueHash,
+    ShaderModuleHandle*       pShaderModule)
+{
+    bool supportModuleCache = true;
+
+#if ICD_X86_BUILD
+    supportModuleCache = false;
+#endif
+
+    if (compilerMask & (1 << PipelineCompilerTypeLlpc))
+    {
+        // LLPC always defers SPIRV conversion, we needn't cache the result
+        supportModuleCache = false;
+    }
+
+    VK_ASSERT(pShaderModule->pRefCount == nullptr);
+
+    VkResult result = VK_ERROR_INITIALIZATION_FAILED;
+    if (supportModuleCache)
+    {
+        Util::MutexAuto mutexLock(&m_shaderModuleCacheLock);
+
+        Util::MetroHash::Hash shaderModuleCacheHash = GetShaderModuleCacheHash(flags, compilerMask, uniqueHash);
+        auto pHandle = m_shaderModuleHandleMap.FindKey(shaderModuleCacheHash);
+        if (pHandle != nullptr)
+        {
+            VK_ASSERT(pHandle->pRefCount != nullptr);
+            (*(pHandle->pRefCount))++;
+            *pShaderModule = *pHandle;
+            result = VK_SUCCESS;
+        }
+        else if (m_pBinaryCache != nullptr)
+        {
+            if (result == VK_SUCCESS)
+            {
+                auto pInstance = m_pPhysicalDevice->VkInstance();
+                pShaderModule->pRefCount = reinterpret_cast<uint32_t*>(
+                    pInstance->AllocMem(sizeof(uint32_t), VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_CACHE));
+                if (pShaderModule->pRefCount != nullptr)
+                {
+                    // Initialize the reference count to two: one for the runtime cache and one for this shader module.
+                    *pShaderModule->pRefCount = 2;
+                    result = PalToVkResult(m_shaderModuleHandleMap.Insert(shaderModuleCacheHash, *pShaderModule));
+                    VK_ASSERT(result == VK_SUCCESS);
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+// =====================================================================================================================
+// Stores shader module to cache, include both run-time cache and binary cache
+void PipelineCompiler::StoreShaderModuleToCache(
+    const Device*             pDevice,
+    VkShaderModuleCreateFlags flags,
+    uint32_t                  compilerMask,
+    Util::MetroHash::Hash&    uniqueHash,
+    ShaderModuleHandle*       pShaderModule)
+{
+
+    VK_ASSERT(pShaderModule->pRefCount == nullptr);
+
+    bool supportModuleCache = true;
+
+#if ICD_X86_BUILD
+    supportModuleCache = false;
+#endif
+
+    if (compilerMask & (1 << PipelineCompilerTypeLlpc))
+    {
+        // LLPC alway defer SPIRV conversion, we nedn't cache the result
+        supportModuleCache = false;
+    }
+
+    if (supportModuleCache)
+    {
+        Util::MetroHash::Hash shaderModuleCacheHash = GetShaderModuleCacheHash(flags, compilerMask, uniqueHash);
+        auto pInstance = m_pPhysicalDevice->VkInstance();
+        pShaderModule->pRefCount = reinterpret_cast<uint32_t*>(
+            pInstance->AllocMem(sizeof(uint32_t), VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_CACHE));
+        if (pShaderModule->pRefCount != nullptr)
+        {
+            Util::MutexAuto mutexLock(&m_shaderModuleCacheLock);
+            // Initialize the reference count to two: one for the runtime cache and one for this shader module.
+            *pShaderModule->pRefCount = 2;
+            auto palResult = m_shaderModuleHandleMap.Insert(shaderModuleCacheHash, *pShaderModule);
+            if (palResult != Util::Result::Success)
+            {
+                // Reset refference count to one if fail to add it to runtime cache
+                *pShaderModule->pRefCount = 1;
+            }
+        }
+
+        if (m_pBinaryCache != nullptr)
+        {
+        }
+    }
+}
+
 // =====================================================================================================================
 // Builds shader module from SPIR-V binary code.
 VkResult PipelineCompiler::BuildShaderModule(
@@ -344,26 +503,49 @@ VkResult PipelineCompiler::BuildShaderModule(
     auto pInstance = m_pPhysicalDevice->Manager()->VkInstance();
     VkResult result = VK_SUCCESS;
     uint32_t compilerMask = GetCompilerCollectionMask();
-    Util::MetroHash::Hash hash = {};
-    Util::MetroHash64::Hash(reinterpret_cast<const uint8_t*>(pCode), codeSize, hash.bytes);
+    Util::MetroHash::Hash stableHash = {};
+    Util::MetroHash::Hash uniqueHash = {};
+    Util::MetroHash64::Hash(reinterpret_cast<const uint8_t*>(pCode), codeSize, stableHash.bytes);
+    uniqueHash = stableHash;
     bool findReplaceShader = false;
 
-    if (pSettings->shaderReplaceMode == ShaderReplaceShaderHash)
+    if ((pSettings->shaderReplaceMode == ShaderReplaceShaderHash) ||
+        (pSettings->shaderReplaceMode == ShaderReplaceShaderHashPipelineBinaryHash))
     {
         size_t replaceCodeSize = 0;
         void* pReplaceCode = nullptr;
-        uint64_t hash64 = Util::MetroHash::Compact64(&hash);
+        uint64_t hash64 = Util::MetroHash::Compact64(&stableHash);
         findReplaceShader = LoadReplaceShaderBinary(hash64, &replaceCodeSize, &pReplaceCode);
         if (findReplaceShader)
         {
             pCode = pReplaceCode;
             codeSize = replaceCodeSize;
+            Util::MetroHash64::Hash(reinterpret_cast<const uint8_t*>(pCode), codeSize, uniqueHash.bytes);
         }
     }
 
-    if (compilerMask & (1 << PipelineCompilerTypeLlpc))
+    result = LoadShaderModuleFromCache(pDevice,  flags, compilerMask, uniqueHash, pShaderModule);
+    if (result != VK_SUCCESS)
+    {
+        if (compilerMask & (1 << PipelineCompilerTypeLlpc))
+        {
+            result = m_compilerSolutionLlpc.BuildShaderModule(pDevice, flags, codeSize, pCode, pShaderModule, stableHash);
+        }
+
+        StoreShaderModuleToCache(pDevice, flags, compilerMask, uniqueHash, pShaderModule);
+    }
+    else
     {
-        result = m_compilerSolutionLlpc.BuildShaderModule(pDevice, flags, codeSize, pCode, pShaderModule, hash);
+        if (result == VK_SUCCESS)
+        {
+            if (pSettings->enablePipelineDump)
+            {
+                Vkgc::BinaryData spvBin = {};
+                spvBin.pCode = pCode;
+                spvBin.codeSize = codeSize;
+                Vkgc::IPipelineDumper::DumpSpirvBinary(pSettings->pipelineDumpDir, &spvBin);
+            }
+        }
     }
 
     if (findReplaceShader)
@@ -390,11 +572,28 @@ bool PipelineCompiler::IsValidShaderModule(
 void PipelineCompiler::FreeShaderModule(
     ShaderModuleHandle* pShaderModule)
 {
-    m_compilerSolutionLlpc.FreeShaderModule(pShaderModule);
+    if (pShaderModule->pRefCount != nullptr)
+    {
+        Util::MutexAuto mutexLock(&m_shaderModuleCacheLock);
+        if (*pShaderModule->pRefCount > 1)
+        {
+            (*pShaderModule->pRefCount)--;
+        }
+        else
+        {
+            m_compilerSolutionLlpc.FreeShaderModule(pShaderModule);
+            auto pInstance = m_pPhysicalDevice->Manager()->VkInstance();
+            pInstance->FreeMem(pShaderModule->pRefCount);
+        }
+    }
+    else
+    {
+        m_compilerSolutionLlpc.FreeShaderModule(pShaderModule);
+    }
 }
 
 // =====================================================================================================================
-// Replaces pipeline binary from external replacment file (<pipeline_name>_repalce.elf)
+// Replaces pipeline binary from external replacement file (<pipeline_name>_replace.elf)
 template<class PipelineBuildInfo>
 bool PipelineCompiler::ReplacePipelineBinary(
         const PipelineBuildInfo* pPipelineBuildInfo,
@@ -698,6 +897,7 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary(
 
     int64_t compileTime = 0;
     uint64_t pipelineHash = Vkgc::IPipelineDumper::GetPipelineHash(&pCreateInfo->pipelineInfo);
+    uint64_t optimizedPipelineHash = 0;
 
     void* pPipelineDumpHandle = nullptr;
     const void* moduleDataBaks[ShaderStage::ShaderStageGfxCount];
@@ -713,7 +913,8 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary(
         &pCreateInfo->pipelineInfo.fs,
     };
 
-    if (settings.shaderReplaceMode == ShaderReplacePipelineBinaryHash)
+    if ((settings.shaderReplaceMode == ShaderReplacePipelineBinaryHash) ||
+        (settings.shaderReplaceMode == ShaderReplaceShaderHashPipelineBinaryHash))
     {
         if (ReplacePipelineBinary(&pCreateInfo->pipelineInfo, pPipelineBinarySize, ppPipelineBinary, pipelineHash))
         {
@@ -742,17 +943,18 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary(
         }
     }
 
-    if (settings.enablePipelineDump)
+    // Generate optimized pipeline hash if both early compile and defer compile are enabled
+    if (settings.deferCompileOptimizedPipeline &&
+        (pCreateInfo->pipelineInfo.enableEarlyCompile || pCreateInfo->pipelineInfo.enableUberFetchShader))
     {
-        Vkgc::PipelineDumpOptions dumpOptions = {};
-        dumpOptions.pDumpDir                 = settings.pipelineDumpDir;
-        dumpOptions.filterPipelineDumpByType = settings.filterPipelineDumpByType;
-        dumpOptions.filterPipelineDumpByHash = settings.filterPipelineDumpByHash;
-        dumpOptions.dumpDuplicatePipelines    = settings.dumpDuplicatePipelines;
+        bool enableEarlyCompile = pCreateInfo->pipelineInfo.enableEarlyCompile;
+        bool enableUberFetchShader = pCreateInfo->pipelineInfo.enableUberFetchShader;
 
-        Vkgc::PipelineBuildInfo pipelineInfo = {};
-        pipelineInfo.pGraphicsInfo = &pCreateInfo->pipelineInfo;
-        pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, pipelineInfo, pipelineHash);
+        pCreateInfo->pipelineInfo.enableEarlyCompile = false;
+        pCreateInfo->pipelineInfo.enableUberFetchShader = false;
+        optimizedPipelineHash = Vkgc::IPipelineDumper::GetPipelineHash(&pCreateInfo->pipelineInfo);
+        pCreateInfo->pipelineInfo.enableEarlyCompile = enableEarlyCompile;
+        pCreateInfo->pipelineInfo.enableUberFetchShader = enableUberFetchShader;
     }
 
     // PAL Pipeline caching
@@ -770,22 +972,73 @@ VkResult PipelineCompiler::CreateGraphicsPipelineBinary(
         pPipelineBinaryCache = pPipelineCache->GetPipelineCache();
     }
 
+    int64_t startTime = 0;
     if (shouldCompile && ((pPipelineBinaryCache != nullptr) || (m_pBinaryCache != nullptr)))
     {
-        int64_t startTime = Util::GetPerfCpuTime();
+        startTime = Util::GetPerfCpuTime();
 
-        GetGraphicsPipelineCacheId(
-            deviceIdx,
-            pCreateInfo,
-            pipelineHash,
-            m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash(),
-            pCacheId);
+        // Search optimized pipeline first
+        if (optimizedPipelineHash != 0)
+        {
+            GetGraphicsPipelineCacheId(
+                deviceIdx,
+                pCreateInfo,
+                optimizedPipelineHash,
+                m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash(),
+                pCacheId);
+
+            cacheResult = GetCachedPipelineBinary(pCacheId, pPipelineBinaryCache, pPipelineBinarySize, ppPipelineBinary,
+                &isUserCacheHit, &isInternalCacheHit, &pCreateInfo->freeCompilerBinary, &pCreateInfo->pipelineFeedback);
+            if (cacheResult == Util::Result::Success)
+            {
+                shouldCompile = false;
+                // Update pipeline option for optimized pipeline and update dump handle.
+                pCreateInfo->pipelineInfo.enableEarlyCompile = false;
+                pCreateInfo->pipelineInfo.enableUberFetchShader = false;
 
-        cacheResult = GetCachedPipelineBinary(pCacheId, pPipelineBinaryCache, pPipelineBinarySize, ppPipelineBinary,
-            &isUserCacheHit, &isInternalCacheHit, &pCreateInfo->freeCompilerBinary, &pCreateInfo->pipelineFeedback);
-        if (cacheResult == Util::Result::Success)
+            }
+        }
+    }
+
+    if (settings.enablePipelineDump)
+    {
+        Vkgc::PipelineDumpOptions dumpOptions = {};
+        dumpOptions.pDumpDir = settings.pipelineDumpDir;
+        dumpOptions.filterPipelineDumpByType = settings.filterPipelineDumpByType;
+        dumpOptions.filterPipelineDumpByHash = settings.filterPipelineDumpByHash;
+        dumpOptions.dumpDuplicatePipelines = settings.dumpDuplicatePipelines;
+
+        Vkgc::PipelineBuildInfo pipelineInfo = {};
+        pipelineInfo.pGraphicsInfo = &pCreateInfo->pipelineInfo;
+        uint64_t dumpHash = pipelineHash;
+        if (optimizedPipelineHash != 0)
         {
-            shouldCompile = false;
+            if (shouldCompile == false)
+            {
+                // Current pipeline is optimized pipeline if optimized pipeline is valid and pipeline cache is hit
+                dumpHash = optimizedPipelineHash;
+            }
+        }
+        pPipelineDumpHandle = Vkgc::IPipelineDumper::BeginPipelineDump(&dumpOptions, pipelineInfo, dumpHash);
+    }
+
+    if (shouldCompile && ((pPipelineBinaryCache != nullptr) || (m_pBinaryCache != nullptr)))
+    {
+        if (shouldCompile)
+        {
+            GetGraphicsPipelineCacheId(
+                deviceIdx,
+                pCreateInfo,
+                pipelineHash,
+                m_pPhysicalDevice->GetSettingsLoader()->GetSettingsHash(),
+                pCacheId);
+
+            cacheResult = GetCachedPipelineBinary(pCacheId, pPipelineBinaryCache, pPipelineBinarySize, ppPipelineBinary,
+                &isUserCacheHit, &isInternalCacheHit, &pCreateInfo->freeCompilerBinary, &pCreateInfo->pipelineFeedback);
+            if (cacheResult == Util::Result::Success)
+            {
+                shouldCompile = false;
+            }
         }
 
         cacheTime = Util::GetPerfCpuTime() - startTime;
@@ -904,7 +1157,8 @@ VkResult PipelineCompiler::CreateComputePipelineBinary(
     ShaderModuleHandle shaderModuleReplaceHandle = {};
     bool shaderModuleReplaced = false;
 
-    if (settings.shaderReplaceMode == ShaderReplacePipelineBinaryHash)
+    if ((settings.shaderReplaceMode == ShaderReplacePipelineBinaryHash) ||
+        (settings.shaderReplaceMode == ShaderReplaceShaderHashPipelineBinaryHash))
     {
         if (ReplacePipelineBinary(&pCreateInfo->pipelineInfo, pPipelineBinarySize, ppPipelineBinary, pipelineHash))
         {
@@ -1373,7 +1627,7 @@ static void BuildViewportState(
 }
 
 // =====================================================================================================================
-static void BuildNggState(
+void PipelineCompiler::BuildNggState(
     const Device*                     pDevice,
     const VkShaderStageFlagBits       activeStages,
     const bool                        isConservativeOverestimation,
@@ -1468,7 +1722,7 @@ static void BuildDepthStencilState(
 }
 
 // =====================================================================================================================
-static void BuildPipelineShaderInfo(
+void PipelineCompiler::BuildPipelineShaderInfo(
     const Device*                                 pDevice,
     const ShaderStageInfo*                        pShaderInfoIn,
     Vkgc::PipelineShaderInfo*                     pShaderInfoOut,
@@ -1488,7 +1742,6 @@ static void BuildPipelineShaderInfo(
         pShaderInfoOut->pSpecializationInfo = pShaderInfoIn->pSpecializationInfo;
         pShaderInfoOut->pEntryTarget        = pShaderInfoIn->pEntryPoint;
         pShaderInfoOut->entryStage          = stage;
-
         pCompiler->ApplyDefaultShaderOptions(stage,
                                              &pShaderInfoOut->options
                                              );
@@ -1516,7 +1769,7 @@ static VkResult BuildPipelineResourceMapping(
     const Device*                     pDevice,
     const PipelineLayout*             pLayout,
     const uint32_t                    stageMask,
-    VbInfo*                           pVbInfo,
+    VbBindingInfo*                    pVbInfo,
     GraphicsPipelineBinaryCreateInfo* pCreateInfo)
 {
     VkResult result = VK_SUCCESS;
@@ -1612,7 +1865,7 @@ static void BuildPipelineShadersInfo(
     {
         if (((shaderMask & (1 << stage)) != 0) && (pShaderInfo->stages[stage].pModuleHandle != nullptr))
         {
-            BuildPipelineShaderInfo(pDevice,
+            PipelineCompiler::BuildPipelineShaderInfo(pDevice,
                                     &pShaderInfo->stages[stage],
                                     ppShaderInfoOut[stage],
                                     &pCreateInfo->pipelineInfo.options,
@@ -1706,7 +1959,7 @@ static void BuildVertexInputInterfaceState(
         pCreateInfo->pipelineInfo.dynamicVertexStride = true;
     }
 
-    if (pDevice->GetRuntimeSettings().enableUberFetchShader)
+    if (pDevice->GetRuntimeSettings().enableUberFetchShader || pDevice->GetRuntimeSettings().enableEarlyCompile)
     {
         pCreateInfo->pipelineInfo.enableUberFetchShader = true;
     }
@@ -1733,7 +1986,7 @@ static void BuildPreRasterizationShaderState(
         BuildViewportState(pDevice, pIn->pViewportState, dynamicStateFlags, pCreateInfo);
     }
 
-    BuildNggState(pDevice, activeStages, isConservativeOverestimation, pCreateInfo);
+    PipelineCompiler::BuildNggState(pDevice, activeStages, isConservativeOverestimation, pCreateInfo);
 
     if (activeStages & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT))
     {
@@ -1805,41 +2058,18 @@ static void BuildFragmentOutputInterfaceState(
 }
 
 // =====================================================================================================================
-static VkResult BuildUberFetchShaderInternalData(
+static VkResult BuildPipelineInternalBufferData(
     const Device*                     pDevice,
     GraphicsPipelineBinaryCreateInfo* pCreateInfo,
-    VbInfo*                           pVbInfo)
+    PipelineInternalBufferInfo*       pInternalBufferInfo)
 {
     PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex);
     VK_ASSERT(pCreateInfo->pipelineInfo.enableUberFetchShader);
-
-    auto result =  pDefaultCompiler->BuildUberFetchShaderInternalData(pCreateInfo->compilerType,
-                                                                      pCreateInfo->pipelineInfo.pVertexInput,
-                                                                      pCreateInfo->pipelineInfo.dynamicVertexStride,
-                                                                      &pVbInfo->uberFetchShaderBuffer);
-
-    auto pSettings = &pDevice->GetRuntimeSettings();
-
-    if (pSettings->disablePerInstanceFetch)
-    {
-        if (pVbInfo->uberFetchShaderBuffer.requirePerIntanceFetch)
-        {
-            pCreateInfo->pipelineInfo.enableUberFetchShader = false;
-            pVbInfo->uberFetchShaderBuffer.bufferSize = 0;
-        }
-    }
-
-    if (pSettings->disablePerCompFetch)
-    {
-        if (pVbInfo->uberFetchShaderBuffer.requirePerCompFetch)
-        {
-            pCreateInfo->pipelineInfo.enableUberFetchShader = false;
-            pVbInfo->uberFetchShaderBuffer.bufferSize = 0;
-        }
-    }
-
+    auto result =  pDefaultCompiler->BuildPipelineInternalBufferData(pCreateInfo,
+                                                                     pInternalBufferInfo);
     return result;
 }
+
 // =====================================================================================================================
 static VkResult BuildExecutablePipelineState(
     const Device*                          pDevice,
@@ -1848,7 +2078,8 @@ static VkResult BuildExecutablePipelineState(
     const PipelineLayout*                  pPipelineLayout,
     const uint32_t                         dynamicStateFlags,
     GraphicsPipelineBinaryCreateInfo*      pCreateInfo,
-    VbInfo*                                pVbInfo)
+    VbBindingInfo*                         pVbInfo,
+    PipelineInternalBufferInfo*            pInternalBufferInfo)
 {
     const RuntimeSettings& settings         = pDevice->GetRuntimeSettings();
     PipelineCompiler*      pDefaultCompiler = pDevice->GetCompiler(DefaultDeviceIndex);
@@ -1908,8 +2139,7 @@ static VkResult BuildExecutablePipelineState(
 
         if (pCreateInfo->pipelineInfo.enableUberFetchShader)
         {
-            VK_ASSERT(pVbInfo->uberFetchShaderBuffer.userDataOffset > 0);
-            result = BuildUberFetchShaderInternalData(pDevice, pCreateInfo, pVbInfo);
+            result = BuildPipelineInternalBufferData(pDevice, pCreateInfo, pInternalBufferInfo);
         }
     }
 
@@ -1924,7 +2154,8 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo(
     const GraphicsPipelineShaderStageInfo*          pShaderInfo,
     const PipelineLayout*                           pPipelineLayout,
     GraphicsPipelineBinaryCreateInfo*               pCreateInfo,
-    VbInfo*                                         pVbInfo)
+    VbBindingInfo*                                  pVbInfo,
+    PipelineInternalBufferInfo*                     pInternalBufferInfo)
 {
     VK_ASSERT(pIn != nullptr);
 
@@ -1938,7 +2169,7 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo(
                                      pIn->pDynamicState
                                      );
 
-    BuildVertexInputInterfaceState(pDevice, pIn, dynamicStateFlags, pCreateInfo, &pVbInfo->bindingInfo);
+    BuildVertexInputInterfaceState(pDevice, pIn, dynamicStateFlags, pCreateInfo, pVbInfo);
 
     BuildPreRasterizationShaderState(pDevice, pIn, pShaderInfo, dynamicStateFlags, activeStages, pCreateInfo);
 
@@ -1955,7 +2186,7 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo(
 
     {
         result = BuildExecutablePipelineState(
-            pDevice, pIn, pShaderInfo, pPipelineLayout, dynamicStateFlags, pCreateInfo, pVbInfo);
+            pDevice, pIn, pShaderInfo, pPipelineLayout, dynamicStateFlags, pCreateInfo, pVbInfo, pInternalBufferInfo);
     }
 
     return result;
@@ -2238,11 +2469,12 @@ void PipelineCompiler::FreeComputePipelineCreateInfo(
 // =====================================================================================================================
 // Free the temp memories in graphics pipeline create info
 void PipelineCompiler::FreeGraphicsPipelineCreateInfo(
-    GraphicsPipelineBinaryCreateInfo* pCreateInfo)
+    GraphicsPipelineBinaryCreateInfo* pCreateInfo,
+    bool                              keepConvertTempMemory)
 {
     auto pInstance = m_pPhysicalDevice->Manager()->VkInstance();
 
-    if (pCreateInfo->pTempBuffer != nullptr)
+    if ((pCreateInfo->pTempBuffer != nullptr) && (keepConvertTempMemory == false))
     {
         pInstance->FreeMem(pCreateInfo->pTempBuffer);
         pCreateInfo->pTempBuffer = nullptr;
@@ -2393,15 +2625,13 @@ void PipelineCompiler::GetGraphicsPipelineCacheId(
 }
 
 // =====================================================================================================================
-VkResult PipelineCompiler::BuildUberFetchShaderInternalData(
-    PipelineCompilerType                        compilerType,
-    const VkPipelineVertexInputStateCreateInfo* pVertexInput,
-    bool                                        isDynamicStride,
-   UberFetchShaderBufferInfo*                   pFetchShaderBufferInfo)
+VkResult PipelineCompiler::BuildPipelineInternalBufferData(
+    GraphicsPipelineBinaryCreateInfo*           pCreateInfo,
+    PipelineInternalBufferInfo*                 pInternalBufferInfo)
 {
 
     VkResult result = VK_SUCCESS;
-    if (compilerType == PipelineCompilerTypeLlpc)
+    if (pCreateInfo->compilerType == PipelineCompilerTypeLlpc)
     {
         VK_NOT_IMPLEMENTED;
     }
@@ -2409,4 +2639,23 @@ VkResult PipelineCompiler::BuildUberFetchShaderInternalData(
     return result;
 }
 
+// =====================================================================================================================
+void PipelineCompiler::ExecuteDeferCompile(
+    DeferredCompileWorkload* pWorkload)
+{
+    auto pThread = m_deferCompileMgr.GetCompileThread();
+    if (pThread != nullptr)
+    {
+        pThread->AddTask(pWorkload);
+    }
+    else
+    {
+        pWorkload->Execute(pWorkload->pPayloads);
+        if (pWorkload->pEvent != nullptr)
+        {
+            pWorkload->pEvent->Set();
+        }
+    }
+}
+
 }
diff --git a/icd/api/strings/entry_points.txt b/icd/api/strings/entry_points.txt
index 28b0ffd7..a03242e5 100644
--- a/icd/api/strings/entry_points.txt
+++ b/icd/api/strings/entry_points.txt
@@ -362,6 +362,8 @@ vkGetPipelineExecutableInternalRepresentationsKHR   @device     @dext(KHR_pipeli
 
 vkCmdSetLineStippleEXT                              @device     @dext(EXT_line_rasterization)
 
+vkSetDeviceMemoryPriorityEXT                        @device     @dext(EXT_pageable_device_local_memory)
+
 vkCreatePrivateDataSlotEXT                          @device     @dext(EXT_private_data)
 vkDestroyPrivateDataSlotEXT                         @device     @dext(EXT_private_data)
 vkSetPrivateDataEXT                                 @device     @dext(EXT_private_data)
diff --git a/icd/api/strings/extensions.txt b/icd/api/strings/extensions.txt
index d14325c3..4f2afb95 100644
--- a/icd/api/strings/extensions.txt
+++ b/icd/api/strings/extensions.txt
@@ -100,6 +100,7 @@ VK_EXT_inline_uniform_block
 VK_EXT_transform_feedback
 VK_KHR_shader_float16_int8
 VK_EXT_memory_priority
+VK_EXT_pageable_device_local_memory
 VK_EXT_memory_budget
 VK_KHR_depth_stencil_resolve
 VK_EXT_host_query_reset
@@ -115,6 +116,7 @@ VK_EXT_line_rasterization
 VK_EXT_shader_atomic_float
 VK_EXT_shader_atomic_float2
 VK_KHR_shader_clock
+VK_KHR_shader_integer_dot_product
 VK_KHR_shader_subgroup_extended_types
 VK_KHR_spirv_1_4
 VK_EXT_texel_buffer_alignment
@@ -138,6 +140,7 @@ VK_EXT_4444_formats
 VK_EXT_color_write_enable
 VK_KHR_shader_terminate_invocation
 VK_KHR_synchronization2
+VK_EXT_primitive_topology_list_restart
 VK_EXT_extended_dynamic_state2
 VK_KHR_copy_commands2
 VK_EXT_ycbcr_image_arrays
diff --git a/icd/api/vk_buffer.cpp b/icd/api/vk_buffer.cpp
index b556784a..8dc364f5 100644
--- a/icd/api/vk_buffer.cpp
+++ b/icd/api/vk_buffer.cpp
@@ -43,7 +43,6 @@ namespace vk
 // =====================================================================================================================
 Buffer::Buffer(
     Device*                      pDevice,
-    const VkAllocationCallbacks* pAllocator,
     const VkBufferCreateInfo*    pCreateInfo,
     Pal::IGpuMemory**            pGpuMemory,
     BufferFlags                  internalFlags)
@@ -72,7 +71,6 @@ Buffer::Buffer(
             m_perGpu[deviceIdx].gpuVirtAddr = 0;
         }
     }
-
 }
 
 // =====================================================================================================================
@@ -83,16 +81,15 @@ VkResult Buffer::Create(
     const VkAllocationCallbacks*    pAllocator,
     VkBuffer*                       pBuffer)
 {
-    void*            pMemory   = nullptr;
-    Pal::IGpuMemory* pGpuMemory[MaxPalDevices] = {};
-
-    Pal::Result palResult = Pal::Result::Success;
+    Pal::IGpuMemory*         pGpuMemory[MaxPalDevices] = {};
+    Pal::GpuMemoryCreateInfo gpuMemoryCreateInfo       = {};
 
-    size_t apiSize = ObjectSize(pDevice);
+    VkResult result     = VK_SUCCESS;
+    size_t   apiSize    = ObjectSize(pDevice);
+    size_t   palMemSize = 0;
+    bool     isSparse   = (pCreateInfo->flags & SparseEnablingFlags) != 0;
 
-    bool isSparse = (pCreateInfo->flags & SparseEnablingFlags) != 0;
-
-    if (isSparse && (pCreateInfo->size != 0))
+    if (isSparse)
     {
         // We need virtual remapping support for all sparse resources
         VK_ASSERT(pDevice->VkPhysicalDevice(DefaultDeviceIndex)->IsVirtualRemappingSupported());
@@ -103,72 +100,68 @@ VkResult Buffer::Create(
             VK_ASSERT(pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetPrtFeatures() & Pal::PrtFeatureBuffer);
         }
 
-        size_t                   palMemSize;
-        Pal::GpuMemoryCreateInfo info = { };
-
-        info.alignment          = pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().
-                                                                gpuMemoryProperties.virtualMemAllocGranularity;
-        info.size               = Util::Pow2Align(pCreateInfo->size, info.alignment);
-        info.flags.u32All       = 0;
-        info.flags.virtualAlloc = 1;
-        info.flags.globalGpuVa  = pDevice->IsGlobalGpuVaEnabled();
-        info.heapAccess         = Pal::GpuHeapAccess::GpuHeapAccessExplicit;
+        gpuMemoryCreateInfo.alignment          = pDevice->GetProperties().virtualMemAllocGranularity;
+        gpuMemoryCreateInfo.size               = Util::Pow2Align(pCreateInfo->size, gpuMemoryCreateInfo.alignment);
+        gpuMemoryCreateInfo.flags.virtualAlloc = 1;
+        gpuMemoryCreateInfo.flags.globalGpuVa  = pDevice->IsGlobalGpuVaEnabled();
+        gpuMemoryCreateInfo.heapAccess         = Pal::GpuHeapAccess::GpuHeapAccessExplicit;
 
         // Virtual resource should return 0 on unmapped read if residencyNonResidentStrict is set.
         if (pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetPrtFeatures() & Pal::PrtFeatureStrictNull)
         {
-            info.virtualAccessMode = Pal::VirtualGpuMemAccessMode::ReadZero;
+            gpuMemoryCreateInfo.virtualAccessMode = Pal::VirtualGpuMemAccessMode::ReadZero;
         }
 
-        palMemSize = pDevice->PalDevice(DefaultDeviceIndex)->GetGpuMemorySize(info, &palResult);
+        Pal::Result palResult;
+
+        palMemSize = pDevice->PalDevice(DefaultDeviceIndex)->GetGpuMemorySize(gpuMemoryCreateInfo, &palResult);
         VK_ASSERT(palResult == Pal::Result::Success);
+    }
 
-        // Allocate enough system memory to also store the VA-only memory object
-        pMemory = pDevice->AllocApiObject(
+    // Allocate memory for the dispatchable object and for sparse buffers, the VA-only memory object
+    void* pMemory = pDevice->AllocApiObject(
                         pAllocator,
                         apiSize + (palMemSize * pDevice->NumPalDevices()));
 
-        if (pMemory == nullptr)
-        {
-            return VK_ERROR_OUT_OF_HOST_MEMORY;
-        }
-
-        void* pPalMemory = Util::VoidPtrInc(pMemory, apiSize);
+    if (pMemory == nullptr)
+    {
+        result = VK_ERROR_OUT_OF_HOST_MEMORY;
+    }
+    else if (isSparse)
+    {
+        void*       pPalMemory = Util::VoidPtrInc(pMemory, apiSize);
+        Pal::Result palResult  = Pal::Result::Success;
 
         for (uint32_t deviceIdx = 0;
             (deviceIdx < pDevice->NumPalDevices()) && (palResult == Pal::Result::Success);
             deviceIdx++)
         {
+            if (deviceIdx != DefaultDeviceIndex)
+            {
+                VK_ASSERT(palMemSize == pDevice->PalDevice(deviceIdx)->GetGpuMemorySize(gpuMemoryCreateInfo,
+                                                                                        &palResult));
+                VK_ASSERT(palResult == Pal::Result::Success);
+            }
+
             // Create the VA-only memory object needed for sparse buffer support
             palResult = pDevice->PalDevice(deviceIdx)->CreateGpuMemory(
-                info,
-                (uint8_t*)pPalMemory,
+                gpuMemoryCreateInfo,
+                pPalMemory,
                 &pGpuMemory[deviceIdx]);
 
             pPalMemory = Util::VoidPtrInc(pPalMemory, palMemSize);
         }
-    }
-    else
-    {
-        // Allocate memory only for the dispatchable object
-        pMemory = pDevice->AllocApiObject(
-                        pAllocator,
-                        apiSize);
 
-        if (pMemory == nullptr)
-        {
-            return VK_ERROR_OUT_OF_HOST_MEMORY;
-        }
+        result = PalToVkResult(palResult);
     }
 
-    if (palResult == Pal::Result::Success)
+    if (result == VK_SUCCESS)
     {
         BufferFlags bufferFlags;
         CalculateBufferFlags(pDevice, pCreateInfo, &bufferFlags);
 
         // Construct API buffer object.
         VK_PLACEMENT_NEW (pMemory) Buffer (pDevice,
-                                           pAllocator,
                                            pCreateInfo,
                                            pGpuMemory,
                                            bufferFlags);
@@ -178,7 +171,7 @@ VkResult Buffer::Create(
         LogBufferCreate(pCreateInfo, *pBuffer, pDevice);
     }
 
-    return PalToVkResult(palResult);
+    return result;
 }
 
 // =====================================================================================================================
@@ -289,7 +282,6 @@ VkResult Buffer::Destroy(
     Device*                         pDevice,
     const VkAllocationCallbacks*    pAllocator)
 {
-
     Pal::ResourceDestroyEventData data = {};
     data.pObj = this;
 
@@ -378,20 +370,6 @@ void Buffer::GetMemoryRequirements(
     GetBufferMemoryRequirements(pDevice, &m_internalFlags, m_size, pMemoryRequirements);
 }
 
-// =====================================================================================================================
-// Get the buffer's memory requirements from VkBufferCreateInfo
-void Buffer::CalculateMemoryRequirements(
-    const Device*             pDevice,
-    const VkBufferCreateInfo* pCreateInfo,
-    VkMemoryRequirements*     pMemoryRequirements)
-{
-    BufferFlags bufferFlags;
-
-    CalculateBufferFlags(pDevice, pCreateInfo, &bufferFlags);
-
-    GetBufferMemoryRequirements(pDevice, &bufferFlags, pCreateInfo->size, pMemoryRequirements);
-}
-
 // =====================================================================================================================
 // Get the buffer's memory requirements
 void Buffer::GetBufferMemoryRequirements(
@@ -569,36 +547,19 @@ VKAPI_ATTR void VKAPI_CALL vkGetBufferMemoryRequirements2(
     VkMemoryRequirements2*                      pMemoryRequirements)
 {
     const Device* pDevice = ApiDevice::ObjectFromHandle(device);
-    VK_ASSERT((pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) ||
-              pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2));
 
-    union
-    {
-        const VkStructHeader*                  pHeader;
-        const VkBufferMemoryRequirementsInfo2* pRequirementsInfo2;
-    };
-
-    pRequirementsInfo2 = pInfo;
-    pHeader = utils::GetExtensionStructure(pHeader, VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2);
-    VK_ASSERT(pHeader != nullptr);
-    if (pHeader != nullptr)
-    {
-        Buffer* pBuffer = Buffer::ObjectFromHandle(pRequirementsInfo2->buffer);
-        VkMemoryRequirements* pRequirements = &pMemoryRequirements->memoryRequirements;
-        pBuffer->GetMemoryRequirements(pDevice, pRequirements);
+    Buffer* pBuffer = Buffer::ObjectFromHandle(pInfo->buffer);
+    VkMemoryRequirements* pRequirements = &pMemoryRequirements->memoryRequirements;
+    pBuffer->GetMemoryRequirements(pDevice, pRequirements);
 
-        if (pMemoryRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2)
-        {
-            VkMemoryDedicatedRequirements* pMemDedicatedRequirements =
-                static_cast<VkMemoryDedicatedRequirements*>(pMemoryRequirements->pNext);
+    VkMemoryDedicatedRequirements* pMemDedicatedRequirements =
+        static_cast<VkMemoryDedicatedRequirements*>(pMemoryRequirements->pNext);
 
-            if ((pMemDedicatedRequirements != nullptr) &&
-                (pMemDedicatedRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS))
-            {
-                pMemDedicatedRequirements->prefersDedicatedAllocation  = pBuffer->DedicatedMemoryRequired();
-                pMemDedicatedRequirements->requiresDedicatedAllocation = pBuffer->DedicatedMemoryRequired();
-            }
-        }
+    if ((pMemDedicatedRequirements != nullptr) &&
+        (pMemDedicatedRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS))
+    {
+        pMemDedicatedRequirements->prefersDedicatedAllocation  = pBuffer->DedicatedMemoryRequired();
+        pMemDedicatedRequirements->requiresDedicatedAllocation = pBuffer->DedicatedMemoryRequired();
     }
 }
 
diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp
index 4cd01858..94ae308f 100644
--- a/icd/api/vk_cmdbuffer.cpp
+++ b/icd/api/vk_cmdbuffer.cpp
@@ -5363,6 +5363,8 @@ void CmdBuffer::SetSampleLocations(
 
     ConvertToPalMsaaQuadSamplePattern(pSampleLocationsInfo, &locations);
     PalCmdSetMsaaQuadSamplePattern(sampleLocationsPerPixel, locations);
+
+    m_allGpuState.staticTokens.samplePattern = DynamicRenderStateToken;
 }
 
 // =====================================================================================================================
@@ -5520,11 +5522,10 @@ void CmdBuffer::BeginRenderPass(
                 sizeof(SamplePattern) * subpassCount,
                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT));
 
-        memset(m_renderPassInstance.pSamplePatterns, 0, subpassCount * sizeof(SamplePattern));
-
         if (m_renderPassInstance.pSamplePatterns != nullptr)
         {
             m_renderPassInstance.maxSubpassCount = subpassCount;
+            memset(m_renderPassInstance.pSamplePatterns, 0, subpassCount * sizeof(SamplePattern));
         }
         else
         {
@@ -5924,7 +5925,7 @@ void CmdBuffer::RPSyncPoint(
 
                     const uint32_t sampleCount = attachment.pImage->GetImageSamples();
 
-                    if (sampleCount > 1)
+                    if (sampleCount > 0)
                     {
                          if (attachment.pImage->IsSampleLocationsCompatibleDepth() &&
                              tr.flags.isInitialLayoutTransition)
@@ -5958,6 +5959,27 @@ void CmdBuffer::RPSyncPoint(
         m_recordingResult = VK_ERROR_OUT_OF_HOST_MEMORY;
     }
 
+    // Construct a dumb transition to sync cache
+    const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings();
+    if (settings.enableDumbTransitionSync && (barrier.transitionCount == 0) && (rpBarrier.flags.needsGlobalTransition))
+    {
+        if (pPalTransitions == nullptr)
+        {
+            pPalTransitions = pVirtStack->AllocArray<Pal::BarrierTransition>(1);
+        }
+
+        if (pPalTransitions != nullptr)
+        {
+            Pal::BarrierTransition *pDumbTransition = &pPalTransitions[0];
+            pDumbTransition->srcCacheMask = 0;
+            pDumbTransition->dstCacheMask = 0;
+            pDumbTransition->imageInfo.pImage = nullptr;
+
+            barrier.transitionCount = 1;
+            barrier.pTransitions = pDumbTransition;
+        }
+    }
+
     // Execute the barrier if it actually did anything
     if ((barrier.waitPoint != Pal::HwPipeBottom) ||
         (barrier.transitionCount > 0) ||
@@ -6531,8 +6553,8 @@ void CmdBuffer::WritePushConstants(
     // pipeline layout (e.g. at the top of the command buffer) and this register write will be redundant because
     // a future vkCmdBindPipeline will reprogram the user data registers during the rebase.
     if (PalPipelineBindingOwnedBy(palBindPoint, apiBindPoint) &&
-        pBindState->userDataLayout.pushConstRegBase == userDataLayout.pushConstRegBase &&
-        pBindState->userDataLayout.pushConstRegCount >= startInDwords + lengthInDwords)
+        (pBindState->userDataLayout.pushConstRegBase == userDataLayout.pushConstRegBase) &&
+        (pBindState->userDataLayout.pushConstRegCount >= (startInDwords + lengthInDwords)))
     {
         utils::IterateMask deviceGroup(m_curDeviceMask);
         do
@@ -7151,24 +7173,6 @@ void CmdBuffer::DrawIndirectByteCount(
     while (deviceGroup.IterateNext());
 }
 
-// =====================================================================================================================
-void CmdBuffer::SetLineStippleEXT(
-    const Pal::LineStippleStateParams& params,
-    uint32_t                           staticToken)
-{
-    m_allGpuState.lineStipple = params;
-
-    utils::IterateMask deviceGroup(m_cbBeginDeviceMask);
-    do
-    {
-        const uint32_t deviceIdx = deviceGroup.Index();
-        PalCmdBuffer(deviceIdx)->CmdSetLineStippleState(m_allGpuState.lineStipple);
-    }
-    while (deviceGroup.IterateNext());
-
-    m_allGpuState.staticTokens.lineStippleState = staticToken;
-}
-
 // =====================================================================================================================
 void CmdBuffer::SetLineStippleEXT(
     uint32_t lineStippleFactor,
diff --git a/icd/api/vk_conv.cpp b/icd/api/vk_conv.cpp
index 95a59797..01a4e8f0 100644
--- a/icd/api/vk_conv.cpp
+++ b/icd/api/vk_conv.cpp
@@ -1056,6 +1056,8 @@ uint32_t GetBufferSrdFormatInfo(
     }
     else
     {
+        VK_ASSERT(pPhysicalDevice->PalProperties().gfxipProperties.srdSizes.bufferView == 4 * sizeof(uint32_t));
+
         uint32_t result[4] = {};
         Pal::BufferViewInfo bufferInfo = {};
         bufferInfo.gpuAddr = 0x300000000ull;
@@ -1063,20 +1065,23 @@ uint32_t GetBufferSrdFormatInfo(
         bufferInfo.range = UINT32_MAX;
         bufferInfo.stride = Pal::Formats::BytesPerPixel(swizzledFormat.format);
         pPhysicalDevice->PalDevice()->CreateTypedBufferViewSrds(1, &bufferInfo, result);
+
+        // NOTE: Until now, all buffer format info is stored the fourth DWORD of buffer SRD. please modify
+        // both BilVertexFetchManager::IssueUberFetchInst and UberFetchShaderFormatInfo once it is changed.
         return result[3];
     }
 }
 
-#define INIT_UBER_FORMATINFO(vkFmt, palFmt, unpckedPalFmt, packed, fixed, compCnt, compSize, align) {  \
+#define INIT_UBER_FORMATINFO(vkFmt, palFmt, unpackedPalFmt, packed, fixed, compCnt, compSize, align) {  \
         UberFetchShaderFormatInfo fmtInfo = {};                                                     \
         fmtInfo.swizzledFormat = palFmt;                                                            \
-        fmtInfo.unpackedFormat = unpckedPalFmt;                                                     \
+        fmtInfo.unpackedFormat = unpackedPalFmt;                                                    \
         fmtInfo.isPacked       = packed;                                                            \
         fmtInfo.isFixed        = fixed;                                                             \
         fmtInfo.componentCount = compCnt;                                                           \
         fmtInfo.componentSize  = compSize;                                                          \
         fmtInfo.bufferFormat   = GetBufferSrdFormatInfo(pPhysicalDevice, palFmt);                   \
-        fmtInfo.unpackedBufferFormat = GetBufferSrdFormatInfo(pPhysicalDevice, unpckedPalFmt);      \
+        fmtInfo.unpackedBufferFormat = GetBufferSrdFormatInfo(pPhysicalDevice, unpackedPalFmt);     \
         fmtInfo.alignment      = align;                                                             \
         pFormatInfoMap->Insert(VK_FORMAT_##vkFmt, fmtInfo); }
 
@@ -1290,6 +1295,29 @@ VkResult InitializeUberFetchShaderFormatTable(
     INIT_UBER_FORMATINFO(UNDEFINED,
         PalFmt_Undefined,                         PalFmt_Undefined,                         0, 0, 0, 0, 4);
 
+    // OOB flag is in buffer dword3 on Gfx10+, and the value is different when stride is 0.
+    // to avoid access the exact bit in buffer SRD, we create untypeded buffer twice with different stride,
+    // and record the modified bits.
+
+    VK_ASSERT(pPhysicalDevice->PalProperties().gfxipProperties.srdSizes.bufferView == 4 * sizeof(uint32_t));
+
+    uint32_t defaultSrd[4] = {};
+    uint32_t zeroStrideSrd[4] = {};
+    Pal::BufferViewInfo bufferInfo = {};
+    bufferInfo.gpuAddr = 0x300000000ull;
+    bufferInfo.swizzledFormat = PalFmt_RGBA(32, 32, 32, 32, Float);
+    bufferInfo.range = UINT32_MAX;
+
+    // Build SRD with non-zero stride
+    bufferInfo.stride = 16;
+    pPhysicalDevice->PalDevice()->CreateUntypedBufferViewSrds(1, &bufferInfo, defaultSrd);
+
+    // Build SRD with zero stride
+    bufferInfo.stride = 0;
+    pPhysicalDevice->PalDevice()->CreateUntypedBufferViewSrds(1, &bufferInfo, zeroStrideSrd);
+
+    // Save the modified bits in buffer SRD
+    pFormatInfoMap->SetBufferFormatMask(defaultSrd[3] ^ zeroStrideSrd[3]);
     return VK_SUCCESS;
  }
 #undef INIT_UBER_FORMATINFO
@@ -1297,11 +1325,23 @@ VkResult InitializeUberFetchShaderFormatTable(
 // =====================================================================================================================
 UberFetchShaderFormatInfo GetUberFetchShaderFormatInfo(
     UberFetchShaderFormatInfoMap* pFormatInfoMap,
-    VkFormat                      vkFormat)
+    VkFormat                      vkFormat,
+    bool                          isZeroStride)
 {
-    UberFetchShaderFormatInfo dummyInfo = {};
+    UberFetchShaderFormatInfo formatInfo = {};
     auto pFormatInfo = pFormatInfoMap->FindKey(vkFormat);
-    return (pFormatInfo == nullptr) ? dummyInfo : *pFormatInfo;
+    if (pFormatInfo != nullptr)
+    {
+        formatInfo = *pFormatInfo;
+        if (isZeroStride)
+        {
+            // Apply zero stride modified bits, which are caclulated in UberFetchShaderFormatInfoMap initialization.
+            formatInfo.bufferFormat = formatInfo.bufferFormat ^ pFormatInfoMap->GetBufferFormatMask();
+            formatInfo.unpackedBufferFormat = formatInfo.unpackedBufferFormat ^ pFormatInfoMap->GetBufferFormatMask();
+        }
+    }
+
+    return formatInfo;
 }
 
 } // namespace vk
diff --git a/icd/api/vk_descriptor_pool.cpp b/icd/api/vk_descriptor_pool.cpp
index fa09fed9..30652c00 100644
--- a/icd/api/vk_descriptor_pool.cpp
+++ b/icd/api/vk_descriptor_pool.cpp
@@ -347,6 +347,10 @@ VkResult DescriptorPool::AllocDescriptorSets(
                         setGpuMemOffset,
                         m_addresses,
                         pSetAllocHandle);
+                    if (m_pDevice->MustWriteImmutableSamplers())
+                    {
+                        pSet->WriteImmutableSamplers();
+                    }
                 }
                 else
                 {
@@ -446,10 +450,21 @@ VkResult DescriptorGpuMemHeap::Init(
 
     bool oneShot = (m_usage & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT) == 0;
 
-    for (uint32_t i = 0; i < count; ++i)
+    if (pDevice->GetRuntimeSettings().pipelineLayoutMode == PipelineLayoutAngle)
+    {
+        for (uint32_t i = 0; i < count; ++i)
+        {
+            m_gpuMemSize += AngleDescPattern::DescriptorSetBindingStride * sizeof(uint32_t) *
+                pTypeCount[i].descriptorCount;
+        }
+    }
+    else
     {
-        m_gpuMemSize += DescriptorSetLayout::GetSingleDescStaticSize(pDevice, pTypeCount[i].type) *
-            pTypeCount[i].descriptorCount;
+        for (uint32_t i = 0; i < count; ++i)
+        {
+            m_gpuMemSize += DescriptorSetLayout::GetSingleDescStaticSize(pDevice, pTypeCount[i].type) *
+                pTypeCount[i].descriptorCount;
+        }
     }
 
     m_gpuMemAddrAlignment = pDevice->GetProperties().descriptorSizes.alignment;
diff --git a/icd/api/vk_descriptor_set.cpp b/icd/api/vk_descriptor_set.cpp
index 99d6486a..81de6a4c 100644
--- a/icd/api/vk_descriptor_set.cpp
+++ b/icd/api/vk_descriptor_set.cpp
@@ -89,6 +89,27 @@ void DescriptorSet<numPalDevices>::Reassign(
 
 }
 
+// =====================================================================================================================
+// Writes the immutable samplers in the layout to memory.
+template <uint32_t numPalDevices>
+void DescriptorSet<numPalDevices>::WriteImmutableSamplers()
+{
+    for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++)
+    {
+        for (uint32_t bindingIndex = 0; bindingIndex < Layout()->Info().count; ++bindingIndex)
+        {
+            const DescriptorSetLayout::BindingInfo& bindingInfo = Layout()->Binding(bindingIndex);
+
+            if (bindingInfo.imm.dwSize != 0)
+            {
+                uint32_t* pSamplerDesc = Layout()->Info().imm.pImmutableSamplerData + bindingInfo.imm.dwOffset;
+                uint32_t* pDestAddr = StaticCpuAddress(deviceIdx) + Layout()->GetDstStaOffset(bindingInfo, 0);
+                memcpy(pDestAddr, pSamplerDesc, sizeof(uint32)*bindingInfo.imm.dwSize);
+            }
+        }
+    }
+}
+
 // =====================================================================================================================
 // Resets a DescriptorSet to an intial state
 template <uint32_t numPalDevices>
@@ -1048,6 +1069,9 @@ void DescriptorSet<1>::Reassign(
 template
 void DescriptorSet<1>::Reset();
 
+template
+void DescriptorSet<1>::WriteImmutableSamplers();
+
 template
 DescriptorSet<2>::DescriptorSet(uint32_t heapIndex);
 
@@ -1061,6 +1085,9 @@ void DescriptorSet<2>::Reassign(
 template
 void DescriptorSet<2>::Reset();
 
+template
+void DescriptorSet<2>::WriteImmutableSamplers();
+
 template
 DescriptorSet<3>::DescriptorSet(uint32_t heapIndex);
 
@@ -1074,6 +1101,9 @@ void DescriptorSet<3>::Reassign(
 template
 void DescriptorSet<3>::Reset();
 
+template
+void DescriptorSet<3>::WriteImmutableSamplers();
+
 template
 DescriptorSet<4>::DescriptorSet(uint32_t heapIndex);
 
@@ -1087,4 +1117,7 @@ void DescriptorSet<4>::Reassign(
 template
 void DescriptorSet<4>::Reset();
 
+template
+void DescriptorSet<4>::WriteImmutableSamplers();
+
 } // namespace vk
diff --git a/icd/api/vk_descriptor_set_layout.cpp b/icd/api/vk_descriptor_set_layout.cpp
index ddf32725..b0f75dbf 100644
--- a/icd/api/vk_descriptor_set_layout.cpp
+++ b/icd/api/vk_descriptor_set_layout.cpp
@@ -308,7 +308,7 @@ void DescriptorSetLayout::ConvertBindingInfo(
 {
 
     // Dword offset to this binding
-    pBindingSectionInfo->dwOffset = Util::Pow2Align(pSectionInfo->dwSize, descAlignmentInDw);
+    pBindingSectionInfo->dwOffset = Util::RoundUpToMultiple(pSectionInfo->dwSize, descAlignmentInDw);
 
     if (pBindingInfo->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
     {
@@ -332,7 +332,7 @@ void DescriptorSetLayout::ConvertBindingInfo(
     if (pBindingSectionInfo->dwSize > 0)
     {
         // Update total section size by how much space this binding takes.
-        pSectionInfo->dwSize += pBindingSectionInfo->dwSize;
+        pSectionInfo->dwSize = pBindingSectionInfo->dwOffset + pBindingSectionInfo->dwSize;
 
         // Update total number of ResourceMappingNodes required by this binding.
         pSectionInfo->numRsrcMapNodes++;
@@ -530,7 +530,12 @@ VkResult DescriptorSetLayout::ConvertCreateInfo(
 
         // Determine the alignment requirement of descriptors in dwords.
         uint32 descAlignmentInDw = pDevice->GetProperties().descriptorSizes.alignment / sizeof(uint32);
-
+        uint32_t staDescAlignmentInDw = descAlignmentInDw;
+        if (pDevice->GetRuntimeSettings().pipelineLayoutMode == PipelineLayoutAngle)
+        {
+            VK_ASSERT(AngleDescPattern::DescriptorSetBindingStride % descAlignmentInDw == 0);
+            staDescAlignmentInDw = AngleDescPattern::DescriptorSetBindingStride;
+        }
         // If the last binding has the VARIABLE_DESCRIPTOR_COUNT_BIT set, write the varDescDwStride
         if ((bindingNumber == (pOut->count - 1)) && pBinding->bindingFlags.variableDescriptorCount)
         {
@@ -541,7 +546,7 @@ VkResult DescriptorSetLayout::ConvertCreateInfo(
         ConvertBindingInfo(
             &pBinding->info,
             GetDescStaticSectionDwSize(pDevice, &pBinding->info, pBinding->bindingFlags),
-            descAlignmentInDw,
+            staDescAlignmentInDw,
             &pOut->sta,
             &pBinding->sta);
 
diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp
index 3a26e103..5141fa1a 100644
--- a/icd/api/vk_device.cpp
+++ b/icd/api/vk_device.cpp
@@ -71,8 +71,6 @@
 #include "sqtt/sqtt_mgr.h"
 #include "sqtt/sqtt_rgp_annotations.h"
 
-#include "appopt/async_layer.h"
-
 #if VKI_GPU_DECOMPRESS
 #include "appopt/gpu_decode_layer.h"
 #endif
@@ -256,7 +254,6 @@ Device::Device(
     m_enabledExtensions(enabledExtensions),
     m_dispatchTable(DispatchTable::Type::DEVICE, m_pInstance, this),
     m_pSqttMgr(nullptr),
-    m_pAsyncLayer(nullptr),
     m_pAppOptLayer(nullptr),
     m_pBarrierFilterLayer(nullptr),
 #if VKI_GPU_DECOMPRESS
@@ -264,6 +261,7 @@ Device::Device(
 #endif
     m_allocationSizeTracking(m_settings.memoryDeviceOverallocationAllowed ? false : true),
     m_useComputeAsTransferQueue(useComputeAsTransferQueue),
+    m_useUniversalAsComputeQueue(pPhysicalDevices[DefaultDeviceIndex]->GetRuntimeSettings().useUniversalAsComputeQueue),
     m_useGlobalGpuVa(false)
     , m_pBorderColorUsedIndexes(nullptr)
 {
@@ -312,6 +310,15 @@ Device::Device(
         m_enabledFeatures.robustBufferAccess = false;
     }
 
+    if (RuntimeSettings().enableRelocatableShaders)
+    {
+        m_enabledFeatures.mustWriteImmutableSamplers = true;
+    }
+    else
+    {
+        m_enabledFeatures.mustWriteImmutableSamplers = false;
+    }
+
     m_enabledFeatures.scalarBlockLayout = false;
 
     m_enabledFeatures.attachmentFragmentShadingRate = false;
@@ -356,6 +363,11 @@ static void ConstructQueueCreateInfo(
     {
         palQueueType = Pal::QueueType::QueueTypeCompute;
     }
+    else if ((palQueueType == Pal::QueueType::QueueTypeCompute) &&
+        (pPhysicalDevices[deviceIdx]->GetRuntimeSettings().useUniversalAsComputeQueue))
+    {
+        palQueueType = Pal::QueueType::QueueTypeUniversal;
+    }
 
     pQueueCreateInfo->tmzOnly = isTmzQueue;
 
@@ -384,8 +396,14 @@ static void ConstructQueueCreateInfo(
     }
     else
     {
-        pQueueCreateInfo->engineType  =
-            pPhysicalDevices[deviceIdx]->GetQueueFamilyPalEngineType(queueFamilyIndex);
+        if (palQueueType == Pal::QueueType::QueueTypeUniversal)
+        {
+            pQueueCreateInfo->engineType = Pal::EngineType::EngineTypeUniversal;
+        }
+        else
+        {
+            pQueueCreateInfo->engineType = pPhysicalDevices[deviceIdx]->GetQueueFamilyPalEngineType(queueFamilyIndex);
+        }
 
         if (palQueueType == Pal::QueueType::QueueTypeUniversal)
         {
@@ -467,6 +485,7 @@ VkResult Device::Create(
     bool                              scalarBlockLayoutEnabled        = false;
     ExtendedRobustness                extendedRobustnessEnabled       = { false, false, false };
     bool                              attachmentFragmentShadingRate   = false;
+    bool                              pageableDeviceLocalMemory       = false;
 
     uint32                            privateDataSlotRequestCount           = 0;
     bool                              privateDataEnabled                    = false;
@@ -516,7 +535,8 @@ VkResult Device::Create(
             {
                 VK_ASSERT(pCreateInfo->pEnabledFeatures == nullptr);
 
-                // If present, VkPhysicalDeviceFeatures2 controls which features are enabled instead of pEnabledFeatures
+                // If present, VkPhysicalDeviceFeatures2 controls which features are enabled
+                // instead of pEnabledFeatures.
                 pEnabledFeatures = &reinterpret_cast<const VkPhysicalDeviceFeatures2*>(pHeader)->features;
 
                 break;
@@ -534,7 +554,19 @@ VkResult Device::Create(
 
             case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES:
             {
-                if (reinterpret_cast<const VkPhysicalDeviceBufferDeviceAddressFeatures*>(pHeader)->bufferDeviceAddressMultiDevice)
+                if (reinterpret_cast<const VkPhysicalDeviceBufferDeviceAddressFeatures*>(
+                    pHeader)->bufferDeviceAddressMultiDevice)
+                {
+                    bufferDeviceAddressMultiDeviceEnabled = true;
+                }
+
+                break;
+            }
+
+            case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT:
+            {
+                if (reinterpret_cast<const VkPhysicalDeviceBufferDeviceAddressFeaturesEXT*>(
+                    pHeader)->bufferDeviceAddressMultiDevice)
                 {
                     bufferDeviceAddressMultiDeviceEnabled = true;
                 }
@@ -562,8 +594,8 @@ VkResult Device::Create(
             {
 
                 deviceCoherentMemoryEnabled = enabledDeviceExtensions.IsExtensionEnabled(
-                                                DeviceExtensions::AMD_DEVICE_COHERENT_MEMORY) &&
-                                                reinterpret_cast<const VkPhysicalDeviceCoherentMemoryFeaturesAMD *>(pHeader)->deviceCoherentMemory;
+                    DeviceExtensions::AMD_DEVICE_COHERENT_MEMORY) &&
+                    reinterpret_cast<const VkPhysicalDeviceCoherentMemoryFeaturesAMD *>(pHeader)->deviceCoherentMemory;
 
                 break;
             }
@@ -582,7 +614,8 @@ VkResult Device::Create(
             case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_FEATURES_KHR:
             {
 
-                if (reinterpret_cast<const VkPhysicalDeviceFragmentShadingRateFeaturesKHR*>(pHeader)->attachmentFragmentShadingRate)
+                if (reinterpret_cast<const VkPhysicalDeviceFragmentShadingRateFeaturesKHR*>(
+                    pHeader)->attachmentFragmentShadingRate)
                 {
                     attachmentFragmentShadingRate = true;
                 }
@@ -612,7 +645,8 @@ VkResult Device::Create(
 
             case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIVATE_DATA_FEATURES_EXT:
             {
-                privateDataEnabled = reinterpret_cast<const VkPhysicalDevicePrivateDataFeaturesEXT*>(pHeader)->privateData;
+                privateDataEnabled = reinterpret_cast<const VkPhysicalDevicePrivateDataFeaturesEXT*>(
+                                     pHeader)->privateData;
 
                 break;
             }
@@ -627,6 +661,17 @@ VkResult Device::Create(
                 break;
             }
 
+           case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT:
+            {
+               if (reinterpret_cast<const VkPhysicalDevicePageableDeviceLocalMemoryFeaturesEXT*>(
+                   pHeader)->pageableDeviceLocalMemory)
+               {
+                   pageableDeviceLocalMemory = true;
+               }
+
+                break;
+            }
+
             default:
                 break;
             }
@@ -1052,7 +1097,8 @@ VkResult Device::Create(
                 attachmentFragmentShadingRate,
                 scalarBlockLayoutEnabled,
                 extendedRobustnessEnabled,
-                bufferDeviceAddressMultiDeviceEnabled);
+                bufferDeviceAddressMultiDeviceEnabled,
+                pageableDeviceLocalMemory);
 
             // If we've failed to Initialize, make sure we destroy anything we might have allocated.
             if (vkResult != VK_SUCCESS)
@@ -1086,7 +1132,8 @@ VkResult Device::Initialize(
     const bool                              attachmentFragmentShadingRate,
     bool                                    scalarBlockLayoutEnabled,
     const ExtendedRobustness&               extendedRobustnessEnabled,
-    bool                                    bufferDeviceAddressMultiDeviceEnabled)
+    bool                                    bufferDeviceAddressMultiDeviceEnabled,
+    bool                                    pageableDeviceLocalMemory)
 {
     // Initialize the internal memory manager
     VkResult result = m_internalMemMgr.Init();
@@ -1198,6 +1245,12 @@ VkResult Device::Initialize(
     m_enabledFeatures.robustImageAccessExtended    = extendedRobustnessEnabled.robustImageAccess;
     m_enabledFeatures.nullDescriptorExtended       = extendedRobustnessEnabled.nullDescriptor;
 
+    if (IsExtensionEnabled(DeviceExtensions::EXT_MEMORY_PRIORITY) ||
+        (IsExtensionEnabled(DeviceExtensions::EXT_PAGEABLE_DEVICE_LOCAL_MEMORY) && pageableDeviceLocalMemory))
+    {
+        m_enabledFeatures.appControlledMemPriority = true;
+    }
+
     // If VkPhysicalDeviceBufferDeviceAddressFeaturesEXT.bufferDeviceAddressMultiDevice is enabled
     // and if globalGpuVaSupport is supported and if multiple devices are used set the global GpuVa.
     m_useGlobalGpuVa = (bufferDeviceAddressMultiDeviceEnabled                    &&
@@ -1331,20 +1384,6 @@ VkResult Device::Initialize(
     }
 #endif
 
-    if ((result == VK_SUCCESS) && m_settings.enableAsyncCompile)
-    {
-        void* pMemory = VkInstance()->AllocMem(sizeof(AsyncLayer), VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-
-        if (pMemory != nullptr)
-        {
-            m_pAsyncLayer = VK_PLACEMENT_NEW(pMemory) AsyncLayer(this);
-        }
-        else
-        {
-            result = VK_ERROR_OUT_OF_HOST_MEMORY;
-        }
-    }
-
     const Pal::DeviceProperties& palProps = pPhysicalDevice->PalProperties();
 
     if (result == VK_SUCCESS)
@@ -1401,6 +1440,13 @@ VkResult Device::Initialize(
                     break;
                 }
             }
+            else if (enabled.IsExtensionEnabled(DeviceExtensions::ExtensionId::EXT_PAGEABLE_DEVICE_LOCAL_MEMORY) &&
+                     pageableDeviceLocalMemory)
+            {
+                // Add back-up heaps for device-local heaps
+                m_overallocationRequestedForPalHeap[Pal::GpuHeap::GpuHeapInvisible] = true;
+                m_overallocationRequestedForPalHeap[Pal::GpuHeap::GpuHeapLocal]     = true;
+            }
             else if ((m_settings.overrideHeapChoiceToLocal != 0) && (palProps.gpuType == Pal::GpuType::Discrete))
             {
                 // This setting utilizes overallocation behavior's heap size tracking. Overallocation to the local
@@ -1484,12 +1530,6 @@ void Device::InitDispatchTable()
         m_pBarrierFilterLayer->OverrideDispatchTable(&m_dispatchTable);
     }
 
-    // Install the async compile layer if needed
-    if (m_pAsyncLayer != nullptr)
-    {
-        m_pAsyncLayer->OverrideDispatchTable(&m_dispatchTable);
-    }
-
 #if VKI_GPU_DECOMPRESS
     if (m_pGpuDecoderLayer != nullptr)
     {
@@ -1715,13 +1755,6 @@ VkResult Device::Destroy(const VkAllocationCallbacks* pAllocator)
         VkInstance()->FreeMem(m_pAppOptLayer);
     }
 
-    if (m_pAsyncLayer != nullptr)
-    {
-        Util::Destructor(m_pAsyncLayer);
-
-        VkInstance()->FreeMem(m_pAsyncLayer);
-    }
-
 #if VKI_GPU_DECOMPRESS
     if (m_pGpuDecoderLayer != nullptr)
     {
@@ -1809,6 +1842,11 @@ Pal::QueueType Device::GetQueueFamilyPalQueueType(
         palQueueType = Pal::QueueType::QueueTypeCompute;
     }
 
+    else if ((palQueueType == Pal::QueueType::QueueTypeCompute) && m_useUniversalAsComputeQueue)
+    {
+        palQueueType = Pal::QueueType::QueueTypeUniversal;
+    }
+
     return palQueueType;
 }
 
@@ -1823,6 +1861,11 @@ Pal::EngineType Device::GetQueueFamilyPalEngineType(
         palEngineType = Pal::EngineType::EngineTypeCompute;
     }
 
+    else if ((palEngineType == Pal::EngineType::EngineTypeCompute) && m_useUniversalAsComputeQueue)
+    {
+        palEngineType = Pal::EngineType::EngineTypeUniversal;
+    }
+
     return palEngineType;
 }
 
@@ -4187,6 +4230,16 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryHostPointerPropertiesEXT(
     return result;
 }
 
+// =====================================================================================================================
+VKAPI_ATTR void VKAPI_CALL vkSetDeviceMemoryPriorityEXT(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    float                                       priority)
+{
+    Memory* pMemory = Memory::ObjectFromHandle(memory);
+    pMemory->SetPriority(MemoryPriority::FromVkMemoryPriority(priority), false);
+}
+
 } // entry
 
 } // vk
diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp
index b04411a7..56cbf30d 100644
--- a/icd/api/vk_dispatch.cpp
+++ b/icd/api/vk_dispatch.cpp
@@ -591,6 +591,8 @@ void DispatchTable::Init()
                         vkResetQueryPool                                );
     INIT_DISPATCH_ENTRY(vkCmdSetLineStippleEXT                          );
 
+    INIT_DISPATCH_ENTRY(vkSetDeviceMemoryPriorityEXT                    );
+
     INIT_DISPATCH_ENTRY(vkGetPhysicalDeviceCalibrateableTimeDomainsEXT  );
     INIT_DISPATCH_ENTRY(vkGetCalibratedTimestampsEXT                    );
 
diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp
index 3051e680..253ceb86 100644
--- a/icd/api/vk_graphics_pipeline.cpp
+++ b/icd/api/vk_graphics_pipeline.cpp
@@ -92,7 +92,7 @@ VkResult GraphicsPipeline::CreatePipelineBinaries(
     for (uint32_t i = 0; (result == VK_SUCCESS) && (i < numPalDevices)
          ; ++i)
     {
-        if (i == DefaultDeviceIndex)
+        if ((i == DefaultDeviceIndex) || (pCreateInfo == nullptr))
         {
             result = pDevice->GetCompiler(i)->CreateGraphicsPipelineBinary(
                 pDevice,
@@ -106,9 +106,10 @@ VkResult GraphicsPipeline::CreatePipelineBinaries(
         else
         {
             GraphicsPipelineBinaryCreateInfo binaryCreateInfoMGPU = {};
-            VbInfo vbInfoMGPU = {};
+            VbBindingInfo vbInfoMGPU = {};
+            PipelineInternalBufferInfo internalBufferInfoMGPU = {};
             pDefaultCompiler->ConvertGraphicsPipelineInfo(
-                pDevice, pCreateInfo, pShaderInfo, pPipelineLayout, &binaryCreateInfoMGPU, &vbInfoMGPU);
+                pDevice, pCreateInfo, pShaderInfo, pPipelineLayout, &binaryCreateInfoMGPU, &vbInfoMGPU, &internalBufferInfoMGPU);
 
             result = pDevice->GetCompiler(i)->CreateGraphicsPipelineBinary(
                 pDevice,
@@ -129,13 +130,99 @@ VkResult GraphicsPipeline::CreatePipelineBinaries(
                     binaryCreateInfoMGPU.stageFeedback);
             }
 
-            pDefaultCompiler->FreeGraphicsPipelineCreateInfo(&binaryCreateInfoMGPU);
+            pDefaultCompiler->FreeGraphicsPipelineCreateInfo(&binaryCreateInfoMGPU, false);
         }
     }
 
     return result;
 }
 
+// =====================================================================================================================
+// Creates graphics PAL pipeline objects
+VkResult GraphicsPipeline::CreatePalPipelineObjects(
+    Device*                           pDevice,
+    PipelineCache*                    pPipelineCache,
+    GraphicsPipelineObjectCreateInfo* pObjectCreateInfo,
+    const size_t*                     pPipelineBinarySizes,
+    const void**                      pPipelineBinaries,
+    const Util::MetroHash::Hash*      pCacheIds,
+    void*                             pSystemMem,
+    Pal::IPipeline**                  pPalPipeline)
+{
+    size_t palSize = 0;
+
+    pObjectCreateInfo->pipeline.pipelineBinarySize = pPipelineBinarySizes[DefaultDeviceIndex];
+    pObjectCreateInfo->pipeline.pPipelineBinary = pPipelineBinaries[DefaultDeviceIndex];
+
+    Pal::Result palResult = Pal::Result::Success;
+    palSize =
+        pDevice->PalDevice(DefaultDeviceIndex)->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, &palResult);
+    VK_ASSERT(palResult == Pal::Result::Success);
+
+    RenderStateCache* pRSCache      = pDevice->GetRenderStateCache();
+    const uint32_t    numPalDevices = pDevice->NumPalDevices();
+    size_t            palOffset     = 0;
+
+    for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++)
+    {
+        Pal::IDevice* pPalDevice = pDevice->PalDevice(deviceIdx);
+
+        if (palResult == Pal::Result::Success)
+        {
+            // If pPipelineBinaries[DefaultDeviceIndex] is sufficient for all devices, the other pipeline binaries
+            // won't be created.  Otherwise, like if gl_DeviceIndex is used, they will be.
+            if (pPipelineBinaries[deviceIdx] != nullptr)
+            {
+                pObjectCreateInfo->pipeline.pipelineBinarySize = pPipelineBinarySizes[deviceIdx];
+                pObjectCreateInfo->pipeline.pPipelineBinary = pPipelineBinaries[deviceIdx];
+            }
+
+            palResult = pPalDevice->CreateGraphicsPipeline(
+                pObjectCreateInfo->pipeline,
+                Util::VoidPtrInc(pSystemMem, palOffset),
+                &pPalPipeline[deviceIdx]);
+
+#if ICD_GPUOPEN_DEVMODE_BUILD
+            // Temporarily reinject post Pal pipeline creation (when the internal pipeline hash is available).
+            // The reinjection cache layer can be linked back into the pipeline cache chain once the
+            // Vulkan pipeline cache key can be stored (and read back) inside the ELF as metadata.
+            if ((pDevice->VkInstance()->GetDevModeMgr() != nullptr) &&
+                (palResult == Util::Result::Success))
+            {
+                const auto& info = pPalPipeline[deviceIdx]->GetInfo();
+
+                palResult = pDevice->GetCompiler(deviceIdx)->RegisterAndLoadReinjectionBinary(
+                    &info.internalPipelineHash,
+                    &pCacheIds[deviceIdx],
+                    &pObjectCreateInfo->pipeline.pipelineBinarySize,
+                    &pObjectCreateInfo->pipeline.pPipelineBinary,
+                    pPipelineCache);
+
+                if (palResult == Util::Result::Success)
+                {
+                    pPalPipeline[deviceIdx]->Destroy();
+
+                    palResult = pPalDevice->CreateGraphicsPipeline(
+                        pObjectCreateInfo->pipeline,
+                        Util::VoidPtrInc(pSystemMem, palOffset),
+                        &pPalPipeline[deviceIdx]);
+                }
+                else if (palResult == Util::Result::NotFound)
+                {
+                    // If a replacement was not found, proceed with the original
+                    palResult = Util::Result::Success;
+                }
+            }
+#endif
+
+            VK_ASSERT(palSize == pPalDevice->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, nullptr));
+            palOffset += palSize;
+        }
+    }
+
+    return PalToVkResult(palResult);
+}
+
 // =====================================================================================================================
 // Create graphics pipeline objects
 VkResult GraphicsPipeline::CreatePipelineObjects(
@@ -143,7 +230,8 @@ VkResult GraphicsPipeline::CreatePipelineObjects(
     const VkGraphicsPipelineCreateInfo* pCreateInfo,
     const VkAllocationCallbacks*        pAllocator,
     const PipelineLayout*               pPipelineLayout,
-    const VbInfo*                       pVbInfo,
+    const VbBindingInfo*                pVbInfo,
+    const PipelineInternalBufferInfo*   pInternalBuffer,
     const size_t*                       pPipelineBinarySizes,
     const void**                        pPipelineBinaries,
     PipelineCache*                      pPipelineCache,
@@ -170,16 +258,13 @@ VkResult GraphicsPipeline::CreatePipelineObjects(
     void*  pSystemMem = nullptr;
     size_t palSize    = 0;
 
-    pObjectCreateInfo->pipeline.pipelineBinarySize = pPipelineBinarySizes[DefaultDeviceIndex];
-    pObjectCreateInfo->pipeline.pPipelineBinary    = pPipelineBinaries[DefaultDeviceIndex];
-
     palSize =
         pDevice->PalDevice(DefaultDeviceIndex)->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, &palResult);
     VK_ASSERT(palResult == Pal::Result::Success);
 
     pSystemMem = pDevice->AllocApiObject(
         pAllocator,
-        sizeof(GraphicsPipeline) + (palSize * numPalDevices));
+        sizeof(GraphicsPipeline) + (palSize * numPalDevices) + pInternalBuffer->dataSize);
 
     if (pSystemMem == nullptr)
     {
@@ -190,64 +275,22 @@ VkResult GraphicsPipeline::CreatePipelineObjects(
 
     if (result == VK_SUCCESS)
     {
-        size_t palOffset = sizeof(GraphicsPipeline);
+        result = CreatePalPipelineObjects(pDevice,
+            pPipelineCache,
+            pObjectCreateInfo,
+            pPipelineBinarySizes,
+            pPipelineBinaries,
+            pCacheIds,
+            Util::VoidPtrInc(pSystemMem, sizeof(GraphicsPipeline)),
+            pPalPipeline);
+    }
 
+    if (result == VK_SUCCESS)
+    {
         for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++)
         {
             Pal::IDevice* pPalDevice = pDevice->PalDevice(deviceIdx);
 
-            if (palResult == Pal::Result::Success)
-            {
-                // If pPipelineBinaries[DefaultDeviceIndex] is sufficient for all devices, the other pipeline binaries
-                // won't be created.  Otherwise, like if gl_DeviceIndex is used, they will be.
-                if (pPipelineBinaries[deviceIdx] != nullptr)
-                {
-                    pObjectCreateInfo->pipeline.pipelineBinarySize = pPipelineBinarySizes[deviceIdx];
-                    pObjectCreateInfo->pipeline.pPipelineBinary    = pPipelineBinaries[deviceIdx];
-                }
-
-                palResult = pPalDevice->CreateGraphicsPipeline(
-                    pObjectCreateInfo->pipeline,
-                    Util::VoidPtrInc(pSystemMem, palOffset),
-                    &pPalPipeline[deviceIdx]);
-
-#if ICD_GPUOPEN_DEVMODE_BUILD
-                // Temporarily reinject post Pal pipeline creation (when the internal pipeline hash is available).
-                // The reinjection cache layer can be linked back into the pipeline cache chain once the
-                // Vulkan pipeline cache key can be stored (and read back) inside the ELF as metadata.
-                if ((pDevice->VkInstance()->GetDevModeMgr() != nullptr) &&
-                    (palResult == Util::Result::Success))
-                {
-                    const auto& info = pPalPipeline[deviceIdx]->GetInfo();
-
-                    palResult = pDevice->GetCompiler(deviceIdx)->RegisterAndLoadReinjectionBinary(
-                        &info.internalPipelineHash,
-                        &pCacheIds[deviceIdx],
-                        &pObjectCreateInfo->pipeline.pipelineBinarySize,
-                        &pObjectCreateInfo->pipeline.pPipelineBinary,
-                        pPipelineCache);
-
-                    if (palResult == Util::Result::Success)
-                    {
-                        pPalPipeline[deviceIdx]->Destroy();
-
-                        palResult = pPalDevice->CreateGraphicsPipeline(
-                            pObjectCreateInfo->pipeline,
-                            Util::VoidPtrInc(pSystemMem, palOffset),
-                            &pPalPipeline[deviceIdx]);
-                    }
-                    else if (palResult == Util::Result::NotFound)
-                    {
-                        // If a replacement was not found, proceed with the original
-                        palResult = Util::Result::Success;
-                    }
-                }
-#endif
-
-                VK_ASSERT(palSize == pPalDevice->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, nullptr));
-                palOffset += palSize;
-            }
-
             // Create the PAL MSAA state object
             if (palResult == Pal::Result::Success)
             {
@@ -321,6 +364,13 @@ VkResult GraphicsPipeline::CreatePipelineObjects(
             pCreateInfo->flags,
             VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT);
 
+        PipelineInternalBufferInfo internalBuffer = *pInternalBuffer;
+        if (pInternalBuffer->dataSize > 0)
+        {
+            internalBuffer.pData = Util::VoidPtrInc(pSystemMem, sizeof(GraphicsPipeline) + (palSize * numPalDevices));
+            memcpy(internalBuffer.pData, pInternalBuffer->pData, pInternalBuffer->dataSize);
+        }
+
         VK_PLACEMENT_NEW(pSystemMem) GraphicsPipeline(
             pDevice,
             pPalPipeline,
@@ -334,6 +384,7 @@ VkResult GraphicsPipeline::CreatePipelineObjects(
             pObjectCreateInfo->flags.force1x1ShaderRate,
             pObjectCreateInfo->flags.customSampleLocations,
             *pVbInfo,
+            &internalBuffer,
             pPalMsaa,
             pPalColorBlend,
             pPalDepthStencil,
@@ -400,15 +451,16 @@ VkResult GraphicsPipeline::Create(
     VkResult result = AchievePipelineLayout(pDevice, pCreateInfo, pAllocator, &pPipelineLayout, &isTempLayout);
 
     // 2. Build pipeline binary create info
-    GraphicsPipelineBinaryCreateInfo binaryCreateInfo = {};
-    GraphicsPipelineShaderStageInfo  shaderStageInfo  = {};
-    VbInfo                           vbInfo           = {};
+    GraphicsPipelineBinaryCreateInfo binaryCreateInfo  = {};
+    GraphicsPipelineShaderStageInfo  shaderStageInfo   = {};
+    VbBindingInfo                    vbInfo            = {};
+    PipelineInternalBufferInfo       internalBufferInfo = {};
     ShaderModuleHandle               tempModules[ShaderStage::ShaderStageGfxCount] = {};
 
     if (result == VK_SUCCESS)
     {
         result = BuildPipelineBinaryCreateInfo(
-            pDevice, pCreateInfo, pPipelineLayout, &binaryCreateInfo, &shaderStageInfo, &vbInfo, tempModules);
+            pDevice, pCreateInfo, pPipelineLayout, &binaryCreateInfo, &shaderStageInfo, &vbInfo, &internalBufferInfo, tempModules);
     }
 
     // 3. Create pipeine binaries
@@ -431,19 +483,22 @@ VkResult GraphicsPipeline::Create(
     }
 
     uint64_t pipelineHash = 0;
-
+    GraphicsPipelineObjectCreateInfo objectCreateInfo = {};
+    GraphicsPipelineBinaryInfo       binaryInfo       = {};
     if (result == VK_SUCCESS)
     {
         pipelineHash = Vkgc::IPipelineDumper::GetPipelineHash(&binaryCreateInfo.pipelineInfo);
 
         // 4. Build pipeline object create info
-        GraphicsPipelineObjectCreateInfo objectCreateInfo = {};
-        GraphicsPipelineBinaryInfo       binaryInfo       = {};
         binaryInfo.pOptimizerKey = &binaryCreateInfo.pipelineProfileKey;
 
         BuildPipelineObjectCreateInfo(
             pDevice, pCreateInfo, &vbInfo, &binaryInfo, pPipelineLayout, &objectCreateInfo);
 
+        objectCreateInfo.immedInfo.checkDeferCompilePipeline =
+            pDevice->GetRuntimeSettings().deferCompileOptimizedPipeline &&
+            (binaryCreateInfo.pipelineInfo.enableEarlyCompile || binaryCreateInfo.pipelineInfo.enableUberFetchShader);
+
         // 5. Create pipeline objects
         result = CreatePipelineObjects(
             pDevice,
@@ -451,6 +506,7 @@ VkResult GraphicsPipeline::Create(
             pAllocator,
             pPipelineLayout,
             &vbInfo,
+            &internalBufferInfo,
             pipelineBinarySizes,
             pPipelineBinaries,
             pPipelineCache,
@@ -468,6 +524,11 @@ VkResult GraphicsPipeline::Create(
         pPipelineLayout->Destroy(pDevice, pAllocator);
     }
 
+    if (internalBufferInfo.pData != nullptr)
+    {
+        pDevice->VkInstance()->FreeMem(internalBufferInfo.pData);
+        internalBufferInfo.pData = nullptr;
+    }
     // Free the created pipeline binaries now that the PAL Pipelines/PipelineBinaryInfo have read them.
     for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++)
     {
@@ -477,7 +538,25 @@ VkResult GraphicsPipeline::Create(
                 &binaryCreateInfo, pPipelineBinaries[deviceIdx], pipelineBinarySizes[deviceIdx]);
         }
     }
-    pDefaultCompiler->FreeGraphicsPipelineCreateInfo(&binaryCreateInfo);
+
+    // Deferred compile will reuse all object generated in BuildPipelineBinaryCreateInfo.
+    // i.e. we need keep temp buffer in binaryCreateInfo
+    pDefaultCompiler->FreeGraphicsPipelineCreateInfo(&binaryCreateInfo,
+                                                     objectCreateInfo.immedInfo.checkDeferCompilePipeline);
+
+    if (objectCreateInfo.immedInfo.checkDeferCompilePipeline)
+    {
+        GraphicsPipeline* pThis = GraphicsPipeline::ObjectFromHandle(*pPipeline);
+        result = pThis->BuildDeferCompileWorkload(pDevice,
+                                                  pPipelineCache,
+                                                  &binaryCreateInfo,
+                                                  &shaderStageInfo,
+                                                  &objectCreateInfo);
+        if (result == VK_SUCCESS)
+        {
+            pDefaultCompiler->ExecuteDeferCompile(&pThis->m_deferWorkload);
+        }
+    }
 
     if (result == VK_SUCCESS)
     {
@@ -502,6 +581,326 @@ VkResult GraphicsPipeline::Create(
     return result;
 }
 
+// =====================================================================================================================
+static size_t GetVertexInputStructSize(
+    const VkPipelineVertexInputStateCreateInfo* pVertexInput)
+{
+    size_t size = 0;
+    size += sizeof(VkPipelineVertexInputStateCreateInfo);
+    size += sizeof(VkVertexInputBindingDescription) * pVertexInput->vertexBindingDescriptionCount;
+    size += sizeof(VkVertexInputAttributeDescription) * pVertexInput->vertexAttributeDescriptionCount;
+
+    const VkPipelineVertexInputDivisorStateCreateInfoEXT* pVertexDivisor = nullptr;
+    const vk::VkStructHeader* pStructHeader =
+        static_cast<const vk::VkStructHeader*>(pVertexInput->pNext);
+    while (pStructHeader != nullptr)
+    {
+        if (pStructHeader->sType == VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT)
+        {
+            pVertexDivisor = reinterpret_cast<const VkPipelineVertexInputDivisorStateCreateInfoEXT*>(pStructHeader);
+            break;
+        }
+        else
+        {
+            pStructHeader = pStructHeader->pNext;
+        }
+    }
+
+    if (pVertexDivisor != nullptr)
+    {
+        size += sizeof(VkPipelineVertexInputDivisorStateCreateInfoEXT);
+        size += sizeof(VkVertexInputBindingDivisorDescriptionEXT) * pVertexDivisor->vertexBindingDivisorCount;
+    }
+
+    return size;
+}
+
+// =====================================================================================================================
+static void CopyVertexInputStruct(
+    const VkPipelineVertexInputStateCreateInfo* pSrcVertexInput,
+    VkPipelineVertexInputStateCreateInfo*       pDestVertexInput)
+{
+    // Copy VkPipelineVertexInputStateCreateInfo
+    *pDestVertexInput = *pSrcVertexInput;
+    void* pNext = Util::VoidPtrInc(pDestVertexInput, sizeof(VkPipelineVertexInputStateCreateInfo));
+
+    // Copy VkVertexInputBindingDescription
+    pDestVertexInput->pVertexBindingDescriptions = reinterpret_cast<VkVertexInputBindingDescription*>(pNext);
+    size_t size = sizeof(VkVertexInputBindingDescription) * pSrcVertexInput->vertexBindingDescriptionCount;
+    memcpy(pNext, pSrcVertexInput->pVertexBindingDescriptions, size);
+    pNext = Util::VoidPtrInc(pNext, size);
+
+    // Copy VkVertexInputAttributeDescription
+    pDestVertexInput->pVertexAttributeDescriptions = reinterpret_cast<VkVertexInputAttributeDescription*>(pNext);
+    size = sizeof(VkVertexInputAttributeDescription) * pSrcVertexInput->vertexAttributeDescriptionCount;
+    memcpy(pNext, pSrcVertexInput->pVertexAttributeDescriptions, size);
+    pNext = Util::VoidPtrInc(pNext, size);
+
+    const VkPipelineVertexInputDivisorStateCreateInfoEXT* pSrcVertexDivisor = nullptr;
+    const vk::VkStructHeader* pStructHeader =
+        reinterpret_cast<const vk::VkStructHeader*>(pSrcVertexInput->pNext);
+    while (pStructHeader != nullptr)
+    {
+        if (pStructHeader->sType == VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT)
+        {
+            pSrcVertexDivisor = reinterpret_cast<const VkPipelineVertexInputDivisorStateCreateInfoEXT*>(pStructHeader);
+            break;
+        }
+        else
+        {
+            pStructHeader = pStructHeader->pNext;
+        }
+    }
+
+    if (pSrcVertexDivisor != nullptr)
+    {
+        // Copy VkPipelineVertexInputDivisorStateCreateInfoEXT
+        VkPipelineVertexInputDivisorStateCreateInfoEXT* pDestVertexDivisor =
+            reinterpret_cast<VkPipelineVertexInputDivisorStateCreateInfoEXT*>(pNext);
+        *pDestVertexDivisor = *pSrcVertexDivisor;
+        pDestVertexInput->pNext = pDestVertexDivisor;
+        pNext = Util::VoidPtrInc(pNext, sizeof(VkPipelineVertexInputDivisorStateCreateInfoEXT));
+
+        // Copy VkVertexInputBindingDivisorDescriptionEXT
+        pDestVertexDivisor->pVertexBindingDivisors = reinterpret_cast<VkVertexInputBindingDivisorDescriptionEXT*>(pNext);
+        size = sizeof(VkVertexInputBindingDivisorDescriptionEXT) * pSrcVertexDivisor->vertexBindingDivisorCount;
+        memcpy(pNext, pSrcVertexDivisor->pVertexBindingDivisors, size);
+        pNext = Util::VoidPtrInc(pNext, size);
+    }
+}
+
+// =====================================================================================================================
+VkResult GraphicsPipeline::BuildDeferCompileWorkload(
+    Device*                           pDevice,
+    PipelineCache*                    pPipelineCache,
+    GraphicsPipelineBinaryCreateInfo* pBinaryCreateInfo,
+    GraphicsPipelineShaderStageInfo*  pShaderStageInfo,
+    GraphicsPipelineObjectCreateInfo* pObjectCreateInfo)
+{
+    VkResult result = VK_SUCCESS;
+    DeferGraphicsPipelineCreateInfo* pCreateInfo = nullptr;
+
+    // Calculate payload size
+    size_t payloadSize = sizeof(DeferGraphicsPipelineCreateInfo) + sizeof(Util::Event);
+    for (uint32_t i = 0; i < ShaderStage::ShaderStageGfxCount; i++)
+    {
+        if (pShaderStageInfo->stages[i].pEntryPoint != nullptr)
+        {
+            payloadSize += strlen(pShaderStageInfo->stages[i].pEntryPoint) + 1;
+            if (pShaderStageInfo->stages[i].pSpecializationInfo != nullptr)
+            {
+                auto pSpecializationInfo = pShaderStageInfo->stages[i].pSpecializationInfo;
+                payloadSize += sizeof(VkSpecializationInfo);
+                payloadSize += sizeof(VkSpecializationMapEntry) * pSpecializationInfo->mapEntryCount;
+                payloadSize += pSpecializationInfo->dataSize;
+            }
+        }
+    }
+
+    size_t vertexInputSize = 0;
+    if ((pShaderStageInfo->stages[ShaderStage::ShaderStageVertex].pEntryPoint != nullptr) &&
+        (pBinaryCreateInfo->pipelineInfo.pVertexInput != nullptr))
+    {
+        vertexInputSize =  GetVertexInputStructSize(pBinaryCreateInfo->pipelineInfo.pVertexInput);
+        payloadSize += vertexInputSize;
+    }
+
+    size_t memOffset = 0;
+    Instance* pInstance = pDevice->VkInstance();
+    void* pPayloadMem = pInstance->AllocMem(payloadSize, VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+    if (pPayloadMem == nullptr)
+    {
+        result = VK_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    if (result == VK_SUCCESS)
+    {
+        memset(pPayloadMem, 0, payloadSize);
+        pCreateInfo = static_cast<DeferGraphicsPipelineCreateInfo*>(pPayloadMem);
+        memOffset = sizeof(DeferGraphicsPipelineCreateInfo);
+
+        // Fill create info and reset defer compile related options
+        pCreateInfo->pDevice          = pDevice;
+        pCreateInfo->pPipelineCache   = pPipelineCache;
+        pCreateInfo->pPipeline        = this;
+        pCreateInfo->shaderStageInfo  = *pShaderStageInfo;
+        pCreateInfo->binaryCreateInfo = *pBinaryCreateInfo;
+        pCreateInfo->objectCreateInfo = *pObjectCreateInfo;
+
+        pCreateInfo->binaryCreateInfo.pipelineInfo.enableEarlyCompile = false;
+        pCreateInfo->binaryCreateInfo.pipelineInfo.enableUberFetchShader = false;
+        pCreateInfo->objectCreateInfo.immedInfo.checkDeferCompilePipeline = false;
+
+        PipelineShaderInfo* pShaderInfo[] =
+        {
+            &pCreateInfo->binaryCreateInfo.pipelineInfo.vs,
+            &pCreateInfo->binaryCreateInfo.pipelineInfo.tcs,
+            &pCreateInfo->binaryCreateInfo.pipelineInfo.tes,
+            &pCreateInfo->binaryCreateInfo.pipelineInfo.gs,
+            &pCreateInfo->binaryCreateInfo.pipelineInfo.fs,
+        };
+
+        // Do deep copy for binaryCreateInfo members
+        for (uint32_t i = 0; i < ShaderStage::ShaderStageGfxCount; i++)
+        {
+            if (pShaderStageInfo->stages[i].pEntryPoint != nullptr)
+            {
+                size_t size = strlen(pShaderStageInfo->stages[i].pEntryPoint) + 1;
+                char* pEntryPoint = reinterpret_cast<char*>(Util::VoidPtrInc(pPayloadMem, memOffset));
+                memcpy(pEntryPoint, pShaderStageInfo->stages[i].pEntryPoint, size);
+                pCreateInfo->shaderStageInfo.stages[i].pEntryPoint = pEntryPoint;
+                pShaderInfo[i]->pEntryTarget = pEntryPoint;
+                memOffset += size;
+
+                if (pShaderStageInfo->stages[i].pSpecializationInfo != nullptr)
+                {
+                    auto pSrcSpecInfo = pShaderStageInfo->stages[i].pSpecializationInfo;
+                    auto pDestSpecInfo = reinterpret_cast<VkSpecializationInfo*>(Util::VoidPtrInc(pPayloadMem, memOffset));
+                    *pDestSpecInfo = *pSrcSpecInfo;
+                    memOffset += sizeof(VkSpecializationInfo);
+
+                    pDestSpecInfo->pMapEntries = reinterpret_cast<VkSpecializationMapEntry*>(Util::VoidPtrInc(pPayloadMem, memOffset));
+                    memcpy(const_cast<VkSpecializationMapEntry*>(pDestSpecInfo->pMapEntries),
+                           pSrcSpecInfo->pMapEntries,
+                           pSrcSpecInfo->mapEntryCount * sizeof(VkSpecializationMapEntry));
+                    memOffset += pSrcSpecInfo->mapEntryCount * sizeof(VkSpecializationMapEntry);
+
+                    pDestSpecInfo->pData = Util::VoidPtrInc(pPayloadMem, memOffset);
+                    memcpy(const_cast<void*>(pDestSpecInfo->pData),
+                           pSrcSpecInfo->pData,
+                           pSrcSpecInfo->dataSize);
+                    memOffset += pSrcSpecInfo->dataSize;
+                    pCreateInfo->shaderStageInfo.stages[i].pSpecializationInfo = pDestSpecInfo;
+                    pShaderInfo[i]->pSpecializationInfo = pDestSpecInfo;
+                }
+            }
+        }
+
+        if (vertexInputSize != 0)
+        {
+            VkPipelineVertexInputStateCreateInfo* pVertexInput =
+                reinterpret_cast<VkPipelineVertexInputStateCreateInfo*>(Util::VoidPtrInc(pPayloadMem, memOffset));
+            pCreateInfo->binaryCreateInfo.pipelineInfo.pVertexInput = pVertexInput;
+            CopyVertexInputStruct(pBinaryCreateInfo->pipelineInfo.pVertexInput, pVertexInput);
+            memOffset += vertexInputSize;
+        }
+
+        // Build defer workload
+        m_deferWorkload.pPayloads = pPayloadMem;
+        m_deferWorkload.pEvent = VK_PLACEMENT_NEW(Util::VoidPtrInc(pPayloadMem, memOffset))(Util::Event);
+        memOffset += sizeof(Util::Event);
+        VK_ASSERT(memOffset == payloadSize);
+
+        EventCreateFlags  flags = {};
+        flags.manualReset = true;
+        m_deferWorkload.pEvent->Init(flags);
+        m_deferWorkload.Execute = ExecuteDeferCreateOptimizedPipeline;
+    }
+
+    return result;
+}
+
+// =====================================================================================================================
+void GraphicsPipeline::ExecuteDeferCreateOptimizedPipeline(
+    void *pPayload)
+{
+    DeferGraphicsPipelineCreateInfo* pCreateInfo = static_cast<DeferGraphicsPipelineCreateInfo*>(pPayload);
+    pCreateInfo->pPipeline->DeferCreateOptimizedPipeline(pCreateInfo->pDevice,
+                                                         pCreateInfo->pPipelineCache,
+                                                         &pCreateInfo->binaryCreateInfo,
+                                                         &pCreateInfo->shaderStageInfo,
+                                                         &pCreateInfo->objectCreateInfo);
+}
+
+// =====================================================================================================================
+VkResult GraphicsPipeline::DeferCreateOptimizedPipeline(
+    Device*                           pDevice,
+    PipelineCache*                    pPipelineCache,
+    GraphicsPipelineBinaryCreateInfo* pBinaryCreateInfo,
+    GraphicsPipelineShaderStageInfo*  pShaderStageInfo,
+    GraphicsPipelineObjectCreateInfo* pObjectCreateInfo)
+{
+    VkResult              result = VK_SUCCESS;
+    size_t                pipelineBinarySizes[MaxPalDevices] = {};
+    const void*           pPipelineBinaries[MaxPalDevices]   = {};
+    Util::MetroHash::Hash cacheId[MaxPalDevices]             = {};
+    Pal::IPipeline*       pPalPipeline[MaxPalDevices]        = {};
+
+    Pal::Result           palResult                          = Pal::Result::Success;
+    size_t palSize =
+        pDevice->PalDevice(DefaultDeviceIndex)->GetGraphicsPipelineSize(pObjectCreateInfo->pipeline, &palResult);
+    VK_ASSERT(palResult == Pal::Result::Success);
+
+    uint32_t numPalDevices = pDevice->NumPalDevices();
+    void* pSystemMem = pDevice->VkInstance()->AllocMem(
+        palSize, VK_DEFAULT_MEM_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+    if (pSystemMem == nullptr)
+    {
+        result = VK_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    if (result == VK_SUCCESS)
+    {
+        result = CreatePipelineBinaries(pDevice,
+            nullptr,
+            pShaderStageInfo,
+            nullptr,
+            pBinaryCreateInfo,
+            pPipelineCache,
+            nullptr,
+            cacheId,
+            pipelineBinarySizes,
+            pPipelineBinaries);
+    }
+
+    if (result == VK_SUCCESS)
+    {
+        result = CreatePalPipelineObjects(pDevice,
+            pPipelineCache,
+            pObjectCreateInfo,
+            pipelineBinarySizes,
+            pPipelineBinaries,
+            cacheId,
+            pSystemMem,
+            pPalPipeline);
+    }
+
+    if (result == VK_SUCCESS)
+    {
+        VK_ASSERT(pSystemMem == pPalPipeline[0]);
+        SetOptimizedPipeline(pPalPipeline);
+    }
+
+    pDevice->GetCompiler(DefaultDeviceIndex)->FreeGraphicsPipelineCreateInfo(pBinaryCreateInfo, false);
+
+    for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++)
+    {
+        if (pPipelineBinaries[deviceIdx] != nullptr)
+        {
+            pDevice->GetCompiler(deviceIdx)->FreeGraphicsPipelineBinary(
+                pBinaryCreateInfo, pPipelineBinaries[deviceIdx], pipelineBinarySizes[deviceIdx]);
+        }
+    }
+    return result;
+}
+
+// =====================================================================================================================
+void GraphicsPipeline::SetOptimizedPipeline(
+    Pal::IPipeline* pPalPipeline[MaxPalDevices])
+{
+    const bool optimizedPipeline = true;
+    Util::MetroHash::Hash hash = {};
+    Util::MetroHash64 palPipelineHasher;
+    palPipelineHasher.Update(PalPipelineHash());
+    palPipelineHasher.Update(optimizedPipeline);
+    palPipelineHasher.Finalize(hash.bytes);
+
+    Util::MutexAuto pipelineSwitchLock(&m_pipelineSwitchLock);
+    memcpy(m_pOptimizedPipeline, pPalPipeline, sizeof(m_pOptimizedPipeline));
+    m_optimizedPipelineHash = hash.qwords[0];
+}
+
 // =====================================================================================================================
 GraphicsPipeline::GraphicsPipeline(
     Device* const                          pDevice,
@@ -515,7 +914,8 @@ GraphicsPipeline::GraphicsPipeline(
     bool                                   bindInputAssemblyState,
     bool                                   force1x1ShaderRate,
     bool                                   customSampleLocations,
-    const VbInfo&                          vbInfo,
+    const VbBindingInfo&                   vbInfo,
+    const PipelineInternalBufferInfo*      pInternalBuffer,
     Pal::IMsaaState**                      pPalMsaa,
     Pal::IColorBlendState**                pPalColorBlend,
     Pal::IDepthStencilState**              pPalDepthStencil,
@@ -529,6 +929,10 @@ GraphicsPipeline::GraphicsPipeline(
         pDevice),
     m_info(immedInfo),
     m_vbInfo(vbInfo),
+    m_internalBufferInfo(*pInternalBuffer),
+    m_pOptimizedPipeline{},
+    m_optimizedPipelineHash(0),
+    m_deferWorkload{},
     m_flags()
 {
     Pipeline::Init(pPalPipeline, pLayout, pBinary, staticStateMask, apiHash);
@@ -689,8 +1093,33 @@ VkResult GraphicsPipeline::Destroy(
     Device*                      pDevice,
     const VkAllocationCallbacks* pAllocator)
 {
+    if (m_deferWorkload.pEvent != nullptr)
+    {
+        auto result = m_deferWorkload.pEvent->Wait(10);
+        if (result == Util::Result::Success)
+        {
+            Util::Destructor(m_deferWorkload.pEvent);
+            pDevice->VkInstance()->FreeMem(m_deferWorkload.pPayloads);
+        }
+        m_deferWorkload.pEvent = nullptr;
+        m_deferWorkload.pPayloads = nullptr;
+    }
+
     DestroyStaticState(pAllocator);
 
+    if (m_pOptimizedPipeline[0] != nullptr)
+    {
+        void* pBaseMem = m_pOptimizedPipeline[0];
+        for (uint32_t deviceIdx = 0;
+            (deviceIdx < m_pDevice->NumPalDevices()) && (m_pPalPipeline[deviceIdx] != nullptr);
+            deviceIdx++)
+        {
+            m_pOptimizedPipeline[deviceIdx]->Destroy();
+            m_pOptimizedPipeline[deviceIdx] = nullptr;
+        }
+        pDevice->VkInstance()->FreeMem(pBaseMem);
+    }
+
     return Pipeline::Destroy(pDevice, pAllocator);
 }
 
@@ -920,8 +1349,9 @@ void GraphicsPipeline::BindToCmdBuffer(
         pRenderState->inputAssemblyState = m_info.inputAssemblyState;
     }
 
+    const bool useOptimizedPipeline = UseOptimizedPipeline();
     const uint64_t oldHash = pRenderState->boundGraphicsPipelineHash;
-    const uint64_t newHash = PalPipelineHash();
+    const uint64_t newHash = useOptimizedPipeline ? m_optimizedPipelineHash : PalPipelineHash();
 
     utils::IterateMask deviceGroup(pCmdBuffer->GetDeviceMask());
     do
@@ -939,7 +1369,7 @@ void GraphicsPipeline::BindToCmdBuffer(
                 Pal::PipelineBindParams params = {};
 
                 params.pipelineBindPoint = Pal::PipelineBindPoint::Graphics;
-                params.pPipeline         = m_pPalPipeline[deviceIdx];
+                params.pPipeline         = useOptimizedPipeline ? m_pOptimizedPipeline[deviceIdx] : m_pPalPipeline[deviceIdx];
                 params.graphics          = graphicsShaderInfos;
                 params.apiPsoHash        = m_apiHash;
 
@@ -975,9 +1405,9 @@ void GraphicsPipeline::BindToCmdBuffer(
             Pal::PipelineBindParams params = {};
 
             params.pipelineBindPoint = Pal::PipelineBindPoint::Graphics;
-            params.pPipeline         = m_pPalPipeline[deviceIdx];
+            params.pPipeline         = useOptimizedPipeline ? m_pOptimizedPipeline[deviceIdx] : m_pPalPipeline[deviceIdx];
             params.graphics          = graphicsShaderInfos;
-            params.apiPsoHash = m_apiHash;
+            params.apiPsoHash        = m_apiHash;
 
             pPalCmdBuf->CmdBindPipeline(params);
         }
@@ -1079,16 +1509,21 @@ void GraphicsPipeline::BindToCmdBuffer(
             pRenderState->dirtyGraphics.vrs = 0;
         }
 
-        if (m_vbInfo.uberFetchShaderBuffer.bufferSize > 0)
+        if ((useOptimizedPipeline == false) && (m_internalBufferInfo.dataSize > 0))
         {
-            VK_ASSERT(m_vbInfo.uberFetchShaderBuffer.userDataOffset > 0);
+            VK_ASSERT(m_internalBufferInfo.internalBufferCount > 0);
             Pal::gpusize gpuAddress = {};
-            uint32_t* pCpuAddr = pPalCmdBuf->CmdAllocateEmbeddedData(m_vbInfo.uberFetchShaderBuffer.bufferSize, 1, &gpuAddress);
-            memcpy(pCpuAddr, m_vbInfo.uberFetchShaderBuffer.bufferData, m_vbInfo.uberFetchShaderBuffer.bufferSize);
-            pPalCmdBuf->CmdSetUserData(Pal::PipelineBindPoint::Graphics,
-                                       m_vbInfo.uberFetchShaderBuffer.userDataOffset,
-                                       2,
-                                       reinterpret_cast<uint32_t*>(&gpuAddress));
+            uint32_t* pCpuAddr = pPalCmdBuf->CmdAllocateEmbeddedData(m_internalBufferInfo.dataSize, 1, &gpuAddress);
+            memcpy(pCpuAddr, m_internalBufferInfo.pData, m_internalBufferInfo.dataSize);
+            for (uint32_t i = 0; i < m_internalBufferInfo.internalBufferCount; i++)
+            {
+                Pal::gpusize bufferAddress = gpuAddress;
+                bufferAddress += m_internalBufferInfo.internalBufferEntries[i].bufferOffset;
+                pPalCmdBuf->CmdSetUserData(Pal::PipelineBindPoint::Graphics,
+                    m_internalBufferInfo.internalBufferEntries[i].userDataOffset,
+                    2,
+                    reinterpret_cast<uint32_t*>(&bufferAddress));
+            }
         }
     }
     while (deviceGroup.IterateNext());
diff --git a/icd/api/vk_image.cpp b/icd/api/vk_image.cpp
index 9f268e9b..e74936d8 100644
--- a/icd/api/vk_image.cpp
+++ b/icd/api/vk_image.cpp
@@ -131,7 +131,7 @@ void Image::CalcMemoryPriority(
 
     m_priority = MemoryPriority::FromSetting(settings.memoryPriorityDefault);
 
-    if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_MEMORY_PRIORITY) == false)
+    if (pDevice->GetEnabledFeatures().appControlledMemPriority == false)
     {
         UpgradeToHigherPriority(settings.memoryPriorityImageAny, &m_priority);
 
@@ -161,7 +161,6 @@ void Image::CalcMemoryPriority(
 // =====================================================================================================================
 Image::Image(
     Device*                      pDevice,
-    const VkAllocationCallbacks* pAllocator,
     VkImageCreateFlags           flags,
     Pal::IImage**                pPalImages,
     Pal::IGpuMemory**            pPalMemory,
@@ -245,7 +244,6 @@ Image::Image(
     }
 
     CalcMemoryPriority(pDevice);
-
 }
 
 // =====================================================================================================================
@@ -342,6 +340,9 @@ static VkResult InitSparseVirtualMemory(
 
     pPalImage[DefaultDeviceIndex]->GetGpuMemoryRequirements(&palReqs);
 
+    // We need virtual remapping support for all sparse resources
+    VK_ASSERT(pDevice->VkPhysicalDevice(DefaultDeviceIndex)->IsVirtualRemappingSupported());
+
     const VkDeviceSize sparseAllocGranularity = pDevice->GetProperties().virtualMemAllocGranularity;
 
     memset(pSparseMemCreateInfo, 0, sizeof(*pSparseMemCreateInfo));
@@ -359,20 +360,6 @@ static VkResult InitSparseVirtualMemory(
         pSparseMemCreateInfo->virtualAccessMode = Pal::VirtualGpuMemAccessMode::ReadZero;
     }
 
-    size_t palMemSize = 0;
-
-    for (uint32_t deviceIdx = 0; (result == VK_SUCCESS) && (deviceIdx < pDevice->NumPalDevices()); deviceIdx++)
-    {
-        Pal::Result palResult = Pal::Result::Success;
-
-        palMemSize += pDevice->PalDevice(DefaultDeviceIndex)->GetGpuMemorySize(*pSparseMemCreateInfo, &palResult);
-
-        if (palResult != Pal::Result::Success)
-        {
-            result = VK_ERROR_INITIALIZATION_FAILED;
-        }
-    }
-
     // If it's a sparse image we should also cache sparse image block dimensions (tile size) to
     // optimize sparse binding update, keeping in mind that each supported aspect (color, depth,
     // stencil) is permitted to use different granularity
@@ -390,47 +377,53 @@ static VkResult InitSparseVirtualMemory(
 
     *pSparseTileSize = sparseFormatProperties.imageGranularity;
 
-    void* pPalMemoryObj = nullptr;
+    Pal::Result palResult;
 
-    if (result == VK_SUCCESS)
-    {
-        pPalMemoryObj = pAllocator->pfnAllocation(
+    size_t palMemSize = pDevice->PalDevice(DefaultDeviceIndex)->GetGpuMemorySize(*pSparseMemCreateInfo, &palResult);
+    VK_ASSERT(palResult == Pal::Result::Success);
+
+    void* pPalMemoryObj = pAllocator->pfnAllocation(
             pAllocator->pUserData,
-            palMemSize,
+            (palMemSize * pDevice->NumPalDevices()),
             VK_DEFAULT_MEM_ALIGN,
             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
-        if (pPalMemoryObj == nullptr)
+    if (pPalMemoryObj != nullptr)
+    {
+        size_t palMemOffset = 0;
+
+        for (uint32_t deviceIdx = 0;
+            (deviceIdx < pDevice->NumPalDevices()) && (palResult == Pal::Result::Success);
+            deviceIdx++)
         {
-            result = VK_ERROR_OUT_OF_HOST_MEMORY;
-        }
-    }
+            if (deviceIdx != DefaultDeviceIndex)
+            {
+                Pal::GpuMemoryRequirements deviceReqs = {};
+                pPalImage[deviceIdx]->GetGpuMemoryRequirements(&deviceReqs);
+                VK_ASSERT(memcmp(&palReqs, &deviceReqs, sizeof(deviceReqs)) == 0);
 
-    size_t palMemOffset = 0;
+                VK_ASSERT(palMemSize == pDevice->PalDevice(deviceIdx)->GetGpuMemorySize(*pSparseMemCreateInfo, &palResult));
+                VK_ASSERT(palResult == Pal::Result::Success);
+            }
 
-    for (uint32_t deviceIdx = 0;
-        (deviceIdx < pDevice->NumPalDevices()) && (result == VK_SUCCESS);
-        deviceIdx++)
-    {
-        Pal::Result palResult = pDevice->PalDevice(deviceIdx)->CreateGpuMemory(
-            *pSparseMemCreateInfo,
-            Util::VoidPtrInc(pPalMemoryObj, palMemOffset),
-            &pSparseMemory[deviceIdx]);
+            palResult = pDevice->PalDevice(deviceIdx)->CreateGpuMemory(
+                *pSparseMemCreateInfo,
+                Util::VoidPtrInc(pPalMemoryObj, palMemOffset),
+                &pSparseMemory[deviceIdx]);
 
-        if (palResult == Pal::Result::Success)
-        {
-            palResult = pPalImage[deviceIdx]->BindGpuMemory(pSparseMemory[deviceIdx], 0);
-        }
+            if (palResult == Pal::Result::Success)
+            {
+                palResult = pPalImage[deviceIdx]->BindGpuMemory(pSparseMemory[deviceIdx], 0);
+            }
 
-        if (palResult == Pal::Result::Success)
-        {
-            palMemOffset += pDevice->PalDevice(deviceIdx)->GetGpuMemorySize(*pSparseMemCreateInfo, &palResult);
+            palMemOffset += palMemSize;
         }
 
-        if (palResult != Pal::Result::Success)
-        {
-            result = VK_ERROR_INITIALIZATION_FAILED;
-        }
+        result = PalToVkResult(palResult);
+    }
+    else
+    {
+        result = VK_ERROR_OUT_OF_HOST_MEMORY;
     }
 
     return result;
@@ -576,7 +569,9 @@ VkResult Image::Create(
         }
         case VK_STRUCTURE_TYPE_IMAGE_SWAPCHAIN_CREATE_INFO_KHR:
         {
-            VK_NOT_IMPLEMENTED;
+            // Nothing to do. BindSwapchainMemory has access to the swapchain and reinitializes based on it.
+            // Some of that could be pulled here, but validation is needed to be sure the same swapchain is provided or
+            // else reinitialization would be required anyway.
             break;
         }
 
@@ -890,7 +885,6 @@ VkResult Image::Create(
         // Construct API image object.
         VK_PLACEMENT_NEW (pMemory) Image(
             pDevice,
-            pAllocator,
             pCreateInfo->flags,
             pPalImages,
             pSparseMemory,
@@ -1081,7 +1075,6 @@ VkResult Image::CreatePresentableImage(
         // Construct API image object.
         VK_PLACEMENT_NEW (pImgObjMemory) Image(
             pDevice,
-            pAllocator,
             0,
             pPalImage,
             nullptr,
@@ -1127,7 +1120,6 @@ VkResult Image::Destroy(
     Device*                         pDevice,
     const VkAllocationCallbacks*    pAllocator)
 {
-
     for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++)
     {
         if (m_perGpu[deviceIdx].pPalImage != nullptr)
@@ -1303,7 +1295,7 @@ VkResult Image::BindMemory(
                 // After applying any necessary base address offset, the full GPU address should be aligned
                 VK_ASSERT(Util::IsPow2Aligned(baseGpuAddr + baseAddrOffset + memOffset, reqs.alignment));
 
-                if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_MEMORY_PRIORITY) == false)
+                if (pDevice->GetEnabledFeatures().appControlledMemPriority == false)
                 {
                     pMemory->ElevatePriority(m_priority);
                 }
@@ -1638,6 +1630,16 @@ VkResult Image::GetMemoryRequirements(
 
     PalImage(DefaultDeviceIndex)->GetGpuMemoryRequirements(&palReqs);
 
+    for (uint32 deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++)
+    {
+        if (deviceIdx != DefaultDeviceIndex)
+        {
+            Pal::GpuMemoryRequirements deviceReqs = {};
+            PalImage(deviceIdx)->GetGpuMemoryRequirements(&deviceReqs);
+            VK_ASSERT(memcmp(&palReqs, &deviceReqs, sizeof(deviceReqs)) == 0);
+        }
+    }
+
     if (isSparse)
     {
         pReqs->alignment = Util::RoundUpToMultiple(virtualGranularity, palReqs.alignment);
@@ -1673,13 +1675,6 @@ VkResult Image::GetMemoryRequirements(
         pReqs->memoryTypeBits &= pDevice->GetMemoryTypeMaskForExternalSharing();
     }
 
-    // Optional: if the image is optimally tiled, don't allow it with host visible memory types.
-    if ((m_internalFlags.linear == 0) &&
-        pDevice->GetRuntimeSettings().addHostInvisibleMemoryTypesForOptimalImages)
-    {
-        pReqs->memoryTypeBits &= ~pDevice->GetMemoryTypeMaskMatching(VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
-    }
-
     if (m_internalFlags.isProtected)
     {
         // If the image is protected only keep the protected type
@@ -1843,36 +1838,19 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageMemoryRequirements2(
     VkMemoryRequirements2*                      pMemoryRequirements)
 {
     const Device* pDevice = ApiDevice::ObjectFromHandle(device);
-    VK_ASSERT((pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) ||
-              pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2));
 
-    union
-    {
-        const VkStructHeader*                 pHeader;
-        const VkImageMemoryRequirementsInfo2* pRequirementsInfo2;
-    };
+    VkMemoryRequirements* pMemReq = &pMemoryRequirements->memoryRequirements;
+    Image* pImage = Image::ObjectFromHandle(pInfo->image);
+    pImage->GetMemoryRequirements(pDevice, pMemReq);
 
-    pRequirementsInfo2 = pInfo;
-    pHeader = utils::GetExtensionStructure(pHeader, VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2);
+    VkMemoryDedicatedRequirements* pMemDedicatedRequirements =
+        static_cast<VkMemoryDedicatedRequirements*>(pMemoryRequirements->pNext);
 
-    if (pHeader != nullptr)
+    if ((pMemDedicatedRequirements != nullptr) &&
+        (pMemDedicatedRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS))
     {
-        VkMemoryRequirements* pMemReq = &pMemoryRequirements->memoryRequirements;
-        Image* pImage = Image::ObjectFromHandle(pRequirementsInfo2->image);
-        pImage->GetMemoryRequirements(pDevice, pMemReq);
-
-        if (pMemoryRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2)
-        {
-            VkMemoryDedicatedRequirements* pMemDedicatedRequirements =
-                static_cast<VkMemoryDedicatedRequirements*>(pMemoryRequirements->pNext);
-
-            if ((pMemDedicatedRequirements != nullptr) &&
-                (pMemDedicatedRequirements->sType == VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS))
-            {
-                pMemDedicatedRequirements->prefersDedicatedAllocation  = pImage->DedicatedMemoryRequired();
-                pMemDedicatedRequirements->requiresDedicatedAllocation = pImage->DedicatedMemoryRequired();
-            }
-        }
+        pMemDedicatedRequirements->prefersDedicatedAllocation  = pImage->DedicatedMemoryRequired();
+        pMemDedicatedRequirements->requiresDedicatedAllocation = pImage->DedicatedMemoryRequired();
     }
 }
 
@@ -1884,26 +1862,12 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageSparseMemoryRequirements2(
     VkSparseImageMemoryRequirements2*               pSparseMemoryRequirements)
 {
     const Device* pDevice = ApiDevice::ObjectFromHandle(device);
-    VK_ASSERT((pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) ||
-              pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2));
-
-    union
-    {
-        const VkStructHeader*                       pHeader;
-        const VkImageSparseMemoryRequirementsInfo2* pRequirementsInfo2;
-    };
 
-    pRequirementsInfo2 = pInfo;
-    pHeader = utils::GetExtensionStructure(pHeader, VK_STRUCTURE_TYPE_IMAGE_SPARSE_MEMORY_REQUIREMENTS_INFO_2);
-
-    if (pHeader != nullptr)
-    {
-        Image* pImage = Image::ObjectFromHandle(pRequirementsInfo2->image);
-        auto memReqsView = utils::ArrayView<VkSparseImageMemoryRequirements>(
-            pSparseMemoryRequirements,
-            &pSparseMemoryRequirements->memoryRequirements);
-        pImage->GetSparseMemoryRequirements(pDevice, pSparseMemoryRequirementCount, memReqsView);
-    }
+    Image* pImage = Image::ObjectFromHandle(pInfo->image);
+    auto memReqsView = utils::ArrayView<VkSparseImageMemoryRequirements>(
+        pSparseMemoryRequirements,
+        &pSparseMemoryRequirements->memoryRequirements);
+    pImage->GetSparseMemoryRequirements(pDevice, pSparseMemoryRequirementCount, memReqsView);
 }
 
 } // namespace entry
diff --git a/icd/api/vk_instance.cpp b/icd/api/vk_instance.cpp
index e1ca79f8..54189536 100644
--- a/icd/api/vk_instance.cpp
+++ b/icd/api/vk_instance.cpp
@@ -347,7 +347,7 @@ VkResult Instance::Init(
 
     createInfo.pLogInfo = &callbackInfo;
 
-#if defined(__unix__)
+#if   defined(__unix__)
     createInfo.pSettingsPath = "/etc/amd";
 #else
     createInfo.pSettingsPath = "Vulkan";
diff --git a/icd/api/vk_memory.cpp b/icd/api/vk_memory.cpp
index 3e4eb5d2..67eb3937 100644
--- a/icd/api/vk_memory.cpp
+++ b/icd/api/vk_memory.cpp
@@ -406,10 +406,6 @@ VkResult Memory::Create(
                 &bindData,
                 sizeof(Pal::GpuMemoryResourceBindEventData));
         }
-        else
-        {
-             VK_NEVER_CALLED();
-        }
 
         // When share a dedicated image, metadata(width/height/mips/...) info is necessary in handle,
         // so driver calls bindMemory here to update metadata at allocation time.
@@ -1175,18 +1171,27 @@ void Memory::ElevatePriority(
     // the new given priority.
     if (m_priority < priority)
     {
-        Util::MutexAuto lock(m_pDevice->GetMemoryMutex());
+        SetPriority(priority, true);
+    }
+}
 
-        if (m_priority < priority)
+// =====================================================================================================================
+// This function set new priority of this memory's allocation.
+void Memory::SetPriority(
+    const MemoryPriority    priority,
+    const bool              mustBeLower)
+{
+    Util::MutexAuto lock(m_pDevice->GetMemoryMutex());
+    if (((mustBeLower == false) && (m_priority != priority)) ||
+        ((mustBeLower == true)  && (m_priority < priority)))
+    {
+        for (uint32_t deviceIdx = 0; deviceIdx < m_pDevice->NumPalDevices(); deviceIdx++)
         {
-            for (uint32_t deviceIdx = 0; deviceIdx < m_pDevice->NumPalDevices(); deviceIdx++)
+            if ((PalMemory(deviceIdx) != nullptr) &&
+                (PalMemory(deviceIdx)->SetPriority(priority.PalPriority(), priority.PalOffset()) ==
+                    Pal::Result::Success))
             {
-                if ((PalMemory(deviceIdx) != nullptr) &&
-                    (PalMemory(deviceIdx)->SetPriority(priority.PalPriority(), priority.PalOffset()) ==
-                        Pal::Result::Success))
-                {
-                    m_priority = priority;
-                }
+                m_priority = priority;
             }
         }
     }
@@ -1360,11 +1365,9 @@ VKAPI_ATTR void VKAPI_CALL vkFreeMemory(
         Device* pDevice = ApiDevice::ObjectFromHandle(device);
         Memory* pMemory = Memory::ObjectFromHandle(memory);
 
-        {
-            const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks();
+        const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pDevice->VkInstance()->GetAllocCallbacks();
 
-            pMemory->Free(pDevice, pAllocCB);
-        }
+        pMemory->Free(pDevice, pAllocCB);
     }
 }
 
diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp
index cea821da..c3561002 100644
--- a/icd/api/vk_physical_device.cpp
+++ b/icd/api/vk_physical_device.cpp
@@ -741,6 +741,12 @@ VkResult PhysicalDevice::Initialize()
             Pal::GpuHeapGartCacheable
         };
 
+        if (settings.forceUMA)
+        {
+            heapProperties[Pal::GpuHeapInvisible].heapSize = 0;
+            heapProperties[Pal::GpuHeapLocal].heapSize     = 0;
+        }
+
         const Pal::gpusize invisHeapSize = heapProperties[Pal::GpuHeapInvisible].heapSize;
         const Pal::gpusize localHeapSize = heapProperties[Pal::GpuHeapLocal].heapSize;
 
@@ -859,29 +865,6 @@ VkResult PhysicalDevice::Initialize()
                         memTypeWantsCoherentMemory[memoryTypeIndex] = true;
                     }
                 }
-
-                // Optional: if we have exposed a memory type that is host visible, add a backup
-                // memory type that is not host visible. We will use it for optimally tiled images.
-                if (settings.addHostInvisibleMemoryTypesForOptimalImages &&
-                    ((pMemoryType->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0)  &&
-                    // Skip host visible+coherent+cached as we won't need it
-                    ((pMemoryType->propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) == 0))
-                {
-                    memoryTypeIndex = m_memoryProperties.memoryTypeCount++;
-
-                    m_memoryTypeMask                         |= 1 << memoryTypeIndex;
-                    m_memoryVkIndexToPalHeap[memoryTypeIndex] = palGpuHeap;
-                    m_memoryPalHeapToVkIndexBits[palGpuHeap] |= (1UL << memoryTypeIndex);
-
-                    VkMemoryType* pNextMemoryType = &m_memoryProperties.memoryTypes[memoryTypeIndex];
-
-                    constexpr VkFlags hostMask = (VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT  |
-                                                    VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-                                                    VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
-
-                    pNextMemoryType->heapIndex     = pMemoryType->heapIndex;
-                    pNextMemoryType->propertyFlags = pMemoryType->propertyFlags & ~hostMask;
-                }
             }
         }
 
@@ -956,6 +939,10 @@ VkResult PhysicalDevice::Initialize()
                         (1UL << memoryTypeIndex);
 
                     m_memoryTypeMask |= 1 << m_memoryProperties.memoryTypeCount;
+
+                    m_memoryVkIndexAddRemoteBackupHeap[m_memoryProperties.memoryTypeCount] =
+                        m_memoryVkIndexAddRemoteBackupHeap[memoryTypeIndex];
+
                     ++m_memoryProperties.memoryTypeCount;
                 }
             }
@@ -3613,8 +3600,8 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions(
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(GOOGLE_HLSL_FUNCTIONALITY1));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(GOOGLE_DECORATE_STRING));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SCALAR_BLOCK_LAYOUT));
-    availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_MEMORY_PRIORITY));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_MEMORY_BUDGET));
+    availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_MEMORY_PRIORITY));
 
     if ((pPhysicalDevice == nullptr) || pPhysicalDevice->PalProperties().gfxipProperties.flags.supportPostDepthCoverage)
     {
@@ -3678,9 +3665,16 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions(
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_TERMINATE_INVOCATION));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_EXTENDED_DYNAMIC_STATE2));
 
+    availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_INTEGER_DOT_PRODUCT));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_COPY_COMMANDS2));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SHADER_SUBGROUP_UNIFORM_CONTROL_FLOW));
 
+    availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_ATOMIC_FLOAT));
+    if ((pPhysicalDevice == nullptr) || (pPhysicalDevice->PalProperties().gfxLevel > Pal::GfxIpLevel::GfxIp9))
+    {
+        availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_ATOMIC_FLOAT2));
+    }
+
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_4444_FORMATS));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_SYNCHRONIZATION2));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_CUSTOM_BORDER_COLOR));
@@ -4905,6 +4899,22 @@ size_t PhysicalDevice::GetFeatures2(
                 break;
             }
 
+            case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR:
+            {
+                if (IsExtensionSupported(DeviceExtensions::KHR_SHADER_INTEGER_DOT_PRODUCT))
+                {
+                    auto* pExtInfo = reinterpret_cast<VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR*>(pHeader);
+
+                    if (updateFeatures)
+                    {
+                        pExtInfo->shaderIntegerDotProduct = VK_TRUE;
+                    }
+
+                    structSize = sizeof(*pExtInfo);
+                }
+                break;
+            }
+
             case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES:
             {
                 auto* pExtInfo = reinterpret_cast<VkPhysicalDeviceScalarBlockLayoutFeatures*>(pHeader);
@@ -4977,6 +4987,21 @@ size_t PhysicalDevice::GetFeatures2(
                 break;
             }
 
+            case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT:
+            {
+                auto* pExtInfo = reinterpret_cast<VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT*>(pHeader);
+
+                if (updateFeatures)
+                {
+                    pExtInfo->primitiveTopologyListRestart      = VK_TRUE;
+                    pExtInfo->primitiveTopologyPatchListRestart = VK_FALSE;
+                }
+
+                structSize = sizeof(*pExtInfo);
+
+                break;
+            }
+
             case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_TERMINATE_INVOCATION_FEATURES_KHR:
             {
                 auto* pExtInfo = reinterpret_cast<VkPhysicalDeviceShaderTerminateInvocationFeaturesKHR*>(pHeader);
@@ -5089,6 +5114,22 @@ size_t PhysicalDevice::GetFeatures2(
 
                 break;
             }
+            case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT:
+            {
+                auto* pExtInfo = reinterpret_cast<VkPhysicalDeviceBufferDeviceAddressFeaturesEXT*>(pHeader);
+
+                if (updateFeatures)
+                {
+                    GetPhysicalDeviceBufferAddressFeatures(
+                        &pExtInfo->bufferDeviceAddress,
+                        &pExtInfo->bufferDeviceAddressCaptureReplay,
+                        &pExtInfo->bufferDeviceAddressMultiDevice);
+                }
+
+                structSize = sizeof(*pExtInfo);
+
+                break;
+            }
             case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT:
             {
                 auto* pExtInfo = reinterpret_cast<VkPhysicalDeviceLineRasterizationFeaturesEXT*>(pHeader);
@@ -5527,11 +5568,25 @@ size_t PhysicalDevice::GetFeatures2(
                 {
                     pExtInfo->shaderBufferFloat32Atomics   = VK_TRUE;
                     pExtInfo->shaderBufferFloat32AtomicAdd = VK_FALSE;
-                    pExtInfo->shaderBufferFloat64Atomics   = VK_TRUE;
+                    if (PalProperties().gfxipProperties.flags.support64BitInstructions)
+                    {
+                        pExtInfo->shaderBufferFloat64Atomics   = VK_TRUE;
+                    }
+                    else
+                    {
+                        pExtInfo->shaderBufferFloat64Atomics   = VK_FALSE;
+                    }
                     pExtInfo->shaderBufferFloat64AtomicAdd = VK_FALSE;
                     pExtInfo->shaderSharedFloat32Atomics   = VK_TRUE;
                     pExtInfo->shaderSharedFloat32AtomicAdd = VK_FALSE;
-                    pExtInfo->shaderSharedFloat64Atomics   = VK_TRUE;
+                    if (PalProperties().gfxipProperties.flags.support64BitInstructions)
+                    {
+                        pExtInfo->shaderSharedFloat64Atomics   = VK_TRUE;
+                    }
+                    else
+                    {
+                        pExtInfo->shaderSharedFloat64Atomics   = VK_FALSE;
+                    }
                     pExtInfo->shaderSharedFloat64AtomicAdd = VK_FALSE;
                     pExtInfo->shaderImageFloat32Atomics    = VK_TRUE;
                     pExtInfo->shaderImageFloat32AtomicAdd  = VK_FALSE;
@@ -5553,12 +5608,26 @@ size_t PhysicalDevice::GetFeatures2(
                     pExtInfo->shaderBufferFloat16AtomicAdd    = VK_FALSE;
                     pExtInfo->shaderBufferFloat16AtomicMinMax = VK_FALSE;
                     pExtInfo->shaderBufferFloat32AtomicMinMax = VK_TRUE;
-                    pExtInfo->shaderBufferFloat64AtomicMinMax = VK_TRUE;
+                    if (PalProperties().gfxipProperties.flags.support64BitInstructions)
+                    {
+                        pExtInfo->shaderBufferFloat64AtomicMinMax = VK_TRUE;
+                    }
+                    else
+                    {
+                        pExtInfo->shaderBufferFloat64AtomicMinMax = VK_FALSE;
+                    }
                     pExtInfo->shaderSharedFloat16Atomics      = VK_FALSE;
                     pExtInfo->shaderSharedFloat16AtomicAdd    = VK_FALSE;
                     pExtInfo->shaderSharedFloat16AtomicMinMax = VK_FALSE;
                     pExtInfo->shaderSharedFloat32AtomicMinMax = VK_TRUE;
-                    pExtInfo->shaderSharedFloat64AtomicMinMax = VK_TRUE;
+                    if (PalProperties().gfxipProperties.flags.support64BitInstructions)
+                    {
+                        pExtInfo->shaderSharedFloat64AtomicMinMax = VK_TRUE;
+                    }
+                    else
+                    {
+                        pExtInfo->shaderSharedFloat64AtomicMinMax = VK_FALSE;
+                    }
                     pExtInfo->shaderImageFloat32AtomicMinMax  = VK_TRUE;
                     pExtInfo->sparseImageFloat32AtomicMinMax  = VK_TRUE;
                 }
@@ -5567,6 +5636,19 @@ size_t PhysicalDevice::GetFeatures2(
                 break;
             }
 
+            case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT:
+            {
+                auto* pExtInfo = reinterpret_cast<VkPhysicalDevicePageableDeviceLocalMemoryFeaturesEXT*>(pHeader);
+
+                if (updateFeatures)
+                {
+                    pExtInfo->pageableDeviceLocalMemory = VK_TRUE;
+                }
+
+                structSize = sizeof(*pExtInfo);
+                break;
+            }
+
             default:
             {
                 // skip any unsupported extension structures
@@ -6113,6 +6195,55 @@ void PhysicalDevice::GetDeviceProperties2(
             break;
         }
 
+        case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR:
+        {
+            auto* pProps = static_cast<VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR*>(pNext);
+
+            const VkBool32 int8DotSupport = PalProperties().gfxipProperties.flags.supportInt8Dot ? VK_TRUE :
+                                                                                                    VK_FALSE;
+            pProps->integerDotProduct8BitUnsignedAccelerated         = int8DotSupport;
+            pProps->integerDotProduct8BitSignedAccelerated           = int8DotSupport;
+            pProps->integerDotProduct4x8BitPackedUnsignedAccelerated = int8DotSupport;
+            pProps->integerDotProduct4x8BitPackedSignedAccelerated   = int8DotSupport;
+
+            {
+                pProps->integerDotProduct8BitMixedSignednessAccelerated         = VK_FALSE;
+                pProps->integerDotProduct4x8BitPackedMixedSignednessAccelerated = VK_FALSE;
+            }
+
+            const VkBool32 int16DotSupport = ((PalProperties().gfxipProperties.flags.support16BitInstructions) &&
+                                                ((GetRuntimeSettings().optOnlyEnableFP16ForGfx9Plus == false) ||
+                                                (PalProperties().gfxLevel >= Pal::GfxIpLevel::GfxIp9))
+                                                ) ? VK_TRUE : VK_FALSE;
+
+            pProps->integerDotProduct16BitUnsignedAccelerated                       = int16DotSupport;
+            pProps->integerDotProduct16BitSignedAccelerated                         = int16DotSupport;
+            pProps->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = int16DotSupport;
+            pProps->integerDotProductAccumulatingSaturating16BitSignedAccelerated   = int16DotSupport;
+
+            pProps->integerDotProduct16BitMixedSignednessAccelerated                              = VK_FALSE;
+            pProps->integerDotProduct32BitUnsignedAccelerated                                     = VK_FALSE;
+            pProps->integerDotProduct32BitSignedAccelerated                                       = VK_FALSE;
+            pProps->integerDotProduct32BitMixedSignednessAccelerated                              = VK_FALSE;
+            pProps->integerDotProduct64BitUnsignedAccelerated                                     = VK_FALSE;
+            pProps->integerDotProduct64BitSignedAccelerated                                       = VK_FALSE;
+            pProps->integerDotProduct64BitMixedSignednessAccelerated                              = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated                = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated                = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating8BitSignedAccelerated                  = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated         = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated        = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated          = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated        = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated               = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating32BitSignedAccelerated                 = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated        = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated               = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating64BitSignedAccelerated                 = VK_FALSE;
+            pProps->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated        = VK_FALSE;
+        }
+        break;
         case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT:
         {
             auto* pProps = static_cast<VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT*>(pNext);
diff --git a/icd/api/vk_pipeline_layout.cpp b/icd/api/vk_pipeline_layout.cpp
index 18acc405..f73e9a2f 100644
--- a/icd/api/vk_pipeline_layout.cpp
+++ b/icd/api/vk_pipeline_layout.cpp
@@ -141,40 +141,20 @@ VkResult PipelineLayout::ConvertCreateInfo(
     pInfo->userDataLayout.setBindingRegCount        = 0;
     pInfo->userDataLayout.setBindingRegBase         = 0;
 
-    // Reserve an user-data to store the VA of buffer for transform feedback.
-    if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK))
+    if (pDevice->GetRuntimeSettings().enableEarlyCompile)
     {
-        pInfo->userDataLayout.transformFeedbackRegCount = 1;
-        pInfo->userDataRegCount                        += pInfo->userDataLayout.transformFeedbackRegCount;
-        pPipelineInfo->numUserDataNodes                += 1;
+        // Early compile mode will enable uber-fetch shader and spec constant buffer on vertex shader and
+        // fragment shader implicitly. so we need three reserved node.
+        pPipelineInfo->numUserDataNodes += 3;
+        pInfo->userDataRegCount += 6; // Each buffer consume 2 user data register now.
     }
-
-    // Reserve one user data nodes for uber-fetch shader.
-    if (pDevice->GetRuntimeSettings().enableUberFetchShader)
+    else if (pDevice->GetRuntimeSettings().enableUberFetchShader)
     {
+        // Reserve one user data nodes for uber-fetch shader.
         pPipelineInfo->numUserDataNodes += 1;
+        pInfo->userDataRegCount += 2;
     }
 
-    // Calculate the number of bytes needed for push constants
-    uint32_t pushConstantsSizeInBytes = 0;
-
-    for (uint32_t i = 0; i < pIn->pushConstantRangeCount; ++i)
-    {
-        const VkPushConstantRange* pRange = &pIn->pPushConstantRanges[i];
-
-        // Test if this push constant range is active in at least one stage
-        if (pRange->stageFlags != 0)
-        {
-            pushConstantsSizeInBytes = Util::Max(pushConstantsSizeInBytes, pRange->offset + pRange->size);
-        }
-    }
-
-    uint32_t pushConstRegCount = pushConstantsSizeInBytes / sizeof(uint32_t);
-
-    pInfo->userDataLayout.pushConstRegBase  = pInfo->userDataLayout.transformFeedbackRegCount;
-    pInfo->userDataLayout.pushConstRegCount = pushConstRegCount;
-    pInfo->userDataRegCount                += pushConstRegCount;
-
     VK_ASSERT(pIn->setLayoutCount <= MaxDescriptorSets);
 
     // Total number of dynamic descriptors across all descriptor sets
@@ -234,6 +214,17 @@ VkResult PipelineLayout::ConvertCreateInfo(
 
         // Add the number of user data regs used by this set to the total count for the whole layout
         pInfo->userDataRegCount += pSetUserData->totalRegCount;
+        if (pDevice->GetRuntimeSettings().pipelineLayoutMode == PipelineLayoutAngle)
+        {
+            // Force next set firstRegOffset align to AngleDescPattern.
+            if ((i + 1) < Util::ArrayLen(AngleDescPattern::DescriptorSetOffset))
+            {
+                if (pInfo->userDataRegCount < AngleDescPattern::DescriptorSetOffset[i + 1])
+                {
+                    pInfo->userDataRegCount = AngleDescPattern::DescriptorSetOffset[i + 1];
+                }
+            }
+        }
     }
 
     // Calculate total number of user data regs used for active descriptor set data
@@ -241,6 +232,35 @@ VkResult PipelineLayout::ConvertCreateInfo(
 
     VK_ASSERT(totalDynDescCount <= MaxDynamicDescriptors);
 
+    // Calculate the number of bytes needed for push constants
+    uint32_t pushConstantsSizeInBytes = 0;
+
+    for (uint32_t i = 0; i < pIn->pushConstantRangeCount; ++i)
+    {
+        const VkPushConstantRange* pRange = &pIn->pPushConstantRanges[i];
+
+        // Test if this push constant range is active in at least one stage
+        if (pRange->stageFlags != 0)
+        {
+            pushConstantsSizeInBytes = Util::Max(pushConstantsSizeInBytes, pRange->offset + pRange->size);
+        }
+    }
+
+    uint32_t pushConstRegCount = pushConstantsSizeInBytes / sizeof(uint32_t);
+
+    pInfo->userDataLayout.pushConstRegBase = pInfo->userDataRegCount;
+    pInfo->userDataLayout.pushConstRegCount = pushConstRegCount;
+    pInfo->userDataRegCount += pushConstRegCount;
+
+    // Reserve an user-data to store the VA of buffer for transform feedback.
+    if (pDevice->IsExtensionEnabled(DeviceExtensions::EXT_TRANSFORM_FEEDBACK))
+    {
+        pInfo->userDataLayout.transformFeedbackRegBase = pInfo->userDataRegCount;
+        pInfo->userDataLayout.transformFeedbackRegCount = 1;
+        pInfo->userDataRegCount += pInfo->userDataLayout.transformFeedbackRegCount;
+        pPipelineInfo->numUserDataNodes += 1;
+    }
+
     // In case we need an internal vertex buffer table, add nodes required for its entries, and its set pointer.
     pPipelineInfo->numRsrcMapNodes += Pal::MaxVertexBuffers;
 
@@ -715,7 +735,7 @@ VkResult PipelineLayout::BuildLlpcSetMapping(
 // This function populates the resource mapping node details to the shader-stage specific pipeline info structure.
 VkResult PipelineLayout::BuildLlpcPipelineMapping(
     const uint32_t             stageMask,
-    VbInfo*                    pVbInfo,
+    VbBindingInfo*             pVbInfo,
     void*                      pBuffer,
     bool                       appendFetchShaderCb,
     Vkgc::ResourceMappingData* pResourceMapping
@@ -732,41 +752,51 @@ VkResult PipelineLayout::BuildLlpcPipelineMapping(
     uint32_t mappingNodeCount     = 0; // Number of consumed ResourceMappingNodes (only sub-nodes)
     uint32_t descriptorRangeCount = 0; // Number of consumed StaticResourceValues
 
-    if (m_info.userDataLayout.transformFeedbackRegCount > 0)
-    {
-        uint32_t xfbStages       = (stageMask & (Vkgc::ShaderStageFragmentBit - 1)) >> 1;
-        uint32_t lastXfbStageBit = Vkgc::ShaderStageVertexBit;
+    constexpr uint32_t InternalCbRegCount = 2;
 
-        while (xfbStages > 0)
-        {
-            lastXfbStageBit <<= 1;
-            xfbStages >>= 1;
-        }
+    if (appendFetchShaderCb && pVbInfo != nullptr)
+    {
+        // Append node for uber fetch shader constant buffer
+        auto pFetchShaderCbNode = &pUserDataNodes[userDataNodeCount];
+        pFetchShaderCbNode->node.type             = Vkgc::ResourceMappingNodeType::DescriptorBufferCompact;
+        pFetchShaderCbNode->node.offsetInDwords   = FetchShaderInternalBufferOffset;
+        pFetchShaderCbNode->node.sizeInDwords     = InternalCbRegCount;
+        pFetchShaderCbNode->node.srdRange.set     = Vkgc::InternalDescriptorSetId;
+        pFetchShaderCbNode->node.srdRange.binding = Vkgc::FetchShaderInternalBufferBinding;
+        pFetchShaderCbNode->visibility            = Vkgc::ShaderStageVertexBit;
+
+        userDataNodeCount += 1;
+    }
 
-        if (lastXfbStageBit != 0)
+    if (m_pDevice->GetRuntimeSettings().enableEarlyCompile)
+    {
+        if (stageMask & Vkgc::ShaderStageVertexBit)
         {
-            auto pTransformFeedbackNode = &pUserDataNodes[userDataNodeCount];
-            pTransformFeedbackNode->node.type           = Vkgc::ResourceMappingNodeType::StreamOutTableVaPtr;
-            pTransformFeedbackNode->node.offsetInDwords = m_info.userDataLayout.transformFeedbackRegBase;
-            pTransformFeedbackNode->node.sizeInDwords   = m_info.userDataLayout.transformFeedbackRegCount;
-            pTransformFeedbackNode->visibility          = lastXfbStageBit;
+            auto pSpecConstVertexCbNode = &pUserDataNodes[userDataNodeCount];
+            pSpecConstVertexCbNode->node.type             = Vkgc::ResourceMappingNodeType::DescriptorBufferCompact;
+            pSpecConstVertexCbNode->node.offsetInDwords   = SpecConstBufferVertexOffset;
+            pSpecConstVertexCbNode->node.sizeInDwords     = InternalCbRegCount;
+            pSpecConstVertexCbNode->node.srdRange.set     = Vkgc::InternalDescriptorSetId;
+            pSpecConstVertexCbNode->node.srdRange.binding = SpecConstVertexInternalBufferBindingId;
+            pSpecConstVertexCbNode->visibility            = Vkgc::ShaderStageVertexBit;
 
             userDataNodeCount += 1;
         }
-    }
 
-    // TODO: Build the internal push constant resource mapping
-        if (m_info.userDataLayout.pushConstRegCount > 0)
+        if (stageMask & Vkgc::ShaderStageFragmentBit)
         {
-            auto pPushConstNode = &pUserDataNodes[userDataNodeCount];
-            pPushConstNode->node.type             = Vkgc::ResourceMappingNodeType::PushConst;
-            pPushConstNode->node.offsetInDwords   = m_info.userDataLayout.pushConstRegBase;
-            pPushConstNode->node.sizeInDwords     = m_info.userDataLayout.pushConstRegCount;
-            pPushConstNode->node.srdRange.set     = Vkgc::InternalDescriptorSetId;
-            pPushConstNode->visibility            = stageMask;
+            auto pSpecConstVertexCbNode = &pUserDataNodes[userDataNodeCount];
+            pSpecConstVertexCbNode->node.type             = Vkgc::ResourceMappingNodeType::DescriptorBufferCompact;
+            pSpecConstVertexCbNode->node.offsetInDwords   = SpecConstBufferFragmentOffset;
+            pSpecConstVertexCbNode->node.sizeInDwords     = InternalCbRegCount;
+            pSpecConstVertexCbNode->node.srdRange.set     = Vkgc::InternalDescriptorSetId;
+            pSpecConstVertexCbNode->node.srdRange.binding = SpecConstFragmentInternalBufferBindingId;
+            pSpecConstVertexCbNode->visibility            = Vkgc::ShaderStageVertexBit;
 
             userDataNodeCount += 1;
         }
+    }
+
     // Build descriptor for each set
     for (uint32_t setIndex = 0; (setIndex < m_info.setCount) && (result == VK_SUCCESS); ++setIndex)
     {
@@ -824,6 +854,41 @@ VkResult PipelineLayout::BuildLlpcPipelineMapping(
         }
     }
 
+    // TODO: Build the internal push constant resource mapping
+        if (m_info.userDataLayout.pushConstRegCount > 0)
+        {
+            auto pPushConstNode = &pUserDataNodes[userDataNodeCount];
+            pPushConstNode->node.type = Vkgc::ResourceMappingNodeType::PushConst;
+            pPushConstNode->node.offsetInDwords = m_info.userDataLayout.pushConstRegBase;
+            pPushConstNode->node.sizeInDwords = m_info.userDataLayout.pushConstRegCount;
+            pPushConstNode->node.srdRange.set = Vkgc::InternalDescriptorSetId;
+            pPushConstNode->visibility = stageMask;
+
+            userDataNodeCount += 1;
+        }
+
+    if (m_info.userDataLayout.transformFeedbackRegCount > 0)
+    {
+        uint32_t xfbStages = (stageMask & (Vkgc::ShaderStageFragmentBit - 1)) >> 1;
+        uint32_t lastXfbStageBit = Vkgc::ShaderStageVertexBit;
+
+        while (xfbStages > 0)
+        {
+            lastXfbStageBit <<= 1;
+            xfbStages >>= 1;
+        }
+
+        if (lastXfbStageBit != 0)
+        {
+            auto pTransformFeedbackNode = &pUserDataNodes[userDataNodeCount];
+            pTransformFeedbackNode->node.type = Vkgc::ResourceMappingNodeType::StreamOutTableVaPtr;
+            pTransformFeedbackNode->node.offsetInDwords = m_info.userDataLayout.transformFeedbackRegBase;
+            pTransformFeedbackNode->node.sizeInDwords = m_info.userDataLayout.transformFeedbackRegCount;
+            pTransformFeedbackNode->visibility = lastXfbStageBit;
+
+            userDataNodeCount += 1;
+        }
+    }
     if ((result == VK_SUCCESS) && (pVbInfo != nullptr))
     {
         // Build the internal vertex buffer table mapping
@@ -836,7 +901,7 @@ VkResult PipelineLayout::BuildLlpcPipelineMapping(
 
             // Build the table description itself
             const uint32_t srdDwSize = m_pDevice->GetProperties().descriptorSizes.bufferView / sizeof(uint32_t);
-            uint32_t vbTableSize = pVbInfo->bindingInfo.bindingTableSize * srdDwSize;
+            uint32_t vbTableSize = pVbInfo->bindingTableSize * srdDwSize;
 
             // Add the set pointer node pointing to this table
             auto pVbTblPtrNode = &pUserDataNodes[userDataNodeCount];
@@ -853,32 +918,6 @@ VkResult PipelineLayout::BuildLlpcPipelineMapping(
         {
             result = VK_ERROR_INITIALIZATION_FAILED;
         }
-
-        if (appendFetchShaderCb)
-        {
-            // Append node for uber fetch shader constant buffer
-            constexpr uint32_t FetchShaderCbRegCount = 2;
-            if ((userDataNodeCount + FetchShaderCbRegCount) <=
-                m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().gfxipProperties.maxUserDataEntries)
-            {
-                auto pFetchShaderCbNode = &pUserDataNodes[userDataNodeCount];
-                pFetchShaderCbNode->node.type             = Vkgc::ResourceMappingNodeType::DescriptorBufferCompact;
-                pFetchShaderCbNode->node.offsetInDwords   = m_info.userDataRegCount + VbTablePtrRegCount;
-                pFetchShaderCbNode->node.sizeInDwords     = FetchShaderCbRegCount;
-                pFetchShaderCbNode->node.srdRange.set     = Vkgc::InternalDescriptorSetId;
-                pFetchShaderCbNode->node.srdRange.binding = Vkgc::FetchShaderInternalBufferBinding;
-                pFetchShaderCbNode->visibility = Vkgc::ShaderStageVertexBit;
-
-                pVbInfo->uberFetchShaderBuffer.userDataOffset = pFetchShaderCbNode->node.offsetInDwords;
-                userDataNodeCount += 1;
-            }
-            else
-            {
-                VK_NEVER_CALLED();
-                result = VK_ERROR_INITIALIZATION_FAILED;
-            }
-
-        }
     }
 
     // If you hit these assert, we precomputed an insufficient amount of scratch space during layout creation.
diff --git a/icd/api/vk_queue.cpp b/icd/api/vk_queue.cpp
index 04461b66..962d0cdc 100644
--- a/icd/api/vk_queue.cpp
+++ b/icd/api/vk_queue.cpp
@@ -1278,7 +1278,6 @@ VkResult Queue::BindSparseEntry(
                 Memory* pMemory = Memory::ObjectFromHandle(bind.memory);
 
                 pRealGpuMem = pMemory->PalMemory(resourceDeviceIndex, memoryDeviceIndex);
-
             }
 
             VK_ASSERT(bind.flags == 0);
@@ -1318,7 +1317,6 @@ VkResult Queue::BindSparseEntry(
                 Memory* pMemory = Memory::ObjectFromHandle(bind.memory);
 
                 pRealGpuMem = pMemory->PalMemory(resourceDeviceIndex, memoryDeviceIndex);
-
             }
 
             result = AddVirtualRemapRange(
@@ -1361,7 +1359,6 @@ VkResult Queue::BindSparseEntry(
                 Memory* pMemory = Memory::ObjectFromHandle(bind.memory);
 
                 pRealGpuMem = pMemory->PalMemory(resourceDeviceIndex, memoryDeviceIndex);
-
             }
 
             // Get the subresource layout to be able to figure out its offset
diff --git a/icd/api/vk_render_pass.cpp b/icd/api/vk_render_pass.cpp
index 2012d4ef..bc15c2cc 100644
--- a/icd/api/vk_render_pass.cpp
+++ b/icd/api/vk_render_pass.cpp
@@ -634,17 +634,6 @@ static size_t GetRenderPassCreateInfoRequiredMemorySize(
 {
     size_t createInfoSize = 0;
 
-    createInfoSize += pCreateInfo->attachmentCount * sizeof(AttachmentDescription);
-    createInfoSize += pCreateInfo->subpassCount * sizeof(SubpassDescription);
-    createInfoSize += pCreateInfo->dependencyCount * sizeof(SubpassDependency);
-
-    for (uint32_t subpassIndex = 0; subpassIndex < pCreateInfo->subpassCount; ++subpassIndex)
-    {
-        const auto& subpassDesc = pCreateInfo->pSubpasses[subpassIndex];
-
-        createInfoSize += GetSubpassDescriptionBaseMemorySize(subpassDesc);
-    }
-
     if (renderPassExt.pMultiviewCreateInfo != nullptr)
     {
         createInfoSize += renderPassExt.pMultiviewCreateInfo->correlationMaskCount * sizeof(uint32_t);
@@ -656,6 +645,19 @@ static size_t GetRenderPassCreateInfoRequiredMemorySize(
         createInfoSize += pCreateInfo2->correlatedViewMaskCount * sizeof(uint32_t);
     }
 
+    createInfoSize += pCreateInfo->attachmentCount * sizeof(AttachmentDescription);
+    // Subpasses need to be aligned
+    createInfoSize = Util::Pow2Align(createInfoSize, alignof(SubpassDescription));
+    createInfoSize += pCreateInfo->subpassCount * sizeof(SubpassDescription);
+    createInfoSize += pCreateInfo->dependencyCount * sizeof(SubpassDependency);
+
+    for (uint32_t subpassIndex = 0; subpassIndex < pCreateInfo->subpassCount; ++subpassIndex)
+    {
+        const auto& subpassDesc = pCreateInfo->pSubpasses[subpassIndex];
+
+        createInfoSize += GetSubpassDescriptionBaseMemorySize(subpassDesc);
+    }
+
     return createInfoSize;
 }
 
@@ -697,6 +699,8 @@ static void InitRenderPassCreateInfo(
     }
 
     nextPtr = Util::VoidPtrInc(nextPtr, pCreateInfo->attachmentCount * sizeof(AttachmentDescription));
+    // Struct needs to be aligned
+    nextPtr = Util::VoidPtrAlign(nextPtr, alignof(SubpassDescription));
     VK_ASSERT(Util::VoidPtrDiff(nextPtr, pMemoryPtr) <= memorySize);
 
     outRenderPassInfo->subpassCount = pCreateInfo->subpassCount;
diff --git a/icd/api/vk_swapchain.cpp b/icd/api/vk_swapchain.cpp
index 5286892f..34cc902f 100644
--- a/icd/api/vk_swapchain.cpp
+++ b/icd/api/vk_swapchain.cpp
@@ -335,11 +335,17 @@ VkResult SwapChain::Create(
                                                                  &palResult);
     VK_ASSERT(palResult == Pal::Result::Success);
 
-    size_t          queueFamilyArraySize = sizeof(uint32_t*) * pCreateInfo->queueFamilyIndexCount;
-    size_t          imageArraySize       = sizeof(VkImage) * swapImageCount;
-    size_t          memoryArraySize      = sizeof(VkDeviceMemory) * swapImageCount;
-    size_t          cmdBufArraySize      = sizeof(Pal::ICmdBuffer*) * swapImageCount;
-    size_t          objSize              = vkSwapChainSize +
+    properties.queueFamilyIndexCount =    ((pCreateInfo->imageSharingMode == VK_SHARING_MODE_CONCURRENT) ?
+                                           pCreateInfo->queueFamilyIndexCount : 0u);
+
+    // If imageSharingMode is VK_SHARING_MODE_CONCURRENT, queueFamilyIndexCount must be greater than 1.
+    VK_ASSERT((pCreateInfo->imageSharingMode != VK_SHARING_MODE_CONCURRENT) || (properties.queueFamilyIndexCount > 1));
+
+    const size_t    queueFamilyArraySize = sizeof(uint32_t*) * properties.queueFamilyIndexCount;
+    const size_t    imageArraySize       = sizeof(VkImage) * swapImageCount;
+    const size_t    memoryArraySize      = sizeof(VkDeviceMemory) * swapImageCount;
+    const size_t    cmdBufArraySize      = sizeof(Pal::ICmdBuffer*) * swapImageCount;
+    const size_t    objSize              = vkSwapChainSize +
                                            queueFamilyArraySize +
                                            palSwapChainSize +
                                            imageArraySize +
@@ -402,26 +408,27 @@ VkResult SwapChain::Create(
             &properties.imageCreateInfo);
     }
 
+    // Store creation info for image barrier policy
+    properties.usage       = pCreateInfo->imageUsage;
+    properties.sharingMode = pCreateInfo->imageSharingMode;
+    properties.format      = pCreateInfo->imageFormat;
+
     properties.images      = static_cast<VkImage*>(Util::VoidPtrInc(pMemory, offset));
     offset += imageArraySize;
 
     properties.imageMemory = static_cast<VkDeviceMemory*>(Util::VoidPtrInc(pMemory, offset));
     offset += memoryArraySize;
 
-    properties.pQueueFamilyIndices = static_cast<uint32_t*>(Util::VoidPtrInc(pMemory, offset));
-    offset += queueFamilyArraySize;
+    // memcpy queue family indices
+    if (queueFamilyArraySize > 0u)
+    {
+        properties.pQueueFamilyIndices = static_cast<uint32_t*>(Util::VoidPtrInc(pMemory, offset));
+        offset += queueFamilyArraySize;
+        memcpy(properties.pQueueFamilyIndices, pCreateInfo->pQueueFamilyIndices, queueFamilyArraySize);
+    }
 
     VK_ASSERT(offset == objSize);
 
-    // Store creation info for image barrier policy
-    properties.usage                 = pCreateInfo->imageUsage;
-    properties.queueFamilyIndexCount = pCreateInfo->queueFamilyIndexCount;
-    properties.sharingMode           = pCreateInfo->imageSharingMode;
-    properties.format                = pCreateInfo->imageFormat;
-
-    // memcpy queue family indices
-    memcpy(properties.pQueueFamilyIndices, pCreateInfo->pQueueFamilyIndices, queueFamilyArraySize);
-
     for (properties.imageCount = 0; properties.imageCount < swapImageCount; ++properties.imageCount)
     {
         if (result == VK_SUCCESS)
diff --git a/icd/make/importdefs b/icd/make/importdefs
index f699ea87..e268502f 100644
--- a/icd/make/importdefs
+++ b/icd/make/importdefs
@@ -26,7 +26,7 @@
 # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION.  It describes the version of the PAL interface
 # that the ICD supports.  PAL uses this value to enable backwards-compatibility for older interface versions.  It must
 # be updated on each PAL promotion after handling all of the interface changes described in palLib.h.
-ICD_PAL_CLIENT_MAJOR_VERSION = 675
+ICD_PAL_CLIENT_MAJOR_VERSION = 678
 ICD_PAL_CLIENT_MINOR_VERSION = 0
 
 # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1.  It describes
diff --git a/icd/res/ver.h b/icd/res/ver.h
index 17a4a6d0..5b3c52e1 100644
--- a/icd/res/ver.h
+++ b/icd/res/ver.h
@@ -36,7 +36,7 @@
 #define VERSION_MAJOR_STR           MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0"
 
 // Bump up after each promotion to mainline
-#define VULKAN_ICD_BUILD_VERSION   199
+#define VULKAN_ICD_BUILD_VERSION   201
 
 // String version is needed with leading zeros and extra termination (unicode)
 #define VERSION_NUMBER_MINOR        VULKAN_ICD_BUILD_VERSION
diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp
index eaef6388..0c2195cb 100644
--- a/icd/settings/settings.cpp
+++ b/icd/settings/settings.cpp
@@ -376,13 +376,36 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings(
             // WWZ performs worse with DCC forced on, so just let the PAL heuristics decide what's best for now.
             if (pInfo->gfxLevel >= Pal::GfxIpLevel::GfxIp10_1)
             {
-                m_settings.forceEnableDcc = ForceDccDefault;
+                m_settings.forceEnableDcc = (ForceDccFor64BppShaderStorage |
+                    ForceDccForNonColorAttachmentShaderStorage |
+                    ForceDccForColorAttachments |
+                    ForceDccFor2DShaderStorage);
+
             }
 
             // Mall no alloc setting gives a ~0.82% gain
             if (pInfo->gfxLevel == Pal::GfxIpLevel::GfxIp10_3)
             {
-                m_settings.mallNoAllocSsrPolicy = MallNoAllocSsrAsSnsr;
+                m_settings.csWaveSize = 64;
+                m_settings.fsWaveSize = 64;
+
+                if (pInfo->revision == Pal::AsicRevision::Navi21)
+                {
+                    m_settings.forceEnableDcc = (ForceDccFor64BppShaderStorage |
+                        ForceDccFor32BppShaderStorage |
+                        ForceDccForColorAttachments |
+                        ForceDccFor3DShaderStorage);
+
+                    m_settings.mallNoAllocCtPolicy = MallNoAllocCtAsSnsr;
+                }
+
+                if (pInfo->revision == Pal::AsicRevision::Navi23)
+                {
+                    m_settings.forceEnableDcc = (ForceDccFor32BppShaderStorage |
+                        ForceDccForNonColorAttachmentShaderStorage |
+                        ForceDccForColorAttachments |
+                        ForceDccFor3DShaderStorage);
+                }
             }
 
             m_settings.implicitExternalSynchronization = false;
diff --git a/icd/settings/settings_xgl.json b/icd/settings/settings_xgl.json
index e4feca85..cbfcb3fc 100644
--- a/icd/settings/settings_xgl.json
+++ b/icd/settings/settings_xgl.json
@@ -651,6 +651,34 @@
       "Scope": "Driver",
       "Type": "enum"
     },
+    {
+      "Name": "PipelineLayoutMode",
+      "Description": "Control the pipeline descriptor layout for early compile",
+      "Tags": [
+        "Pipeline Options"
+      ],
+      "Defaults": {
+        "Default": "PipelineLayoutDefault"
+      },
+      "ValidValues": {
+        "IsEnum": true,
+        "Values": [
+          {
+            "Name": "PipelineLayoutDefault",
+            "Value": 0,
+            "Description": "Build descritptor layout with default layout"
+          },
+          {
+            "Name": "PipelineLayoutAngle",
+            "Value": 1,
+            "Description": "Build descriptor layout compatible with angle base app"
+          }
+        ],
+        "Name": "PipelineLayoutMode"
+      },
+      "Scope": "Driver",
+      "Type": "enum"
+    },
     {
       "Name": "PipelineBinningMode",
       "Description": "Specifies whether to override binning setting for pipeline.",
@@ -972,6 +1000,45 @@
       "Scope": "Driver",
       "Type": "bool"
     },
+    {
+      "Name": "EnableEarlyCompile",
+      "Description": "Enable pipeline early compile.",
+      "Tags": [
+        "SPIRV Options"
+      ],
+      "Defaults": {
+        "Default": false
+      },
+      "Scope": "Driver",
+      "Type": "bool"
+    },
+    {
+      "Name": "DeferCompileOptimizedPipeline",
+      "Description": "Whether enable defer compile optimized pipeline, it only affects option EnableUberFetchShader and EnableEarlyCompile",
+      "Tags": [
+        "SPIRV Options"
+      ],
+      "Defaults": {
+        "Default": false
+      },
+      "Scope": "Driver",
+      "Type": "bool"
+    },
+    {
+      "Name": "DeferCompileThreadCount",
+      "Description": "Assistant thread count for deferred compile operation, if count is greater than the internal limitation, the real thread count will be clamped to the limitation.",
+      "Tags": [
+        "SPIRV Options"
+      ],
+      "Defaults": {
+        "Default": "0xFFFFFFFF"
+      },
+      "Scope": "Driver",
+      "Type": "uint32",
+      "Flags": {
+        "IsHex": true
+      }
+    },
     {
       "Name": "DisablePerCompFetch",
       "Description": "Disable per component fetch in uber fetch shader.",
@@ -1154,6 +1221,11 @@
             "Name": "ShaderReplaceShaderISA",
             "Value": 4,
             "Description": "Enable replace ISA shader in the pipeline, For every pipeline in the ShaderReplacementPipelineHashs, would find if there is a file named 0xAAA_replace.txt under ShaderReplacementDir, would be loaded for the replacement the replace shader look like this  *----offset: ISACODE----* 848:0x7E120303   1480:0x7E1E0303  2592:0x7E0E030E"
+          },
+          {
+            "Name": "ShaderReplaceShaderHashPipelineBinaryHash",
+            "Value": 5,
+            "Description": "Enable both shader hash based shader replacement and pipeline binary hash based pipeline binary replacement. In cases where both a pipeline and one or more of its shaders are replaced, the replacement shader will take precedence and will potentially change the hash of the pipeline. The pipeline will only be replaced if the pipeline replacement file has the new hash."
           }
         ],
         "Name": "ShaderReplaceMode"
@@ -2090,6 +2162,18 @@
       "Type": "bool",
       "Name": "DumpDuplicatePipelines"
     },
+    {
+      "Description": "Re-routes all compute work to a universal queue internally.",
+      "Tags": [
+        "General"
+      ],
+      "Defaults": {
+        "Default": false
+      },
+      "Type": "bool",
+      "Name": "UseUniversalAsComputeQueue",
+      "Scope": "Driver"
+    },
     {
       "Name": "DbgBarrierPostCmdEnable",
       "Description": "Triggers a CmdBarrier call after any command in the given mask.  The barrier behavior is controlled by the other DbgBarrierPost* settings in this category.  Requires VK_ENABLE_DEBUG_BARRIERS=1 to take effect. 0x8FFFFFFF: All commands (heavyweight option)",
@@ -3310,18 +3394,6 @@
       "Type": "uint32",
       "Name": "TransferGranularityDmaOverride"
     },
-    {
-      "Description": "If this option is enabled, the following changes are introduced:  (1) Images with VK_IMAGE_TILING_OPTIMAL may not be bound to memory types with VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT.  (2) For each memory type that was previously host visible and usable with optimal images, an another memory type is created that is not host visible and can be used with optimally tiled images.",
-      "Tags": [
-        "Memory"
-      ],
-      "Defaults": {
-        "Default": false
-      },
-      "Scope": "Driver",
-      "Type": "bool",
-      "Name": "AddHostInvisibleMemoryTypesForOptimalImages"
-    },
     {
       "Description": "If this option is enabled, the driver returns an extra image memory requirement. The amount of memory is determined by memoryPaddingFactorForImageMemoryRequirements.This can be used while capturing GFXR traces and can be helpful for DCC tuning",
       "Tags": [
@@ -3622,6 +3694,18 @@
       "Type": "bool",
       "Name": "OverrideHeapGartCacheableToUswc"
     },
+    {
+      "Description": "for APU, set local and local invisibl heap size to 0, Force to use system memory.",
+      "Tags": [
+        "Memory"
+      ],
+      "Defaults": {
+        "Default": false
+      },
+      "Scope": "Driver",
+      "Type": "bool",
+      "Name": "ForceUMA"
+    },
     {
       "Description": "Forces a particular AppProfile value.  The profile selected is the value of ForceAppProfileValue. ",
       "Tags": [
@@ -4402,30 +4486,6 @@
       "Scope": "Driver",
       "Type": "enum"
     },
-    {
-      "Description": "Enable async compile for shader module and pipelines.",
-      "Tags": [
-        "Optimization"
-      ],
-      "Defaults": {
-        "Default": false
-      },
-      "Scope": "Driver",
-      "Type": "bool",
-      "Name": "EnableAsyncCompile"
-    },
-    {
-      "Description": "Enable partial pipeline compile.",
-      "Tags": [
-        "Optimization"
-      ],
-      "Defaults": {
-        "Default": false
-      },
-      "Scope": "Driver",
-      "Type": "bool",
-      "Name": "EnablePartialPipelineCompile"
-    },
     {
       "Description": "Specifies the maximum threshold in bytes for linear transfer commands to use CP DMA, which have less overhead than CS/Gfx copies, but also less throughput for large copies.",
       "Tags": [
@@ -5423,6 +5483,18 @@
       },
       "Type": "bool",
       "Scope": "Driver"
+    },
+    {
+      "Name": "EnableDumbTransitionSync",
+      "Description": "Enable synchronizing cache by adding dumb transition in the barrier",
+      "Tags": [
+        "General"
+      ],
+      "Defaults": {
+        "Default": true
+      },
+      "Type": "bool",
+      "Scope": "Driver"
     }
   ]
 }
\ No newline at end of file
diff --git a/tools/cache_creator/CMakeLists.txt b/tools/cache_creator/CMakeLists.txt
index 37d0b3f5..d3f3ec7f 100644
--- a/tools/cache_creator/CMakeLists.txt
+++ b/tools/cache_creator/CMakeLists.txt
@@ -75,8 +75,8 @@ target_link_libraries(cache-info PRIVATE cache_creator_lib)
 # Build cache creator tools whenever we build XGL.
 add_dependencies(xgl cache-creator cache-info)
 
-if(XGL_BUILD_LIT)
-    message(STATUS "Building cache creator LIT tests")
+if(XGL_BUILD_TESTS OR XGL_BUILD_LIT)
+    message(STATUS "Building cache creator tests")
     set(CACHE_CREATOR_TOOLS_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}")
     add_subdirectory(test "${CMAKE_CURRENT_BINARY_DIR}/test/cache-creator/lit")
     add_subdirectory(unittests "${CMAKE_CURRENT_BINARY_DIR}/test/cache-creator/unittests")