inc/core/palCmdBuffer.h

/*
 ***********************************************************************************************************************
 *
 *  Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved.
 *
 *  Permission is hereby granted, free of charge, to any person obtaining a copy
 *  of this software and associated documentation files (the "Software"), to deal
 *  in the Software without restriction, including without limitation the rights
 *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *  copies of the Software, and to permit persons to whom the Software is
 *  furnished to do so, subject to the following conditions:
 *
 *  The above copyright notice and this permission notice shall be included in all
 *  copies or substantial portions of the Software.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 *  SOFTWARE.
 *
 **********************************************************************************************************************/
/**
 ***********************************************************************************************************************
 * @file  palCmdBuffer.h
 * @brief Defines the Platform Abstraction Library (PAL) ICmdBuffer interface and related types.
 ***********************************************************************************************************************
 */

#pragma once

#include "pal.h"
#include "palDevice.h"
#include "palGpuMemory.h"
#include "palImage.h"
#include "palMsaaState.h"
#include "palPipeline.h"
#include "palQueryPool.h"

/// HSA kernel dispatch packet typedef
typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t;
/// AMD kernel code typedef
typedef struct amd_kernel_code_s amd_kernel_code_t;
/// AMD kernel descriptor
namespace llvm {namespace amdhsa {struct kernel_descriptor_t;} }

namespace Util
{
class VirtualLinearAllocator;
class Event;
}

namespace Pal
{

// Forward declarations.
class IBorderColorPalette;
class ICmdAllocator;
class ICmdBuffer;
class IColorBlendState;
class IColorTargetView;
class IDepthStencilState;
class IDepthStencilView;
class IGpuEvent;
class IGpuMemory;
class IIndirectCmdGenerator;
class IMsaaState;
class IPerfExperiment;
class IQueue;
class IQueryPool;
enum class PerfTraceMarkerType : uint32;
enum class PointOrigin : uint32;

struct VideoCodecInfo;
struct VideoCodecAuxInfo;

/// Specifies a pipeline bind point (i.e., compute or graphics).
enum class PipelineBindPoint : uint32
{
    Compute     = 0x0,
    Graphics    = 0x1,
    Count
};

/// Fully specifies a type of graphics primitive and vertex ordering for geometry.
enum class PrimitiveTopology : uint8
{
    PointList        = 0x0,
    LineList         = 0x1,
    LineStrip        = 0x2,
    TriangleList     = 0x3,
    TriangleStrip    = 0x4,
    RectList         = 0x5,  ///< Each rect is three 2D axis-aligned rectangle vertices.
    QuadList         = 0x6,
    QuadStrip        = 0x7,
    LineListAdj      = 0x8,
    LineStripAdj     = 0x9,
    TriangleListAdj  = 0xA,
    TriangleStripAdj = 0xB,
    Patch            = 0xC,
    TriangleFan      = 0xD,
    LineLoop         = 0xE,
    Polygon          = 0xF,
    TwoDRectList     = 0x10, ///< Each rect is the bounding box of an arbitrary 2D triangle.
                             ///  Support is optional, see support2DRectList in DeviceProperties.
    Count
};

/// Specifies how triangle primitives should be rasterized.
enum class FillMode : uint8
{
    Points    = 0x0,
    Wireframe = 0x1,
    Solid     = 0x2,
    Count
};

/// Specifies the triangle face direction that should result in culled primitives.
enum class CullMode : uint8
{
    _None         = 0x0,  ///< All triangles are rasterized.
    Front         = 0x1,  ///< Front facing triangles are culled.
    Back          = 0x2,  ///< Back facing triangles are culled.
    FrontAndBack  = 0x3,  ///< All triangles are culled.

    // Unfortunately for Linux clients, X.h includes a "#define None 0" macro.  Clients have their choice of either
    // undefing None before including this header or using _None when dealing with PAL.
#ifndef None
    None = _None,         ///< All triangles are rasterized.
#endif
};

/// Specifies vertex winding order corresponding to a front facing triangle.  @see CullMode.
enum class FaceOrientation : uint8
{
    Ccw = 0x0,  ///< Counter-clockwise vertex winding primitives are front facing.
    Cw  = 0x1   ///< Clockwise vertex winding primitives are front facing.
};

/// Specifies which vertex of a primitive is the _provoking vertex_.  This impacts which vertex's "flat" VS outputs
/// are passed to the PS (i.e., flat shading).
enum class ProvokingVertex : uint8
{
    First = 0x0,
    Last  = 0x1
};

/// Specifies bit size of each element in an index buffer.
enum class IndexType : uint32
{
    Idx8  = 0x0,
    Idx16 = 0x1,
    Idx32 = 0x2,
    Count
};

/// Specifies a memory atomic operation that can be performed from command buffers with ICmdBuffer::CmdMemoryAtomic().
enum class AtomicOp : uint32
{
    AddInt32  = 0x00,
    SubInt32  = 0x01,
    MinUint32 = 0x02,
    MaxUint32 = 0x03,
    MinSint32 = 0x04,
    MaxSint32 = 0x05,
    AndInt32  = 0x06,
    OrInt32   = 0x07,
    XorInt32  = 0x08,
    IncUint32 = 0x09,
    DecUint32 = 0x0A,
    AddInt64  = 0x0B,
    SubInt64  = 0x0C,
    MinUint64 = 0x0D,
    MaxUint64 = 0x0E,
    MinSint64 = 0x0F,
    MaxSint64 = 0x10,
    AndInt64  = 0x11,
    OrInt64   = 0x12,
    XorInt64  = 0x13,
    IncUint64 = 0x14,
    DecUint64 = 0x15,
    Count
};

/// Specifies the point in the GPU pipeline where an action should take place.
///
/// Relevant operations include setting GPU events, waiting on GPU events in hardware, or writing timestamps.
///
/// @note The numeric value of these enums are ordered such that a "newState < oldState" comparison will generally yield
///        true if a stall is necessary to resolve a hazard between those two pipe points.  This guideline does not
///        hold up when comparing PreRasterization or PostPs with PostCs, as CS work is not properly pipelined with
///        graphics shader work.
///
/// @see ICmdBuffer::CmdSetEvent()
/// @see ICmdBuffer::CmdResetEvent()
/// @see ICmdBuffer::CmdPredicateEvent()
/// @see ICmdBuffer::CmdBarrier()
/// @see ICmdBuffer::CmdWriteTimestamp()
/// @see ICmdBuffer::CmdWriteImmediate()
enum HwPipePoint : uint32
{
    HwPipeTop              = 0x0,                   ///< Earliest possible point in the GPU pipeline (CP PFP), can be
                                                    ///  used as wait point for indirect args and index buffer fetch.
    HwPipePostPrefetch     = 0x1,                   ///< Indirect arguments have been fetched for all prior
                                                    ///  draws/dispatches (CP ME).
    HwPipePreRasterization = 0x2,                   ///< All prior generated VS/HS/DS/GS waves have completed, can be
                                                    ///  used as release point for VB/IB fetch and streamout target.
    HwPipePostPs           = 0x3,                   ///< All prior generated PS waves have completed.
                                                    ///  Only valid as a pipe point to wait on (release point).
    HwPipePreColorTarget   = 0x4,                   ///< Represents the same point in pipe to HwPipePostPs, but provides
                                                    ///  clients with a better option to accurately specify the pipeline
                                                    ///  sync request. And PAL uses it as entry-point to add partial
                                                    ///  flushes to prevent write-after-read hazard from corner cases.
                                                    ///  Only valid as a wait point (acquire point).
    HwPipePreIndexBuffer   = HwPipeTop,             ///< As late as possible before index buffer fetches (CP PFP).
    HwPipePostIndexBuffer  = HwPipePreRasterization,///< All prior index buffer fetches have completed.

    // The following points apply to compute-specific work:
    HwPipePreCs            = HwPipePostPrefetch,    ///< As late as possible before CS waves are launched (CP ME).
    HwPipePostCs           = 0x5,                   ///< All prior generated CS waves have completed.

    // The following points apply to BLT-specific work:
    HwPipePreBlt           = HwPipePostPrefetch,    ///< As late as possible before BLT operations are launched.
    HwPipePostBlt          = 0x6,                   ///< All prior requested BLTs have completed.

    HwPipeBottom           = 0x7,                   ///< All prior GPU work (graphics, compute, or BLT) has completed.
    HwPipePointCount
};

/// Bitmask values that can be OR'ed together to specify a synchronization scope.  See srcStageMask and dstStageMask in
/// @ref AcquireReleaseInfo.
///
/// When specifying an execution dependency at a synchronization point where previous operations must *happen-before*
/// future operations, a mask of these flags specifies a *synchronization scope* that restricts which stages of prior
/// draws, dispatches, or BLTs must *happen-before* which stages of future draws, dispatches, or BLTs.
///
/// Note that flag numerical order does not indicate any happens-before or happens-after relationships. Clients should
/// not compare flags numerically to judge execution order, only barriers can guarantee execution ordering.
enum PipelineStageFlag : uint32
{
    PipelineStageTopOfPipe         = 0x00000001,
    PipelineStageFetchIndirectArgs = 0x00000002,
    PipelineStagePostPrefetch      = 0x00000004,
    PipelineStageFetchIndices      = 0x00000008,
    PipelineStageStreamOut         = 0x00000010,
    PipelineStageVs                = 0x00000020,
    PipelineStageHs                = 0x00000040,
    PipelineStageDs                = 0x00000080,
    PipelineStageGs                = 0x00000100,
    PipelineStagePs                = 0x00000200,
    PipelineStageSampleRate        = 0x00000400,
    PipelineStageEarlyDsTarget     = 0x00000800,
    PipelineStageLateDsTarget      = 0x00001000,
    PipelineStageColorTarget       = 0x00002000,
    PipelineStageCs                = 0x00004000,
    PipelineStageBlt               = 0x00008000,
    PipelineStageBottomOfPipe      = 0x00010000,
    PipelineStageDsTarget          = PipelineStageEarlyDsTarget | PipelineStageLateDsTarget,
    PipelineStageAllStages         = 0x0001FFFF
};

/// Bitmask values that can be ORed together to specify all potential usages of an image at a point in time.  Such a
/// mask should be specified in the usages field of ImageLayout.  These combined usages can be examined by PAL to infer
/// the layout (i.e., compression state) of the image.
///
/// @note There is no layout corresponding to CmdClear*().  The layout flags passed to those functions will determine
///       the expected image layout at that time, and the CmdClear*() implementation will execute a clear that keeps the
///       layout the same.
enum ImageLayoutUsageFlags : uint32
{
    LayoutUninitializedTarget     = 0x00000001,  ///< Initial state of any image that can be used as a color or
                                                 ///  depth/stencil target.  A layout transition out of this state will
                                                 ///  likely result in a mask RAM initialization BLT.  If this bit is
                                                 ///  set, no other bits may be set.
    LayoutColorTarget             = 0x00000002,  ///< Color target bound via CmdBindTargets().  This bit is exclusive
                                                 ///  with LayoutDepthStencilTarget.
    LayoutDepthStencilTarget      = 0x00000004,  ///< Depth/stencil target bound via CmdBindTargets().  This bit is
                                                 ///  exclusive with LayoutColorTarget.
    LayoutShaderRead              = 0x00000008,  ///< Any shader read state including texture, UAV, constant buffer,
                                                 ///  vertex buffer.
    LayoutShaderFmaskBasedRead    = 0x00000010,  ///< Images in this state support the load_fptr AMD IL instruction,
                                                 ///  which will read decompressed fmask in order to access compressed
                                                 ///  MSAA color data from a shader.
    LayoutShaderWrite             = 0x00000020,  ///< Writeable UAV.
    LayoutCopySrc                 = 0x00000040,  ///< CmdCopyImage(), CmdCopyImageToMemory(), CmdScaledCopyImage or
                                                 ///  CmdCopyTiledImageToMemory() source image.
    LayoutCopyDst                 = 0x00000080,  ///< CmdCopyImage(), CmdCopyMemoryToImage(), CmdScaledCopyImage or
                                                 ///  CmdCopyMemoryToTiledImage() destination image.
    LayoutResolveSrc              = 0x00000100,  ///< CmdResolveImage() source.
    LayoutResolveDst              = 0x00000200,  ///< CmdResolveImage() destination.
    LayoutPresentWindowed         = 0x00000400,  ///< Windowed-mode IQueue::Present().
    LayoutPresentFullscreen       = 0x00000800,  ///< Fullscreen (flip) present.  Layout must be supported by the
                                                 ///  display engine.
    LayoutUncompressed            = 0x00001000,  ///< Metadata fully decompressed/expanded layout
    LayoutSampleRate              = 0x00002000,  ///< CmdBindSampleRateImage() source.
    LayoutAllUsages               = 0x00003FFF
};

/// Bitmask values that can be ORed together to specify all potential engines an image might be used on.  Such a
/// mask should be specified in the engines field of ImageLayout.
///
/// If the client API is unable to determine which engines might be used, it should specify all possible engines
/// corresponding to the usage flags.
enum ImageLayoutEngineFlags : uint32
{
    LayoutUniversalEngine       = 0x1,
    LayoutComputeEngine         = 0x2,
    LayoutDmaEngine             = 0x4,
    LayoutVideoEncodeEngine     = 0x8,
    LayoutVideoDecodeEngine     = 0x10,
    LayoutVideoJpegDecodeEngine = 0x20,
    LayoutAllEngines            = 0x3F
};

/// Bitmask values that can be ORed together to specify previous output usage and upcoming input usages of an image or
/// GPU memory in a ICmdBuffer::CmdBarrier() call to ensure cache coherency between those usages.
enum CacheCoherencyUsageFlags : uint32
{
    CoherCpu                = 0x00000001,     ///< Data read or written by CPU.
    CoherShaderRead         = 0x00000002,     ///< Data read by a GPU shader.
    CoherShaderWrite        = 0x00000004,     ///< Data written by a GPU shader.
    CoherCopySrc            = 0x00000008,     ///< Source of a ICmdBuffer::CmdCopy*() call.
    CoherCopyDst            = 0x00000010,     ///< Destination of a ICmdBuffer::CmdCopy*() call.
    CoherColorTarget        = 0x00000020,     ///< Color target.
    CoherDepthStencilTarget = 0x00000040,     ///< Depth stencil target.
    CoherResolveSrc         = 0x00000080,     ///< Source of a CmdResolveImage() call.
    CoherResolveDst         = 0x00000100,     ///< Destination of a CmdResolveImage() call.
    CoherClear              = 0x00000200,     ///< Destination of a CmdClear() call.
    CoherIndirectArgs       = 0x00000400,     ///< Source argument data read by CmdDrawIndirect() and similar functions.
    CoherIndexData          = 0x00000800,     ///< Index buffer data.
    CoherQueueAtomic        = 0x00001000,     ///< Destination of a CmdMemoryAtomic() call.
    CoherTimestamp          = 0x00002000,     ///< Destination of a CmdWriteTimestamp() call.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 914
    CoherStreamOut          = 0x00004000,     ///< Data written as stream output.
    CoherMemory             = 0x00008000,     ///< Data read or written directly from/to memory
    CoherSampleRate         = 0x00010000,     ///< CmdBindSampleRateImage() source.
    CoherPresent            = 0x00020000,     ///< Source of present.
    CoherCp                 = 0x00080000,     ///< HW Command Processor (CP) encompassing the front - end command
    CoherAllUsages          = 0x000FFFFF,     ///<  processing of any queue, including SDMA.
#else
    CoherCeLoad             = 0x00004000,     ///< Source of a CmdLoadCeRam() call.
    CoherCeDump             = 0x00008000,     ///< Destination of CmdDumpCeRam() call.
    CoherStreamOut          = 0x00010000,     ///< Data written as stream output.
    CoherMemory             = 0x00020000,     ///< Data read or written directly from/to memory
    CoherSampleRate         = 0x00040000,     ///< CmdBindSampleRateImage() source.
    CoherPresent            = 0x00080000,     ///< Source of present.
    CoherCp                 = 0x00200000,     ///< HW Command Processor (CP) encompassing the front - end command
    CoherAllUsages          = 0x003FFFFF,     ///<  processing of any queue, including SDMA.
#endif

    CoherShader             = CoherShaderRead | CoherShaderWrite,
    CoherCopy               = CoherCopySrc    | CoherCopyDst,
    CoherResolve            = CoherResolveSrc | CoherResolveDst,
};

/// Bitmask values for the flags parameter of ICmdBuffer::CmdClearColorImage().
enum ClearColorImageFlags : uint32
{
    ColorClearAutoSync   = 0x00000001, ///< PAL will automatically insert required CmdBarrier() synchronization before
                                       ///  and after the clear assuming all subresources to be cleared are currently
                                       ///  ready for rendering as a color target (as is required by API convention in
                                       ///  DX12).  Allows reduced sync costs in some situations since PAL knows
                                       ///  the details of how the clear will be performed.
    ColorClearForceSlow  = 0x00000002, ///< Force these to use slow clears.
    ColorClearSkipIfSlow = 0x00000004, ///< Only issue the clear if it is a fast clear.
    ColorClearAllFlags   = 0x00000007  ///< Clients should NOT use it, for internal static_assert purpose only.
};

/// Bitmask values for the flags parameter of ICmdBuffer::CmdClearDepthStencil().
enum ClearDepthStencilFlags : uint32
{
    DsClearAutoSync = 0x00000001,   ///< PAL will automatically insert required CmdBarrier() synchronization before
                                    ///  and after the clear assuming all subresources to be cleared are currently
                                    ///  ready for rendering as a depth/stencil target (as is required by API convention
                                    ///  in DX12).  Allows reduced sync costs in some situations since PAL knows the
                                    ///  details of how the clear will be performed.
    DsClearAllFlags = 0x00000001    ///< Clients should NOT use it, for internal static_assert purpose only.
};

/// Bitmask values for the flags parameter of ICmdBuffer::CmdResolveImage().
enum ResolveImageFlags : uint32
{
    ImageResolveInvertY     = 0x00000001,   ///< PAL will invert the y-axis (flip upside down) of the resolved region to
                                            ///  the destination image.
    ImageResolveDstAsSrgb   = 0x00000002,   ///< If set, a non-srgb destination image will be treated as srgb format.
                                            ///  The flag cannot be set when @ref ImageResolveDstAsNorm is set.
    ImageResolveDstAsNorm   = 0x00000004,   ///< If set, a srgb destination image will be treated as non-srgb format.
                                            ///  The flag cannot be set when @ref ImageResolveDstAsSrgb is set.
    ImageResolveSrcAsNorm   = 0x00000008,   ///< If set, a srgb source image will be treated as non-srgb format.
    ImageResolveAllFlags    = 0x0000000F    ///< Clients should NOT use it, for internal static_assert purpose only.
};

/// Specifies properties for creation of an ICmdBuffer object.  Input structure to IDevice::CreateCmdBuffer().
struct CmdBufferCreateInfo
{
    ICmdAllocator*  pCmdAllocator; ///< The command buffer will use this command allocator to allocate all GPU memory
                                   ///  If the client specifies a null pCmdAllocator, it must call ICmdBuffer::Reset
                                   ///  with a non-null pCmdAllocator before calling ICmdBuffer::Begin.
    QueueType       queueType;     ///< Type of queue commands in this command buffer will target.
                                   ///  This defines the set of allowed actions in the command buffer.
    QueuePriority   queuePriority; ///< Priority level of the queue this command buffer will target.
    EngineType      engineType;    ///< Type of engine the queue commands will run on.

    union
    {
        struct
        {
            /// Indicates that this command buffer will be a "nested" command buffer, instead of a normal, "root"
            /// command buffer.  Nested command buffers differ from root command buffers in how they are sent to the
            /// GPU for execution: root command buffers must be submitted to the hardware by calling
            /// @ref IQueue::Submit, whereas nested command buffers can only be submitted by being executed by a root
            /// command buffer.
            ///
            /// Currently, only Universal and Compute command buffers can be nested. Nesting DMA command buffers is
            /// meaningless and unsupported.  It is an error to attempt to create a nested DMA command buffer.
            ///
            /// @see ICmdBuffer::CmdExecuteNestedCmdBuffers.
            uint32 nested                     :  1;

            /// Dedicated CUs are reserved for this queue. Thus we have to skip CU mask programming.
            uint32 realtimeComputeUnits       :  1;

            /// Target queue uses dispatch tunneling.
            uint32 dispatchTunneling          :  1;

#if PAL_BUILD_GFX12
            /// Indicates that each subsequent Dispatch command is desired to be executed in alternating
            /// order of forward and reverse workgroup walk order. This can improve cache locality when
            /// subsequent Dispatches consume data from the previous Dispatch and the overall footprint
            /// does not fit in cache.
            /// This is a best effort as not all implementations or Queues may support this.
            uint32 dispatchPingPongWalk       :  1;
#else
            uint32 reserved1                  :  1;
#endif

            /// Reserved for future use.
            uint32 reserved                   : 28;
        };

        /// Flags packed as 32-bit uint.
        uint32  u32All;

    }  flags;   ///< Command buffer creation flags.
};

/// Specifies which states will not be bound in a nested command buffer, and instead must be inherited from the calling
/// root-level command buffer.
union InheritedStateFlags
{
    struct
    {
        /// Color and depth target views are inherited from the root-level command buffer. The nested command buffer
        /// should not modify this state.
        uint32 targetViewState :  1;

        /// Occlusion query is inherited from the root-level command buffer. The nested command buffer
        /// should not modify this state.
        uint32 occlusionQuery  :  1;

        /// Predication is inherited from the root-level command buffer. The nested command buffer should not modify
        /// this state.
        uint32  predication    :  1;

        /// Reserved for future usage.
        uint32 reserved        : 29;
    };

    /// Flags packed as 32-bit uint.
    uint32 u32All;
};

/// Specifies parameters inherited from primary command buffer into nested command buffer.
struct InheritedStateParams
{
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 891
    uint32                      colorTargetCount;                            ///< Number of color targets bound in the
                                                                             ///  root-level command buffer.
    SwizzledFormat              colorTargetSwizzledFormats[MaxColorTargets]; ///< Format and swizzle for each color
                                                                             ///  target.
    uint32                      sampleCount[MaxColorTargets];                ///< Sample count for each color target.
#endif
    InheritedStateFlags         stateFlags;                                  ///< States that are inherited from the
                                                                             ///  calling root-level command buffer.
};

/// Specifies optional hints to control command buffer building optimizations.
union CmdBufferBuildFlags
{
    struct
    {
        /// Optimize command buffer building for large sets of draw or dispatch operations that are GPU front-end
        /// limited.  These optimizations include removing redundant PM4 commands and reducing the VGT prim group size.
        /// This flag might increase the CPU overhead of building command buffers.
        uint32 optimizeGpuSmallBatch           :  1;

        /// Optimize command buffer building for exclusive command buffer submission.  Command buffers built with this
        /// flag cannot be submitted if they have already been submitted previously unless the caller guarantees that
        /// they are no longer in use.  This flag allows PAL to modify the contents of command buffers during
        /// submission.
        uint32 optimizeExclusiveSubmit         :  1;

        /// Optimize command buffer building for single command buffer submission.  Command buffers built with this flag
        /// cannot be submitted more than once.  This flag allows PAL to modify the contents of command buffers during
        /// submission.  This flag is a stricter version of optimizeExclusiveSubmit, it is not necessary to set
        /// optimizeExclusiveSubmit if this flag is set.
        uint32 optimizeOneTimeSubmit           :  1;

        /// Indicates that the client is providing custom tessellation distribution settings. If set, it is the clients
        /// responsibility to ensure all 5 (isoline, triangle, quad, donut, trapezoid) factors are provided.
        uint32 optimizeTessDistributionFactors :  1;

        /// Attempt to prefetch shader code into cache before launching draws or dispatches with a freshly bound
        /// pipeline object.  This optimization might increase the CPU overhead of building command buffers and/or
        /// introduce additional front-end GPU bottlenecks.
        uint32 prefetchShaders                 :  1;

        /// Attempt to prefetch the command buffer into cache to avoid bottlenecking the GPU front-end.
        /// This optimization might slightly increase the overhead of some GPU copies and other front-end reads/writes.
        uint32 prefetchCommands                :  1;

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914
        /// Indicates the command buffer will use one or more constant engine commands: CmdLoadCeRam(), CmdDumpCeRam(),
        /// or CmdWriteCeRam()
        uint32 usesCeRamCmds                   :  1;
#else
        uint32 placeholder914                  :  1;
#endif

        /// Indicates that the client would prefer that this nested command buffer not be launched using an IB2 packet.
        /// The calling command buffer will either inline this command buffer into itself or use IB chaining based on if
        /// the optimizeExclusiveSubmit flag is also set. This flag is ignored for root command buffers.
        uint32 disallowNestedLaunchViaIb2      :  1;

        /// placeholder
        uint32 placeholder1                    :  2;

        /// Enable TMZ mode to allow reading TMZ protected allocations. If this command buffer attempts to write
        /// non-TMZ memory, the results are undefined. Only valid for graphics and compute.
        uint32  enableTmz                      :  1;

        uint32 placeholder3                    :  1;

        /// If set, internal operations such as blits, copies, etc. will not affect active Query results.
        /// Otherwise they may affect the results.
        uint32 disableQueryInternalOps         :  1;

        uint32 optimizeContextStatesPerBin     :  1;
        uint32 optimizePersistentStatesPerBin  :  1;

        /// Reserved for future use.
        uint32 reserved                        :  16;
    };

    /// Flags packed as 32-bit uint.
    uint32 u32All;
};

/// Specifies tessellation accum factors.
union TessDistributionFactors
{
    struct
    {
        /// The following 3 factors are used by hardware when distributed tessellation is active: the min tess factors for
        /// each patch processed by a VGT are accumulated. When the sum exceeds this threshold, the next patch is sent to a
        /// different VGT.
        uint32 isoDistributionFactor   : 8;
        uint32 triDistributionFactor   : 8; ///< Recommended to be higher than quad factor.
        uint32 quadDistributionFactor  : 8;
        /// Used by the hardware when distributed tessellation is in DONUT mode: the min tess factor for each patch is
        /// tested against this threshold to determine whether a patch gets split up. If the patch isn't split, it still
        /// increments the accumulator for the Patch distribution factor.
        uint32 donutDistributionFactor : 5;
        /// Used when the distribution mode is TRAPEZOID for quad and tri domain types. The number of donuts in the patch
        /// are compared against this value to detemine whether this donut gets split up into trapezoids (needs the patch to
        /// be in donut mode). A value of 0 or 1 will be treated as 2. The innermost donut is never allowed to be broken
        /// into trapezoids.
        uint32 trapDistributionFactor  : 3;
    };

    /// Values packed as 32-bit uint.
    uint32 u32All;
};

/// Specifies options that direct command buffer building.
struct CmdBufferBuildInfo
{
    /// Command buffer build flags, specifies optional hints to control command buffer build optimizations.
    CmdBufferBuildFlags flags;

    /// Command buffer inherited state and params. If non-null, related state is assumed set in root-level and nested
    /// command buffer should not modify the software states. Any software params that may be needed within nested
    /// command buffer needs to be provided here.
    const InheritedStateParams* pInheritedState;

    /// If non-null, the command buffer will begin with all states set as they are in this previously built command
    /// buffer. Any state specified in pInheritedState is excluded if it is also provided.
    const ICmdBuffer* pStateInheritCmdBuffer;

    /// Optional allocator for PAL to use when allocating temporary memory during command buffer building.  PAL will
    /// stop using this allocator once command building ends.  If no allocator is provided PAL will use an internally
    /// managed allocator instead which may be less efficient.  PAL will use this allocator in two ways:
    /// + Temporary storage within a single command building call.  PAL will rewind the allocator before returning to
    ///   free all memory allocated within the call.
    /// + Temporary storage for the entire command building period.  When Begin() is called, PAL will save the current
    ///   position of the allocator and rewind the allocator to that point when End() is called.  If the client also
    ///   wishes to allocate temporary storage that lasts between command building function calls they must allocate it
    ///   before calling Begin() or PAL will accidentally free it.
    Util::VirtualLinearAllocator* pMemAllocator;

    /// Optional tessellation distribution factors that will overwrite PAL set defaults. Clients must also set the
    /// optimizeTessDistributionFactors flag for these custom factors to take effect.
    TessDistributionFactors clientTessDistributionFactors;

    /// Number of context states per PBB bin.
    /// Client must also set @ref CmdBufferBuildFlags::optimizeContextStatesPerBin for this to take effect.
    uint8 contextStatesPerBin;

    /// Number of persistent states per PBB bin.
    /// Client must also set @ref CmdBufferBuildFlags::optimizePersistentStatesPerBin for this to take effect.
    uint8 persistentStatesPerBin;

    /// Client/app data handle. This can have an arbitrary value and is used to uniquely identify this command buffer.
    uint64 execMarkerClientHandle;
};

/// Specifies info on how a compute shader should use resources.
struct DynamicComputeShaderInfo
{
    float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit.  This can be used to selectively
                         ///  throttle certain workloads that bottleneck multiqueue applications.  For ease of use, a
                         ///  value of zero means no limit is set.  The remaining valid values are in the range (0, 40]
                         ///  and specify the maximum number of waves per compute unit.  If the hardware has one wave
                         ///  limit control for multiple shader stages PAL will select the most strict limit.
                         ///  This option is converted internally to set set HW WavesPerSh setting and the non-integer
                         ///  maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for
                         ///  example specify less number of waves than number of CUs per shader array.

    uint32 maxThreadGroupsPerCu; ///< Override the maximum number of threadgroups that a particular CS can run on,
                                 ///  throttling it, to enable more graphics work to complete.  0 disables the limit.

    uint32 tgScheduleCountPerCu; ///< Override the number of threadgroups to schedule on a single compute unit before
                                 ///  moving to the next compute unit. 0 selects optimal default.

    uint32 ldsBytesPerTg; ///< Override the amount of LDS space used per thread-group for this pipeline, in bytes.
                          ///  Zero indicates that the LDS size determined at pipeline-compilation time will be used.
};

/// Specifies info on how a graphics shader should use resources.
struct DynamicGraphicsShaderInfo
{
    float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit.  This can be used to selectively
                         ///  throttle certain workloads that bottleneck multiqueue applications.  For ease of use, a
                         ///  value of zero means no limit is set.  The remaining valid values are in the range (0, 40]
                         ///  and specify the maximum number of waves per compute unit.  If the hardware has one wave
                         ///  limit control for multiple shader stages PAL will select the most strict limit.
                         ///  This option is converted internally to set HW WavesPerSh setting and the non-integer
                         ///  maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for
                         ///  example specify less number of waves than number of CUs per shader array.
};

/// Specifies dynamic states of a graphics pipeline
struct DynamicGraphicsState
{
    uint32             colorWriteMask;               ///< Color target write mask. 4b / RT (8 count)
    struct
    {
        uint32         switchWinding              : 1; ///< Whether to reverse vertex ordering for tessellation.
        uint32         depthClipNearEnable        : 1; ///< Enable clipping based on Near Z coordinate.
        uint32         depthClipFarEnable         : 1; ///< Enable clipping based on Far Z coordinate.
        uint32         alphaToCoverageEnable      : 1; ///< Enable alpha to coverage.
        uint32         perpLineEndCapsEnable      : 1; ///< Forces the use of perpendicular line end caps as opposed to
                                                       ///  axis-aligned line end caps during line rasterization.
        uint32         rasterizerDiscardEnable    : 1; ///< Whether to kill all rasterized pixels.
        uint32         dualSourceBlendEnable      : 1; ///< Enable dual source blend
        uint32         vertexBufferCount          : 6; ///< Number vertex buffer slots accessed by this pipeline
        LogicOp        logicOp                    : 4; ///< Logic operation to perform.
        DepthRange     depthRange                 : 1; ///< Specifies Z dimensions of screen space (i.e., post viewport
                                                       ///  transform: 0 to 1 or -1 to 1).
        DepthClampMode depthClampMode             : 2; ///< Depth clamping behavior.
        uint32         reserved1                  : 7; ///< Reserved
        uint32         reserved                   : 5; ///< Reserved for future use.
    };

    union
    {
        struct
        {
            uint32 depthClampMode          :  1;  ///< Whether to enable dynamic state depthClampMode.
            uint32 depthRange              :  1;  ///< Whether to enable dynamic state depthRange.
            uint32 logicOp                 :  1;  ///< Whether to enable dynamic state logicOp.
            uint32 colorWriteMask          :  1;  ///< Whether to enable dynamic state colorWriteMask.
            uint32 switchWinding           :  1;  ///< Whether to enable dynamic state switchWinding.
            uint32 depthClipMode           :  1;  ///< Whether to enable dynamic state depthClipNear/FarEnable.
            uint32 alphaToCoverageEnable   :  1;  ///< Whether to enable dynamic state alphaToCoverageEnable.
            uint32 perpLineEndCapsEnable   :  1;  ///< Whether to enable dynamic state perpLineEndCapsEnable.
            uint32 rasterizerDiscardEnable :  1;  ///< Whether to enable dynamic state rasterizerDiscardEnable.
            uint32 dualSourceBlendEnable   :  1;  ///< Whether to enable dynamic state dualSourceBlendEnable
            uint32 vertexBufferCount       :  1;  ///< Whether to enable dynamic state vertexBufferCount.
            uint32 reserved1               :  1;  ///< Reserved.
            uint32 reserved                : 20;  ///< Reserved for future use.
        };
        uint32     u32All;
    } enable;
};

/// Specifies info on how graphics shaders should use resources.
struct DynamicGraphicsShaderInfos
{
    union
    {
        // VS/HS/DS/GS or TS/MS are active
        struct
        {
            DynamicGraphicsShaderInfo vs;  ///< Dynamic Vertex shader information.
            DynamicGraphicsShaderInfo hs;  ///< Dynamic Hull shader information.
            DynamicGraphicsShaderInfo ds;  ///< Dynamic Domain shader information.
            DynamicGraphicsShaderInfo gs;  ///< Dynamic Geometry shader information.
        };
        struct
        {
            DynamicGraphicsShaderInfo ts;  ///< Dynamic Task shader information.
            DynamicGraphicsShaderInfo ms;  ///< Dynamic Mesh shader information.
        };
    };

    DynamicGraphicsShaderInfo ps;  ///< Dynamic Pixel shader information.

    union
    {
        struct
        {
            uint8 vs       : 1; // If set, there is dynamic VS shader info.
            uint8 hs       : 1; // If set, there is dynamic HS shader info.
            uint8 ds       : 1; // If set, there is dynamic DS shader info.
            uint8 gs       : 1; // If set, there is dynamic GS shader info.
            uint8 ps       : 1; // If set, there is dynamic PS shader info.
            uint8 ts       : 1; // If set, there is dynamic TS shader info.
            uint8 ms       : 1; // If set, there is dynamic MS shader info.
            uint8 reserved : 1; // Reserved.
        };
        uint8 u8All;
    } enable;
};

/// Specifies parameters for binding a pipeline.
/// @see ICmdBuffer::CmdBindPipeline
struct PipelineBindParams
{
    PipelineBindPoint pipelineBindPoint; ///< Specifies which type of pipeline is to be bound (compute or graphics).
    const IPipeline*  pPipeline;         ///< New pipeline to be bound.  Can be null in order to unbind a previously
                                         ///  bound pipeline without binding a new one.
    uint64 apiPsoHash;                   ///< 64-bit identifier provided by client driver based on the Pipeline State
                                         ///  Object. There exists a many-to-one correlation for ApiPsoHash to
                                         ///  internalPipelineHash to map the two.
    union
    {
        DynamicComputeShaderInfo   cs;   ///< Dynamic Compute shader information.

        struct
        {
            DynamicGraphicsShaderInfos gfxShaderInfo;
            DynamicGraphicsState       gfxDynState;
        };
    };
};

/// Specifies per-MRT color target view and current image state.  Used as input to ICmdBuffer::CmdBindTargets().
struct ColorTargetBindInfo
{
    const IColorTargetView* pColorTargetView;  ///< Color target view to bind.
    ImageLayout             imageLayout;       ///< Specifies the current image layout based on bitmasks of currently
                                               ///  allowed operations and engines that may perform those operations.
                                               ///  At minimum, the LayoutColorTarget usage flag and
                                               ///  LayoutUniversalEngine engine flag must be set.
};

/// Specifies depth/stencil view and current image state of the depth and stencil planes.  Used as input to
/// ICmdBuffer::CmdBindTargets().
struct DepthStencilBindInfo
{
    const IDepthStencilView* pDepthStencilView;  ///< Depth/stencil target view to bind.
    ImageLayout              depthLayout;        ///< Specifies the current image layout of the depth plane based on
                                                 ///  bitmasks of currently allowed operations and engines that may
                                                 ///  perform those operations.  At minimum, the
                                                 ///  LayoutDepthStencilTarget usage flag and LayoutUniversalEngine
                                                 ///  engine flag must be set.  Ignored if the specified view does not
                                                 ///  have a depth plane.
    ImageLayout              stencilLayout;      ///< Specifies the current image layout of the stencil plane based on
                                                 ///  bitmasks of currently allowed operations and engines that may
                                                 ///  perform those operations.  At minimum, the
                                                 ///  LayoutDepthStencilTarget usage flag and LayoutUniversalEngine
                                                 ///  engine flag must be set.  Ignored if the specified view does not
                                                 ///  have a stencil plane.
};

/// Represents a GPU memory or image transition as part of a barrier.
///
/// A single transition will ensure cache coherency of dirty data in the specific set of source caches with the
/// specified set of destination caches. The source and destination designation is relative to the barrier itself
/// and does not indicate whether a particular cache is a read or write cache.
///
/// Typically a transition flushes written data from the source caches into the destination caches and thus the source
/// cache mask typically only contains write caches. However, the client is encouraged to include flags for any prior
/// read-only caches accesses as PAL may be able to optimize its cache operations.
///
/// If the both cache masks are zero the client is indicating that no cache coherency operations are required but PAL
/// may still issue cache operations for internal reasons.
///
/// In addition, the client can change an image's layout usage/engine flags which may result in a metadata blt.
///
/// @note There is no range provided to control the range of addresses that will be flushed/invalidated in GPU caches.
struct BarrierTransition
{

    uint32 srcCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing previous write operations whose
                         ///  results need to be visible for subsequent operations. Flags for prior read operations
                         ///  may be included as well and may be used for internal optimizations.
    uint32 dstCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing the operations expected to read
                         ///  and/or write data flushed from the caches indicated by the srcCacheMask.

    struct
    {
        const IImage* pImage;      ///< If non-null, indicates this transition only applies to the specified image.
                                   ///  The remaining members of this structure are ignored if this member is null.
        SubresRange   subresRange; ///< Subset of pImage this transition applies to. If newLayout includes @ref
                                   ///  LayoutUninitializedTarget this range must cover all subresources of pImage
                                   ///  unless the perSubresInit image create flag was specified.
        ImageLayout   oldLayout;   ///< Specifies the current image layout based on bitmasks of allowed operations and
                                   ///  engines up to this point.  These masks imply the previous compression state. No
                                   ///  usage flags should ever be set in oldLayout.usages that correspond to usages
                                   ///  that are not supported by the engine that is performing the transition.  The
                                   ///  queue type performing the transition must be set in oldLayout.engines.
        ImageLayout   newLayout;   ///< Specifies the upcoming image layout based on bitmasks of allowed operations and
                                   ///  engines after this point.  These masks imply the upcoming compression state.
                                   ///  point.  This usage mask implies the upcoming compressions state.  A difference
                                   ///  between oldLayoutUsageMask and newLayoutUsageMask may result in a
                                   ///  decompression.

        /// Specifies a custom sample pattern over a 2x2 pixel quad.  The position for each sample is specified on a
        /// grid where the pixel center is <0,0>, the top left corner of the pixel is <-8,-8>, and <7,7> is the maximum
        /// valid position (not quite to the bottom/right border of the pixel).
        /// Specifies a custom sample pattern over a 2x2 pixel quad. Can be left null for non-MSAA images or when
        /// a valid MsaaQuadSamplePattern is bound prior to the CmdBarrier call.
        const MsaaQuadSamplePattern* pQuadSamplePattern;

    } imageInfo; ///< Image-specific transition information.
};

/// Describes a barrier as inserted by a call to ICmdBuffer::CmdBarrier().
///
/// A barrier can be used to 1) stall GPU execution at a specified point to resolve a data hazard, 2) flush/invalidate
/// GPU caches to ensure data coherency, and/or 3) compress/decompress image resources as necessary when changing how
/// the GPU will use the image.
///
/// This structure directly specifies how #1 is performed.  #2 and #3 are managed by the list of @ref BarrierTransition
/// structures passed in pTransitions.
struct BarrierInfo
{
    /// Determine at what point the GPU should stall until all specified waits and transitions have completed.  If the
    /// specified wait point is unavailable, PAL will wait at the closest available earlier point.
    HwPipePoint        waitPoint;

    uint32             pipePointWaitCount;           ///< Number of entries in pPipePoints.
    const HwPipePoint* pPipePoints;                  ///< The barrier will stall until the hardware pipeline has cleared
                                                     ///  up to each point specified in this array.  One entry in this
                                                     ///  array is typically enough, but CS and GFX operate in parallel
                                                     ///  at certain stages.

    uint32             gpuEventWaitCount;            ///< Number of entries in ppGpuEvents.
    const IGpuEvent**  ppGpuEvents;                  ///< The barrier will stall until each GPU event in this array is
                                                     ///  in the set state.

    uint32             rangeCheckedTargetWaitCount;  ///< Number of entries in ppTargets.
    const IImage**     ppTargets;                    ///< The barrier will stall until all previous rendering with any
                                                     ///  color or depth/stencil image in this list bound as a target
                                                     ///  has completed. If one of the targets is a nullptr it will
                                                     ///  perform a full range sync.

    uint32                   transitionCount;        ///< Number of entries in pTransitions.
    const BarrierTransition* pTransitions;           ///< List of image/memory transitions to process.  See
                                                     ///  @ref BarrierTransition. The same subresource should never
                                                     ///  be specified more than once in the list of transitions.
                                                     ///  PAL assumes that all specified subresources are unique.

    uint32  globalSrcCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined
                                ///  (bitwise logical union) with the @ref srcCacheMask field belonging to every
                                ///  element in @ref pTransitions. If this is zero or if there are no transitions,
                                ///  then no global cache flags are applied during every transition.

    uint32  globalDstCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined
                                ///  (bitwise logical union) with the @ref dstCacheMask field belonging to every
                                ///  element in @ref pTransitions. If this is zero or if there are no transitions,
                                ///  then no global cache flags are applied during every transition.

    uint32 reason; ///< The reason that the barrier was invoked.
};

/// Specifies execution dependencies, *availability* and/or *visibility* operations on a section of an IGpuMemory
/// object that does not contain valid IImage data. PAL may assume image data is not present and skip certain
/// cache operations.
///
/// PAL specifies these execution dependencies using pairs of synchronization scope bitmasks of
/// @ref PipelineStageFlag values. The barrier's execution dependencies are only applied to state in this barrier.
/// Memory coherency operations or layout transitions in other barriers will ignore this barrier's execution
/// dependencies.
///
/// PAL specifies these operations using pairs of access scope bitmasks of @ref CacheCoherencyUsageFlags values.
/// The source mask (named srcAccessMask or srcGlobalAccessMask) describes which prior write operations should be made
/// available (i.e., written back from local caches to the LLC). The destination mask (named dstAccessMask or
/// dstGlobalAccessMask) describes which upcoming read/write operations that need visibility (i.e., invalidate
/// corresponding local caches above the LLC). These masks may be zero if no cache operations are needed.
///
/// In general, PAL executes the availability and visibility operations in isolation because the CmdRelease functions
/// require that the destination masks be zero and the CmdAcquire functions require that the source masks be zero.
/// In essence, CmdRelease implements the availability operations and CmdAcquire implements the visibility operations.
/// However, CmdReleaseThenAcquire sees both masks and thus can optimize its cache operations.
///
/// To facilitate cache optimizations, the client is encouraged to add flags corresponding to prior read operations
/// in the relevant source mask(s). Unlike the usual write operation flags, these read flags are entirely optional
/// and do not impact correctness; if they are omitted PAL will simply issue the full set of cache operations.
/// If they are provided PAL may detect cases where future read operations use the same caches as the prior read
/// operations and thus can skip the usual visibility operations.
///
/// Note that,
///   1. If the client does provide read operation flags in a source mask they *must* guarantee that the same flags
///      were provided to a prior barrier's destination mask(s). Incorrect behavior may occur otherwise.
///   2. One @ref MemBarrier or @ImgBarrier object can only be applied to a single resource otherwise PAL's internal
///      optimization may be incorrect. Don't OR multiple resource transitions' stage or access mask into one
///      @ref MemBarrier or @ImgBarrier when making PAL barrier call. However, you are allowed to OR multiple resource
///      transitions' stage or access mask into the global transition mask.
///
/// This struct is used by @ref AcquireReleaseInfo.
struct MemBarrier
{
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914
    union
    {
        struct
        {
            uint32 globallyAvailable :  1; ///< Normally, data made available is in the GPU LLC.  When this bit is
                                           ///  set, available means in memory, available to all clients in the
                                           ///  system.  This is useful for rare cases like mid command buffer
                                           ///  synchronization with the CPU or another external device.
            uint32 reserved          : 31; ///< Reserved for future use.
        };
        uint32 u32All;                     ///< Flags packed as a 32-bit uint.
    } flags;                               ///< Flags controlling the memory barrier.
#endif

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 880
    GpuMemSubAllocInfo memory;             ///< Specifies a portion of an IGpuMemory object this memory barrier affects.
                                           ///  Zero values of memory structure indicate full range barrier operations.
#endif

    uint32             srcStageMask;       ///< Bitmask of PipelineStageFlag values defining the synchronization
                                           ///  scope that must be confirmed complete as part of a release.  Must be
                                           ///  0 when passed in to CmdAcquire or CmdAcquireEvent.
    uint32             dstStageMask;       ///< Bitmask of PipelineStageFlag values defining the synchronization
                                           ///  scope of operations to be performed after the acquire.  Must be
                                           ///  0 when passed in to CmdRelease or CmdReleaseEvent.

    uint32             srcAccessMask;      ///< CacheCoherencyUsageFlags mask which defines the access scope for the
                                           ///  availability operation, as defined in the struct comment header.
                                           ///  This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent.
    uint32             dstAccessMask;      ///< CacheCoherencyUsageFlags mask which defines the access scope for the
                                           ///  visibility operation, as defined in the struct comment header.
                                           ///  This must be 0 when passed to CmdRelease or CmdReleaseEvent.
};

/// Specifies required layout transition, execution dependencies, *availability*, and/or *visibility* operations on a
/// subresource of an IImage object.
///
/// See the header comment on @ref MemBarrier for a full description of the execution dependencies, availability and
/// visibility operations, including what rules the clients must follow when filling out srcAccessMask and
/// dstAccessMask.
///
/// This struct is used by @ref AcquireReleaseInfo.
struct ImgBarrier
{
    const IImage* pImage;        ///< Relevant image resource for this barrier.
    SubresRange   subresRange;   ///< Selects a range of planes/slices/mips the barrier affects.  If newLayout
                                 ///  includes @ref LayoutUninitializedTarget this range must cover all subresources of
                                 ///  pImage unless the perSubresInit image create flag was specified.

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 880
    Box           box;           ///< Restricts the barrier to a sub-section of each subresource.  The Z offset/extent
                                 ///  must be 0 for 1D/2D images, and the Y offset/extent must be 0 for 1D images.  A
                                 ///  box with zero extents will be ignored, and the barrier will affect the entire
                                 ///  subresource range.  This box may be used to restrict ranges of cache flushes or
                                 ///  invalidations, or may restrict what data is decompressed.  However, the
                                 ///  implementation may not be able to optimize particular cases and may expand the
                                 ///  barrier to cover the entire subresource range.  Specifying a subregion with a box
                                 ///  when newLayout includes @ref LayoutUninitializedTarget is not supported.
#endif

    uint32        srcStageMask;  ///< Bitmask of PipelineStageFlag values defining the synchronization
                                 ///  scope that must be confirmed complete as part of a release.  Must be
                                 ///  0 when passed in to CmdAcquire or CmdAcquireEvent.
    uint32        dstStageMask;  ///< Bitmask of PipelineStageFlag values defining the synchronization
                                 ///  scope of operations to be performed after the acquire.  Must be
                                 ///  0 when passed in to CmdRelease or CmdReleaseEvent.

    uint32        srcAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the
                                 ///  availability operation, as defined in the struct comment header.
                                 ///  This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent.
    uint32        dstAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the
                                 ///  visibility operation, as defined in the struct comment header.
                                 ///  This must be 0 when passed to CmdRelease or CmdReleaseEvent.

    ImageLayout   oldLayout;     ///< Specifies the current image layout based on bitmasks of allowed operations and
                                 ///  engines up to this point.  These masks imply the previous compression state. No
                                 ///  usage flags should ever be set in oldLayout.usages that correspond to usages
                                 ///  that are not supported by the engine that is performing the transition.  The
                                 ///  engine type performing the transition must be set in oldLayout.engines. Can set
                                 ///  both oldLayout and newLayout to zero value for no layout transition case.
    ImageLayout   newLayout;     ///< Specifies the upcoming image layout based on bitmasks of allowed operations and
                                 ///  engines after this point.  These masks imply the upcoming compression state.
                                 ///  point.  A difference between oldLayoutUsageMask and newLayoutUsageMask may result
                                 ///  in a decompression.  PAL's implementation will ensure the results of any layout
                                 ///  operations are consistent with the requested availability and visibility
                                 ///  operations. Can set both oldLayout and newLayout to zero value for no layout
                                 ///  transition case.

    /// Specifies a custom sample pattern over a 2x2 pixel quad.  The position for each sample is specified on a grid
    /// where the pixel center is <0,0>, the top left corner of the pixel is <-8,-8>, and <7,7> is the maximum valid
    /// position (not quite to the bottom/right border of the pixel).  Specifies a custom sample pattern over a 2x2
    /// pixel quad. Can be left null for non-MSAA images or when a valid IMsaaState is bound prior to the CmdBarrier
    /// call.
    const MsaaQuadSamplePattern* pQuadSamplePattern;
};

/// Input structure to CmdRelease(), CmdReleaseEvent(), CmdAcquire(), CmdAcquireEvent(), and CmdReleastThenAcquire().
/// It describes the execution dependencies, memory dependencies, and image layout transitions that must be resolved.
///
/// Global transition doesn't have buffer or image info so it will assume the worst case and the barrier operations may
/// not be optimal (e.g. metadata may be misaligned and need issue LLC flush/invalidation). It's suggested that if
/// clients know the buffer or image info, try setting up the barrier call with the full buffer or image transition
/// info (including stageMask and accessMask) instead of global transition for optimal performance.
///
/// Clients may OR multiple MemBarrier into a single MemBarrier on full range barrier cases for simple and saving CPU
/// overhead. To allow more optimization chances (e.g. skip unnecessary stalls for read only transitions) in PAL,
/// it's suggested to split the single grouped MemBarrier into two separate grouped MemBarriers: one is read only
/// MemBarrier and the other is writeable MemBarrier; both are then passed together to the barrier call.
struct AcquireReleaseInfo
{
    uint32               srcGlobalStageMask;  ///< Bitmask of PipelineStageFlag values defining the global
                                              ///  synchronization scope that must be confirmed complete as part of a
                                              ///  release.  Must be 0 when passed in to CmdAcquire or CmdAcquireEvent.
    uint32               dstGlobalStageMask;  ///< Bitmask of PipelineStageFlag values defining the global
                                              ///  synchronization scope of operations to be performed after the
                                              ///  acquire.  Must be 0 when passed in to CmdRelease or CmdReleaseEvent.

    uint32               srcGlobalAccessMask; ///< *Access scope* for the global availability operation.  Serves the
                                              ///  same purpose as srcAccessMask in @ref MemoryBarrier, but will cause
                                              ///  all relevant caches to be flushed without range checking.
                                              ///  This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent.
    uint32               dstGlobalAccessMask; ///< *Access scope* for the global visibility operation.  Serves the
                                              ///  same purpose as dstAccessMask in @ref MemoryBarrier, but will cause
                                              ///  all relevant caches to be invalidated without range checking.
                                              ///  This must be 0 when passed to CmdRelease or CmdReleaseEvent.

    uint32               memoryBarrierCount;  ///< Number of entries in pMemoryBarriers.
    const MemBarrier*    pMemoryBarriers;     ///< Describes memory dependencies specific to a range of a particular
                                              ///  IGpuMemory object.

    uint32               imageBarrierCount;   ///< Number of entries in pImageBarriers.
    const ImgBarrier*    pImageBarriers;      ///  Describes memory dependencies and image layout transitions required
                                              ///  for a subresource range of a particular IImage object.
    uint32 reason;                            ///< The reason that the barrier was invoked.
                                              ///  See @ref Developer::BarrierReason for internal reason codes, though
                                              ///  clients may define their own as well
};

/// Specifies barrier type, global (potentially mixed cases of buffer and image), buffer or image.
enum class BarrierType : uint32
{
    Global,
    Buffer,
    Image
};

/// Number of all HW opaque release token types.
constexpr uint32 NumReleaseTokenTypes = 4;

/// Synchronization token structure for CmdRelease() and CmdAcquire().
///
/// Clients should pass the ReleaseToken returned by CmdRelease() to CmdAcquire() directly without changing the value.
/// If a resource with given subresource range has multiple ReleaseToken, all related ReleaseToken should be passed to
/// CmdAcquire().
///
/// Passing ReleaseToken { .fenceValue = N; .type = T } into CmdAcquire() will wait for all prior releases with
/// .fenceValue <= N for .type == T. Resource with a large number of subresources may introduce lots of ReleaseToken
/// potentially (e.g. released per subresource). No need to track all ReleaseToken for each resource since clients
/// can optimize this based on the fact that release type and fenceValue are exposed for each ReleaseToken: define a
/// ReleaseToken array with size @ref NumReleaseTokenTypes, only track ReleaseToken with the largest fenceValue per
/// each release type; and then passing the tracked array ReleaseToken values to CmdAcquire() is enough.
union ReleaseToken
{
    struct
    {
        uint32 fenceValue : 24; ///< Release fence value per token type.
        uint32 type       : 8;  ///< Release token type (HW opaque). Note that please increase the number of bits if
                                ///  it can't hold all types, see @ref NumReleaseTokenTypes for details.
    };

    uint32 u32All;
};

/// Specifies parameters for a copy from one range of a source GPU memory allocation to a range of the same size in a
/// destination GPU memory allocation.  Used as an input to ICmdBuffer::CmdCopyMemory().
struct MemoryCopyRegion
{
    gpusize srcOffset;  ///< Offset in bytes into the source GPU memory allocation to copy data from.
    gpusize dstOffset;  ///< Offset in bytes into the destination GPU memory allocation to copy data to.
    gpusize copySize;   ///< Amount of data to copy in bytes.
};

/// Specifies parameters for an image copy from one region in a source image subresource to a region of the same size in
/// a destination image subresource.  Used as input to ICmdBuffer::CmdCopyImage().
/// If the region describes a copy between a 2D and a 3D image, extent.depth and numSlices must be equal and may be
/// larger than 1.
struct ImageCopyRegion
{
    SubresId srcSubres;  ///< Selects the source subresource.
    Offset3d srcOffset;  ///< Offset to the start of the chosen region in the source subresource.
    SubresId dstSubres;  ///< Selects the destination subresource.
    Offset3d dstOffset;  ///< Offset to the start of the chosen region in the destination
                         ///  subresource.
    Extent3d extent;     ///< Size of the copy region in pixels.
    uint32   numSlices;  ///< Number of slices the copy will span.
};

/// Specifies parameters for a copy between an image and a GPU memory allocation.  The same structure is used regardless
/// of direction, an input for both ICmdBuffer::CmdCopyImageToMemory() and ICmdBuffer::CmdCopyMemoryToImage().
struct MemoryImageCopyRegion
{
    SubresId imageSubres;         ///< Selects the image subresource.
    Offset3d imageOffset;         ///< Pixel offset to the start of the chosen subresource region.
    Extent3d imageExtent;         ///< Size of the image region in pixels.
    uint32   numSlices;           ///< Number of slices the copy will span.
    gpusize  gpuMemoryOffset;     ///< Offset in bytes to the start of the copy region in the GPU memory allocation.
    gpusize  gpuMemoryRowPitch;   ///< Offset in bytes between the same X position on two consecutive lines.
    gpusize  gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices.
    SwizzledFormat swizzledFormat;///< If not Undefined, reinterpret both subresources using this format and swizzle.
};

/// Specifies parameters for a copy between a PRT and a GPU memory allocation.  The same structure is used regardless
/// of direction, an input for both ICmdBuffer::CmdCopyTiledImageToMemory() and ICmdBuffer::CmdCopyMemoryToTiledImage().
struct MemoryTiledImageCopyRegion
{
    SubresId imageSubres;         ///< Selects the image subresource; must not be a part of the packed mip tail.
    Offset3d imageOffset;         ///< Tile offset to the start of the chosen subresource region.
    Extent3d imageExtent;         ///< Size of the image region in tiles.
    uint32   numSlices;           ///< Number of slices the copy will span.
    gpusize  gpuMemoryOffset;     ///< Offset in bytes to the start of the copy region in the GPU memory allocation.
    gpusize  gpuMemoryRowPitch;   ///< Offset in bytes between the same X position on two consecutive lines.
    gpusize  gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices.
};

/// Used by copy operations to temporarily interpret a range of GPU memory as a "typed buffer".  A typed buffer is
/// essentially a linear image with a caller-defined row pitch and depth pitch.  Typed buffer copies do not require
/// the GPU memory objects to be created with the "typedBuffer" flag.
struct TypedBufferInfo
{
    SwizzledFormat swizzledFormat; ///< The pixels in this buffer have this format.
    gpusize        offset;         ///< Offset in bytes to the start of the copy region in the buffer's GPU memory
                                   ///  allocation.
    gpusize        rowPitch;       ///< Offset in bytes between the same X position on two consecutive lines.
    gpusize        depthPitch;     ///< Offset in bytes between the same X,Y position of two consecutive slices.
};

/// Specifies parameters for a copy from one region of a typed buffer to a region of the same size in a destination
/// typed buffer.  Used as an input to ICmdBuffer::CmdCopyTypedBuffer().
struct TypedBufferCopyRegion
{
    TypedBufferInfo srcBuffer; ///< How to interpret the source GPU memory allocation as a typed buffer.
    TypedBufferInfo dstBuffer; ///< How to interpret the destination GPU memory allocation as a typed buffer.
    Extent3d        extent;    ///< Size of the copy region in pixels.
};

/// Specifies parameters for a scaled copy between an image and a typed buffer.  The same structure is used regardless
/// of direction, an input for ICmdBuffer::CmdScaledCopyTypedBufferToImage().
struct TypedBufferImageScaledCopyRegion
{
    SubresId        imageSubres;    ///< Selects the image subresource.
    Offset2d        imageOffset;    ///< Pixel offset to the start of the chosen subresource region.
    Extent2d        imageExtent;    ///< Size of the image region in pixels.
    TypedBufferInfo bufferInfo;     ///< How to interpret the GPU memory allocation as a typed buffer.
    Extent2d        bufferExtent;   ///< Size of the typed buffer region in pixels.
    SwizzledFormat  swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle.
};

/// Specifies parameters for a scaled image copy from one region in a source image subresource to a region in the
/// destination image subresource.  Used as an input to ICmdBuffer::CmdScaledCopyImage.
struct ImageScaledCopyRegion
{
    SubresId           srcSubres;      ///< Selects the source subresource.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 887
    uint32             srcSlices;      ///< Number of source image slices to read across.
#endif
    union
    {
        Offset3d       srcOffset;      ///< Offset to the start of the chosen region in the source subresource.
        Offset3dFloat  srcOffsetFloat; ///< Alternative representation in floating point.
    };
    union
    {
        SignedExtent3d srcExtent;      ///< Signed size of the source region in pixels.  A negative size indicates
                                       ///  a copy in the reverse direction.
        Extent3dFloat  srcExtentFloat; ///< Alternative representation in floating point.
    };

    SubresId           dstSubres;      ///< Selects the destination subresource.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 887
    uint32             dstSlices;      ///< Number of destination image slices to write.
#endif
    union
    {
        Offset3d       dstOffset;      ///< Offset to the start of the chosen region in the destination subresource.
        Offset3dFloat  dstOffsetFloat; ///< Alternative representation in floating point.
    };
    union
    {
        SignedExtent3d dstExtent;      ///< Signed size of the destination region in pixels.  A negative size
                                       ///  indicates a copy in the reverse direction.
        Extent3dFloat  dstExtentFloat; ///< Alternative representation in floating point.
    };

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 887
    uint32             numSlices;      ///< Number of slices the copy will span.
#endif
    SwizzledFormat     swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle.
                                       ///  The specified format needs to have been included in the "pViewFormats" list
                                       ///  specified at image-creation time, otherwise the result might be incorrect.
};

/// Specifies parameters for a color-space-conversion copy from one region in a source image subresource to a region in
/// a destination image subresource.  Used as an input to ICmdBuffer::CmdColorSpaceConversionCopy.
struct ColorSpaceConversionRegion
{
    Offset2d        srcOffset;      ///< Offset to the start of the chosen region in the source subresource(s).
    SignedExtent2d  srcExtent;      ///< Signed size of the source region in pixels.  A negative size indicates a copy
                                    ///  in the reverse direction.
    Offset2d        dstOffset;      ///< Offset to the start of the chosen region in the destination subresource(s).
    SignedExtent2d  dstExtent;      ///< Signed size of the destination region in pixels.  A negative size indicates a
                                    ///  copy in the reverse direction.
    SubresId        rgbSubres;      ///< Selects the first subresource of the RGB image where the copy will begin.  This
                                    ///  can either be the source or destination of the copy, depending on whether the
                                    ///  copy is performing an RGB->YUV or YUV->RGB conversion.
    uint32          yuvStartSlice;  ///< Array slice of the YUV image where the copy will begin.  All planes of planar
                                    ///  YUV images will be implicitly involved in the copy.  This can either be the
                                    ///  source or destination of the copy, depending on whether the copy is performing
                                    ///  an RGB->YUV or YUV->RGB conversion.
    uint32          sliceCount;     ///< Number of slices the copy will span.
};

/// Specifies the color-space-conversion table used when converting between YUV and RGB Image formats.  Used as an input
/// to ICmdBuffer:CmdColorSpaceConversionCopy.
struct ColorSpaceConversionTable
{
    float table[3][4];  ///< Values forming the conversion table matrix, which has three rows and four columns. For RGB
                        ///  to YUV conversions, the conversion shader uses the following expressions to evaluate the
                        ///  YUV color:
                        ///   Y = dot( [R G B 1], [row #0] )
                        ///   U = dot( [R G B 1], [row #1] )
                        ///   V = dot( [R G B 1], [row #2] )
                        ///  For YUV to RGB conversions, the conversion shader uses the following expressions to
                        ///  evaluate the RGB color:
                        ///   R = dot( [Y U V 1], [row #0] )
                        ///   G = dot( [Y U V 1], [row #1] )
                        ///   B = dot( [Y U V 1], [row #2] )
                        ///  A fourth row is not needed because alpha is copied directly between the RGB and YUV colors.
};

/// Default color-space-conversion table usable by PAL clients when calling ICmdBuffer::CmdColorSpaceConverionCopy
/// to perform a YUV to RGB color space conversion.  Represents the BT.601 standard (standard-definition TV).
extern const ColorSpaceConversionTable DefaultCscTableYuvToRgb;

/// Default color-space-conversion table usable by PAL clients when calling ICmdBuffer::CmdColorSpaceConverionCopy
/// to perform a RGB to YUV color space conversion.  Represents the BT.601 standard (standard-definition TV).
extern const ColorSpaceConversionTable DefaultCscTableRgbToYuv;

/// Specifies flags controlling GPU copy behavior.  Format related flags are ignored by DMA queues.
enum CopyControlFlags : uint32
{
    CopyFormatConversion  = 0x1, ///< Requests that the copy convert between two compatible formats. This is ignored
                                 ///  unless both formats support @ref FormatFeatureFormatConversion.
    CopyRawSwizzle        = 0x2, ///< If possible, raw copies will swizzle from the source channel format into the
                                 ///  destination channel format (e.g., RGBA to BGRA).
    CopyEnableScissorTest = 0x4, ///< If set, do scissor test using the specified scissor rectangle.
    CopyControlAllFlags   = 0x7  ///< Clients should NOT use it, for internal static_assert purpose only.
};

/// Specifies parameters for a resolve of one region in an MSAA source image to a region of the same size in a single
/// sample destination image.  Used as an input to ICmdBuffer::CmdResolveImage().
struct ImageResolveRegion
{
    uint32         srcPlane;        ///< The source color, depth, or stencil plane.
    uint32         srcSlice;        ///< Selects the source starting slice
    Offset3d       srcOffset;       ///< Offset to the start of the chosen region in the source subresource.
    uint32         dstPlane;        ///< The destination color, depth, or stencil plane.
    uint32         dstMipLevel;     ///< Selects destination mip level.
    uint32         dstSlice;        ///< Selects the destination starting slice
    Offset3d       dstOffset;       ///< Offset to the start of the chosen region in the destination subresource.
    Extent3d       extent;          ///< Size of the resolve region in pixels.
    uint32         numSlices;       ///< Number of slices to be resolved
    SwizzledFormat swizzledFormat;  ///< If not Undefined, reinterpret both subresources using this format and swizzle.
                                    ///  The format must match both subresource's native formats.

    const MsaaQuadSamplePattern* pQuadSamplePattern; ///< Specifies sample pattern for MSAA depth image. It must be a
                                                     ///  valid pointer if image was created with sampleLocsAlwaysKnown
                                                     ///  flag set.
};

/// A list of the types of PRT+ resolves that can be performed.
enum class PrtPlusResolveType : uint32
{
    Decode = 0x0, ///< Translate from AMD HW format to format of destination image.
    Encode = 0x1, ///< Translate from source image to AMD HW format
    Count  = 0x2,
};

/// Input structure to the CmdResolvePrtPlusImage function
struct PrtPlusImageResolveRegion
{
    Offset3d  srcOffset;       ///< Offset to the start of the chosen region in the source subresource.
    uint32    srcMipLevel;     ///< Selects source mip level
    uint32    srcSlice;        ///< Selects the source starting slice

    Offset3d  dstOffset;       ///< Offset to the start of the chosen region in the destination subresource.
    uint32    dstMipLevel;     ///< Selects destination mip level.
    uint32    dstSlice;        ///< Selects the destination starting slice

    Extent3d  extent;          ///< Size of the resolve region in pixels.
    uint32    numSlices;       ///< Number of slices to be resolved
};

/// Specifies parameters for a resolve of one region in an MSAA source image to a region of the same size in a single
/// sample destination image.  Used as an input to ICmdBuffer::CmdResolveImage().
enum class ResolveMode : uint32
{
    Average     = 0x0,   ///< Resolve result is an average of all the individual samples
    Minimum     = 0x1,   ///< Resolve result is the minimum value of all individual samples
    Maximum     = 0x2,   ///< Resolve result is the maximum value of all individual samples
    Count       = 0x4,
};

/// Specifies width of immediate data to be written out.
enum class ImmediateDataWidth : uint32
{
    ImmediateData32Bit = 0x0,
    ImmediateData64Bit = 0x1,

    Count              = 0x2,
};

/// Specifies flags controlling GPU query behavior.
union QueryControlFlags
{
    struct
    {
        /// Controls accuracy of query data collection. Available only for occlusion queries.  If set, occlusion query
        /// is guaranteed to return imprecise non-zero value if any samples pass the depth and stencil test.  Using
        /// imprecise occlusion query results could improve rendering performance while an occlusion query is active.
        uint32 impreciseData :  1;
        uint32 reserved      : 31;  ///< Reserved for future use.
    };
    uint32 u32All;                  ///< Flags packed as 32-bit uint.
};

/// Specifies layout of GPU memory used as an input to CmdDrawIndirectMulti.
struct DrawIndirectArgs
{
    uint32 vertexCount;    ///< Number of vertices to draw.
    uint32 instanceCount;  ///< Number of instances to draw.
    uint32 firstVertex;    ///< Starting index value for the draw.  Indices passed to the vertex shader will range from
                           ///  firstVertex to firstVertex + vertexCount - 1.
    uint32 firstInstance;  ///< Starting instance for the draw.  Instace IDs passed to the vertex shader will range from
                           ///  firstInstance to firstInstance + instanceCount - 1.
};

/// Specifies layout of GPU memory used as an input to CmdDrawIndexedIndirectMulti.
///
/// Indices passed to the vertex shader will be:
///
/// + IndexBuffer[firstIndex] + vertexOffset
/// + IndexBuffer[firstIndex + 1] + vertexOffset,
/// + ...
/// + IndexBuffer[firstIndex + indexCount - 1] + vertexOffset
struct DrawIndexedIndirectArgs
{
    uint32 indexCount;     ///< Number of vertices to draw.
    uint32 instanceCount;  ///< Number of instances to draw.
    uint32 firstIndex;     ///< Starting index buffer slot for the draw.
    int32  vertexOffset;   ///< Offset added to the index fetched from the index buffer before it is passed to the
                           ///  vertex shader.
    uint32 firstInstance;  ///< Starting instance for the draw.  Instace IDs passed to the vertex shader will range from
                           ///  firstInstance to firstInstance + instanceCount - 1.
};

/// Specifies layout of GPU memory used as an input to CmdDispatchIndirect.
struct DispatchIndirectArgs
{
    uint32 x;  ///< Threadgroups to dispatch in the X dimension.
    uint32 y;  ///< Threadgroups to dispatch in the Y dimension.
    uint32 z;  ///< Threadgroups to dispatch in the Z dimension.
};

/// Specifies layout of GPU memory used as an input to CmdDispatchMeshIndirect.
using DispatchMeshIndirectArgs = DispatchIndirectArgs;

/// Specifies the GPU virtual address of an array and the stride in bytes between array elements.
struct CpuVirtAddrAndStride
{
    const void*  pCpuVirtAddr;  ///< CPU virtual address of the 0th array element.
    struct
    {
        uint64  stride : 32;    ///< Distance between array elements in bytes.
        uint64  _pad   : 32;    ///< Padding for structure alignment.
    };
};

/// Specifies the GPU virtual address of an array and the stride in bytes between array elements.
struct GpuVirtAddrAndStride
{
    gpusize  gpuVirtAddr;       ///< GPU virtual address of the 0th array element.
    struct
    {
        uint64  stride : 32;    ///< Distance between array elements in bytes.
        uint64  _pad   : 32;    ///< Padding for structure alignment.
    };
};

/// Flags to describe a dispatch
union DispatchInfoFlags
{
    struct
    {
        uint32 devDriverOverlay      : 1;   ///< Flag indicates this dispatch draws the DevDriver overlay
        uint32 reserved              : 31;  ///< Reserved for future use.
    };
    uint32 u32All;                  ///< Flags packed as 32-bit uint.
};

/// Specifies the different stages at which a combiner can choose between different shading rates.
enum class VrsCombinerStage : uint32
{
    ProvokingVertex, ///< Chooses between the shading rate specified by the VrsRateParams struct and the shader
                     ///  rate provided by the provoking vertex.
    Primitive,       ///< Chooses between previous combiner stage and the shader rate associated with the primitive
    Image,           ///< Chooses between previous combiner stage and the shader rate associated with an image
    PsIterSamples,   ///< Chooses between previous combiner stage and the PS_ITER_SAMPLES rate.
    Max
};

/// Specifies the different possible shading rates.  Not all are supported on all HW; see the supportedVrsRates
/// entry in the gfxipProperties structure.
enum class VrsShadingRate : uint32
{
    _16xSsaa  = 0x0,
    _8xSsaa   = 0x1,
    _4xSsaa   = 0x2,
    _2xSsaa   = 0x3,
    _1x1      = 0x4,
    _1x2      = 0x5,
    _2x1      = 0x6,
    _2x2      = 0x7,
    Count
};

/// Indices into the centerOffset array member of the VrsCenterState structure.
enum class VrsCenterRates : uint32
{
    _1x1 = 0x0,
    _1x2 = 0x1,
    _2x1 = 0x2,
    _2x2 = 0x3,
    Max  = 0x4,
};

/// Specifies the different ways in which a combiner can choose between two different shading rate inputs.
enum class VrsCombiner : uint32
{
    Passthrough = 0, ///< Keep previous shading rate.
    Override    = 1, ///< C.xy = B.xy
    Min         = 2, ///< min(A.xy, B.xy)
    Max         = 3, ///< max(A.xy, B.xy)
    Sum         = 4, ///< min(maxRate, A.xy + B.xy)
    Count
};

/// Structure for defining paramters to the CmdSetPerDrawVrsRate function.
struct VrsRateParams
{
    /// The shading rate to be bound to the render state.
    VrsShadingRate  shadingRate;

    /// The state of all the combiners.
    VrsCombiner     combinerState[static_cast<uint32>(VrsCombinerStage::Max)];

    union
    {
        struct
        {
            uint32  exposeVrsPixelsMask :  1; ///< Controls how the shader input mask of a coarse pixel is generated.
                                              ///   0 : Bitwise OR of all fine pixel`s mask
                                              ///   1 : Pack fine pixels` coverage mask into iMask.  Layout based
                                              ///       on VRS rate
            uint32  reserved            : 31;
        };

        uint32  u32All;   ///< Flags packed as 32-bit uint.
    } flags;              ///< Flags controlling VRS rate parameters
};

/// Structure for defininig paramters to the CmdSetVrsCenterState function.
struct VrsCenterState
{
    /// The offset is scaled by the coarse pixel size and then added to the center location
    /// Center offsets are specified as two 4 bits signed integer value representing a location on a 16x16 grid gd.
    /// The offset is scaled by the coarse pixel size and then added to the center location
    /// 1x1, 1x2, 2x1 and 2x2 shading rates can all have their own unique offsets
    Offset2d  centerOffset[static_cast<uint32>(VrsCenterRates::Max)];

    union
    {
        struct
        {
            uint32  overrideCenterSsaa    :  1; ///< Override center interpolants to be evaluated at the sample
                                                ///  position.
            uint32  overrideCentroidSsaa  :  1; ///< Override centroid interpolants to be evaluated at the centroid
                                                ///  of each sample group being iterated (simply the sample position
                                                ///  in the typical case of 1-sample groups).
            uint32  alwaysComputeCentroid :  1; ///< Don't assume the centroid of a fully covered shading region is
                                                ///  the center.  It is possible all samples could be lit but the
                                                ///  center is not lit for certain combinations of centerOffset[]
                                                ///  values and programmable sample positions
            uint32  reserved              : 29; ///< Reserved for future HW
        };

        uint32  u32All;   ///< Flags packed as 32-bit uint.
    } flags;              ///< Flags controlling VRS center state
};

/// @internal
/// Function pointer type definition for setting pipeline-accessible user data entries to the specified values. Each
/// command buffer object has one such callback per pipeline bind point, so the bind point is implicit.
///
/// @see ICmdBuffer::CmdSetUserData().
typedef void (PAL_STDCALL *CmdSetUserDataFunc)(
    ICmdBuffer*   pCmdBuffer,
    uint32        firstEntry,
    uint32        entryCount,
    const uint32* pEntryValues);

/// @internal Function pointer type definition for issuing non-indexed draws.
///
/// @see ICmdBuffer::CmdDraw().
typedef void (PAL_STDCALL *CmdDrawFunc)(
    ICmdBuffer* pCmdBuffer,
    uint32      firstVertex,
    uint32      vertexCount,
    uint32      firstInstance,
    uint32      instanceCount,
    uint32      drawId);

/// @internal Function pointer type definition for issuing draws auto.
///
/// @see ICmdBuffer::CmdDrawOpaque().
typedef void (PAL_STDCALL *CmdDrawOpaqueFunc)(
    ICmdBuffer* pCmdBuffer,
    gpusize     streamOutFilledSizeVa,
    uint32      streamOutOffset,
    uint32      stride,
    uint32      firstInstance,
    uint32      instanceCount);

/// @internal Function pointer type definition for issuing indexed draws.
///
/// @see ICmdBuffer::CmdDrawIndexed().
typedef void (PAL_STDCALL *CmdDrawIndexedFunc)(
    ICmdBuffer* pCmdBuffer,
    uint32      firstIndex,
    uint32      indexCount,
    int32       vertexOffset,
    uint32      firstInstance,
    uint32      instanceCount,
    uint32      drawId);

/// @internal Function pointer type definition for issuing indirect draws.
///
/// @see ICmdBuffer::CmdDrawIndirectMulti().
typedef void (PAL_STDCALL *CmdDrawIndirectMultiFunc)(
    ICmdBuffer*           pCmdBuffer,
    GpuVirtAddrAndStride  gpuVirtAddrAndStride,
    uint32                maximumCount,
    gpusize               countGpuAddr);

/// @internal Function pointer type definition for issuing indexed, indirect draws.
///
/// @see ICmdBuffer::CmdDrawIndexedIndirectMulti().
typedef void (PAL_STDCALL *CmdDrawIndexedIndirectMultiFunc)(
    ICmdBuffer*           pCmdBuffer,
    GpuVirtAddrAndStride  gpuVirtAddrAndStride,
    uint32                maximumCount,
    gpusize               countGpuAddr);

/// @internal Function pointer type definition for issuing direct dispatches.
///
/// @see ICmdBuffer::CmdDispatch().
typedef void (PAL_STDCALL *CmdDispatchFunc)(
    ICmdBuffer*       pCmdBuffer,
    DispatchDims      size,
    DispatchInfoFlags infoFlags);

/// @internal Function pointer type definition for issuing indirect dispatches.
///
/// @see ICmdBuffer::CmdDispatchIndirect().
typedef void (PAL_STDCALL *CmdDispatchIndirectFunc)(
    ICmdBuffer* pCmdBuffer,
    gpusize     gpuVirtAddr);
/// @internal Function pointer type definition for issuing direct dispatches with threadgroup offsets.
///
/// @see ICmdBuffer::CmdDispatchOffset().
typedef void (PAL_STDCALL *CmdDispatchOffsetFunc)(
    ICmdBuffer*  pCmdBuffer,
    DispatchDims offset,
    DispatchDims launchSize,
    DispatchDims logicalSize);

/// @internal Function pointer type definition for issuing direct mesh dispatches.
///
/// @see ICmdBuffer::CmdDispatchMesh().
typedef void (PAL_STDCALL *CmdDispatchMeshFunc)(
    ICmdBuffer*  pCmdBuffer,
    DispatchDims size);

/// @internal Function pointer type definition for issuing indirect mesh dispatches.
///
/// @see ICmdBuffer::CmdDispatchMeshIndirectMulti().
typedef void (PAL_STDCALL *CmdDispatchMeshIndirectMultiFunc)(
    ICmdBuffer*          pCmdBuffer,
    GpuVirtAddrAndStride gpuVirtAddrAndStride,
    uint32               maximumCount,
    gpusize              countGpuAddr);

/// Specifies input assembler state for draws.
/// @see ICmdBuffer::CmdSetInputAssemblyState
struct InputAssemblyStateParams
{
    PrimitiveTopology topology;                     ///< Defines how vertices should be interpretted and rendered by
                                                    ///  the graphics pipeline.
    uint8             patchControlPoints;           ///< # of control points per patch. [0-32] valid. Should be set to
                                                    ///  0 by clients if topology is not PrimitiveTopology::Patch.
    bool              primitiveRestartEnable;       ///< Enables the index specified by primitiveRestartIndex to _cut_
                                                    ///  a primitive (i.e., triangle strip) and begin a new primitive
                                                    ///  with the next index.
    bool              primitiveRestartMatchAllBits; ///< Specifies which bits from primitiveRestartIndex to use.
                                                    ///  false - only check relevant bits based on index type
                                                    ///  true  - check all 32 bits irrespective of index type
    uint32            primitiveRestartIndex;        ///< When primitiveRestartEnable is true, this is the index value
                                                    ///  that will restart a primitive.  When using a 16-bit index
                                                    ///  buffer, the upper 16 bits of this value will be ignored.
};

/// Specifies parameters for controlling triangle rasterization.
/// @see ICmdBuffer::CmdSetTriangleRasterState
struct TriangleRasterStateParams
{
    struct
    {
        FillMode        frontFillMode   : 2; ///< Whether front-facing triangles should be rendered solid or wireframe.
        FillMode        backFillMode    : 2; ///< Whether back-facing triangles should be rendered solid or wireframe.
        CullMode        cullMode        : 2; ///< Specifies which, if any, triangles should be culled based on whether
                                             ///  they are front or back facing.
        FaceOrientation frontFace       : 1; ///< Specifies the vertex winding that results in a front-facing triangle.
        ProvokingVertex provokingVertex : 1; ///< Specifies whether the first or last vertex of a primitive is the
    };                                       ///  provoking vertex as it affects flat shading.
    union
    {
        struct
        {
            uint8 frontDepthBiasEnable : 1; ///< Enable depth bias (i.e. polygon offset) for front-facing
                                            ///  triangle-based primitives
            uint8 backDepthBiasEnable  : 1; ///< Enable depth bias (i.e. polygon offset) for back-facing
                                            ///  triangle-based primitives
            uint8 reserved             : 6; ///< Reserved for future use.
        };
        uint8 u8All;                        ///< Flags packed as 8-bit uint.
    } flags;                                ///< Triangle raster state flags.
};

/// Specifies parameters for controlling point and line rasterization.
/// @see ICmdBuffer::CmdSetPointLineRasterState
struct PointLineRasterStateParams
{
    float pointSize;    ///< Width of a point primitive in pixels.
    float lineWidth;    ///< Width of a line primitive in pixels.
    float pointSizeMin; ///< Minimum width of a point primitive in pixels.
    float pointSizeMax; ///< Maximum width of a point primitive in pixels.
};

/// Specifies parameters for controlling line stippling.
/// @see ICmdBuffer::CmdSetLineStippleState
struct LineStippleStateParams
{
    uint16 lineStippleValue; ///< Line stipple bit pattern.
    uint32 lineStippleScale; ///< Line stipple repeat factor.
};

/// Specifies paramters for setting up depth bias. Depth Bias is used to ensure a primitive can properly be displayed
/// (without Z fighting) in front (or behind) of the previously rendered co-planar primitive.  This is useful for decal
/// or shadow rendering.
/// @see ICmdBuffer::CmdSetDepthBiasState
struct DepthBiasParams
{
    float depthBias;            ///< Base depth bias to be added to each fragment's Z value.  In units of the
                                ///  minimum delta representable in the bound depth buffer.
    float depthBiasClamp;       ///< Maximum allowed depth bias result.  Prevents polygons viewed at a sharp value
                                ///  from generating very large biases.
    float slopeScaledDepthBias; ///< Factor multiplied by the depth slope (change in Z coord per x/y pixel) to
                                ///  create more bias for "steep" polygons.  This result is applied to the final
                                ///  Z value in addition to the base depthBias parameter.
};

/// Specifies parameters for setting the value range to be used for depth bounds testing.
/// @see ICmdBuffer::CmdSetDepthBounds
struct DepthBoundsParams
{
    float min; ///< Minimum depth value in passing range (closest).
    float max; ///< Maximum depth value in passing range (farthest).
};

/// Specifies parameters for setting bit-masks applied to stencil buffer reads and writes.
/// @see ICmdBuffer::CmdSetStencilRefMasks
struct StencilRefMaskParams
{

    uint8 frontRef;        ///< Stencil reference value for front-facing polygons.
    uint8 frontReadMask;   ///< Bitmask to restrict stencil buffer reads for front-facing polygons.
    uint8 frontWriteMask;  ///< Bitmask to restrict stencil buffer writes for front-facing polygons.
    uint8 frontOpValue;    ///< Stencil operation value for front-facing polygons.
                           ///  This is the value used as a parameter for a given stencil operation.
                           ///  For example: StencilOp::IncWrap will use this value when incrementing the current
                           ///  stencil contents. Typically, this would be set to one, but on AMD hardware,
                           ///  this register is 8 bits so there is a greater flexibility.

    uint8 backRef;         ///< Stencil reference value for back-facing polygons.
    uint8 backReadMask;    ///< Bitmask to restrict stencil buffer reads for back-facing polygons.
    uint8 backWriteMask;   ///< Bitmask to restrict stencil buffer writes for back-facing polygons.
    uint8 backOpValue;     ///< Stencil operation value for back-facing polygons - See description of frontOpValue
                           ///  for further details.
    union
    {
        uint8 u8All;                         ///< Flags packed as a 8-bit uint.
        struct
        {
            uint8 updateFrontRef        : 1; ///< Updating reference value for front-facing polygons.
            uint8 updateFrontReadMask   : 1; ///< Updating read mask value for front-facing polygons.
            uint8 updateFrontWriteMask  : 1; ///< Updating write mask value for front-facing polygons.
            uint8 updateFrontOpValue    : 1; ///< Updating stencil op value for front-facing polygons.
            uint8 updateBackRef         : 1; ///< Updating reference value for back-facing polygons.
            uint8 updateBackReadMask    : 1; ///< Updating read mask value for back-facing polygons.
            uint8 updateBackWriteMask   : 1; ///< Updating write mask value for back-facing polygons.
            uint8 updateBackOpValue     : 1; ///< Updating stencil op value for back-facing polygons.
        };
    } flags;               ///< Flags to indicate which of the stencil state values are being updated.
};

/// HiS always exposes two pretests.
constexpr uint32 NumHiSPretests = 2;

/// Hierarchical stencil (HiS) allows work to be discarded by the stencil test at tile rate in certain cases.
/// In order to use HiS, the client will define a set of pretests that will be performed whenever a particular stencil
/// buffer is written.  The stencil image will track the results of the pretest for each 8x8 tile, keeping a record of
/// whether any pixel in the tile "may-pass" or "may-fail" the specified pretest.  When stencil testing is enabled,
/// the hardware may be able to discard whole tiles early based on what it can glean from the HiS pretest states.
///
/// Each stencil image has two pretest slots per mip level.  Pretest slots are reset when an initialization barrier
/// targets their mip level on the stencil plane.  The client can then pass this struct to @ref CmdUpdateHiSPretests
/// to bind one or more valid pretests.  It is legal to bind a pretest over a reset slot at any point.
///
/// @warning Except in special cases, it is illegal to bind a pretest on top of an existing pretest.
///
/// It is only legal to bind a new pretest on top of an existing pretest if:
/// 1. All array slices within the given mip have been reset using an initialization barrier.
/// 2. The client guarantees that they will rewrite all stencil values in all array slices within the given mip
///    before the next draw with stencil testing enabled by doing either:
///      a. One or more calls to @ref CmdClearDepthStencil.
///      b. One or more draws with the stencil test disabled and stencil writes enabled.
///
/// Once pretests are selected via @ref CmdUpdateHiSPretests the client should keep track of which tests were enabled
/// on each stencil image and provide them to every call to @ref CmdClearDepthStencil.  This is optional but PAL will
/// not be able to generate HiS optimized clears unless it is given the current pretests.
///
/// @warning The pretests provided to @ref CmdUpdateHiSPretests are applied to all mips of all subresource ranges.
///          If the client varies pretests between mips they must guarantee that the given pretests were bound to all
///          mips in the given subresource ranges.
///
/// This feature works best if the future stencil test behavior is known, either directly told via an API extension
/// or via an app profile in the client layer. For example, if the application 1) clears stencil, 2) does a pass to
/// write stencil, 3) then does a final pass that masks rendering based on the stencil value being > 0, ideally we
/// would choose a pretest of func=Greater, mask=0xFF, and value=0 so that #2 would update the stencil image with
/// per-tile data that lets #3 be accelerated at maximum effeciency.
///
/// In absence of app-specific knowledge, the following algorithm may be a good generic approach:
/// 1. When the stencil image is cleared, set pretest #0 to func=Equal, mask=0xFF, and value set to the clear value.
/// 2. On the first draw with stencil writes enabled, set pretest #1 with the mask set to the app's current stencil
///    mask, and
///      a. If the stencil op is INC or DEC, set func=GreaterEqual and value the same as in #1.
///      b. If the stencil op is REPLACE, set func=Equal and set value to the app's current stencil ref value.
///
/// Note that HiS can only be beneficial for GPU performance so clients that do not want to implement app profiles or
/// generic heuristics should at least hard-code both tests to something simple.
struct HiSPretests
{
    struct
    {
        CompareFunc func;    ///< This function is used to compare the pretest value with the image's stencil value.
                             ///  The expression is evaluated with the pretest value as the left-hand operand and the
                             ///  image's stencil value as the right-hand operand.
        uint8       mask;    ///< This value is ANDed with both stencil values before evaluating the comparison.
        uint8       value;   ///< The pretest value, used as the left-hand operand in the comparison.
        bool        isValid; ///< True if this pretest contains valid information.  Set to false to skip this test.
    } test[NumHiSPretests];  ///< The set of pretest slots.
};

/// Specifies coordinates for setting up single user clip plane.
/// @see ICmdBuffer::CmdSetUserClipPlanes
struct UserClipPlane
{
    float x; ///< Plane coordinate x
    float y; ///< Plane coordinate y
    float z; ///< Plane coordinate z
    float w; ///< Plane coordinate w
};

/// Specifies parameters for setting the constant factor to be used by the blend hardware when programmed with the
/// Blend::ConstantColor, Blend::OneMinusConstantColor, Blend::ConstantAlpha, or Blend::OneMinusConstantAlpha blend
/// coefficients.
/// @see ICmdBuffer::CmdSetBlendConst
struct BlendConstParams
{
    float blendConst[4];  ///< 4-component RGBA float specifying the new blend constant.
};

/// Specifies the parameters for a single viewport
struct Viewport
{
    float       originX;   ///< X coordinate for the viewport's origin.
    float       originY;   ///< Y coordinate for the viewport's origin.
    float       width;     ///< Width of the viewport.
    float       height;    ///< Height of the viewport.
    float       minDepth;  ///< Minimum depth value of the viewport.  Must be in the [0..1] range.
    float       maxDepth;  ///< Maximum depth value of the viewport.  Must be in the [0..1] range.
    PointOrigin origin;    ///< Origin of the viewport relative to NDC. UpperLeft or LowerLeft.
};

/// Specifies the viewport transform parameters for setting a single viewport.
/// @see ICmdBuffer::CmdSetViewport
struct ViewportParams
{
    uint32     count;                   ///< Number of viewports.
    float      horzDiscardRatio;        ///< The ratio between guardband discard rect width and viewport width.
                                        ///  For all guard band ratio settings, values less than 1.0f are illegal.
                                        ///  Value FLT_MAX opens the guardband as wide as the HW supports.
                                        ///  Value 1.0f disables the guardband.
    float      vertDiscardRatio;        ///< The ratio between guardband discard rect height and viewport height.
    float      horzClipRatio;           ///< The ratio between guardband clip rect width and viewport width.
    float      vertClipRatio;           ///< The ratio between guardband clip rect height and viewport height.
    DepthRange depthRange;              ///< Specifies the target range of Z values
    // Define viewports array at the end of the structure as it is common to only access the first N from the CPU.
    Viewport   viewports[MaxViewports]; ///< Array of desciptors for each viewport.
};

/// Specifies the parameters for specifing the scissor rectangle.
struct ScissorRectParams
{
    uint32 count;                   ///< Number of scissor rectangles.
    Rect   scissors[MaxViewports];  ///< Array of scissor regions corresponding to each viewport.
};

/// Specifies parameters for setting the global scissor rectangle.
/// @see ICmdBuffer::CmdSetGlobalScissor
struct GlobalScissorParams
{
    Rect scissorRegion; ///< Rectangle of the global scissor window.
};

/// Specifies parameters for binding the color targets and depth target.
/// @see ICmdBuffer::CmdBindTargets
struct BindTargetParams
{
    uint32                      colorTargetCount;               ///< Number of color targets to bind.
    ColorTargetBindInfo         colorTargets[MaxColorTargets];  ///< Array of color target descriptors.
    DepthStencilBindInfo        depthTarget;                    ///< Describes the depth target bind info.
};

/// Specifies parameters for binding the stream-output targets.
/// @see ICmdBuffer::CmdBindStreamOutTargets
struct BindStreamOutTargetParams
{
    struct
    {
        gpusize  gpuVirtAddr;       ///< GPU virtual address of this stream-output target.  Must be DWORD-aligned.  If
                                    ///  this is zero, 'size' is ignored and the target is considered un-bound.
        gpusize  size;              ///< Size of this stream-output target, in bytes.  Must be DWORD-aligned.
    } target[MaxStreamOutTargets];  ///< Describes the stream-output target for each buffer slot.
};

/// Specifies the different types of predication ops available.
enum class PredicateType : uint32
{
    Zpass     = 1, ///< Enable occlusion predicate
    PrimCount = 2, ///< Enable streamout predicate
    Boolean64 = 3, ///< CP PFP treats memory as a 64bit integer which is either false (0) or true, DX12 style.
    Boolean32 = 4, ///< CP PFP treats memory as a 32bit integer which is either false (0) or true, Vulkan style.
    Count
};

/// Bitfield structure used to specify masks for functions that operate on depth and/or stencil planes of an image.
union DepthStencilSelectFlags
{
    struct
    {
        /// Select Depth.
        uint32 depth : 1;

        /// Select Stencil.
        uint32 stencil : 1;

        /// Reserved for future usage.
        uint32 reserved : 30;
    };

    /// Flags packed as 32-bit uint.
    uint32 u32All;
};

/// Specifies information related to clearing a bound color target.  Input structure to CmdClearBoundColorTargets().
struct BoundColorTarget
{
    uint32         targetIndex;    ///< Render target index where the target image is currently bound.
    SwizzledFormat swizzledFormat; ///< Format and swizzle of the target image.
    uint32         samples;        ///< Sample count for the target.
    uint32         fragments;      ///< Fragment count for the target.
    ClearColor     clearValue;     ///< clear color value.
};

/// Specifies clear region to clear a bound target. Input structure to CmdClearBoundColorTargets() and
/// CmdClearBoundDepthStencilTargets()
struct ClearBoundTargetRegion
{
    Rect    rect;                       ///< The 2D region to clear.
    uint32  startSlice;                 ///< The starting slice to clear.
    uint32  numSlices;                  ///< The number of slices to clear.
};

/// Specifies flags controlling CmdSaveComputeState and CmdRestoreComputeState.  PAL clients must be aware that saving
/// and restoring specific state in a nested command buffer may not be supported.  The rule is simple: if the client
/// requires that the caller leak the given state to the callee, PAL will not support saving and restoring that state.
enum ComputeStateFlags : uint32
{
    ComputeStatePipelineAndUserData = 0x1, ///< Selects the bound compute pipeline, all non-indirect user data, and all
                                           ///  kernel arguments (if applicable). Note that the current user data will
                                           ///  be invalidated on CmdSaveComputeState.
    ComputeStateBorderColorPalette  = 0x2, ///< Selects the bound border color pallete that affects compute pipelines.
    ComputeStateAll                 = 0x3, ///< Selects all state
};

/// Provides dynamic command buffer flags during submission
/// The following flags are used for Frame Pacing when delay time is configured to be caculated by KMD.
/// (Currently DX clients require this).
/// For clients that do not need Frame Pacing with KMD caculated delay time, they can ignore these flags:
///
/// - frameBegin and frameEnd : Client's presenting queue should track its present state,
///   and set frameBegin flag on the first command buffer after present,
///   set frameEnd flag on the the last command buffer before present. (Could be the Present command buffer itself.)
///   We don't need to set them on queues other than the presenting queue.
/// - P2PCmd : Mark a P2P copy command. KMD could use this flag for adjustments for its frame time calculation.
///   For the current frame time algorithm, clients should only set this flag on SW compositing copy command.
///   But KMD may adjust their algorithm, and clients should update the flag depending on KMD needs.
///
/// The following flags are used for Direct Capture.
///
/// - captureBegin and captureEnd : Direct capture info should be filled if any of these is set. And captureEnd flag
///   also notifies KMD that the on-screen primary is safe to release.
struct CmdBufInfo
{
    union
    {
        struct
        {
            uint32 isValid                 : 1;  ///< Indicate if this CmdBufInfo is valid and should be submitted
            uint32 frameBegin              : 1;  ///< First command buffer after Queue creation or Present.
            uint32 dfSpmTraceBegin         : 1;  ///< This command buffer begins a DF SPM trace.
            uint32 dfSpmTraceEnd           : 1;  ///< This command buffer ends a DF SPM trace.
            uint32 frameEnd                : 1;  ///< Last command buffer before Present.
            uint32 p2pCmd                  : 1;  ///< Is P2P copy command. See CmdBufInfo comments for details.
            uint32 captureBegin            : 1;  ///< This command buffer begins a Direct Capture frame capture.
            uint32 captureEnd              : 1;  ///< This command buffer ends a Direct Capture frame capture.
            uint32 rayTracingExecuted      : 1;  ///< This command buffer contains ray tracing work.
            uint32 preflip                 : 1;  ///< This command buffer has pre-flip access to DirectCapture resource
            uint32 postflip                : 1;  ///< This command buffer has post-flip access to DirectCapture resource
            uint32 privateFlip             : 1;  ///< Need to flip to a private primary surface for DirectCapture feature
            uint32 vpBltExecuted           : 1;  ///< This command buffer comtains VP Blt work.
            uint32 disableDccRejected      : 1;  ///< Reject KMD's DisableDcc request to avoid writing to front buffer.
            uint32 noFlip                  : 1;  ///< No flip when DirectCapture access submission completes
            uint32 frameGenIndex           : 4;  ///< Index of the DirectCapture feature generated frames
            uint32 noRenderPresent         : 1;  ///< Last command buffer before present which is no render present or not
            uint32 motionVectorPropChanged : 1;  ///< Indicates whether motion vector properties changed
            uint32 depthPropChanged        : 1;  ///< Indicates whether depth properties changed
            uint32 cameraPropChanged       : 1;  ///< Indicates whether camera matrix properties changed
            uint32 capturePrimary          : 1;  ///< Has Direct Capture primary surface capture
            uint32 captureMotionVector     : 1;  ///< Has Direct Capture motion vector capture
            uint32 captureDepth            : 1;  ///< Has Direct Capture depth capture
            uint32 captureCamera           : 1;  ///< Has Direct Capture camera matrix capture
            uint32 reserved                : 5;  ///< Reserved for future usage.
        };
        uint32 u32All;                           ///< Flags packed as uint32.
    };

    const IGpuMemory*  pPrimaryMemory;     ///< The primary's gpu memory object used for passing its allocation handle
                                           ///  to KMD for pre-flip primary access (PFPA). If frame metadata flags
                                           ///  specifies that primaryHandle should be sent, clients should set this to
                                           ///  current frame pending primary's IGpuMemory object on the creating GPU
                                           ///  for the frameEnd command. Otherwise set this to nullptr.

    const IGpuMemory*  pDirectCapMemory;   ///< The Direct Capture gpu memory object. It should be set if flag
                                           ///  captureBegin or captureEnd is set. Otherwise set this to nullptr.
    const IGpuMemory*  pPrivFlipMemory;    ///< The gpu memory object of the private flip primary surface for the
                                           ///  DirectCapture feature.
    const Util::Event* pEarlyPresentEvent; ///< The 'early present' event object. This variable can be nullptr.
    uint64             frameIndex;         ///< The frame index of this command buffer. It is only required for the
                                           ///  DirectCapture feature
    uint32             vidPnSourceId;      ///< The display source id for the DirectCapture feature. Clients must set
                                           ///  a valid vidPnSourceId when privateFlip flag is set and pDirectCapMemory
                                           ///  is nullptr.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 865
    uint64             frameId;            ///< Present frame index, incremented at each present
#endif
    const IGpuMemory* pMotionVectorMemory; ///< The motion vector gpu memory object for the DirectCapture feature.
    const IGpuMemory* pDepthMemory;        ///< The depth gpu memory object for the DirectCapture feature.
    const IGpuMemory* pCameraMemory;       ///< The camera gpu memory object for the DirectCapture feature.
};

/// Specifies rotation angle between two images.  Used as input to ICmdBuffer::CmdScaledCopyImage.
enum class ImageRotation : uint32
{
    Ccw0   = 0x0,   ///< Counter clockwise degree 0
    Ccw90  = 0x1,   ///< Counter clockwise degree 90
    Ccw180 = 0x2,   ///< Counter clockwise degree 180
    Ccw270 = 0x3,   ///< Counter clockwise degree 270
    Count
};

/// Describes a color-key value which can control a pixel get copied or ignored during a CmdScaledCopyImage operation.
struct ColorKey
{
    uint32 u32Color[4]; ///< The color value for each channel
};

/// Uniquely identifies the target of the a Present operation (swap chain / destination window / etc.) so that PAL's debug
/// layers can track frames-per-second or other statistics correctly when applications render to multiple displays or
/// windows.  Client drivers which don't care about this can always specify a key value of 0.
using UniquePresentKey = uint64;

/// Convert an OS window handle to a unique present key.
inline UniquePresentKey PresentKeyFromOsWindowHandle(OsWindowHandle handle)
    { return handle.win; }
/// Convert any pointer to a unique present key.
template <typename T>
constexpr inline UniquePresentKey PresentKeyFromPointer(T* ptr) { return reinterpret_cast<UniquePresentKey>(ptr); }

/// Specifies the input parameters for debug overlay's visual confirm. This struct is not functional.
/// The client is expected to default initialize this struct and then fill out any state that makes
/// sense under its presentation model. PAL will process any valid input and ignore fields that are
/// default initialized.
struct CmdPostProcessDebugOverlayInfo
{
    PresentMode presentMode;           ///< The Presentation Mode of the application.
    WsiPlatform wsiPlatform;           ///< The WsiPlatform that Swap Chain works upon
    UniquePresentKey presentKey;       ///< Identifies the window/swap chain, etc. used to present.
};

/// Specifies the input parameters for ICmdBuffer::CmdPostProcessFrame.
struct CmdPostProcessFrameInfo
{
    union
    {
        struct
        {
            uint32 srcIsTypedBuffer :  1; ///< True if the source is a typed buffer instead of an image.
            uint32 reserved         : 31; ///< Reserved for future usage.
        };
        uint32     u32All;                ///< Flags packed as uint32.
    } flags;

    union
    {
        const IImage*     pSrcImage;       ///< The image to postprocess (prior to presenting).
        const IGpuMemory* pSrcTypedBuffer; ///< The typed buffer to postprocess.
                                           ///  Must have been created as a typed buffer.
    };

    CmdPostProcessDebugOverlayInfo      debugOverlay;
    FullScreenFrameMetadataControlFlags fullScreenFrameMetadataControlFlags;

    Pal::ImageLayout srcImageLayout;
};

/// External flags for ScaledCopyImage.
union ScaledCopyFlags
{
    struct
    {
        uint32 srcColorKey    : 1;  ///< If set, enables source color-keying by using the value in the ColorKey member.
                                    ///  That is, any pixel in the source image that matches the color key should not be
                                    ///  copied to the destination image, and all of the source pixels that do not match
                                    ///  the color key should be copied. Mutually exclusive with dstColorKey.
        uint32 dstColorKey    : 1;  ///< If set, enables destination color-keying by using the value in the ColorKey
                                    ///  member. That is, any pixel in the destination image that matches the color key
                                    ///  should be replaced with the corresponding pixel from the source image, and all of
                                    ///  the destination pixels that do not match the color key should not be replaced.
                                    ///  Mutually exclusive with srcColorKey.
        uint32 srcAlpha       : 1;  ///< If set, use alpha channel in source surface as blend factor.
                                    ///  color = src alpha * src color + (1.0 - src alpha) * dst color.
        uint32 dstAsSrgb      : 1;  ///< If set, a non-srgb destination image will be treated as srgb format.
                                    ///  Cannot be set if @ref dstAsNorm is set.
        uint32 dstAsNorm      : 1;  ///< If set, a srgb destination image will be treated as non-srgb format.
                                    ///  Cannot be set if @ref dstAsSrgb is set.
        uint32 scissorTest    : 1;  ///< If set, do scissor test using the specified scissor rectangle.
        uint32 coordsInFloat  : 1;  ///< If set, copy regions are represented in floating point type.
        uint32 srcAsNorm      : 1;  ///< If set, an srgb source image will be treated as non-srgb format.
                                    ///  Cannot be set if @ref srcAsSrgb is set.
        uint32 srcAsSrgb      : 1;  ///< If set, a non-srgb source image will be treated as srgb format.
                                    ///  Cannot be set if @ref srcAsNorm is set.
        uint32 reserved       : 23; ///< reserved for future usage.
    };
    uint32 u32All;                  ///< Flags packed as uint32.
};

/// Input structure to @ref ICmdBuffer::CmdScaledCopyImage. Specifies parameters needed to execute CmdScaledCopyImage.
struct ScaledCopyInfo
{
    const IImage*                   pSrcImage;      ///< The source image to blt from.
    ImageLayout                     srcImageLayout; ///< The source image layout.
    const IImage*                   pDstImage;      ///< The dest image to blt to.
    ImageLayout                     dstImageLayout; ///< The dest image layout.
    uint32                          regionCount;    ///< Copy region array size.
    const ImageScaledCopyRegion*    pRegions;       ///< Region array to copy.
    TexFilter                       filter;         ///< Controlling how a given texture is sampled.
    ImageRotation                   rotation;       ///< Rotation option between two images.
    const ColorKey*                 pColorKey;      ///< Color key value.
    const Rect*                     pScissorRect;   ///< Scissor test rectangle.
    ScaledCopyFlags                 flags;          ///< Copy flags, identifies the type of blt to peform.
};

/// Input structure to @ref ICmdBuffer::CmdGenerateMipmaps. Specifies parameters needed to execute CmdGenerateMipmaps.
struct GenMipmapsInfo
{
    const IImage*  pImage;         ///< Populate mips in this image by reading from existing higher-level mips.
    ImageLayout    baseMipLayout;  ///< The layout of all slices in the read-only base mip; must include LayoutCopySrc.
    ImageLayout    genMipLayout;   ///< The layout of all slices and mips that will be generated; must include
                                   ///  LayoutCopySrc and LayoutCopyDst.
    SubresRange    range;          ///< Which subresources should be generated from earlier mips. The starting mipLevel
                                   ///  must never be zero because there would be no larger mip to read.
    TexFilter      filter;         ///< Controls texture sampling during mip generation. Linear texture filtering is
                                   ///  only supported for images with non-integer formats.
    SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret all subresources using this format and swizzle.
                                   ///  The specified format needs to have been included in the "pViewFormats" list
                                   ///  specified at image-creation time, otherwise the result might be incorrect.
};

/// Defines a single memory range to prefetch using CmdPrimeGpuCaches.
struct PrimeGpuCacheRange
{
    gpusize gpuVirtAddr;          ///< Base GPU virtual address to be prefetched.
    gpusize size;                 ///< Number of bytes to prefetch.  Clients should keep range sizes small relative
                                  ///  to the GPU caches (e.g., tccSizeInBytes); the PAL implementation may clamp
                                  ///  prefetched ranges if they are too large for the cache being prefetched.
    uint32  usageMask;            ///< Bitmask of CacheCoherencyUsageFlags defining the usage to prefetch for.
                                  ///  E.g., if the mask includes CoherShader, then PAL will attempt to prefetch
                                  ///  into caches that are on the shader core's data path.  This mask must be a
                                  ///  subset of the dstCacheMask specified in the last barrier operation executed
                                  ///  on this memory range.  Performing the cache prefetch is considered a read
                                  ///  operation of the specified usage, and so must be properly accounted for
                                  ///  in future barrier memory dependencies for this range.
    bool    addrTranslationOnly;  ///< If set, only the address translation caches (i.e., TLB) will be primed;
                                  ///  no data caches will be affected.  If this is set, the prefetch operation
                                  ///  has no bearing on barrier execution or memory dependencies.
};

/// Magic number tag for payloads in command buffer dumps
constexpr uint32 CmdBufferPayloadSignature = 0x1337F77D;

/// Maximum size, in DWORDs, of payload data in command buffer dumps.
constexpr uint32 MaxPayloadSize = 254;

/// Payload types used in special embedded NOP packets.
enum class CmdBufferPayloadType : uint32
{
    Integer             = 0,    ///< Payload consists of a single 32-bit signed integer.
    UnsignedInteger     = 1,    ///< Payload consists of a single 32-bit unsigned integer.
    Integer64           = 2,    ///< Payload consists of a single 64-bit signed integer.
    UnsignedInteger64   = 3,    ///< Payload consists of a single 64-bit unsigned integer.
    Float               = 4,    ///< Payload consists of a single 32-bit floating point number.
    Double              = 5,    ///< Payload consists of a single 64-bit double precision floating point number.
    Pointer             = 6,    ///< Payload consists of a single 64-bit pointer address.
    String              = 7,    ///< Payload consists of a variable length string. Must contain null-terminator.
    Binary              = 8,    ///< Payload consists of DWORD-aligned binary data.
};

/// Structure layout for embedded CmdBuffer payloads. This can be embedded into the command stream with the
/// @ref ICmdBuffer::CmdNop() function.
struct CmdBufferPayload
{
    uint32               signature;     ///< Magic number tag indicating the structure to follow.
    uint32               payloadSize;   ///< Size of the NOP packet (one DWORD) plus the sizeof this structure and the
                                        ///  payload data to follow.
                                        ///  This value is in DWORDs. Payload size is expected to be under
                                        ///  MaxPayloadSize.
    CmdBufferPayloadType type;          ///< The type of payload.
    uint32               payload[1];    ///< Initial DWORD of payload data with the other data to follow.
};

/// Flags controlling which sub-queue(s) of a command buffer should insert an RGP trace marker.  Zeroing out this
/// union is invalid, because RGP markers must be sent to at least one sub-queue.
union RgpMarkerSubQueueFlags
{
    struct
    {
        uint32  includeMainSubQueue    :  1; ///< If set, includes the main sub-queue in the RGP marker.
        uint32  includeGangedSubQueues :  1; ///< If set, includes any ganged sub-queues in the RGP marker.
        uint32  reserved               : 30; ///< Reserved for future use.
    };
    uint32  u32All; ///< Flags packed into a uint32
};

/**
 ***********************************************************************************************************************
 * @interface ICmdBuffer
 * @brief     Contains GPU rendering and other commands recorded by PAL on the client's behalf.
 *
 * A command buffer can be executed by the GPU multiple times and recycled, provided the command buffer is not pending
 * execution on the GPU when it is recycled.
 *
 * Command buffers are fully independent and there is no persistence of GPU state between submitted command buffers.
 * When a new command buffer is recorded, the state is undefined.  All relevant state must be explicitly set by the
 * client before state-dependent operations such as draws and dispatches.
 *
 * @see IDevice::CreateCmdBuffer()
 ***********************************************************************************************************************
 */
class ICmdBuffer : public IDestroyable
{
public:
    /// Resets the command buffer's previous contents and state, then puts it in the _building_ _state_, allowing new
    /// commands to be recorded.
    ///
    /// If this is a root command buffer, the state will be reset to a "clean slate" with nothing bound.  If this is a
    /// nested command buffer, the state is set to an "undefined" state so that all render state can be inherited from
    /// any root command buffer which executes this one.
    ///
    /// @param [in] info Controls how PAL will generate commands for this command buffer.  E.g., specifies whether the
    ///                  command buffer may be submitted more than once, and controls options for optimizing PM4, etc.
    ///
    /// @returns Success if the command buffer was successfully reset and put into the _building_ _state_.  Otherwise,
    ///          one of the following error codes may be returned:
    ///          + ErrorInvalidFlags if invalid flags are set in the flags parameter.
    ///          + ErrorIncompleteCommandBuffer if the command buffer is already in the _building_ _state_.
    virtual Result Begin(
        const CmdBufferBuildInfo& info) = 0;

    /// Completes recording of a command buffer in the _building_ _state_, making it _executable_.
    ///
    /// @returns Success if the command buffer was successfully made _executable_.  Otherwise, one of the following
    ///          errors may be returned:
    ///          + ErrorIncompleteCommandBuffer if the command buffer is not in the _building_ _state_.
    ///          + ErrorBuildingCommandBuffer if some error occurred while building the command buffer, and it could not
    ///            be made _executable_.  If this error is returned, the command buffer can not be submitted.
    virtual Result End() = 0;

    /// Explicitly resets a command buffer, releasing any internal resources associated with it.
    ///
    /// This call must be used to reset command buffers that have previously reported a ErrorIncompleteCommandBuffer
    /// error.
    ///
    /// @note @ref Begin will implicitly cause a command buffer to be reset in addition to putting it in the
    ///       _building_ _state_.  This method just gives a way to release resources between when the client knows
    ///       it is done with the command buffer and when it is ready to reuse this command buffer object for
    ///       recording new commands.
    ///
    /// @param [in] pCmdAllocator If non-null, all future GPU memory allocations will be done using this allocator.
    ///                           Otherwise the command buffer will continue to use its current command allocator.
    ///
    /// @param [in] returnGpuMemory If true then all GPU memory associated with this command buffer will be returned
    ///                             to the allocator upon reset. If false data chunks will be retained and reused.
    ///                             Note: This flag must be true if changing command allocators.
    ///
    /// @warning If returnGpuMemory is false, the client must guarantee that this command buffer is not queued for
    ///          execution, is not currently being executed, and that all other command buffers that have referenced
    ///          this command buffer in a @ref CmdExecuteNestedCmdBuffers call have also been reset.
    ///
    /// @returns Success if the command buffer was successfully reset.  Otherwise, one of the following errors may be
    ///          returned:
    ///          + ErrorUnknown if an internal PAL error occurs.
    virtual Result Reset(ICmdAllocator* pCmdAllocator, bool returnGpuMemory) = 0;

    /// Queries how many DWORDs of embedded data the command buffer can allocate in one call to CmdAllocateEmbeddedData.
    ///
    /// This a property of the command buffer and its associated command allocator; it may change if the caller
    /// specifies a different command allocator on Reset().
    ///
    /// @returns How many DWORDs of embedded data the command buffer can allocate at once.
    virtual uint32 GetEmbeddedDataLimit() const = 0;

    /// Queries how many DWORDs of embedded data the command buffer can allocate in one call to
    /// CmdAllocateLargeEmbeddedData.
    ///
    /// @returns Number of DWORDs that can be allocated in one call to CmdAllocateLargeEmbeddedData
    virtual uint32 GetLargeEmbeddedDataLimit() const = 0;

    /// Binds a graphics or compute pipeline to the current command buffer state.
    ///
    /// Graphics pipelines must be compiled for the PAL ABI. Compute pipelines must either be compiled for the PAL ABI
    /// or the HSA ABI, if it's supported. HSA ABI support is indicated by supportHsaAbi in @ref DeviceProperties.
    ///
    /// PAL ABI pipelines and HSA ABI pipelines use different mechanisms to bind inputs and outputs. PAL ABI pipelines
    /// use user data entries set by @ref CmdSetUserData. HSA ABI pipelines use kernel arguments set by @ref
    /// CmdSetKernelArguments. Binding or unbinding a compute pipeline can implicitly modify the user data and kernel
    /// argument state, please read the @ref CmdSetUserData and @ref CmdSetKernelArguments documentation for details.
    ///
    /// @param [in] params Parameters necessary to manage dynamic pipeline shader information.
    virtual void CmdBindPipeline(
        const PipelineBindParams& params) = 0;

    /// Binds the specified MSAA state object to the current command buffer state.
    ///
    /// @param [in] pMsaaState New MSAA state to be bound.  Can be null in order to unbind a previously bound MSAA state
    ///                        object without binding a new one.
    virtual void CmdBindMsaaState(
        const IMsaaState* pMsaaState) = 0;

    /// Saves a copy of all of the current command buffer state that is used by graphics workloads. This feature is
    /// intended to give PAL clients a convenient way to issue their own internal graphics workloads without modifying
    /// the application-facing state.
    ///
    /// PAL cannot save multiple layers of state, each call to CmdSaveGraphicsState must be followed by a call to
    /// CmdRestoreGraphicsState before the next call to CmdSaveGraphicsState. Any barriers, resolves, blits, etc are not
    /// allowed while the state is pushed.
    ///
    /// This function can only be called on command buffers that support graphics workloads. All query counters will be
    /// disabled until CmdRestoreGraphicsState is called.
    virtual void CmdSaveGraphicsState() = 0;

    /// Restores all of the command buffer state that is used by graphics workloads. This feature is intended to
    /// give PAL clients a convenient way to issue their own internal graphics workloads without modifying the
    /// application-facing state.
    ///
    /// A call to this function must be preceded by a call to CmdSaveGraphicsState
    ///
    /// This function can only be called on command buffers that support graphics workloads. All previously disabled
    /// query counters will be reactivated.
    virtual void CmdRestoreGraphicsState() = 0;

    /// Sets the shading rate in the command buffer along with the state of the various combiners.
    ///
    /// @param [in] rateParams     Nwe VRS shading rate parameters to be bound.
    virtual void CmdSetPerDrawVrsRate(
        const VrsRateParams&  rateParams) = 0;

    /// Setup parameters regarding how pixel center will be evaluated with VRS.
    ///
    /// @param [in] centerState     Nwe VRS parameters to be bound that control how pixel center is defined.
    virtual void CmdSetVrsCenterState(
        const VrsCenterState&  centerState) = 0;

    /// Binds the shading rate data in the specified image into the pipeline for use with VRS.  Only relevant if the
    /// combiner stage for VrsCombinerStage is set to something other than Passthrough.
    ///
    /// This binding point requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageSampleRate
    /// - CacheCoherency: @ref CoherSampleRate
    /// - ImageLayout:    @ref LayoutSampleRate
    ///
    /// @param [in] pImage   Image that contains sample rate data.  Pointer can be NULL to force 1x1 shading rate.
    virtual void CmdBindSampleRateImage(
        const IImage*  pImage) = 0;

    /// Binds the specified color/blend state object to the current command buffer state.
    ///
    /// @param [in] pColorBlendState New color/blend state to be bound.  Can be null in order to unbind a previously
    ///                              bound color/blend state object without binding a new one.
    virtual void CmdBindColorBlendState(
        const IColorBlendState* pColorBlendState) = 0;

    /// Binds the specified depth/stencil state object to the current command buffer state.
    ///
    /// @param [in] pDepthStencilState New depth/stencil state to be bound.  Can be null in order to unbind a previously
    ///                                bound depth/stencil state object without binding a new one.
    virtual void CmdBindDepthStencilState(
        const IDepthStencilState* pDepthStencilState) = 0;

    /// Sets the value range to be used for depth bounds testing.
    ///
    /// The depth bounds test is enabled in the graphics pipeline.  When enabled, an additional check will be done that
    /// will reject a pixel if the pre-existing depth value stored at its destination location is outside of the
    /// specified bounds.  Applications would typically use this feature to optimize shadow volume rendering.
    ///
    /// @param [in] params  Parameters necessary to set the depth bounds (such as min/max depth).
    virtual void CmdSetDepthBounds(
        const DepthBoundsParams& params) = 0;

    /// Sets pipeline-accessible user data to the specified values.
    ///
    /// The values set in user data entries will be interpreted based on the resource mapping specified for each shader
    /// in the currently bound pipeline.  For example, the client can write virtual addresses of tables containing
    /// SRDs, immediate SRDs that can be loaded without an indirection, or even a small number of immediate ALU
    /// constants.
    ///
    /// The user data values are only used by PAL ABI pipelines. Almost all pipelines used by PAL clients are compiled
    /// for the PAL ABI, but PAL also supports HSA ABI compute pipelines which use @ref CmdSetKernelArguments instead.
    /// When an HSA ABI pipeline is bound the current compute user data entries are saved and will be restored if the
    /// client later binds a PAL ABI compute pipeline.
    ///
    /// @warning It's illegal to set compute user data if an HSA ABI pipeline is currently bound.
    ///
    /// If no compute pipeline is currently bound PAL assumes the client will bind a PAL ABI pipeline and thus accepts
    /// user data bindings. Graphics user data are unaffected by all of this because graphics pipelines can only use
    /// the PAL ABI.
    ///
    /// @see PipelineShaderInfo
    /// @see ResourceMappingNode
    /// @ingroup ResourceBinding
    ///
    /// @param [in] bindPoint    Specifies which type of user-date is to be set (i.e., compute or graphics).
    /// @param [in] firstEntry   First user data entry to be updated.
    /// @param [in] entryCount   Number of user data entries to update; size of the pEntryValues array. Must be greater
    ///                          than zero, and (firstEntry + entryCount) must not extend beyond MaxUserDataEntries.
    /// @param [in] pEntryValues Array of 32-bit values to be copied into user data.
    void CmdSetUserData(
        PipelineBindPoint bindPoint,
        uint32            firstEntry,
        uint32            entryCount,
        const uint32*     pEntryValues)
    { (m_funcTable.pfnCmdSetUserData[static_cast<uint32>(bindPoint)])(this, firstEntry, entryCount, pEntryValues); }

    /// Copies all pipeline-accessible user-data from one bind point to another.  It is invalid if the source and
    /// dest parameters refer to the same bind point.
    ///
    /// @see CmdSetUserData for how the user-date entries will be interpreted by the pipeline.
    ///
    /// @param [in] source  Specifies which bind point to copy from.
    /// @param [in] dest    Specifies which bind point to copy into.
    virtual void CmdDuplicateUserData(
        PipelineBindPoint source,
        PipelineBindPoint dest) = 0;

    /// Sets one or more HSA code object kernel argument values.
    ///
    /// If the currently bound compute pipeline was compiled using the HSA compute ABI this function must be used to
    /// bind that pipeline's arguments. The argument position and value types are static properties of the pipeline
    /// and must be known by the client.
    ///
    /// @note Calling @ref CmdBindPipeline invalidates all prior kernel argument bindings, even if the new pipeline
    ///       also uses the HSA ABI. Any kernel arguments that the client intends to share between pipelines must
    ///       be manually rebound.
    ///
    /// @warning It's illegal to call this function if no compute pipeline is bound or if the bound compute pipeline
    ///          uses a different ABI (e.g., the PAL compute ABI).
    ///
    /// @ingroup ResourceBinding
    ///
    /// @param [in] firstArg  The zero-based position of the first kernel argument to bind.
    /// @param [in] argCount  Number of kernel arguments this call binds.
    /// @param [in] ppValues  Array of pointers to kernel argument values.
    virtual void CmdSetKernelArguments(
        uint32            firstArg,
        uint32            argCount,
        const void*const* ppValues) = 0;

    /// Changes one or more of the command buffer's active vertex buffers.
    ///
    /// @note  If bufferViews.offsetMode is false, PAL will construct SRDs for each bound vertex buffer which are
    ///        equivalent to the client calling @ref IDevice::CreateUntypedBufferViewSrd on each element of the
    ///        pBuffers parameter.
    ///
    /// Note that vertex buffers require use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageVs
    /// - CacheCoherency: @ref CoherShaderRead
    ///
    /// @param [in] bufferViews Vertex buffer view descriptors.This parameter defines which vertex mode is used through
    ///                         @ref VertexBufferViews::offsetMode. VertexBufferViews::pVertexBufferViews or
    ///                         VertexBufferViews::pBufferViewInfos must not be nullptr.
    virtual void CmdSetVertexBuffers(
        const VertexBufferViews& bufferViews) = 0;

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 877
    void CmdSetVertexBuffers(
        uint32                firstBuffer,
        uint32                bufferCount,
        const BufferViewInfo* pBuffers)
    {
        const VertexBufferViews bufferViews =
        {
            .firstBuffer = firstBuffer,
            .bufferCount = bufferCount,
            .offsetMode = false,
            .pBufferViewInfos = pBuffers
        };
        CmdSetVertexBuffers(bufferViews);
    }
#endif

    /// Binds a range of memory for use as index data (i.e., binds an index buffer).
    ///
    /// The GPU virtual address must be index element aligned: 2-byte aligned for 16-bit indices or 4-byte aligned for
    /// 32-bit indices.
    ///
    /// The index buffer binding point requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageFetchIndices
    /// - CacheCoherency: @ref CoherIndexData
    ///
    /// @param [in] gpuAddr    GPU virtual address of the index data.  Can be zero to unbind the previously bound data.
    /// @param [in] indexCount Maximum number of indices in the index data; the GPU may read less indices.
    /// @param [in] indexType  Specifies whether to use 8-bit, 16-bit or 32-bit index data.
    virtual void CmdBindIndexData(
        gpusize   gpuAddr,
        uint32    indexCount,
        IndexType indexType) = 0;

    /// Binds color and depth/stencil targets to the current command buffer state.
    ///
    /// The current layout of each target must also be specified.
    ///
    /// The color target binding points require use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageColorTarget
    /// - CacheCoherency: @ref CoherColorTarget
    /// - ImageLayout:    @ref LayoutColorTarget
    ///
    /// The depth and stencil target binding points require use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget
    /// - CacheCoherency: @ref CoherDepthStencilTarget
    /// - ImageLayout:    @ref LayoutDepthStencilTarget
    ///
    /// @param [in] params Parameters representing the color and depth/stencil targets to bind to the command buffer.
    virtual void CmdBindTargets(
        const BindTargetParams& params) = 0;

    /// Binds stream-output target buffers to the current command buffer state.
    ///
    /// At draw-time, the stream-output targets must be consistent with the soState parameters specified by the
    /// currently bound graphics pipeline.
    ///
    /// The stream-output target buffers require use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageStreamOut
    /// - CacheCoherency: @ref CoherStreamOut
    ///
    /// @param [in] params Parameters representing the stream-output target buffers to bind to the command buffer.
    virtual void CmdBindStreamOutTargets(
        const BindStreamOutTargetParams& params) = 0;

    /// Sets the constant factor to be used by the blend hardware when programmed with the Blend::ConstantColor,
    /// Blend::OneMinusConstantColor, Blend::ConstantAlpha, or Blend::OneMinusConstantAlpha blend coefficients.
    ///
    /// @param [in] params Parameters representing the blend constant factor.
    virtual void CmdSetBlendConst(
        const BlendConstParams& params) = 0;

    /// Sets input assembly state for upcoming draws in this command buffer.
    ///
    /// At draw-time, the topology specified with this method must be consistent with the _topologyInfo_ parameters
    /// specified by the currently bound graphics pipeline.
    ///
    /// @param [in] params Parameters representing the input assembly state for upcoming draws.
    virtual void CmdSetInputAssemblyState(
        const InputAssemblyStateParams& params) = 0;

    /// Sets parameters controlling triangle rasterization.
    ///
    /// @param [in] params Parameters to set the triangle raster state (such as fill/cull mode).
    virtual void CmdSetTriangleRasterState(
        const TriangleRasterStateParams& params) = 0;

    /// Sets parameters controlling point and line rasterization.
    ///
    /// @param [in] params Parameters to set the point and line rasterization state (such as pointSize and lineWidth).
    virtual void CmdSetPointLineRasterState(
        const PointLineRasterStateParams& params) = 0;

    /// Sets parameters controlling line stippling.
    ///
    /// @param [in] params Parameters to set the line stipple state.
    virtual void CmdSetLineStippleState(
        const LineStippleStateParams& params) = 0;

    /// Sets depth bias parameters.
    ///
    /// Depth bias is used to ensure a primitive can properly be displayed (without Z fighting) in front (or behind)
    /// of the previously rendered co-planar primitive.  This is useful for decal or shadow rendering.
    ///
    /// @param [in] params  Parameters for setting the depth bias (such as depth bias, depth bias clamp, and slope
    ///                     scaled depth bias).
    virtual void CmdSetDepthBiasState(
        const DepthBiasParams& params) = 0;

    /// Sets stencil reference values and mask buffer reads and writes in upcoming draws. Separate reference values
    /// can be specified for front-facing and back-facing polygons. Update flags should be set for state which needs to
    /// be updated. All other state will be preserved.
    /// Setting all the values (reference, read/write masks and stencil op) in the StencilRefMaskParams together
    /// takes the faster path.
    /// Setting either the ref value, read/write masks or the stencil op value individually takes the slower
    /// read-modify-write path.
    ///
    /// @param [in] params  Parameters for setting the stencil read and write masks.
    virtual void CmdSetStencilRefMasks(
        const StencilRefMaskParams& params) = 0;

    /// Sets user defined clip planes, should only be called on universal command buffers.
    ///
    /// @param [in] firstPlane The index of first plane in user define clip plane array.
    /// @param [in] planeCount The count of planes in plane array.
    /// @param [in] pPlanes    Pointer to plane array.
    virtual void CmdSetUserClipPlanes(
        uint32               firstPlane,
        uint32               planeCount,
        const UserClipPlane* pPlanes) = 0;

    /// Sets clip rects, should only be called on universal command buffers.
    ///
    /// @param [in] clipRule  16-bit clip rule bits are used to determine if pixel shall be discarded or retained.
    ///                       For each pixel, a 4-bit index is computed based on which clip rects the pixel is
    ///                       inside (bitN represents rectN). Then uses this index to check the corresponding bit
    ///                       in clip rule for this pixel - 0 for discarded, 1 for retained.
    /// @param [in] rectCount The count of rectangles in rect list. This must be less than or equal to
    ///                       MaxClipRects (4).
    /// @param [in] pRectList Pointer to the rect list.
    virtual void CmdSetClipRects(
        uint16      clipRule,
        uint32      rectCount,
        const Rect* pRectList) = 0;

    /// Sets user defined MSAA quad-pixel sample pattern, should only be called on universal command buffers
    /// This should be called before clearing, rendering, barriering and resolving of MSAA DepthStencil image.
    ///
    /// @param [in] numSamplesPerPixel Number of samples per pixel
    /// @param [in] quadSamplePattern  The input msaa sample pattern
    virtual void CmdSetMsaaQuadSamplePattern(
        uint32                       numSamplesPerPixel,
        const MsaaQuadSamplePattern& quadSamplePattern) = 0;

    /// Sets the specified viewports to the current command buffer state.
    ///
    /// @param [in] params  Parameters for setting the specified number of viewports.
    virtual void CmdSetViewports(
        const ViewportParams& params) = 0;

    /// Sets the scissor regions corresponding to each viewport to the current command buffer state.
    ///
    /// @param [in] params  Parameters for setting the specified number of scissor regions.
    virtual void CmdSetScissorRects(
        const ScissorRectParams& params) = 0;

    /// Sets the global scissor rectangle.
    ///
    /// @param [in] params  Parameters for setting the global scissor rectangle from the top left to bottom right
    ///                     coordinate.
    virtual void CmdSetGlobalScissor(
        const GlobalScissorParams& params) = 0;

    /// Inserts a barrier in the current command stream that can stall GPU execution, flush/invalidate caches, or
    /// decompress images before further, dependent work can continue in this command buffer.
    ///
    /// This operation does not honor the command buffer's predication state, if active.
    ///
    /// @param [in] barrierInfo See @ref BarrierInfo for detailed information.
    virtual void CmdBarrier(
        const BarrierInfo& barrierInfo) = 0;

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 857
    /// Perform source pipeline point and cache access optimization based on the legacy barrier interface.
    ///
    /// @param [in]     pipePointWaitCount Number of entries in pPipePoints.
    /// @param [in/out] pPipePoints        Array of @ref HwPipePoint to optimize.
    /// @param [in/out] pCacheMask         A mask of ORed @ref CacheCoherencyUsageFlags to optimize.
    ///
    /// @note HwPipePostBlt will be converted to a more accurate stage based on the underlying implementation of
    ///       outstanding BLTs, but will be left as HwPipePostBlt if the internal outstanding BLTs can't be expressed
    ///       as a client-facing HwPipePoint (e.g., if there are CP DMA BLTs in flight).
    void OptimizeBarrierReleaseInfo(
        uint32       pipePointWaitCount,
        HwPipePoint* pPipePoints,
        uint32*      pCacheMask) const { PAL_NOT_IMPLEMENTED(); }
#endif

    /// Perform source pipeline stage and cache access optimization based on the acquire/release interface.
    ///
    /// @param [in]     barrierType    Barrier transition type @ref BarrierType.
    /// @param [in]     pImage         Image pointer for image transition, required when @ref BarrierType is
    ///                                BarrierType::Image.
    /// @param [in/out] pSrcStageMask  A source mask of ORed @ref PipelineStageFlag to optimize, can't be null.
    /// @param [in/out] pSrcAccessMask A source mask of ORed @ref CacheCoherencyUsageFlags to optimize, can't be null.
    /// @param [in/out] pDstStageMask  A destination mask of ORed @ref PipelineStageFlag to optimize, can't be null.
    /// @param [in/out] pDstAccessMask A destination mask of ORed @ref CacheCoherencyUsageFlags to optimize.
    ///
    /// @returns If need flush and invalidate GL2 cache.
    ///
    /// @note PipelineStageBlt will be converted to more accurate stage(s) based on the underlying implementation of
    ///       outstanding BLTs, but will be left as PipelineStageBlt if the internal outstanding BLTs can't be expressed
    ///       as a client-facing PipelineStage (e.g., if there are CP DMA BLTs in flight).
    virtual bool OptimizeAcqRelReleaseInfo(
        BarrierType   barrierType,
        const IImage* pImage,
        uint32*       pSrcStageMask,
        uint32*       pSrcAccessMask,
        uint32*       pDstStageMask,
        uint32*       pDstAccessMask) const = 0;

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 858
    /// @param [in/out] pSrcStageMask  A mask of ORed @ref PipelineStageFlag to optimize, can't be null.
    /// @param [in/out] pSrcAccessMask A mask of ORed @ref CacheCoherencyUsageFlags to optimize, can't be null.
    void OptimizeAcqRelReleaseInfo(
        uint32*     pSrcStageMask,
        uint32*     pSrcAccessMask) const
    {
        uint32 unusedStageMask  = 0;
        uint32 unusedAccessMask = 0;
        OptimizeAcqRelReleaseInfo(BarrierType::Global, nullptr,
                                  pSrcStageMask, pSrcAccessMask, &unusedStageMask, &unusedAccessMask);
    }
#elif PAL_CLIENT_INTERFACE_MAJOR_VERSION < 864
    /// @param [in/out] barrierType Barrier transition type @ref BarrierType.
    /// @param [in/out] pSrcStageMask  A mask of ORed @ref PipelineStageFlag to optimize, can't be null.
    /// @param [in/out] pSrcAccessMask A mask of ORed @ref CacheCoherencyUsageFlags to optimize, can't be null.
    void OptimizeAcqRelReleaseInfo(
        BarrierType barrierType,
        uint32*     pSrcStageMask,
        uint32*     pSrcAccessMask) const
    {
        uint32 unusedStageMask  = 0;
        uint32 unusedAccessMask = 0;
        OptimizeAcqRelReleaseInfo(barrierType, nullptr,
                                  pSrcStageMask, pSrcAccessMask, &unusedStageMask, &unusedAccessMask);
    }
#elif PAL_CLIENT_INTERFACE_MAJOR_VERSION < 867
    /// @param [in]     barrierType    Barrier transition type @ref BarrierType.
    /// @param [in/out] pSrcStageMask  A source mask of ORed @ref PipelineStageFlag to optimize, can't be null.
    /// @param [in/out] pSrcAccessMask A source mask of ORed @ref CacheCoherencyUsageFlags to optimize, can't be null.
    /// @param [in/out] pDstStageMask  A destination mask of ORed @ref PipelineStageFlag to optimize, can't be null.
    /// @param [in/out] pDstAccessMask A destination mask of ORed @ref CacheCoherencyUsageFlags to optimize.
    void OptimizeAcqRelReleaseInfo(
        BarrierType barrierType,
        uint32*     pSrcStageMask,
        uint32*     pSrcAccessMask,
        uint32*     pDstStageMask,
        uint32*     pDstAccessMask) const
    {
        uint32 unusedStageMask  = 0;
        uint32 unusedAccessMask = 0;
        OptimizeAcqRelReleaseInfo(barrierType, nullptr,
                                  pSrcStageMask, pSrcAccessMask, &unusedStageMask, &unusedAccessMask);
    }
#endif

    /// Performs the release portion of an acquire/release-based barrier.  This releases a set of resources from their
    /// current usage, while CmdAcquire() is expected to be called to acquire access to the resources for future,
    /// different usage.
    ///
    /// Conceptually, this method will:
    ///   - Ensure the specified source synchronization scope has completed.
    ///   - Ensure all specified resources are available in memory.  The availability operation will flush all
    ///     write-back caches to the last-level-cache.
    ///   - Perform any requested layout transitions.
    ///
    /// Once all of these operations are complete, the release issues a timestamp event that signals the operation
    /// completion.  The event type and timestamp value is returned to caller in a packed uint32 token.  A corresponding
    /// CmdAcquire() call is expected to wait on one or a list of such synchronization tokens and perform any necessary
    /// visibility operations and/or layout transitions that could not be predicted at release-time.
    ///
    /// @param [in]  releaseInfo  Describes the synchronization scope, availability operations, and required layout
    ///                           transitions.
    /// @returns Synchronization token for the release operation.  Pass this token to CmdAcquire to confirm completion.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 885
    virtual uint32 CmdRelease(
#else
    virtual ReleaseToken CmdRelease(
#endif
        const AcquireReleaseInfo& releaseInfo) = 0;

    /// Performs the acquire portion of an acquire/release-based barrier.  This acquire a set of resources for a new
    /// set of usages, assuming CmdRelease() was called to release access for the resource's past usage.
    ///
    /// Conceptually, this method will:
    ///   - Ensure all specified resources are visible in memory.  The visibility operation will invalidate all
    ///     relevant caches above the last-level-cache.
    ///   - Perform any requested layout transitions.
    ///   - Ensure the release(s) have completed by waiting on the synchronization token of the release operation.
    ///
    /// @param [in] acquireInfo    Describes the synchronization scope, visibility operations, and the required layout
    ///                            layout transitions.
    /// @param [in] syncTokenCount Number of entries in pSyncTokens, can be zero if no valid release token.
    /// @param [in] pSyncTokens    Array of synchronization tokens, as returned from CmdRelease, to confirm completion.
    ///                            The token value(s) must have been returned by a CmdRelease call in the same command
    ///                            buffer. pSyncTokens can be null if syncTokenCount is 0.
    virtual void CmdAcquire(
        const AcquireReleaseInfo& acquireInfo,
        uint32                    syncTokenCount,
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 885
        const uint32*             pSyncTokens) = 0;
#else
        const ReleaseToken*       pSyncTokens) = 0;
#endif

    /// Performs the release portion of an acquire/release event-based barrier.  This releases a set of resources from
    /// their current usage, while CmdAcquireEvent() is expected to be called to acquire access to the resources for
    /// future, different usage.
    ///
    /// Conceptually, this method will:
    ///   - Ensure the specified source synchronization scope has completed.
    ///   - Ensure all specified resources are available in memory.  The availability operation will flush all
    ///     write-back caches to the last-level-cache.
    ///   - Perform any requested layout transitions.
    ///
    /// Once all of these operations are complete, the specified IGpuEvent object will be signaled.  A corresponding
    /// CmdAcquireEvent() call is expected to wait on this event and perform any necessary visibility operations and/or
    /// layout transitions that could not be predicted at release-time.
    ///
    /// @param [in] releaseInfo Describes the synchronization scope, availability operations, and required layout
    ///                         transitions.
    /// @param [in] pGpuEvent   Event to be signaled once the release has completed.  Must be a valid (non-null) GPU
    ///                         event pointer. Call CmdRelease()/CmdAcquire() pair instead if want to release/acquire
    ///                         something but no GPU event is available.
    virtual void CmdReleaseEvent(
        const AcquireReleaseInfo& releaseInfo,
        const IGpuEvent*          pGpuEvent) = 0;

    /// Performs the acquire portion of an acquire/release event-based barrier.  This acquire a set of resources for a
    /// new set of usages, assuming CmdReleaseEvent() was called to release access for the resource's past usage.
    ///
    /// Conceptually, this method will:
    ///   - Ensure the release(s) have completed by waiting for the specified IGpuEvent early enough in the pipeline to
    ///     support the specified destination synchronization scope.
    ///   - Ensure all specified resources are visible in memory.  The visibility operation will invalidate all
    ///     relevant caches above the last-level-cache.
    ///   - Perform any requested layout transitions.
    ///
    /// @param [in] acquireInfo    Describes the synchronization scope, visibility operations, and the required layout
    ///                            layout transitions.
    /// @param [in] gpuEventCount  Number of entries in pGpuEvents.
    /// @param [in] ppGpuEvents    Array of one or more events to wait on.  Typically these will be set via
    ///                            CmdReleaseEvent(), but it's valid to wait on an event set through a different means
    ///                            like CmdSetEvent() from CPU side. Must be a valid (non-null) pointer to an array of
    ///                            gpuEventCount valid GPU event pointers. Call CmdReleaseThenAcquire() instead if wait
    ///                            to acquire something but no GPU event is available.
    virtual void CmdAcquireEvent(
        const AcquireReleaseInfo& acquireInfo,
        uint32                    gpuEventCount,
        const IGpuEvent* const*   ppGpuEvents) = 0;

    /// Conceptually equivalent to calling CmdRelease() followed immediately by CmdAcquire(),  but it potentially has
    /// better performance than calling CmdRelease()/CmdAcquire() directly.  Can be called in cases where the client/
    /// application cannot detect separate release and acquire points for a transition.
    ///
    /// Effectively equivalent to @ref ICmdBuffer::CmdBarrier.
    ///
    /// @param [in] barrierInfo  Describes the synchronization scopes, availability/visibility operations, and the
    ///                          required layout transitions.
    virtual void CmdReleaseThenAcquire(
        const AcquireReleaseInfo& barrierInfo) = 0;

    /// Issues an instanced, non-indexed draw call using the command buffer's currently bound graphics state.  Results
    /// in instanceCount * vertexCount vertices being processed.
    ///
    /// It is an error if the currently bound pipeline contains a mesh and/or task shader.
    ///
    /// @param [in] firstVertex   Starting index value for the draw.  Indices passed to the vertex shader will range
    ///                           from firstVertex to firstVertex + vertexCount - 1.
    /// @param [in] vertexCount   Number of vertices to draw.  If zero, the draw will be discarded.
    /// @param [in] firstInstance Starting instance for the draw.  Instance IDs passed to the vertex shader will range
    ///                           from firstInstance to firstInstance + instanceCount - 1.
    /// @param [in] instanceCount Number of instances to draw.  If zero, the draw will be discarded.
    /// @param [in] drawId        Draw index for the draw.
    void CmdDraw(
        uint32 firstVertex,
        uint32 vertexCount,
        uint32 firstInstance,
        uint32 instanceCount,
        uint32 drawId)
    {
        m_funcTable.pfnCmdDraw(this, firstVertex, vertexCount, firstInstance, instanceCount, drawId);
    }

    /// Issues draw opaque call using the command buffer's currently bound graphics state.
    /// Uses the stream-out target of a previous draw as the input vertex data.
    /// the number of vertices = (streamOutFilledSize (value of streamOutFilledSizeVa) - streamOutOffset) / stride
    ///
    /// It is an error if the currently bound pipeline contains a mesh and/or task shader.
    ///
    /// @param [in] streamOutFilledSizeVa gpuAddress of streamOut filled size for streamOut buffer.
    /// @param [in] streamOutOffset       the offset of begin of streamOut as vertex.
    /// @param [in] stride                stride for stream data as vertex.
    /// @param [in] firstInstance         Starting instance for the draw. Instance IDs passed to the vertex shader
    ///                                   will range from firstInstance to firstInstance + instanceCount - 1.
    /// @param [in] instanceCount         Number of instances to draw.  If zero, the draw will be discarded.
    void CmdDrawOpaque(
        gpusize streamOutFilledSizeVa,
        uint32  streamOutOffset,
        uint32  stride,
        uint32  firstInstance,
        uint32  instanceCount)
    {
        m_funcTable.pfnCmdDrawOpaque(this,
                                     streamOutFilledSizeVa,
                                     streamOutOffset,
                                     stride,
                                     firstInstance,
                                     instanceCount);
    }

    /// Issues an instanced, indexed draw call using the command buffer's currently bound graphics state.  Results in
    /// instanceCount * indexCount vertices being processed.
    ///
    /// It is an error if the currently bound pipeline contains a mesh and/or task shader.
    ///
    /// Indices passed to the vertex shader will be:
    ///
    /// + IndexBuffer[firstIndex] + vertexOffset
    /// + IndexBuffer[firstIndex + 1] + vertexOffset,
    /// + ...
    /// + IndexBuffer[firstIndex + indexCount - 1] + vertexOffset
    ///
    /// @param [in] firstIndex    Starting index buffer slot for the draw.
    /// @param [in] indexCount    Number of vertices to draw.  If zero, the draw will be discarded.
    /// @param [in] vertexOffset  Offset added to the index fetched from the index buffer before it is passed to the
    ///                           vertex shader.
    /// @param [in] firstInstance Starting instance for the draw.  Instance IDs passed to the vertex shader will range
    ///                           from firstInstance to firstInstance + instanceCount - 1.
    /// @param [in] instanceCount Number of instances to draw.  If zero, the draw will be discarded.
    /// @param [in] drawId        Draw index for the draw.
    void CmdDrawIndexed(
        uint32 firstIndex,
        uint32 indexCount,
        int32  vertexOffset,
        uint32 firstInstance,
        uint32 instanceCount,
        uint32 drawId)
    {
        m_funcTable.pfnCmdDrawIndexed(this, firstIndex, indexCount, vertexOffset, firstInstance, instanceCount, drawId);
    }

    /// Issues instanced, non-indexed draw calls using the command buffer's currently bound graphics state.  The draw
    /// arguments come from GPU memory. This command will issue count draw calls, using the provided stride to find
    /// the next indirect args structure in gpuMemory.  Each draw call will be discarded if its vertexCount or
    /// instanceCount is zero.
    ///
    /// The layout of the argument data is defined in the @ref DrawIndirectArgs structure.
    ///
    /// It is an error if the currently bound pipeline contains a mesh and/or task shader.
    ///
    /// This function requires use of the following barrier flags on the indirect memory:
    /// - PipelineStage:  @ref PipelineStageFetchIndirectArgs
    /// - CacheCoherency: @ref CoherIndirectArgs
    ///
    /// @see CmdDraw
    /// @see DrawIndirectArgs
    ///
    /// @param [in] gpuVirtAddrAndStride  GPU virtual address where the indirect argument data is located and stride in
    ///                                   memory from one structure to another.
    ///                                   The virtual address must be 4 byte aligned.
    /// @param [in] maximumCount          Maximum count of data structures to loop through.  If countGpuAddr
    ///                                   is nonzero, the value at that memory location is clamped to
    ///                                   this maximum. If countGpuAddr is zero, then the number of draws
    ///                                   issued exactly matches this number.
    /// @param [in] countGpuAddr          GPU virtual address where the number of draws is stored.
    ///                                   Must be 4-byte aligned.
    void CmdDrawIndirectMulti(
        GpuVirtAddrAndStride gpuVirtAddrAndStride,
        uint32               maximumCount,
        gpusize              countGpuAddr)
    {
        m_funcTable.pfnCmdDrawIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr);
    }

    /// Issues instanced, indexed draw calls using the command buffer's currently bound graphics state.  The draw
    /// arguments come from GPU memory. This command will issue count draw calls, using the provided stride to find
    /// the next indirect args structure in gpuMemory.  Each draw call will be discarded if its indexCount or
    /// instanceCount is zero.
    ///
    /// The layout of the argument data is defined in the @ref DrawIndexedIndirectArgs structure.
    ///
    /// It is an error if the currently bound pipeline contains a mesh and/or task shader.
    ///
    /// This function requires use of the following barrier flags on the indirect memory:
    /// - PipelineStage:  @ref PipelineStageFetchIndirectArgs
    /// - CacheCoherency: @ref CoherIndirectArgs
    ///
    /// @see CmdDrawIndexed
    /// @see DrawIndexedIndirectArgs
    ///
    /// @param [in] gpuVirtAddrAndStride  GPU virtual address where the indirect argument data is located and stride in
    ///                                   memory from one structure to another.
    ///                                   The virtual address must be 4 byte aligned.
    /// @param [in] maximumCount          Maximum count of data structures to loop through.  If countGpuAddr
    ///                                   is nonzero, the value at that memory location is clamped to
    ///                                   this maximum. If countGpuAddr is zero, then the number of draws
    ///                                   issued exactly matches this number.
    /// @param [in] countGpuAddr          GPU virtual address where the number of draws is stored.
    ///                                   Must be 4-byte aligned.
    void CmdDrawIndexedIndirectMulti(
        GpuVirtAddrAndStride gpuVirtAddrAndStride,
        uint32               maximumCount,
        gpusize              countGpuAddr)
    {
        m_funcTable.pfnCmdDrawIndexedIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr);
    }

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 909
    /// Dispatches a compute workload of the given dimensions using the command buffer's currently bound compute state.
    ///
    /// The thread group size is defined in the compute shader.
    ///
    /// Supports PAL ABI and HSA ABI pipelines.
    ///
    /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded.
    inline void CmdDispatch(
        DispatchDims size)
    {
        m_funcTable.pfnCmdDispatch(this, size, {});
    }
#endif

    /// Dispatches a compute workload of the given dimensions using the command buffer's currently bound compute state.
    ///
    /// The thread group size is defined in the compute shader.
    ///
    /// Supports PAL ABI and HSA ABI pipelines.
    ///
    /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded.
    /// @param [in] infoFlags Additional information about the dispatch.
    void CmdDispatch(
        DispatchDims      size,
        DispatchInfoFlags infoFlags)
    {
        m_funcTable.pfnCmdDispatch(this, size, infoFlags);
    }

    /// Dispatches a compute workload using the command buffer's currently bound compute state.  The dimensions of the
    /// workload come from GPU memory.  The dispatch will be discarded if any of its dimensions are zero.
    ///
    /// The layout of the argument data is defined in the @ref DispatchIndirectArgs structure.
    ///
    /// @warning Does not support HSA ABI pipelines.
    ///
    /// This function requires use of the following barrier flags on the indirect memory:
    /// - PipelineStage:  @ref PipelineStageFetchIndirectArgs
    /// - CacheCoherency: @ref CoherIndirectArgs
    ///
    /// @see CmdDispatch
    /// @see DispatchIndirectArgs
    ///
    /// @param [in] gpuVirtAddr  GPU virtual memory address where the indirect argument data is located.
    ///                          The virtual address must be 4-byte aligned.
    void CmdDispatchIndirect(
        gpusize gpuVirtAddr)
    {
        m_funcTable.pfnCmdDispatchIndirect(this, gpuVirtAddr);
    }

    /// Dispatches a compute workload of the given dimensions and offsets using the command buffer's currently bound
    /// compute state. This command allows targeting regions of thread groups without adding the offset computations in
    /// the shader.
    ///
    /// The caller may also provide a logical thread group count which is larger than the number of groups actually
    /// launched. If the shader reads the dispatch's thread group count from PAL metadata it will see the logical size,
    /// not the launch size.
    ///
    /// The combination of an offset, launch size, and logical size give the caller enough flexibility to take an
    /// incoming dispatch, split it up into sub-dispatches, and execute those sub-dispatches using multiple
    /// CmdDispatchOffset calls in whatever execution pattern they would like. Note that such an optimization
    /// would not work if the shader has global logic that does make assumptions about thread group launch order.
    ///
    /// The thread group size is defined in the compute shader.
    ///
    /// Supports PAL ABI and HSA ABI pipelines.
    ///
    /// @param [in] offset      The thread groups offsets. Set them to zero if you don't want an offset.
    /// @param [in] launchSize  Thread groups to dispatch. If any components are zero the dispatch will be discarded.
    /// @param [in] logicalSize The thread group dimensions reported to the shader via metadata.
    void CmdDispatchOffset(
        DispatchDims offset,
        DispatchDims launchSize,
        DispatchDims logicalSize)
    {
        m_funcTable.pfnCmdDispatchOffset(this, offset, launchSize, logicalSize);
    }

    /// Dispatches a mesh shader workload using the command buffer's currently bound graphics state.  It is an error if
    /// the currently bound graphics pipeline does not contain a mesh and/or task shader.
    ///
    /// The thread group size is defined in the mesh shader or task shader.
    ///
    /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded.
    void CmdDispatchMesh(
        DispatchDims size)
    {
        m_funcTable.pfnCmdDispatchMesh(this, size);
    }

    /// Dispatches a mesh shader workload using the command buffer's currently bound graphics state.  It is an error if
    /// the currently bound graphics pipeline does not contain a mesh shader.  The dimensions of the workload come from
    /// GPU memory.  The dispatch will be discarded if any of its dimensions are zero.
    ///
    /// The layout of the argument data is defined in the @ref DispatchMeshIndirectArgs structure.
    ///
    /// This function requires use of the following barrier flags on the indirect memory:
    /// - PipelineStage:  @ref PipelineStageFetchIndirectArgs
    /// - CacheCoherency: @ref CoherIndirectArgs
    ///
    /// @see CmdDispatchMesh
    /// @see DispatchMeshIndirectArgs
    ///
    /// @param [in] gpuVirtAddrAndStride  GPU virtual address where the indirect argument data is located and stride in
    ///                                   memory from one structure to another.
    ///                                   The virtual address must be 4 byte aligned.
    /// @param [in] maximumCount          Maximum count of data structures to loop through.  If countGpuAddr
    ///                                   is nonzero, the value at that memory location is clamped to
    ///                                   this maximum. If countGpuAddr is zero, then the number of draws
    ///                                   issued exactly matches this number.
    /// @param [in] countGpuAddr          GPU virtual address where the number of draws is stored.
    ///                                   Must be 4-byte aligned.
    void CmdDispatchMeshIndirectMulti(
        GpuVirtAddrAndStride gpuVirtAddrAndStride,
        uint32               maximumCount,
        gpusize              countGpuAddr)
    {
        m_funcTable.pfnCmdDispatchMeshIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr);
    }

    /// Copies multiple regions from one GPU memory allocation to another.
    ///
    /// None of the destination regions are allowed to overlap each other, nor are destination and source regions
    /// allowed to overlap when the source and destination GPU memory allocations are the same.  Any illegal overlapping
    /// will cause undefined results.
    ///
    /// This call should be used for buffer memory copy only; don't use it for image memory.
    ///
    /// For best performance, offsets and copy sizes should be 4-byte aligned.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    ///
    /// @param [in] srcGpuMemory  GPU memory allocation where the source regions are located.
    /// @param [in] dstGpuMemory  GPU memory allocation where the destination regions are located.
    /// @param [in] regionCount   Number of regions to copy; size of the pRegions array.
    /// @param [in] pRegions      Array of copy regions, each entry specifying a source offset, destination offset, and
    ///                           copy size.
    virtual void CmdCopyMemory(
        const IGpuMemory&       srcGpuMemory,
        const IGpuMemory&       dstGpuMemory,
        uint32                  regionCount,
        const MemoryCopyRegion* pRegions) = 0;

    /// Copies multiple regions from one GPU memory virtual address to another.
    ///
    /// @note  The CmdCopyMemory() path should be preferred because it contains more optimizations due to more
    ///        knowledge about the memory itself that is lost when only virtual addresses are passed in.
    ///
    ///
    /// None of the destination regions are allowed to overlap each other, nor are destination and source regions
    /// allowed to overlap when the source and destination GPU memory virtual address are the same.  Any illegal
    /// overlapping will cause undefined results.
    ///
    /// For best performance, addresses, offsets, and copy sizes should be 4-byte aligned.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    ///
    /// @param [in] srcGpuVirtAddr  GPU memory vitrual address where the source regions are located.
    /// @param [in] dstGpuVirtAddr  GPU memory virtual address where the destination regions are located.
    /// @param [in] regionCount     Number of regions to copy; size of the pRegions array.
    /// @param [in] pRegions        Array of copy regions, each entry specifynig a source offset, destination offset,
    ///                             and copy size.
    virtual void CmdCopyMemoryByGpuVa(
        gpusize                 srcGpuVirtAddr,
        gpusize                 dstGpuVirtAddr,
        uint32                  regionCount,
        const MemoryCopyRegion* pRegions) = 0;

    /// Copies multiple regions from one image to another.
    ///
    /// The source and destination subresource of a particular region are not allowed to be the same, and will produce
    /// undefined results.  Additionally, destination subresources cannot be present more than once per CmdCopyImage()
    /// call.
    ///
    /// For compressed images, the compression block size is used as the pixel size.  For compressed images, the image
    /// extents are specified in compression blocks.
    ///
    /// The source and destination images must to be of the same type (1D, 2D or 3D), or optionally 2D and 3D with the
    /// number of slices matching the depth.  MSAA source and destination images must have the same number of samples.
    ///
    /// Each region must satisfy these restrictions.
    /// - srcOffset >= 0 and dstOffset >= 0
    /// - srcOffset + extent <= srcSubres's extent
    /// - dstOffset + extent <= dstSubres's extent
    ///
    /// Images copied via this function must have x/y/z offsets and width/height/depth extents aligned to the minimum
    /// tiled copy alignment specified in @ref DeviceProperties for the engine this function is executed on.  Note that
    /// the DMA engine supports tiled copies regardless of the alignment; the reported minimum tiled copy alignments
    /// are an indication of the minimum alignments for which the copy will be performant.
    ///
    /// When the per-engine capability flag supportsMismatchedTileTokenCopy (@see DeviceProperties) is false,
    /// CmdCopyImage is only valid between two subresources that share the same tileToken (@see SubresLayout).
    ///
    /// Note that the copy can go through clone copy automatically if,
    ///  - Both source and destination images are created with @ref ImageCreateInfo::flags::cloneable = 1
    ///  - Both source and destination images have same @ref ImageCreateInfo
    ///  - Source image's layout is compatible with destination images' layout
    ///  - This is a full image copy
    ///  - Copy flags @ref CopyControlFlags required to be 0.
    /// Basically clone copy clones all subresources' data of one image object in another while preserving the image
    /// layout. It does raw copy on image data and metadata; and tries to keep the metadata (like DCC/HiZ/HiS)
    /// unchanged but may be not true due to different HW design.
#if PAL_BUILD_GFX12
    /// e.g. Client compression (fragment and ZPlane compression) will be missed during the compute based raw copy.
#endif
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    /// - ImageLayout:    @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination.
    ///
    /// @param [in] srcImage       Image where source regions reside.
    /// @param [in] srcImageLayout Current allowed usages and engines for the source image.  These masks must include
    ///                            LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this
    ///                            function is being called on.
    /// @param [in] dstImage       Image where destination regions reside.
    /// @param [in] dstImageLayout Current allowed usages and engines for the destination image.  These masks must
    ///                            include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this
    ///                            function is being called on.
    /// @param [in] regionCount    Number of regions to copy; size of the pRegions array.
    /// @param [in] pRegions       Array of copy regions, each entry specifying a source subresource, destination
    ///                            subresource, source x/y/z offset, destination x/y/z offset, and copy size in the
    ///                            x/y/z dimensions.
    /// @param [in] pScissorRect   Rectangle for scissor test.
    /// @param [in] flags          A mask of ORed @ref CopyControlFlags that can be used to control copy behavior.
    virtual void CmdCopyImage(
        const IImage&          srcImage,
        ImageLayout            srcImageLayout,
        const IImage&          dstImage,
        ImageLayout            dstImageLayout,
        uint32                 regionCount,
        const ImageCopyRegion* pRegions,
        const Rect*            pScissorRect,
        uint32                 flags) = 0;

    /// Copies data directly (without format conversion) from a GPU memory object to an image.
    ///
    /// For compressed images, the extents are specified in compression blocks.
    ///
    /// The size of the data copied from memory is implicitly derived from the image extents.
    ///
    /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes.  A destination
    /// subresource cannot be present more than once per CmdCopyMemoryToImage() call.
    ///
    /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    /// - ImageLayout:    @ref LayoutCopyDst
    ///
    /// @param [in] srcGpuMemory   GPU memory where the source data is located.
    /// @param [in] dstImage       Image where destination data will be written.
    /// @param [in] dstImageLayout Current allowed usages and engines for the destination image.  These masks must
    ///                            include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this
    ///                            function is being called on.
    /// @param [in] regionCount    Number of regions to copy; size of the pRegions array.
    /// @param [in] pRegions       Array of copy regions, each entry specifying a source offset, a destination
    ///                            subresource, destination x/y/z offset, and copy size in the x/y/z dimensions.
    virtual void CmdCopyMemoryToImage(
        const IGpuMemory&            srcGpuMemory,
        const IImage&                dstImage,
        ImageLayout                  dstImageLayout,
        uint32                       regionCount,
        const MemoryImageCopyRegion* pRegions) = 0;

    /// Copies data directly (without format conversion) from an image to a GPU memory object.
    ///
    /// For compressed images, the extents are specified in compression blocks.
    ///
    /// The size of the data copied to memory is implicitly derived from the image extents.
    ///
    /// The destination memory offset has to be aligned to the smaller of the copied texel size or 4 bytes.  A
    /// destination region cannot be present more than once per CmdCopyImageToMemory() call.
    ///
    /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    /// - ImageLayout:    @ref LayoutCopySrc
    ///
    /// @param [in] srcImage       Image where source data will be read from.
    /// @param [in] srcImageLayout Current allowed usages and engines for the source image.  These masks must include
    ///                            LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this
    ///                            function is being called on.
    /// @param [in] dstGpuMemory   GPU memory where the destination data will be written.
    /// @param [in] regionCount    Number of regions to copy; size of the pRegions array.
    /// @param [in] pRegions       Array of copy regions, each entry specifying a destination offset, a source
    ///                            subresource, source x/y/z offset, and copy size in the x/y/z dimensions.
    virtual void CmdCopyImageToMemory(
        const IImage&                srcImage,
        ImageLayout                  srcImageLayout,
        const IGpuMemory&            dstGpuMemory,
        uint32                       regionCount,
        const MemoryImageCopyRegion* pRegions) = 0;

    /// Copies data directly (without format conversion) from a GPU memory object to a PRT.
    ///
    /// The image offset and extents are in units of tiles.  @see ImageMemoryLayout for the size of a tile in texels.
    /// This function always copies entire tiles, even if parts of the tile are internal padding.
    ///
    /// This function cannot be used to copy any subresources stored in the packed mip tail.  Other copy functions that
    /// operate in texels like the generic CmdCopyMemoryToImage() should be used instead.
    ///
    /// The size of the data copied from memory is implicitly derived from the image extents.
    ///
    /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes.  A destination
    /// subresource cannot be present more than once per CmdCopyMemoryToTiledImage() call.
    ///
    /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    /// - ImageLayout:    @ref LayoutCopyDst
    ///
    /// @param [in] srcGpuMemory   GPU memory where the source data is located.
    /// @param [in] dstImage       Image where destination data will be written.  Must have the "prt" flag set.
    /// @param [in] dstImageLayout Current allowed usages and engines for the destination image.  These masks must
    ///                            include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this
    ///                            function is being called on.
    /// @param [in] regionCount    Number of regions to copy; size of the pRegions array.
    /// @param [in] pRegions       Array of copy regions, each entry specifying a source offset, a destination
    ///                            subresource, destination x/y/z offset, and copy size in the x/y/z dimensions.
    virtual void CmdCopyMemoryToTiledImage(
        const IGpuMemory&                 srcGpuMemory,
        const IImage&                     dstImage,
        ImageLayout                       dstImageLayout,
        uint32                            regionCount,
        const MemoryTiledImageCopyRegion* pRegions) = 0;

    /// Copies data directly (without format conversion) from a PRT to a GPU memory object.
    ///
    /// The image offset and extents are in units of tiles.  @see ImageMemoryLayout for the size of a tile in texels.
    /// This function always copies entire tiles, even if parts of the tile are internal padding.
    ///
    /// This function cannot be used to copy any subresources stored in the packed mip tail.  Other copy functions that
    /// operate in texels like the generic CmdCopyImageToMemory() should be used instead.
    ///
    /// The size of the data copied to memory is implicitly derived from the image extents.
    ///
    /// The destination memory offset has to be aligned to the smaller of the copied texel size or 4 bytes.  A
    /// destination region cannot be present more than once per CmdCopyTiledImageToMemory() call.
    ///
    /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    /// - ImageLayout:    @ref LayoutCopySrc
    ///
    /// @param [in] srcImage       Image where source data will be read from.
    /// @param [in] srcImageLayout Current allowed usages and queues for the source image.  These masks must include
    ///                            LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this
    ///                            function is being called on.
    /// @param [in] dstGpuMemory   GPU memory where the destination data will be written.
    /// @param [in] regionCount    Number of regions to copy; size of the pRegions array.
    /// @param [in] pRegions       Array of copy regions, each entry specifying a destination offset, a source
    ///                            subresource, source x/y/z offset, and copy size in the x/y/z dimensions.
    virtual void CmdCopyTiledImageToMemory(
        const IImage&                     srcImage,
        ImageLayout                       srcImageLayout,
        const IGpuMemory&                 dstGpuMemory,
        uint32                            regionCount,
        const MemoryTiledImageCopyRegion* pRegions) = 0;

    /// Copies multiple regions directly (without format conversion) from one typed buffer to another.
    ///
    /// For compressed formats, the extents are specified in compression blocks.
    ///
    /// The buffer memory offsets have to be aligned to the smaller of their texel sizes or 4 bytes.
    ///
    /// None of the destination regions are allowed to overlap each other, nor are destination and source regions
    /// allowed to overlap when the source and destination GPU memory allocations are the same.  Any illegal overlapping
    /// will cause undefined results.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    ///
    /// @param [in] srcGpuMemory GPU memory where the source data is located.
    /// @param [in] dstGpuMemory GPU memory where the destination data will be written.
    /// @param [in] regionCount  Number of regions to copy; size of the pRegions array.
    /// @param [in] pRegions     Array of copy regions, each entry specifying a destination offset, a source offset,
    ///                          and copy size in the x/y/z dimensions.
    virtual void CmdCopyTypedBuffer(
        const IGpuMemory&            srcGpuMemory,
        const IGpuMemory&            dstGpuMemory,
        uint32                       regionCount,
        const TypedBufferCopyRegion* pRegions) = 0;

    /// Copies data directly (without format conversion) from a 2D typed buffer to a 2D image.
    ///
    /// For compressed images, the extents are specified in compression blocks.
    ///
    /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes.  A destination
    /// subresource cannot be present more than once per CmdScaledCopyTypedBufferToImage() call.
    ///
    /// MSAA resource is unsupported. The client must resolve both resources before calling this function.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    /// - ImageLayout:    @ref LayoutCopyDst
    ///
    /// @param [in] srcGpuMemory   GPU memory where the source data is located.
    /// @param [in] dstImage       Image where destination data will be written.
    /// @param [in] dstImageLayout Current allowed usages and engines for the destination image.  These masks must
    ///                            include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this
    ///                            function is being called on.
    /// @param [in] regionCount    Number of regions to copy; size of the pRegions array.
    /// @param [in] pRegions       Array of copy regions, each entry specifying a source offset, copy size of source
    ///                            region, a destination offset, destination subresource, and copy size of destination
    ///                            region.
    virtual void CmdScaledCopyTypedBufferToImage(
        const IGpuMemory&                       srcGpuMemory,
        const IImage&                           dstImage,
        ImageLayout                             dstImageLayout,
        uint32                                  regionCount,
        const TypedBufferImageScaledCopyRegion* pRegions) = 0;

    /// Copies a GPU register content to a GPU memory location.
    ///
    /// The destination memory offset has to be aligned to 4 bytes.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStagePostPrefetch
    /// - CacheCoherency: @ref CoherMemory
    ///
    /// @param [in] srcRegisterOffset Source register offset in bytes
    /// @param [in] dstGpuMemory      GPU memory where the destination data will be written.
    /// @param [in] dstOffset         Destination memory offset in bytes.
    virtual void CmdCopyRegisterToMemory(
        uint32            srcRegisterOffset,
        const IGpuMemory& dstGpuMemory,
        gpusize           dstOffset) = 0;

    /// Copies multiple scaled regions from one image to another.
    ///
    /// The source and destination subresource of a particular region are not allowed to be the same, and will produce
    /// undefined results.  Additionally, destination subresources cannot be present more than once per
    /// CmdScaledCopyImage() call.
    ///
    /// For compressed images, the compression block size is used as the pixel size.  For compressed images, the image
    /// extents are specified in compression blocks.
    ///
    /// The source and destination images must to be of the same type (1D, 2D or 3D). Both single sampled images and
    /// MSAA images are supported.
    ///
    /// Linear texture filtering is only supported for images with non-integer formats.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    /// - ImageLayout:    @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination.
    ///
    /// @param [in] copyInfo       Specifies parameters needed to execute CmdScaledCopyImage. See
    ///                            @ref ScaledCopyInfo for more information.
    virtual void CmdScaledCopyImage(
        const ScaledCopyInfo& copyInfo) = 0;

    /// Automatically generates texture data for a range of subresources such that they may be used as intermediate
    /// images in a mipmap chain. The existing values in mip N are used to generate mip N+1.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the base mip, @ref CoherCopySrc and @ref CoherCopyDst for the others.
    /// - ImageLayout:    @ref LayoutCopySrc for the base mip, @ref LayoutCopySrc and @ref LayoutCopyDst for the others.
    ///
    /// @param [in] genInfo The parameters for CmdGenerateMipmaps. See @ref GenMipmapsInfo for more information.
    virtual void CmdGenerateMipmaps(
        const GenMipmapsInfo& genInfo) = 0;

    /// Copies multiple scaled regions from one image to another, converting between RGB and YUV color spaces during
    /// the copy.  The exact conversion between YUV and RGB is controlled by a caller-specified color-space-conversion
    /// table.
    ///
    /// The source and destination images must both be of the 2D type.  Only single-sampled images are supported.
    /// One of the two images involved must have an RGB color format, and the other must have a YUV color format.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    /// - ImageLayout:    @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination.
    ///
    /// @param [in] srcImage       Images where source region reside.  If this is a YUV image, the destination must be
    ///                            RGB, and this copy will convert YUV to RGB.  Otherwise, the destination must be YUV,
    ///                            and the copy will convert RGB to YUV.
    /// @param [in] srcImageLayout Current allowed usages and engines for the source image.  These masks must include
    ///                            LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this
    ///                            function is being called on.
    /// @param [in] dstImage       Image where destination regions reside.  If this is a YUV image, the source must be
    ///                            RGB, and this copy will convert RGB to YUV.  Otherwise, the source must be YUV and
    ///                            the copy will convert YUV to RGB.
    /// @param [in] dstImageLayout Current allowed usages and engines for the destination image.  These masks must
    ///                            include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this
    ///                            function is being called on.
    /// @param [in] regionCount    Number of regions to copy; size of the pRegions array.
    /// @param [in] pRegions       Array of conversion-copy regions, each entry specifying a source x/y/z offset, source
    ///                            x/y/z extent, destination x/y/z offset, destination x/y/z extent, RGB subresource and
    ///                            YUV subresource(s).
    /// @param [in] filter         Texture filtering for shader sample instruction.
    /// @param [in] cscTable       Color-space-conversion table which controls how YUV data is converted to a specific
    ///                            RGB representation and vice-versa.
    virtual void CmdColorSpaceConversionCopy(
        const IImage&                     srcImage,
        ImageLayout                       srcImageLayout,
        const IImage&                     dstImage,
        ImageLayout                       dstImageLayout,
        uint32                            regionCount,
        const ColorSpaceConversionRegion* pRegions,
        TexFilter                         filter,
        const ColorSpaceConversionTable&  cscTable) = 0;

    /// Clones data of one image object in another while preserving the image layout.
    ///
    /// The source and destination images must be created with identical creation parameters and must specify the
    /// cloneable flag. The clone operation clones all subresources.
    ///
    /// Both resources can be in any layout before the clone operation.  After the clone, the source image state is left
    /// intact and the destination image layout becomes the same as the source.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination.
    ///
    /// @param [in] srcImage Source image.
    /// @param [in] dstImage Destination image.
    virtual void CmdCloneImageData(
        const IImage& srcImage,
        const IImage& dstImage) = 0;

    /// Directly updates a range of GPU memory with a small amount of host data.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopyDst
    ///
    /// The client is responsible for choosing the proper method for optimal performance. If updating data size is less
    /// equal than 8 bytes, CmdWriteImmediate() is preferred.
    ///
    /// @param [in] dstGpuMemory  GPU memory object to be updated.
    /// @param [in] dstOffset     Byte offset into the GPU memory object to be udpated.  Must be a multiple of 4.
    /// @param [in] dataSize      Amount of data to write, in bytes.  Must be a multiple of 4.
    /// @param [in] pData         Pointer to host data to be copied into the GPU memory.
    virtual void CmdUpdateMemory(
        const IGpuMemory& dstGpuMemory,
        gpusize           dstOffset,
        gpusize           dataSize,
        const uint32*     pData) = 0;

    /// Updates marker surface with a DWORD value to indicate an event completion.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStagePostPrefetch
    /// - CacheCoherency: @ref CoherCp
    ///
    /// @param [in] dstGpuMemory  GPU memory object to be updated.
    /// @param [in] offset        Byte offset into marker address
    /// @param [in] value         Marker DWORD value to be copied to the bus addressable or external physical memory.
    virtual void CmdUpdateBusAddressableMemoryMarker(
        const IGpuMemory& dstGpuMemory,
        gpusize           offset,
        uint32            value) = 0;

    /// Fills a range of GPU memory with the provided 32-bit data.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopyDst
    ///
    /// @param [in] dstGpuMemory  GPU memory object to be filled.
    /// @param [in] dstOffset     Byte offset into the GPU memory object to be filled.  Must be a multiple of 4.
    /// @param [in] fillSize      Size to fill, in bytes.  Must be a multiple of 4.
    /// @param [in] data          32-bit value to be repeated in the filled range.
    virtual void CmdFillMemory(
        const IGpuMemory& dstGpuMemory,
        gpusize           dstOffset,
        gpusize           fillSize,
        uint32            data) = 0;

    /// Interprets a range of GPU memory as a color buffer and clears it to the specified clear color.
    ///
    /// The maximum clear range is determined by the buffer offset and buffer extent; if any Ranges are specified they
    /// must be specified in texels with respect to the beginning of the buffer and must not exceed its extent.
    /// With 96-bit formats, bufferOffset must be specified in bytes.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageCs is expected but the more general @ref PipelineStageBlt is also OK.
    /// - CacheCoherency: @ref CoherShader is expected but the more general @ref CoherClear is also OK.
    ///
    /// @param [in] gpuMemory     GPU memory to be cleared.
    /// @param [in] color         Specifies the clear color data and how to interpret it.
    /// @param [in] bufferFormat  The format of the color data in the buffer.
    /// @param [in] bufferOffset  The offset to the beginning of the buffer, in units of texels
    ///                           (or bytes for 96-bit texels).
    /// @param [in] bufferExtent  The extent of the buffer, in units of texels.
    /// @param [in] rangeCount    Number of ranges within the buffer to clear; size of the pRanges array.
    ///                           If zero, the entire view will be cleared and pRanges will be ignored.
    /// @param [in] pRanges       Array of ranges within the GPU memory to clear.
    virtual void CmdClearColorBuffer(
        const IGpuMemory& gpuMemory,
        const ClearColor& color,
        SwizzledFormat    bufferFormat,
        uint32            bufferOffset,
        uint32            bufferExtent,
        uint32            rangeCount = 0,
        const Range*      pRanges    = nullptr) = 0;

    /// Clears the currently bound color targets to the specified clear color.
    ///
    /// This will always result in a slow clear and should only be used when the actual image being cleared is unknown.
    /// In practice, this is the case when vkCmdClearColorAttachments() is called in a secondary command buffer in
    /// Vulkan where the color attachments are inherited.
    ///
    /// This requires regionCount being specified since resource size is for sure to be known. The bound color targets
    /// shouldn't have UndefinedSwizzledFormat as their swizzle format.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageColorTarget
    /// - CacheCoherency: @ref CoherColorTarget
    /// - ImageLayout:    @ref LayoutColorTarget
    ///
    /// @param [in] colorTargetCount      Number of bound color target that needs to be cleared.
    /// @param [in] pBoundColorTargets    Color target information for the bound color targets.
    /// @param [in] regionCount           Number of volumes within the image to clear; size of the pClearRegions array.
    ///                                   This need to be non-zero.
    /// @param [in] pClearRegions         Array of volumes within the subresources to clear.
    virtual void CmdClearBoundColorTargets(
        uint32                          colorTargetCount,
        const BoundColorTarget*         pBoundColorTargets,
        uint32                          regionCount,
        const ClearBoundTargetRegion*   pClearRegions) = 0;

    /// Clears a color image to the specified clear color.
    ///
    /// If any Boxes have been specified, all subresource ranges must contain a single, identical mip level.
    ///
    /// The imageLayout can include any valid layout (e.g. not @ref LayoutUninitializedTarget) but it is wise to stick
    /// to layouts that are likely to support compression like @ref LayoutColorTarget.
    ///
    /// This function requires use of the following barrier flags if @ref flags includes @ref ColorClearAutoSync:
    /// - PipelineStage:  @ref PipelineStageColorTarget
    /// - CacheCoherency: @ref CoherColorTarget
    /// - ImageLayout:    @ref LayoutColorTarget
    /// Otherwise the following barrier flags must be used:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherClear
    ///
    /// @param [in] image       Image to be cleared.
    /// @param [in] imageLayout Current allowed usages and engines for the target image.
    /// @param [in] color       Specifies the clear color data and how to interpret it.
    /// @param [in] clearFormat If clearFormat.format is Undefined (e.g. if UndefinedSwizzledFormat is provided), do not
    ///                         reinterpret the subresources' formats. Otherwise, the subresources' formats will be
    ///                         reinterpreted according to this parameter. The specified format needs to have been
    ///                         included in the "pViewFormats" list specified at image-creation time, otherwise
    ///                         corruption may occur.
    /// @param [in] rangeCount  Number of subresource ranges to clear; size of the pRanges array.
    /// @param [in] pRanges     Array of subresource ranges to clear.
    /// @param [in] boxCount    Number of volumes within the image to clear; size of the pBoxes array.
    ///                         If zero, entire subresources will be cleared and pBoxes will be ignored.
    /// @param [in] pBoxes      Array of volumes within the subresources to clear.
    /// @param [in] flags       Mask of ClearColorImageFlags values controlling behavior of the clear.
    virtual void CmdClearColorImage(
        const IImage&         image,
        ImageLayout           imageLayout,
        const ClearColor&     color,
        const SwizzledFormat& clearFormat,
        uint32                rangeCount,
        const SubresRange*    pRanges,
        uint32                boxCount,
        const Box*            pBoxes,
        uint32                flags) = 0;

    /// Clears the currently bound depth/stencil targets to the specified clear values.
    ///
    /// This will always result in a slow clear and should only be used when the actual image being cleared is unknown.
    /// In practice, this is the case when vkCmdClearColorAttachments() is called in a secondary command buffer in
    /// Vulkan where the color attachments are inherited.
    ///
    /// This requires regionCount being specified since resource size is for sure to be known.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget
    /// - CacheCoherency: @ref CoherDepthStencilTarget
    /// - ImageLayout:    @ref LayoutDepthStencilTarget
    ///
    /// @param [in] depth            Depth clear value.
    /// @param [in] stencil          Stencil clear value.
    /// @param [in] stencilWriteMask Stencil write mask to clear specific stencil planes.
    /// @param [in] samples          Sample count.
    /// @param [in] fragments        Fragment count.
    /// @param [in] flag             Select to depth, stencil or depth and stencil.
    /// @param [in] regionCount      Number of volumes within the bound depth/stencil target to clear.
    /// @param [in] pClearRegions    Array of volumes within the subresources to clear.
    virtual void CmdClearBoundDepthStencilTargets(
        float                         depth,
        uint8                         stencil,
        uint8                         stencilWriteMask,
        uint32                        samples,
        uint32                        fragments,
        DepthStencilSelectFlags       flag,
        uint32                        regionCount,
        const ClearBoundTargetRegion* pClearRegions) = 0;

    /// Clears a depth/stencil image to the specified clear values.
    ///
    /// If any Rects have been specified, all subresource ranges must contain a single, identical mip level.
    ///
    /// The layouts can include any valid layout (e.g. not @ref LayoutUninitializedTarget) but it is wise to stick to
    /// layouts that are likely to support compression like @ref LayoutDepthStencilTarget.
    ///
    /// This function requires use of the following barrier flags if @ref flags includes @ref DsClearAutoSync:
    /// - PipelineStage:  @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget
    /// - CacheCoherency: @ref CoherDepthStencilTarget
    /// - ImageLayout:    @ref LayoutDepthStencilTarget
    /// Otherwise the following barrier flags must be used:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherClear
    ///
    /// @param [in] image            Image to be cleared.
    /// @param [in] depth            Depth clear value.
    /// @param [in] depthLayout      Current allowed usages and engines for the depth plane.
    /// @param [in] stencil          Stencil clear value.
    /// @param [in] stencilWriteMask Write-mask to apply to the stencil subresource ranges during the clear.
    /// @param [in] stencilLayout    Current allowed usages and engines for the stencil plane.
    /// @param [in] rangeCount       Number of subresource ranges to clear; size of the pRanges array.
    /// @param [in] pRanges          Array of subresource ranges to clear.
    /// @param [in] rectCount        Number of areas within the image to clear; size of the pRects array.  If zero,
    ///                              the entire subresources will be cleared and pRects will be ignored.
    /// @param [in] pRects           Array of areas within the subresources to clear.
    /// @param [in] flags            Mask of ClearDepthStencilFlags values controlling behavior of the clear.
    virtual void CmdClearDepthStencil(
        const IImage&      image,
        ImageLayout        depthLayout,
        ImageLayout        stencilLayout,
        float              depth,
        uint8              stencil,
        uint8              stencilWriteMask,
        uint32             rangeCount,
        const SubresRange* pRanges,
        uint32             rectCount,
        const Rect*        pRects,
        uint32             flags) = 0;

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 910
    /// Clears a range of GPU memory to the specified clear color using the specified buffer view SRD.
    ///
    /// The maximum clear range is determined by the view; if any Ranges are specified they must fit within the view's
    /// range. The view must support shader writes.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageCs
    /// - CacheCoherency: @ref CoherShader
    ///
    /// @note You may use the more general @ref PipelineStageBlt and @ref CoherClear if you wish but they may result in
    ///       higher barrier overhead.
    ///
    /// @param [in] gpuMemory      GPU memory to be cleared.
    /// @param [in] color          Specifies the clear color data and how to interpret it.
    /// @param [in] pBufferViewSrd The image view SRD that will be used to interpret the image.
    /// @param [in] rangeCount     Number of ranges within the GPU memory to clear; size of the pRanges array.
    ///                            If zero, the entire view will be cleared and pRanges will be ignored.
    /// @param [in] pRanges        Array of ranges within the GPU memory to clear.
    virtual void CmdClearBufferView(
        const IGpuMemory& gpuMemory,
        const ClearColor& color,
        const void*       pBufferViewSrd,
        uint32            rangeCount = 0,
        const Range*      pRanges    = nullptr) = 0;

    /// Clears an image to the specified clear color using the specified image view SRD.
    ///
    /// The clear subresouce range is determined by the view; if any Rects have been specified, the image view must
    /// contain a single mip level. The view must support shader writes.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageCs but the more general @ref PipelineStageBlt is also OK.
    /// - CacheCoherency: @ref CoherShader but the more general @ref CoherClear is also OK.
    /// - ImageLayout:    @ref LayoutShaderWrite
    ///
    /// @param [in] image         Image to be cleared.
    /// @param [in] imageLayout   Current allowed usages and engines for the image, must include LayoutShaderWrite.
    /// @param [in] color         Specifies the clear color data and how to interpret it.
    /// @param [in] pImageViewSrd The image view SRD that will be used to interpret the image.
    /// @param [in] rectCount     Number of volumes within the image to clear; size of the pRects array.
    ///                           If zero, entire subresources will be cleared and pRects will be ignored.
    /// @param [in] pRects        Array of volumes within the subresources to clear. The begin and end slices to be
    ///                           cleard are from SubresRange in pImageViewSrd.
    virtual void CmdClearImageView(
        const IImage&     image,
        ImageLayout       imageLayout,
        const ClearColor& color,
        const void*       pImageViewSrd,
        uint32            rectCount = 0,
        const Rect*       pRects    = nullptr) = 0;
#endif

    /// Resolves multiple regions of a multisampled image to a single-sampled image.
    ///
    /// The source image must be a 2D multisampled image and the destination must be a single-sampled image.
    /// The formats of the source and destination images must match unless all regions specify a valid format.
    ///
    /// For color images, if the source image has an integer numeric format, a single sample is copied (sample 0).
    ///
    /// For depth/stencil images, the resolve is performed by simply copying sample 0 from every source pixel to the
    /// destination pixel.
    ///
    /// The same subresource may not appear more than once in the specified array of regions.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherResolveSrc for the source and @ref CoherResolveDst for the destination.
    /// - ImageLayout:    @ref LayoutResolveSrc for the source and @ref LayoutResolveDst for the destination.
    ///
    /// @param [in] srcImage       MSAA source image.
    /// @param [in] srcImageLayout Current allowed usages and engines for the source image.  These masks must include
    ///                            LayoutResolveSrc and the ImageLayoutEngineFlags corresponding to the engine this
    ///                            function is being called on.
    /// @param [in] dstImage       Single-sample destination image.
    /// @param [in] dstImageLayout Current allowed usages and engines for the destination image.  These masks must
    ///                            include LayoutResolveDst and the ImageLayoutEngineFlags corresponding to the engine
    ///                            this function is being called on.
    /// @param [in] regionCount    Number of regions to resolve; size of the pRegions array.
    /// @param [in] resolveMode    Resolve mode
    /// @param [in] pRegions       Specifies src/dst subresources and rectangles.
    /// @param [in] flags          Mask of ResolveImageFlags values controlling behavior of the resolve.
    virtual void CmdResolveImage(
        const IImage&             srcImage,
        ImageLayout               srcImageLayout,
        const IImage&             dstImage,
        ImageLayout               dstImageLayout,
        ResolveMode               resolveMode,
        uint32                    regionCount,
        const ImageResolveRegion* pRegions,
        uint32                    flags) = 0;

    virtual void CmdResolvePrtPlusImage(
        const IImage&                    srcImage,
        ImageLayout                      srcImageLayout,
        const IImage&                    dstImage,
        ImageLayout                      dstImageLayout,
        PrtPlusResolveType               resolveType,
        uint32                           regionCount,
        const PrtPlusImageResolveRegion* pRegions) = 0;

    /// Puts the specified event into the _set_ state when all prior GPU work has progressed past the given stages.
    ///
    /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs.
    ///
    /// @param [in] gpuEvent  GPU event to be set.
    /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that
    ///                       restricts which stages of prior GPU work must happen before the event is set. The set
    ///                       will be performed at the earliest possible stage after the prior stages.
    virtual void CmdSetEvent(
        const IGpuEvent& gpuEvent,
        uint32           stageMask) = 0;

    /// Puts the specified event into the _reset_ state when all prior GPU work has progressed past the given stages.
    ///
    /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs.
    ///
    /// @param [in] gpuEvent  GPU event to be reset.
    /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that
    ///                       restricts which stages of prior GPU work must happen before the event is reset. The
    ///                       reset will be performed at the earliest possible stage after the prior stages.
    virtual void CmdResetEvent(
        const IGpuEvent& gpuEvent,
        uint32           stageMask) = 0;

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900
    /// Puts the specified GPU event into the _set_ state when all previous GPU work reaches the specified point in the
    /// pipeline.
    ///
    /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API.
    ///
    /// @param [in] gpuEvent GPU event to be set.
    /// @param [in] setPoint Point in the graphics pipeline where the GPU event will be _set_, indicating all prior
    ///                      issued GPU work has reached at least this point in the pipeline.  If the GPU doesn't
    ///                      support this operation at the exact specified point, the set will be performed at the
    ///                      earliest possible point _after_ the specified point.
    inline void CmdSetEvent(
        const IGpuEvent& gpuEvent,
        HwPipePoint      setPoint)
        { CmdSetEvent(gpuEvent, HwPipePointToStage[setPoint]); }

    /// Puts the specified GPU event into the _reset_ state when all previous GPU work reaches the specified point in
    /// the pipeline.
    ///
    /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API.
    ///
    /// @param [in] gpuEvent   GPU event to be reset.
    /// @param [in] resetPoint Point in the graphics pipeline where the GPU event will be _reset_, indicating all prior
    ///                        issued GPU work has reached at least this point in the pipeline.  If the GPU doesn't
    ///                        support this operation at the exact specified point, the reset will be performed at the
    ///                        earliest possible point _after_ the specified point.
    inline void CmdResetEvent(
        const IGpuEvent& gpuEvent,
        HwPipePoint      resetPoint)
        { CmdResetEvent(gpuEvent, HwPipePointToStage[resetPoint]); }
#endif

    /// Predicate the subsequent jobs in the command buffer if the event is set.
    ///
    /// @param [in] gpuEvent   GPU event to be checked.
    virtual void CmdPredicateEvent(
        const IGpuEvent&  gpuEvent) = 0;

    /// Performs the specified 32- or 64-bit memory operation.  These operations are atomic with respect to shader
    /// atomic operations.
    ///
    /// The data size (32-bit or 64-bit) is determined by the operation type.  For 32-bit atomics, only the lower
    /// 32-bits of srcData is used.
    ///
    /// The destination GPU memory offset must be 4-byte aligned for 32-bit atomics and 8-byte aligned for 64-bit
    /// atomics.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  @ref PipelineStagePostPrefetch
    /// - CacheCoherency: @ref CoherQueueAtomic
    ///
    /// @param [in] dstGpuMemory  Destination GPU memory object.
    /// @param [in] dstOffset     Offset into the memory object where the atomic will be performed.
    /// @param [in] srcData       Source data for the atomic operation.  Use depends on the atomicOp.
    /// @param [in] atomicOp      Specifies which atomic operation to perform.  @see AtomicOp.
    virtual void CmdMemoryAtomic(
        const IGpuMemory& dstGpuMemory,
        gpusize           dstOffset,
        uint64            srcData,
        AtomicOp          atomicOp) = 0;

    /// Starts a query operation for the given slot of a query pool.
    ///
    /// The query slot must have been previously cleared with CmdResetQueryPool() before starting a query.
    ///
    /// @note Queries may not span multiple command buffers.
    ///
    /// @param [in] queryPool  Query pool for this query.
    /// @param [in] queryType  The type of query this operation will produce.
    /// @param [in] slot       Slot in pQueryPool where the results of this query should be accumulated.
    /// @param [in] flags      Flags controlling query behavior.  @see QueryControlFlags.
    virtual void CmdBeginQuery(
        const IQueryPool& queryPool,
        QueryType         queryType,
        uint32            slot,
        QueryControlFlags flags) = 0;

    /// Stops a query operation for the given slot of a query pool.
    ///
    /// The query slot must have an open query on it when this is called.
    ///
    /// @param [in] queryPool  Query pool for this query.
    /// @param [in] queryType  The type of query this operation will produce.
    /// @param [in] slot       Slot in pQueryPool where the query is running.
    virtual void CmdEndQuery(
        const IQueryPool& queryPool,
        QueryType         queryType,
        uint32            slot) = 0;

    /// Resolves the results of a range of queries to the specified query type into the specified GPU memory location.
    ///
    /// This function requires use of the following barrier flags on @ref dstGpuMemory:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopyDst
    ///
    /// This operation does not honor the command buffer's predication state, if active.
    ///
    /// @param [in] queryPool     Query pool holding the source queries.
    /// @param [in] flags         Flags that control the result data layout and how the results are retrieved.
    /// @param [in] queryType     The type of queries this resolve will produce.
    /// @param [in] startQuery    First slot in pQueryPool to resolve.
    /// @param [in] queryCount    Number of query pool slots to resolve.
    /// @param [in] dstGpuMemory  Destination GPU memory object.
    /// @param [in] dstOffset     4-byte aligned offset into pDstGpuMemory where the results should be written.
    /// @param [in] dstStride     4-byte aligned stride between where results are written into pDstGpuMemory.
    virtual void CmdResolveQuery(
        const IQueryPool& queryPool,
        QueryResultFlags  flags,
        QueryType         queryType,
        uint32            startQuery,
        uint32            queryCount,
        const IGpuMemory& dstGpuMemory,
        gpusize           dstOffset,
        gpusize           dstStride) = 0;

    /// Rests a range of slots in a query pool.  A query slot must be reset each time before a query can be started
    /// using that slot.
    ///
    /// @param [in] queryPool  Query pool to be reset.
    /// @param [in] startQuery First slot in pQueryPool to be reset.
    /// @param [in] queryCount Number of slots to reset.
    virtual void CmdResetQueryPool(
        const IQueryPool& queryPool,
        uint32            startQuery,
        uint32            queryCount) = 0;

    /// Writes a GPU performance timestamp to memory when all prior GPU work has progressed past the given stages.
    ///
    /// The timestamp data is a 64-bit value that increments once per clock. @ref timestampFrequency in DeviceProperties
    /// reports the frequency the timestamps are clocked at. Timestamps are only supported by engines that report
    /// @ref supportsTimestamps in DeviceProperties.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  the same flag(s) specified in @ref stageMask.
    /// - CacheCoherency: @ref CoherTimestamp
    ///
    /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs.
    ///
    /// @param [in] stageMask    A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that
    ///                          restricts which stages of prior GPU work must happen before the timestamp is written.
    ///                          The timestamp will be performed at the earliest possible stage after the prior stages.
    ///                          Note that the SDMA engine only supports bottom-of-pipe timestamps.
    /// @param [in] dstGpuMemory GPU memory object where timestamp should be written.
    /// @param [in] dstOffset    Offset into pDstGpuMemory where the timestamp should be written.  Must be aligned to
    ///                          minTimestampAlignment in DeviceProperties.
    virtual void CmdWriteTimestamp(
        uint32            stageMask,
        const IGpuMemory& dstGpuMemory,
        gpusize           dstOffset) = 0;

    /// Writes an immediate value to memory when all prior GPU work has progressed past the given stages.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  the same flag(s) specified in @ref stageMask.
    /// - CacheCoherency: @ref CoherCp
    ///
    /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs.
    ///
    /// @param [in] stageMask          A bitmask of @ref PipelineStageFlag values which defines a synchronization scope
    ///                                that restricts which stages of prior GPU work must happen before the immediate
    ///                                value is written. The write will be occur at the earliest possible stage after
    ///                                the prior stages. Note that the SDMA engine only supports bottom-of-pipe writes.
    /// @param [in] data               Value to be written to gpu address.
    /// @param [in] ImmediateDataWidth Size of the data to be written out.
    /// @param [in] address            GPU address where immediate value should be written.
    virtual void CmdWriteImmediate(
        uint32             stageMask,
        uint64             data,
        ImmediateDataWidth dataSize,
        gpusize            address) = 0;

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900
    /// Writes a HwPipePostPrefetch or HwPipeBottom timestamp to the specified memory location.
    ///
    /// The timestamp data is a 64-bit value that increments once per clock.  timestampFrequency in DeviceProperties
    /// reports the frequency the timestamps are clocked at.
    ///
    /// Timestamps are only supported by engines that report supportsTimestamps in DeviceProperties.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  the same flag specified in @ref pipePoint.
    /// - CacheCoherency: @ref CoherTimestamp
    ///
    /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API.
    ///
    /// @param [in] pipePoint    Specifies where in the pipeline the timestamp should be sampled and written. The only
    ///                          valid choices are HwPipePostPrefetch and HwPipeBottom. HwPipePostPrefetch timestamps
    ///                          are not supported on the SDMA engine, so all timestamps will be executed as
    ///                          bottom-of-pipe.
    /// @param [in] dstGpuMemory GPU memory object where timestamp should be written.
    /// @param [in] dstOffset    Offset into pDstGpuMemory where the timestamp should be written.  Must be aligned to
    ///                          minTimestampAlignment in DeviceProperties.
    inline void CmdWriteTimestamp(
        HwPipePoint       pipePoint,
        const IGpuMemory& dstGpuMemory,
        gpusize           dstOffset)
        { CmdWriteTimestamp(HwPipePointToStage[pipePoint], dstGpuMemory, dstOffset); }

    /// Writes a top-of-pipe or bottom-of-pipe immediate value to the specified memory location.
    ///
    /// This function requires use of the following barrier flags:
    /// - PipelineStage:  the same flag specified in @ref pipePoint.
    /// - CacheCoherency: @ref CoherCp
    ///
    /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API.
    ///
    /// @param [in] pipePoint          Specifies where in the pipeline the timestamp should be sampled and written.
    ///                                The only valid choices are HwPipeTop, HwPipePostPrefetch and HwPipeBottom.
    ///                                Top-of-pipe timestamps are not supported on the SDMA engine, so all timestamps
    ///                                will be executed as bottom-of-pipe.
    /// @param [in] data               Value to be written to gpu address.
    /// @param [in] ImmediateDataWidth Size of the data to be written out.
    /// @param [in] address            GPU address where immediate value should be written.
    inline void CmdWriteImmediate(
        HwPipePoint        pipePoint,
        uint64             data,
        ImmediateDataWidth dataSize,
        gpusize            address)
        { CmdWriteImmediate(HwPipePointToStage[pipePoint], data, dataSize, address); }
#endif

    /// Loads the current stream-out buffer-filled-sizes stored on the GPU from memory, typically from a target of a
    /// prior CmdSaveBufferFilledSizes() call.
    ///
    /// Note that barriers shouldn't be necessary in normal stream-out workflows. However, if the client wishes to use
    /// the @ref gpuVirtAddr allocations in shaders or PAL blts they must use the following barrier flags:
    /// - PipelineStage:  @ref PipelineStagePostPrefetch
    /// - CacheCoherency: @ref CoherCp
    ///
    /// @param [in] gpuVirtAddr Array of GPU virtual addresses to load each counter from.  If any of these are zero,
    ///                         the corresponding filled-size counter is not loaded.
    virtual void CmdLoadBufferFilledSizes(
        const gpusize (&gpuVirtAddr)[MaxStreamOutTargets]) = 0;

    /// Saves the current stream-out buffer-filled-sizes into GPU memory.
    ///
    /// Note that barriers shouldn't be necessary in normal stream-out workflows. However, if the client wishes to use
    /// the @ref gpuVirtAddr allocations in shaders or PAL blts they must use the following barrier flags:
    /// - PipelineStage:  @ref PipelineStagePostPrefetch
    /// - CacheCoherency: @ref CoherCp
    ///
    /// @param [in] gpuVirtAddr Array of GPU virtual addresses to save each counter into.  If any of these are zero,
    ///                         the corresponding filled-size counter is not saved.
    virtual void CmdSaveBufferFilledSizes(
        const gpusize (&gpuVirtAddr)[MaxStreamOutTargets]) = 0;

    /// Set the offset to buffer-filled-size for a stream-out target.
    ///
    /// @param [in] bufferId   Stream-out buffer ID, it could be in the range [0, MaxStreamOutTargets).
    /// @param [in] offset     The value to be written into the buffer filled size counter.
    ///
    virtual void CmdSetBufferFilledSize(
        uint32  bufferId,
        uint32  offset) = 0;

    /// Binds the specified border color palette for use by samplers.
    ///
    /// @param [in] pipelineBindPoint Specifies which pipeline type is affected (i.e., graphics or compute).
    /// @param [in] pPalette          Border color palette object to bind.
    virtual void CmdBindBorderColorPalette(
        PipelineBindPoint          pipelineBindPoint,
        const IBorderColorPalette* pPalette) = 0;

    /// Sets predication for this command buffer to use the specified GPU memory location. Any draw, dispatch or copy
    /// operation between this command and the corresponding reset/disable call will be skipped if the value in spec-
    /// ified location matches the passed-in predicated value
    ///
    /// This function requires use of the following barrier flags on @ref pGpuMemory:
    /// - PipelineStage:  @ref PipelineStageFetchIndirectArgs
    /// - CacheCoherency: @ref CoherIndirectArgs
    ///
    /// @param [in] pQueryPool     pointer to QueryPool obj, not-nullptr means this is a QueryPool based predication
    ///                                - Zpass/Occlusion based predication
    ///                                - or PrimCount/Streamout based predication
    /// @param [in] slot           Slot to use for setting occlusion predication, valid when pQueryPool is not nullptr
    /// @param [in] pGpuMemory     GPU memory object for the predication value, only valid when pQueryPool is nullptr
    /// @param [in] offset         GPU memory offset for the predication value
    /// @param [in] predType       Predication type.
    /// @param [in] predPolarity   Controls the polarity of the predication test
    ///                                true  = draw_if_visible_or_no_overflow
    ///                                false = draw_if_not_visible_or_overflow
    /// @param [in] waitResults    Hint only valid for Zpass/Occlusion.
    ///                                false = wait_until_final_zpass_written
    ///                                true  = draw_if_not_final_zpass_written
    /// @param [in] accumulateData true(1) = allow_accumulation of Zpass and PrimCount across command buffer boundaries.
    ///
    /// pQueryPool and gpuVirtAddr should be exclusively set, when both are nullptr/0, other params will be ignored
    /// and it means to reset/disable predication so that the following commands can perform normally.
    virtual void CmdSetPredication(
        IQueryPool*         pQueryPool,
        uint32              slot,
        const IGpuMemory*   pGpuMemory,
        gpusize             offset,
        PredicateType       predType,
        bool                predPolarity,
        bool                waitResults,
        bool                accumulateData) = 0;

    /// Suspend/resume any active predication for this command buffer
    ///
    /// @param [in] suspend     Controls if predication should be paused
    ///                             true  = suspend active predication
    ///                             false = resume active predication
    ///
    /// Any suspended predication must be resumed prior to disabling predication using CmdSetPredication with pQueryPool
    /// and gpuVirtAddr with nullptr/0. This is only valid on universal and compute command buffers.
    virtual void CmdSuspendPredication(
        bool suspend) = 0;

    /// Begins a conditional block in the current command buffer. All commands between this and the corresponding
    /// CmdEndIf() (or CmdElse() if it is present) command are executed if the specified condition is true.
    ///
    /// This function requires use of the following barrier flags on @ref gpuMemory:
    /// - PipelineStage:  @ref PipelineStageFetchIndirectArgs
    /// - CacheCoherency: @ref CoherIndirectArgs
    ///
    /// @param [in] gpuMemory    GPU memory object containing the memory location to be tested.
    /// @param [in] offset       Offset within the memory object where the tested memory location begins.
    /// @param [in] data         Source data to compare against the value in GPU memory.
    /// @param [in] mask         Mask to apply to the GPU memory (via bitwise AND) prior to comparison.
    /// @param [in] compareFunc  Function controlling how the data operands are compared.
    virtual void CmdIf(
        const IGpuMemory& gpuMemory,
        gpusize           offset,
        uint64            data,
        uint64            mask,
        CompareFunc       compareFunc) = 0;

    /// Begins a conditional block in the current command buffer. All commands between this and the corresponding
    /// CmdEndIf() command are executed if the condition specified in the innermost active conditional block are false.
    virtual void CmdElse() = 0;

    /// Ends the innermost active conditional block in the current command buffer.
    virtual void CmdEndIf() = 0;

    /// Begins a while loop in the current command buffer. All commands between this and the corresponding CmdEndWhile()
    /// command are executed repeatedly as long as the specified condition remains true.
    ///
    /// This function requires use of the following barrier flags on @ref gpuMemory:
    /// - PipelineStage:  @ref PipelineStageFetchIndirectArgs
    /// - CacheCoherency: @ref CoherIndirectArgs
    ///
    /// @param [in] gpuMemory    GPU memory object containing the memory location to be tested.
    /// @param [in] offset       Offset within the memory object where the tested memory location begins.
    /// @param [in] data         Source data to compare against the value in GPU memory.
    /// @param [in] mask         Mask to apply to the GPU memory (via bitwise AND) prior to comparison.
    /// @param [in] compareFunc  Function controlling how the data operands are compared.
    virtual void CmdWhile(
        const IGpuMemory& gpuMemory,
        gpusize           offset,
        uint64            data,
        uint64            mask,
        CompareFunc       compareFunc) = 0;

    /// Ends the innermost active while loop in the current command buffer.
    virtual void CmdEndWhile() = 0;

    /// Stalls a command buffer execution based on a condition that compares an immediate value with value coming from a
    /// GPU register.
    ///
    /// The client (or application) is supposed to do necessary barriers before calling this function, but for now this
    /// is only need to wait some display or timer related registers.
    ///
    /// @param [in] registerOffset The offset in bytes of GPU register to be tested.
    /// @param [in] data           Source data to compare against the value of GPU register.
    /// @param [in] mask           Mask to apply to the GPU memory (via bitwise AND) prior to comparison.
    /// @param [in] compareFunc    Function controlling how the data operands are compared. CompareFunc::Never shouldn't
    ///                            be used as the hardware does not support it.
    virtual void CmdWaitRegisterValue(
        uint32      registerOffset,
        uint32      data,
        uint32      mask,
        CompareFunc compareFunc) = 0;

    /// Stalls a command buffer execution based on a condition that compares an immediate value with value coming from a
    /// GPU memory location.
    ///
    /// The client (or application) is expected to transiton the memory to proper state before calling this function.
    /// The memory location for the condition must be 4-byte aligned.
    /// This function requires use of the following barrier flags on @ref gpuVirtAddr:
    /// - PipelineStage:  @ref PipelineStagePostPrefetch
    /// - CacheCoherency: @ref CoherCp
    ///
    /// @param [in] gpuVirtAddr  GPU memory address containing the data to be tested.
    /// @param [in] data         Source data to compare against the value in GPU memory.
    /// @param [in] mask         Mask to apply to the GPU memory (via bitwise AND) prior to comparison.
    /// @param [in] compareFunc  Function controlling how the data operands are compared. CompareFunc::Never should not
    ///                          be used as the hardware does not support it.
    virtual void CmdWaitMemoryValue(
        gpusize     gpuVirtAddr,
        uint32      data,
        uint32      mask,
        CompareFunc compareFunc) = 0;

    /// Stalls a command buffer execution until an external device writes to the marker surface in the GPU bus
    /// addressable memory location.
    ///
    /// This function requires use of the following barrier flags on @ref gpuMemory:
    /// - PipelineStage:  @ref PipelineStagePostPrefetch
    /// - CacheCoherency: @ref CoherCp
    ///
    /// @param [in] gpuMemory    GPU memory object containing the memory location to be tested.
    /// @param [in] data         Source data to compare against the value in GPU memory.
    /// @param [in] mask         Mask to apply to the GPU memory (via bitwise AND) prior to comparison.
    /// @param [in] compareFunc  Function controlling how the data operands are compared. CompareFunc::Never should not
    ///                          be used as the hardware does not support it.
    virtual void CmdWaitBusAddressableMemoryMarker(
        const IGpuMemory& gpuMemory,
        uint32            data,
        uint32            mask,
        CompareFunc       compareFunc) = 0;

    /// Begins the specified performance experiment.
    ///
    /// @param [in] pPerfExperiment Performance experiment to begin.
    virtual void CmdBeginPerfExperiment(
        IPerfExperiment* pPerfExperiment) = 0;

    /// Updates the sqtt token mask on the specified performance experiment.
    ///
    /// @param [in] pPerfExperiment Performance experiment to update.
    /// @param [in] tokenConfig updated token and reg mask to apply.
    ///
    /// @note: This function is only valid to call if pPerfExperiment is a thread trace experiment that is currently
    //         active.
    virtual void CmdUpdatePerfExperimentSqttTokenMask(
        IPerfExperiment*              pPerfExperiment,
        const ThreadTraceTokenConfig& tokenConfig) = 0;

    /// Updates the sqtt token mask on all running traces, if any.
    ///
    /// @note This may overwrite the stall settings (making them more conservative)
    /// @param [in] tokenConfig updated token and reg mask to apply.
    virtual void CmdUpdateSqttTokenMask(
        const ThreadTraceTokenConfig& tokenConfig) = 0;

    /// Ends the specified performance experiment.
    ///
    /// @param [in] pPerfExperiment Performance experiment to end.
    virtual void CmdEndPerfExperiment(
        IPerfExperiment* pPerfExperiment) = 0;

    /// Inserts a trace marker into the command buffer.
    ///
    /// A trace marker can be inserted to mark particular points of interest in a command buffer to be viewed with the
    /// trace data collected in a performance experiment.
    ///
    /// @param [in] markerType Selects one of two generic marker categories ("A" or "B").
    /// @param [in] markerData 32-bit marker value to be inserted.
    virtual void CmdInsertTraceMarker(
        PerfTraceMarkerType markerType,
        uint32              markerData) = 0;

    /// Inserts a set of SQ thread trace markers for consumption by the Radeon GPU Profiler (RGP).
    ///
    /// Only supported on Universal and Compute engines.
    ///
    /// @param [in] numDwords Number of dwords in pData to be inserted as SQTT markers.
    /// @param [in] pData     SQTT marker data.  See the RGP SQTT Instrumentation Specification for details on how this
    ///                       data should be formatted.
    virtual void CmdInsertRgpTraceMarker(
        RgpMarkerSubQueueFlags subQueueFlags,
        uint32                 numDwords,
        const void*            pData) = 0;

    /// This function is to be used to copy the DF SPM (MALL SPM) data from the output buffers to an accessible buffer.
    /// The buffer that HW outputs to is allocated with a special KMD flag and therefore cannot be the same as the
    /// normal IPerfExperiment buffer so we need a special command to get the data.
    ///
    /// The bulk of the implementation for this is done by the KMD. They are in charge of starting and stopping the
    /// trace as well as all of the register programming. When KMD recieves a dfSpmTraceEnd bit from a CmdBufInfo
    /// flag, they will wait for the command buffer to be completely idle before stopping the trace. Therefore, a
    /// CmdEndPerfExperiment call does not stop this particular sample, the end of a command buffer with a
    /// dfSpmTraceEnd does. This means that calling CmdCopyDfSpmTraceData in the same command buffer as
    /// dfSpmTraceEnd will give you incorrect data. The sample will still be in progress when the copy happens.
    /// You must call CmdCopyDfSpmTraceData in a separate command buffer after one where the dfSpmTraceEnd bit is
    /// set.
    ///
    /// There is also a metadata buffer that does not need a special KMD flag. It is also stored in a separate buffer
    /// and is copied along with the output buffer with this command. It contains a uint32 trace size, a uint32 pad,
    /// a uint64 start trace GPU timestamp and a uint64 stop trace GPU timestamp and is placed at the beginning of the
    /// dstGpuMemory.
    ///
    /// The minimum size of the dstGpuMemory should be the size of the metadata struct plus the size of the DF SPM
    /// ringSize given to the perf experiment. The SPM data may not fill the entire memory, but the client is
    /// responsible for parsing the data.
    ///
    /// This function requires use of the following barrier flags on @ref dstGpuMemory:
    /// - PipelineStage:  @ref PipelineStageBlt
    /// - CacheCoherency: @ref CoherCopyDst
    ///
    /// @param [in] perfExperiment  The perfExperiment that we will be copying the data from
    /// @param [in] dstGpuMemory    The memory location that the DF SPM trace data will be copied to.
    /// @param [in] dstOffset       The offset into the destination memory that the data will be copied to.
    virtual void CmdCopyDfSpmTraceData(
        const IPerfExperiment& perfExperiment,
        const IGpuMemory&      dstGpuMemory,
        gpusize                dstOffset) = 0;

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914
    /// Loads data from the provided GPU Memory object into Constant Engine RAM.
    ///
    /// @param [in] srcGpuMemory  GPU Memory object containing the source data to be loaded to CE RAM.
    /// @param [in] memOffset     Offset within the memory object where the source data is located,
    ///                           must be 32-byte aligned.
    /// @param [in] ramOffset     Byte offset destination in CE RAM where the data should be loaded,
    ///                           must be 32-byte aligned.
    /// @param [in] dwordSize     Number of DWORDs that should be loaded into CE RAM, must be a multiple of 8.
    void CmdLoadCeRam(
        const IGpuMemory& srcGpuMemory,
        gpusize           memOffset,
        uint32            ramOffset,
        uint32            dwordSize) {}

    /// Dumps data from Constant Engine RAM to the provided GPU Memory address which may be located in a GPU ring buffer
    /// managed by the CE. The CE can be used to automatically handle the synchronization between the DE and CE when
    /// manipulating a GPU ring buffer. In order for PAL to instruct the CE to handle this, we need to know the current
    /// position (entry) within the ring buffer being dumped to, as well as the total size (in entries) of the ring.
    ///
    /// @param [in] dstGpuMemory  GPU Memory object destination where the data should be dumped from CE RAM.
    /// @param [in] memOffset     Offset within the memory object where data should be dumped, must be 4 byte aligned.
    /// @param [in] ramOffset     Byte offset source in CE RAM for data that should be dumped, must be 4 byte aligned.
    /// @param [in] dwordSize     Number of DWORDs that should be dumped from CE RAM into GPU Memory
    /// @param [in] currRingPos   Current position (ring entry) in the GPU ring buffer being managed by the CE which the
    ///                           dump location corresponds to.
    /// @param [in] ringSize      Number of entries in the GPU ring buffer being managed by the CE. If the memory being
    ///                           dumped into is not managed in a ring-like fashion, this should be set to zero.
    void CmdDumpCeRam(
        const IGpuMemory& dstGpuMemory,
        gpusize           memOffset,
        uint32            ramOffset,
        uint32            dwordSize,
        uint32            currRingPos,
        uint32            ringSize) {}

    /// Writes CPU data to Constant Engine RAM
    ///
    /// @param [in] pSrcData   Pointer to the source CPU data to be written to CE RAM.
    /// @param [in] ramOffset  Byte offset in CE RAM where the data should be written, must be 4 byte aligned.
    /// @param [in] dwordSize  Number of DWORDs that should be written from pSrcData into CE RAM.
    virtual void CmdWriteCeRam(
        const void* pSrcData,
        uint32      ramOffset,
        uint32      dwordSize) {}
#endif

    /// Allocates a chunk of command space that the client can use to embed constant data directly in the command
    /// buffer's backing memory. The returned CPU address is valid until ICmdBuffer::End() is called. The GPU address
    /// is valid until ICmdBuffer::Reset() or ICmdBuffer::Begin() and must only be referenced by work contained within
    /// this command buffer (e.g., as an SRD table address).
    ///
    /// @param [in]  sizeInDwords       Size of the embedded data space in DWORDs. It must be less than or equal to the
    ///                                 value reported by GetEmbeddedDataLimit().
    /// @param [in]  alignmentInDwords  Minimum GPU address alignment of the embedded space in DWORDs.
    /// @param [out] pGpuAddress        The GPU address of the embedded space.
    ///
    /// @returns The DWORD-aligned CPU address of the embedded space.
    virtual uint32* CmdAllocateEmbeddedData(
        uint32   sizeInDwords,
        uint32   alignmentInDwords,
        gpusize* pGpuAddress) = 0;

    /// Allocates a chunk of command space that the client can use to embed constant data directly in the command
    /// buffer's backing memory. The returned CPU address is valid until ICmdBuffer::End() is called. The GPU address
    /// is valid until ICmdBuffer::Reset() or ICmdBuffer::Begin() and must only be referenced by work contained within
    /// this command buffer (e.g., as an SRD table address).
    ///
    /// @param [in]  sizeInDwords       Size of the embedded data space in DWORDs. It must be less than or equal to the
    ///                                 value reported by GetLargeEmbeddedDataLimit().
    /// @param [in]  alignmentInDwords  Minimum GPU address alignment of the embedded space in DWORDs.
    /// @param [out] pGpuAddress        The GPU address of the embedded space.
    ///
    /// @returns The DWORD-aligned CPU address of the embedded space.
    virtual uint32* CmdAllocateLargeEmbeddedData(
        uint32   sizeInDwords,
        uint32   alignmentInDwords,
        gpusize* pGpuAddress) = 0;

    /// Get memory from scratch memory and bind to GPU event. For now only GpuEventPool and CmdBuffer's internal
    /// GpuEvent use this path to allocate and bind GPU memory. These usecases assume the bound GPU memory is GPU access
    /// only, so client is responsible for resetting the event from GPU, and cannot call Set(), Reset(), GetStatus().
    ///
    /// @param [in]  pGpuEvent  The GPU event that needs to bind a memory. Must not be nullptr.
    ///
    /// @returns Success if the GPU event successfully binds a GPU memory.  Otherwise, one of the following errors may
    ///          be returned:
    ///          + ErrorUnknown if an internal PAL error occurs.
    virtual Result AllocateAndBindGpuMemToEvent(
        IGpuEvent* pGpuEvent) = 0;

    /// Issues commands to prime GPU caches shortly before accessing the specified GPU address range(s).  The benefit of
    /// this prefetching is likely to be platform-dependent based on the GPU's cache hierarchy, memory subsystem,
    /// available prefetching tools in hardware, etc., so caller beware.
    ///
    /// This operation may read data from memory into caches and therefore counts as a general BLT SRC operation with
    /// regard to barrier execution and memory dependencies.
    ///
    /// @param [in] rangeCount  Number of entries in pRanges.
    /// @param [in] pRanges     Array of structs defining a memory range and properties controlling prefetching of that
    ///                         range.
    virtual void CmdPrimeGpuCaches(
        uint32                    rangeCount,
        const PrimeGpuCacheRange* pRanges) = 0;

    /// Issues commands which execute the specified group of nested command buffers.  The observable behavior of this
    /// operation should be indiscernible from directly recording the nested command buffers' commands directly into
    /// this command buffer.  Naturally, the queue type of the nested command buffers must match this command buffer.
    ///
    /// Conceptually, executing a nested command buffer is similar to calling a subroutine: the root command buffer is
    /// like the "caller", while the nested ones are the "callees".
    ///
    /// If any nested command buffers were allocated from a @ref ICmdAllocator with @ref autoMemoryReuse enabled,
    /// resetting or destroying those nested command buffers will render them retroactively uncallable. This effectively
    /// makes the caller command buffer invalid and illegal to submit even if it was otherwise valid and executable in
    /// the past. If the nested command allocator has autoMemoryReuse disabled, the calls to reset nested command
    /// buffers remain valid until the allocator itself is reset.
    ///
    /// State inheritance/leakage between the caller and callee(s) has the following behavior:
    /// + The callee only inherits the state specified in the callee CmdBufferBuildInfo.  It is up to the client to
    ///   bind any default state necessary when they called @ref ICmdBuffer::Begin() to begin building the callee.
    ///   By default no state is inherited and all state must be specified by the client.
    /// + The callee leaks any render and resource-binding state back into the caller after it completes.  It is up to
    ///   the client to rebind the caller's state after this operation completes if they don't want state leakage.
    /// + Both of the above points apply in between callees, if more than one command buffer is being executed by this
    ///   call.
    ///
    /// @param [in]     cmdBufferCount  Number of nested command buffers to execute.  (i.e., size of the ppCmdBuffers
    ///                                 array).  This must be at least one, otherwise making this call is pointless.
    /// @param [in,out] ppCmdBuffers    Array of nested command buffers to execute.  It is an error condition if any
    ///                                 of the following are true: (Debug assertions are used to check them.)
    ///                                 + ppCmdBuffers is null.
    ///                                 + Any member of ppCmdBuffers is null.
    ///                                 + Any member of ppCmdBuffers is a root command buffer, or has a different
    ///                                   queue type than this command buffer.
    virtual void CmdExecuteNestedCmdBuffers(
        uint32            cmdBufferCount,
        ICmdBuffer*const* ppCmdBuffers) = 0;

    /// Saves a copy of some set of the current command buffer state that is used by compute workloads. This feature is
    /// intended to give PAL clients a convenient way to issue their own internal compute workloads without modifying
    /// the application-facing state.
    ///
    /// PAL cannot save multiple layers of state, each call to CmdSaveComputeState must be followed by a call to
    /// CmdRestoreComputeState before the next call to CmdSaveComputeState.
    ///
    /// This function can only be called on command buffers that support compute workloads. All query counters will be
    /// disabled until CmdRestoreComputeState is called.
    ///
    /// @param [in] stateFlags  A mask of ORed @ref ComputeStateFlags indicating which state to save.
    virtual void CmdSaveComputeState(
        uint32 stateFlags) = 0;

    /// Restores some set of the command buffer state that is used by compute workloads. This feature is intended to
    /// give PAL clients a convenient way to issue their own internal compute workloads without modifying the
    /// application-facing state.
    ///
    /// A call to this function must be preceded by a call to CmdSaveComputeState and the save stateFlags must contain
    /// all restore stateFlags, otherwise the values of the restored state are undefined.
    ///
    /// This function can only be called on command buffers that support compute workloads. All previously disabled
    /// query counters will be reactivated.
    ///
    /// @param [in] stateFlags  A mask of ORed @ref ComputeStateFlags indicating which state to restore.
    virtual void CmdRestoreComputeState(
        uint32 stateFlags) = 0;

    /// Issues commands which complete two tasks: using the provided @ref IIndirectCmdGenerator object to translate the
    /// indirect argument buffer into a format understandable by the GPU; and then executing the generated commands.
    ///
    /// The virtual address must be 4-byte aligned.
    ///
    /// The indirect argument data offset in memory must be 4-byte aligned. The expected layout of the argument data
    /// is defined by the @ref IIndirectCmdGenerator object.
    ///
    /// It is unsafe to call this method on a command buffer which was not begun with either the optimizeOneTimeSubmit
    /// or optimizeExclusiveSubmit flags. This is because there is a potential race condition if the same command buffer
    /// is generating indirect commands on multiple Queues simultaneously.
    ///
    /// This function requires use of the following barrier flags on the indirect memory:
    /// - PipelineStage:  @ref PipelineStageFetchIndirectArgs
    /// - CacheCoherency: @ref CoherIndirectArgs
    ///
    /// @param [in] generator     Indirect command generator object which can translate the indirect argument buffer
    ///                           into a command buffer format which the GPU can understand.
    /// @param [in] gpuVirtAddr   Gpu virtual address where the indirect argument data is located.
    /// @param [in] maximumCount  Maximum count of data structures to loop through.  If countGpuAddr is nonzero, the
    ///                           value at that memory location is clamped to this maximum. If countGpuAddr is zero,
    ///                           Then the number of draws issued exactly matches this number.
    /// @param [in] countGpuAddr  GPU virtual address where the number of draws is stored.  Must be 4-byte aligned.
    virtual void CmdExecuteIndirectCmds(
        const IIndirectCmdGenerator& generator,
        gpusize                      gpuVirtAddr,
        uint32                       maximumCount,
        gpusize                      countGpuAddr) = 0;

    /// Updates one or more HiS pretests bound to the given stencil image within a range of mip levels.
    /// See @ref HiSPretests for a summary of HiS.
    ///
    /// @warning Improper use of pretests can cause corruption.  Please see @ref HiSPretests for more information.
    ///
    /// @param [in] image    The stencil image that will receive the new pretest(s).
    /// @param [in] pretests The new pretest(s).
    /// @param [in] firstMip The beginning of the mip range which will receive the new pretest(s).
    /// @param [in] numMips  The number of mips in the mip range which will receive the new pretest(s).
    virtual void CmdUpdateHiSPretests(
        const IImage*      pImage,
        const HiSPretests& pretests,
        uint32             firstMip,
        uint32             numMips) = 0;

    /// Executes any internal postprocessing commands to be performed on a frame, such as drawing the dev driver
    /// overlay.  Calling this prior to presenting (via any path) is a requirement, and must be prior to or
    /// concurrent with frameEnd if FSFM is applicable.  This must be called using the image that will be the
    /// source of the present.
    ///
    /// @param [in]  postProcessInfo  Information about the frame to be postprocessed.
    /// @param [out] pAddedGpuWork    (Optional) Set to true if commands were added as part of this call.
    virtual void CmdPostProcessFrame(
        const CmdPostProcessFrameInfo& postProcessInfo,
        bool*                          pAddedGpuWork) = 0;

    /// Inserts a string embedded inside a NOP packet with a signature that is recognized by tools and can be printed
    /// inside a command buffer disassembly. Note that this is a real NOP that will really be submitted to the GPU
    /// and executed (skipped over) by CP. It will be visible in kernel debugging as well as offline debug dumps.
    ///
    /// The maximum length of a string that may be embedded in the command buffer is currently 128 characters,
    /// including the NUL-terminator. This is defined in the internal command buffer class in MaxCommentStringLength.
    ///
    /// @param [in] pComment        Pointer to NUL-terminated string that will be inserted into the command buffer.
    virtual void CmdCommentString(
        const char* pComment) = 0;

    /// Inserts the specified payload embedded inside a NOP packet. Note that this is a real NOP that will be submitted
    /// to the GPU and executed (skipped over) by CP. It will be visible in kernel debugging as well as offline debug
    /// dumps.
    ///
    /// @param [in] pPayload    Pointer to binary data to embed.
    /// @param [in] payloadSize Size of the payload in DWORDs, expected to be under MaxPayloadSize.
    virtual void CmdNop(
        const void* pPayload,
        uint32      payloadSize) = 0;

    /// Marks the begin or end of a user-defined region of GPU work; analyzed post-mortem in crash-dump analysis tools.
    /// Each 'Begin' marker must be paired with a corresponding 'End' marker; however, markers may be nested by
    /// inserting multiple 'Begin' markers consecutively.
    ///
    /// @warning This function is a no-op if Crash Analysis mode is not enabled.
    ///
    /// @param [in] isBegin         Whether this is a 'Begin' marker (true) or an 'End' marker (false).
    /// @param [in] sourceId        The application layer ID at which the marker is being created:
    ///                               0x0 => Application
    ///                               0x1 => API (e.g. DX12, Vulkan, etc.)
    ///                               0x2 => PAL
    ///                             Developers may use IDs within the range of 10 - 15 to define a custom
    ///                             application layer.
    /// @param [in] pMarkerName     A NULL-terminated string containing a name for this marker, used for annotation
    ///                             purposes in external tools. Only valid for 'Begin' markers, and will be ignored if
    ///                             isBeginMarker is false.
    /// @param [in] markerNameSize  Size of the marker string, in bytes.
    ///
    /// @returns Non-zero counter value of the embedded execution marker.
    ///          If Crash Analysis mode is disabled, this will always return zero.
    virtual uint32 CmdInsertExecutionMarker(
        bool         isBegin,
        uint8        sourceId,
        const char*  pMarkerName,
        uint32       markerNameSize) = 0;

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888
    /// XDMA was retired starting in gfx10 so this function has no use anymore.
    inline void CmdXdmaWaitFlipPending() {}
#endif

    /// Starts thread-trace/counter-collection - used by GPS Shim's OpenShimInterface via DXCP
    /// Only valid for the GPU Profiler layer (which is enabled separately by the GPS Shim during usage of these
    /// functions)
    /// Only valid for per-draw granularity and hence non-RGP thread-trace formats.
    /// The caller is responsible for setting up valid GPU Profiler panel settings.
    virtual void CmdStartGpuProfilerLogging() = 0;

    /// Stops thread-trace/counter-collection - used by GPS Shim's OpenShimInterface via DXCP
    /// Only valid for the GPU Profiler layer (which is enabled separately by the GPS Shim during usage of these
    /// functions)
    /// Only valid for per-draw granularity and hence non-RGP thread-trace formats.
    /// The caller is responsible for setting up valid GPU Profiler panel settings.
    virtual void CmdStopGpuProfilerLogging() = 0;

    /// Set a mask to control which view instances are enabled for subsequent draws, should only be called on
    /// universal command buffers.
    ///
    /// @param [in] mask     The mask to control which view instances are enabled.
    virtual void CmdSetViewInstanceMask(uint32 mask) = 0;

    /// Get used size of all chunks in bytes for given CmdAllocType. For CommandDataAlloc with multi-queue scheme, the
    /// size reported will be the sum of all command streams associated with the command buffer. It's legal to call
    /// this function while in the command building state.
    ///
    /// @param [in] type    Allocation type for ICmdAllocator
    ///
    /// @returns Used allocation data size in bytes for provided CmdAllocType.
    virtual uint32 GetUsedSize(
        CmdAllocType type) const = 0;

    /// Returns the value of the associated arbitrary client data pointer.
    /// Can be used to associate arbitrary data with a particular PAL object.
    ///
    /// @returns Pointer to client data.
    void* GetClientData() const
    {
        return m_pClientData;
    }

    /// Sets the value of the associated arbitrary client data pointer.
    /// Can be used to associate arbitrary data with a particular PAL object.
    ///
    /// @param  [in]    pClientData     A pointer to arbitrary client data.
    void SetClientData(
        void* pClientData)
    {
        m_pClientData = pClientData;
    }

    /// Gets the internal unique Id of the command buffer.
    /// This function is only for internal debugging, and the Id is not unique across different queue types.
    ///
    /// @returns Unique Id of the command buffer
    virtual uint32 UniqueId() const = 0;
protected:
    /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly
    /// called the proper create method.
    ICmdBuffer() : m_pClientData(nullptr)
    {
    }

    /// @internal Destructor.  Prevent use of delete operator on this interface.  Client must destroy objects by
    /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the
    /// object on their own.
    virtual ~ICmdBuffer() { }

    /// Structure for function pointers for the ICmdBuffer::Cmd* functions.
    struct CmdBufferFnTable
    {
        /// CmdSetUserData function pointers for each pipeline bind point.
        CmdSetUserDataFunc              pfnCmdSetUserData[static_cast<uint32>(PipelineBindPoint::Count)];

        CmdDrawFunc                      pfnCmdDraw;                      ///< CmdDraw function pointer.
        CmdDrawOpaqueFunc                pfnCmdDrawOpaque;                ///< CmdDrawOpaque function pointer.
        CmdDrawIndexedFunc               pfnCmdDrawIndexed;               ///< CmdDrawIndexed function pointer.
        CmdDrawIndirectMultiFunc         pfnCmdDrawIndirectMulti;         ///< CmdDrawIndirectMulti function pointer.
        CmdDrawIndexedIndirectMultiFunc  pfnCmdDrawIndexedIndirectMulti;  ///< CmdDrawIndexedIndirectMulti func pointer.
        CmdDispatchFunc                  pfnCmdDispatch;                  ///< CmdDispatch function pointer.
        CmdDispatchIndirectFunc          pfnCmdDispatchIndirect;          ///< CmdDispatchIndirect function pointer.
        CmdDispatchOffsetFunc            pfnCmdDispatchOffset;            ///< CmdDispatchOffset function pointer.
        CmdDispatchMeshFunc              pfnCmdDispatchMesh;              ///< CmdDispatchmesh function pointer.
        CmdDispatchMeshIndirectMultiFunc pfnCmdDispatchMeshIndirectMulti; ///< CmdDispatchMeshIndirect function pointer.
    } m_funcTable;     ///< Function pointer table for Cmd* functions.

private:
    /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData()
    /// and set via SetClientData().
    /// For non-top-layer objects, this will point to the layer above the current object.
    void* m_pClientData;

#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900
    /// @internal Some back-compat glue for some of the HwPipePoint interfaces in this file.
    static constexpr uint32 HwPipePointToStage[] =
    {
            PipelineStageTopOfPipe,    // HwPipeTop              = 0x0
            PipelineStagePostPrefetch, // HwPipePostPrefetch     = 0x1
            PipelineStageVs,           // HwPipePreRasterization = 0x2
            PipelineStagePs,           // HwPipePostPs           = 0x3
            PipelineStageLateDsTarget, // HwPipePreColorTarget   = 0x4
            PipelineStageCs,           // HwPipePostCs           = 0x5
            PipelineStageBlt,          // HwPipePostBlt          = 0x6
            PipelineStageBottomOfPipe, // HwPipeBottom           = 0x7
    };
#endif
};

} // Pal