-
Notifications
You must be signed in to change notification settings - Fork 90
/
Copy pathpalCmdBuffer.h
4969 lines (4610 loc) · 281 KB
/
palCmdBuffer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
***********************************************************************************************************************
*
* Copyright (c) 2014-2024 Advanced Micro Devices, Inc. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
**********************************************************************************************************************/
/**
***********************************************************************************************************************
* @file palCmdBuffer.h
* @brief Defines the Platform Abstraction Library (PAL) ICmdBuffer interface and related types.
***********************************************************************************************************************
*/
#pragma once
#include "pal.h"
#include "palDevice.h"
#include "palGpuMemory.h"
#include "palImage.h"
#include "palMsaaState.h"
#include "palPipeline.h"
#include "palQueryPool.h"
/// HSA kernel dispatch packet typedef
typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t;
/// AMD kernel code typedef
typedef struct amd_kernel_code_s amd_kernel_code_t;
namespace Util
{
class VirtualLinearAllocator;
class Event;
}
namespace Pal
{
// Forward declarations.
class IBorderColorPalette;
class ICmdAllocator;
class ICmdBuffer;
class IColorBlendState;
class IColorTargetView;
class IDepthStencilState;
class IDepthStencilView;
class IGpuEvent;
class IGpuMemory;
class IIndirectCmdGenerator;
class IMsaaState;
class IPerfExperiment;
class IQueue;
class IQueryPool;
enum class PerfTraceMarkerType : uint32;
enum class PointOrigin : uint32;
struct VideoCodecInfo;
struct VideoCodecAuxInfo;
/// Specifies a pipeline bind point (i.e., compute or graphics).
enum class PipelineBindPoint : uint32
{
Compute = 0x0,
Graphics = 0x1,
Count
};
/// Fully specifies a type of graphics primitive and vertex ordering for geometry.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 848
enum class PrimitiveTopology : uint8
#else
enum class PrimitiveTopology : uint32
#endif
{
PointList = 0x0,
LineList = 0x1,
LineStrip = 0x2,
TriangleList = 0x3,
TriangleStrip = 0x4,
RectList = 0x5, ///< Each rect is three 2D axis-aligned rectangle vertices.
QuadList = 0x6,
QuadStrip = 0x7,
LineListAdj = 0x8,
LineStripAdj = 0x9,
TriangleListAdj = 0xA,
TriangleStripAdj = 0xB,
Patch = 0xC,
TriangleFan = 0xD,
LineLoop = 0xE,
Polygon = 0xF,
TwoDRectList = 0x10, ///< Each rect is the bounding box of an arbitrary 2D triangle.
/// Support is optional, see support2DRectList in DeviceProperties.
Count
};
/// Specifies how triangle primitives should be rasterized.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 851
enum class FillMode : uint8
#else
enum class FillMode : uint32
#endif
{
Points = 0x0,
Wireframe = 0x1,
Solid = 0x2,
Count
};
/// Specifies the triangle face direction that should result in culled primitives.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 851
enum class CullMode : uint8
#else
enum class CullMode : uint32
#endif
{
_None = 0x0, ///< All triangles are rasterized.
Front = 0x1, ///< Front facing triangles are culled.
Back = 0x2, ///< Back facing triangles are culled.
FrontAndBack = 0x3, ///< All triangles are culled.
// Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either
// undefing None before including this header or using _None when dealing with PAL.
#ifndef None
None = _None, ///< All triangles are rasterized.
#endif
};
/// Specifies vertex winding order corresponding to a front facing triangle. @see CullMode.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 851
enum class FaceOrientation : uint8
#else
enum class FaceOrientation : uint32
#endif
{
Ccw = 0x0, ///< Counter-clockwise vertex winding primitives are front facing.
Cw = 0x1 ///< Clockwise vertex winding primitives are front facing.
};
/// Specifies which vertex of a primitive is the _provoking vertex_. This impacts which vertex's "flat" VS outputs
/// are passed to the PS (i.e., flat shading).
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 851
enum class ProvokingVertex : uint8
#else
enum class ProvokingVertex : uint32
#endif
{
First = 0x0,
Last = 0x1
};
/// Specifies bit size of each element in an index buffer.
enum class IndexType : uint32
{
Idx8 = 0x0,
Idx16 = 0x1,
Idx32 = 0x2,
Count
};
/// Specifies a memory atomic operation that can be performed from command buffers with ICmdBuffer::CmdMemoryAtomic().
enum class AtomicOp : uint32
{
AddInt32 = 0x00,
SubInt32 = 0x01,
MinUint32 = 0x02,
MaxUint32 = 0x03,
MinSint32 = 0x04,
MaxSint32 = 0x05,
AndInt32 = 0x06,
OrInt32 = 0x07,
XorInt32 = 0x08,
IncUint32 = 0x09,
DecUint32 = 0x0A,
AddInt64 = 0x0B,
SubInt64 = 0x0C,
MinUint64 = 0x0D,
MaxUint64 = 0x0E,
MinSint64 = 0x0F,
MaxSint64 = 0x10,
AndInt64 = 0x11,
OrInt64 = 0x12,
XorInt64 = 0x13,
IncUint64 = 0x14,
DecUint64 = 0x15,
Count
};
/// Specifies the point in the GPU pipeline where an action should take place.
///
/// Relevant operations include setting GPU events, waiting on GPU events in hardware, or writing timestamps.
///
/// @note The numeric value of these enums are ordered such that a "newState < oldState" comparison will generally yield
/// true if a stall is necessary to resolve a hazard between those two pipe points. This guideline does not
/// hold up when comparing PreRasterization or PostPs with PostCs, as CS work is not properly pipelined with
/// graphics shader work.
///
/// @see ICmdBuffer::CmdSetEvent()
/// @see ICmdBuffer::CmdResetEvent()
/// @see ICmdBuffer::CmdPredicateEvent()
/// @see ICmdBuffer::CmdBarrier()
/// @see ICmdBuffer::CmdWriteTimestamp()
/// @see ICmdBuffer::CmdWriteImmediate()
enum HwPipePoint : uint32
{
HwPipeTop = 0x0, ///< Earliest possible point in the GPU pipeline (CP PFP), can be
/// used as wait point for indirect args and index buffer fetch.
HwPipePostPrefetch = 0x1, ///< Indirect arguments have been fetched for all prior
/// draws/dispatches (CP ME).
HwPipePreRasterization = 0x2, ///< All prior generated VS/HS/DS/GS waves have completed, can be
/// used as release point for VB/IB fetch and streamout target.
HwPipePostPs = 0x3, ///< All prior generated PS waves have completed.
/// Only valid as a pipe point to wait on (release point).
HwPipePreColorTarget = 0x4, ///< Represents the same point in pipe to HwPipePostPs, but provides
/// clients with a better option to accurately specify the pipeline
/// sync request. And PAL uses it as entry-point to add partial
/// flushes to prevent write-after-read hazard from corner cases.
/// Only valid as a wait point (acquire point).
HwPipePreIndexBuffer = HwPipeTop, ///< As late as possible before index buffer fetches (CP PFP).
HwPipePostIndexBuffer = HwPipePreRasterization,///< All prior index buffer fetches have completed.
// The following points apply to compute-specific work:
HwPipePreCs = HwPipePostPrefetch, ///< As late as possible before CS waves are launched (CP ME).
HwPipePostCs = 0x5, ///< All prior generated CS waves have completed.
// The following points apply to BLT-specific work:
HwPipePreBlt = HwPipePostPrefetch, ///< As late as possible before BLT operations are launched.
HwPipePostBlt = 0x6, ///< All prior requested BLTs have completed.
HwPipeBottom = 0x7, ///< All prior GPU work (graphics, compute, or BLT) has completed.
HwPipePointCount
};
/// Bitmask values that can be OR'ed together to specify a synchronization scope. See srcStageMask and dstStageMask in
/// @ref AcquireReleaseInfo.
///
/// When specifying an execution dependency at a synchronization point where previous operations must *happen-before*
/// future operations, a mask of these flags specifies a *synchronization scope* that restricts which stages of prior
/// draws, dispatches, or BLTs must *happen-before* which stages of future draws, dispatches, or BLTs.
///
/// Note that flag numerical order does not indicate any happens-before or happens-after relationships. Clients should
/// not compare flags numerically to judge execution order, only barriers can guarantee execution ordering.
enum PipelineStageFlag : uint32
{
PipelineStageTopOfPipe = 0x00000001,
PipelineStageFetchIndirectArgs = 0x00000002,
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 835
PipelineStagePostPrefetch = 0x00000004,
PipelineStageFetchIndices = 0x00000008,
PipelineStageStreamOut = 0x00000010,
PipelineStageVs = 0x00000020,
PipelineStageHs = 0x00000040,
PipelineStageDs = 0x00000080,
PipelineStageGs = 0x00000100,
PipelineStagePs = 0x00000200,
PipelineStageSampleRate = 0x00000400,
PipelineStageEarlyDsTarget = 0x00000800,
PipelineStageLateDsTarget = 0x00001000,
PipelineStageColorTarget = 0x00002000,
PipelineStageCs = 0x00004000,
PipelineStageBlt = 0x00008000,
PipelineStageBottomOfPipe = 0x00010000,
PipelineStageDsTarget = PipelineStageEarlyDsTarget | PipelineStageLateDsTarget,
PipelineStageAllStages = 0x0001FFFF
#else
PipelineStageFetchIndices = 0x00000004,
PipelineStageStreamOut = 0x00000008,
PipelineStageVs = 0x00000010,
PipelineStageHs = 0x00000020,
PipelineStageDs = 0x00000040,
PipelineStageGs = 0x00000080,
PipelineStagePs = 0x00000100,
PipelineStageEarlyDsTarget = 0x00000200,
PipelineStageLateDsTarget = 0x00000400,
PipelineStageColorTarget = 0x00000800,
PipelineStageCs = 0x00001000,
PipelineStageBlt = 0x00002000,
PipelineStageBottomOfPipe = 0x00004000,
PipelineStageDsTarget = PipelineStageEarlyDsTarget | PipelineStageLateDsTarget,
PipelineStageAllStages = 0x00007FFF
#endif
};
/// Bitmask values that can be ORed together to specify all potential usages of an image at a point in time. Such a
/// mask should be specified in the usages field of ImageLayout. These combined usages can be examined by PAL to infer
/// the layout (i.e., compression state) of the image.
///
/// @note There is no layout corresponding to CmdClear*(). The layout flags passed to those functions will determine
/// the expected image layout at that time, and the CmdClear*() implementation will execute a clear that keeps the
/// layout the same.
enum ImageLayoutUsageFlags : uint32
{
LayoutUninitializedTarget = 0x00000001, ///< Initial state of any image that can be used as a color or
/// depth/stencil target. A layout transition out of this state will
/// likely result in a mask RAM initialization BLT. If this bit is
/// set, no other bits may be set.
LayoutColorTarget = 0x00000002, ///< Color target bound via CmdBindTargets(). This bit is exclusive
/// with LayoutDepthStencilTarget.
LayoutDepthStencilTarget = 0x00000004, ///< Depth/stencil target bound via CmdBindTargets(). This bit is
/// exclusive with LayoutColorTarget.
LayoutShaderRead = 0x00000008, ///< Any shader read state including texture, UAV, constant buffer,
/// vertex buffer.
LayoutShaderFmaskBasedRead = 0x00000010, ///< Images in this state support the load_fptr AMD IL instruction,
/// which will read decompressed fmask in order to access compressed
/// MSAA color data from a shader.
LayoutShaderWrite = 0x00000020, ///< Writeable UAV.
LayoutCopySrc = 0x00000040, ///< CmdCopyImage(), CmdCopyImageToMemory(), CmdScaledCopyImage or
/// CmdCopyTiledImageToMemory() source image.
LayoutCopyDst = 0x00000080, ///< CmdCopyImage(), CmdCopyMemoryToImage(), CmdScaledCopyImage or
/// CmdCopyMemoryToTiledImage() destination image.
LayoutResolveSrc = 0x00000100, ///< CmdResolveImage() source.
LayoutResolveDst = 0x00000200, ///< CmdResolveImage() destination.
LayoutPresentWindowed = 0x00000400, ///< Windowed-mode IQueue::Present().
LayoutPresentFullscreen = 0x00000800, ///< Fullscreen (flip) present. Layout must be supported by the
/// display engine.
LayoutUncompressed = 0x00001000, ///< Metadata fully decompressed/expanded layout
LayoutSampleRate = 0x00002000, ///< CmdBindSampleRateImage() source.
LayoutAllUsages = 0x00003FFF
};
/// Bitmask values that can be ORed together to specify all potential engines an image might be used on. Such a
/// mask should be specified in the engines field of ImageLayout.
///
/// If the client API is unable to determine which engines might be used, it should specify all possible engines
/// corresponding to the usage flags.
enum ImageLayoutEngineFlags : uint32
{
LayoutUniversalEngine = 0x1,
LayoutComputeEngine = 0x2,
LayoutDmaEngine = 0x4,
LayoutVideoEncodeEngine = 0x8,
LayoutVideoDecodeEngine = 0x10,
LayoutVideoJpegDecodeEngine = 0x20,
LayoutAllEngines = 0x3F
};
/// Bitmask values that can be ORed together to specify previous output usage and upcoming input usages of an image or
/// GPU memory in a ICmdBuffer::CmdBarrier() call to ensure cache coherency between those usages.
enum CacheCoherencyUsageFlags : uint32
{
CoherCpu = 0x00000001, ///< Data read or written by CPU.
CoherShaderRead = 0x00000002, ///< Data read by a GPU shader.
CoherShaderWrite = 0x00000004, ///< Data written by a GPU shader.
CoherCopySrc = 0x00000008, ///< Source of a ICmdBuffer::CmdCopy*() call.
CoherCopyDst = 0x00000010, ///< Destination of a ICmdBuffer::CmdCopy*() call.
CoherColorTarget = 0x00000020, ///< Color target.
CoherDepthStencilTarget = 0x00000040, ///< Depth stencil target.
CoherResolveSrc = 0x00000080, ///< Source of a CmdResolveImage() call.
CoherResolveDst = 0x00000100, ///< Destination of a CmdResolveImage() call.
CoherClear = 0x00000200, ///< Destination of a CmdClear() call.
CoherIndirectArgs = 0x00000400, ///< Source argument data read by CmdDrawIndirect() and similar functions.
CoherIndexData = 0x00000800, ///< Index buffer data.
CoherQueueAtomic = 0x00001000, ///< Destination of a CmdMemoryAtomic() call.
CoherTimestamp = 0x00002000, ///< Destination of a CmdWriteTimestamp() call.
CoherCeLoad = 0x00004000, ///< Source of a CmdLoadCeRam() call.
CoherCeDump = 0x00008000, ///< Destination of CmdDumpCeRam() call.
CoherStreamOut = 0x00010000, ///< Data written as stream output.
CoherMemory = 0x00020000, ///< Data read or written directly from/to memory
CoherSampleRate = 0x00040000, ///< CmdBindSampleRateImage() source.
CoherPresent = 0x00080000, ///< Source of present.
CoherCp = 0x00200000, ///< HW Command Processor (CP) encompassing the front - end command
CoherAllUsages = 0x003FFFFF, ///< processing of any queue, including SDMA.
CoherShader = CoherShaderRead | CoherShaderWrite,
CoherCopy = CoherCopySrc | CoherCopyDst,
CoherResolve = CoherResolveSrc | CoherResolveDst,
};
/// Bitmask values for the flags parameter of ICmdBuffer::CmdClearColorImage().
enum ClearColorImageFlags : uint32
{
ColorClearAutoSync = 0x00000001, ///< PAL will automatically insert required CmdBarrier() synchronization before
/// and after the clear assuming all subresources to be cleared are currently
/// ready for rendering as a color target (as is required by API convention in
/// DX12). Allows reduced sync costs in some situations since PAL knows
/// the details of how the clear will be performed.
ColorClearForceSlow = 0x00000002, ///< Force these to use slow clears.
ColorClearSkipIfSlow = 0x00000004, ///< Only issue the clear if it is a fast clear.
ColorClearAllFlags = 0x00000007 ///< Clients should NOT use it, for internal static_assert purpose only.
};
/// Bitmask values for the flags parameter of ICmdBuffer::CmdClearDepthStencil().
enum ClearDepthStencilFlags : uint32
{
DsClearAutoSync = 0x00000001, ///< PAL will automatically insert required CmdBarrier() synchronization before
/// and after the clear assuming all subresources to be cleared are currently
/// ready for rendering as a depth/stencil target (as is required by API convention
/// in DX12). Allows reduced sync costs in some situations since PAL knows the
/// details of how the clear will be performed.
DsClearAllFlags = 0x00000001 ///< Clients should NOT use it, for internal static_assert purpose only.
};
/// Bitmask values for the flags parameter of ICmdBuffer::CmdResolveImage().
enum ResolveImageFlags : uint32
{
ImageResolveInvertY = 0x00000001, ///< PAL will invert the y-axis (flip upside down) of the resolved region to
/// the destination image.
ImageResolveDstAsSrgb = 0x00000002, ///< If set, a non-srgb destination image will be treated as srgb format.
/// The flag cannot be set when @ref ImageResolveDstAsNorm is set.
ImageResolveDstAsNorm = 0x00000004, ///< If set, a srgb destination image will be treated as non-srgb format.
/// The flag cannot be set when @ref ImageResolveDstAsSrgb is set.
ImageResolveSrcAsNorm = 0x00000008, ///< If set, a srgb source image will be treated as non-srgb format.
ImageResolveAllFlags = 0x0000000F ///< Clients should NOT use it, for internal static_assert purpose only.
};
/// Specifies properties for creation of an ICmdBuffer object. Input structure to IDevice::CreateCmdBuffer().
struct CmdBufferCreateInfo
{
ICmdAllocator* pCmdAllocator; ///< The command buffer will use this command allocator to allocate all GPU memory
/// If the client specifies a null pCmdAllocator, it must call ICmdBuffer::Reset
/// with a non-null pCmdAllocator before calling ICmdBuffer::Begin.
QueueType queueType; ///< Type of queue commands in this command buffer will target.
/// This defines the set of allowed actions in the command buffer.
QueuePriority queuePriority; ///< Priority level of the queue this command buffer will target.
EngineType engineType; ///< Type of engine the queue commands will run on.
union
{
struct
{
/// Indicates that this command buffer will be a "nested" command buffer, instead of a normal, "root"
/// command buffer. Nested command buffers differ from root command buffers in how they are sent to the
/// GPU for execution: root command buffers must be submitted to the hardware by calling
/// @ref IQueue::Submit, whereas nested command buffers can only be submitted by being executed by a root
/// command buffer.
///
/// Currently, only Universal and Compute command buffers can be nested. Nesting DMA command buffers is
/// meaningless and unsupported. It is an error to attempt to create a nested DMA command buffer.
///
/// @see ICmdBuffer::CmdExecuteNestedCmdBuffers.
uint32 nested : 1;
/// Dedicated CUs are reserved for this queue. Thus we have to skip CU mask programming.
uint32 realtimeComputeUnits : 1;
/// Target queue uses dispatch tunneling.
uint32 dispatchTunneling : 1;
uint32 reserved1 : 1;
/// Reserved for future use.
uint32 reserved : 28;
};
/// Flags packed as 32-bit uint.
uint32 u32All;
} flags; ///< Command buffer creation flags.
};
/// Specifies which states will not be bound in a nested command buffer, and instead must be inherited from the calling
/// root-level command buffer.
union InheritedStateFlags
{
struct
{
/// Color and depth target views are inherited from the root-level command buffer. The nested command buffer
/// should not modify this state.
uint32 targetViewState : 1;
/// Occlusion query is inherited from the root-level command buffer. The nested command buffer
/// should not modify this state.
uint32 occlusionQuery : 1;
/// Predication is inherited from the root-level command buffer. The nested command buffer should not modify
/// this state.
uint32 predication : 1;
/// Reserved for future usage.
uint32 reserved : 29;
};
/// Flags packed as 32-bit uint.
uint32 u32All;
};
/// Specifies parameters inherited from primary command buffer into nested command buffer.
struct InheritedStateParams
{
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 891
uint32 colorTargetCount; ///< Number of color targets bound in the
/// root-level command buffer.
SwizzledFormat colorTargetSwizzledFormats[MaxColorTargets]; ///< Format and swizzle for each color
/// target.
uint32 sampleCount[MaxColorTargets]; ///< Sample count for each color target.
#endif
InheritedStateFlags stateFlags; ///< States that are inherited from the
/// calling root-level command buffer.
};
/// Specifies optional hints to control command buffer building optimizations.
union CmdBufferBuildFlags
{
struct
{
/// Optimize command buffer building for large sets of draw or dispatch operations that are GPU front-end
/// limited. These optimizations include removing redundant PM4 commands and reducing the VGT prim group size.
/// This flag might increase the CPU overhead of building command buffers.
uint32 optimizeGpuSmallBatch : 1;
/// Optimize command buffer building for exclusive command buffer submission. Command buffers built with this
/// flag cannot be submitted if they have already been submitted previously unless the caller guarantees that
/// they are no longer in use. This flag allows PAL to modify the contents of command buffers during
/// submission.
uint32 optimizeExclusiveSubmit : 1;
/// Optimize command buffer building for single command buffer submission. Command buffers built with this flag
/// cannot be submitted more than once. This flag allows PAL to modify the contents of command buffers during
/// submission. This flag is a stricter version of optimizeExclusiveSubmit, it is not necessary to set
/// optimizeExclusiveSubmit if this flag is set.
uint32 optimizeOneTimeSubmit : 1;
/// Indicates that the client is providing custom tessellation distribution settings. If set, it is the clients
/// responsibility to ensure all 5 (isoline, triangle, quad, donut, trapezoid) factors are provided.
uint32 optimizeTessDistributionFactors : 1;
/// Attempt to prefetch shader code into cache before launching draws or dispatches with a freshly bound
/// pipeline object. This optimization might increase the CPU overhead of building command buffers and/or
/// introduce additional front-end GPU bottlenecks.
uint32 prefetchShaders : 1;
/// Attempt to prefetch the command buffer into cache to avoid bottlenecking the GPU front-end.
/// This optimization might slightly increase the overhead of some GPU copies and other front-end reads/writes.
uint32 prefetchCommands : 1;
/// Indicates the command buffer will use one or more constant engine commands: CmdLoadCeRam(), CmdDumpCeRam(),
/// or CmdWriteCeRam()
uint32 usesCeRamCmds : 1;
/// Indicates that the client would prefer that this nested command buffer not be launched using an IB2 packet.
/// The calling command buffer will either inline this command buffer into itself or use IB chaining based on if
/// the optimizeExclusiveSubmit flag is also set. This flag is ignored for root command buffers.
uint32 disallowNestedLaunchViaIb2 : 1;
/// placeholder
uint32 placeholder1 : 2;
/// Enable TMZ mode to allow reading TMZ protected allocations. If this command buffer attempts to write
/// non-TMZ memory, the results are undefined. Only valid for graphics and compute.
uint32 enableTmz : 1;
uint32 placeholder3 : 1;
/// If set, internal operations such as blits, copies, etc. will not affect active Query results.
/// Otherwise they may affect the results.
uint32 disableQueryInternalOps : 1;
uint32 optimizeContextStatesPerBin : 1;
uint32 optimizePersistentStatesPerBin : 1;
/// Reserved for future use.
uint32 reserved : 16;
};
/// Flags packed as 32-bit uint.
uint32 u32All;
};
/// Specifies tessellation accum factors.
union TessDistributionFactors
{
struct
{
/// The following 3 factors are used by hardware when distributed tessellation is active: the min tess factors for
/// each patch processed by a VGT are accumulated. When the sum exceeds this threshold, the next patch is sent to a
/// different VGT.
uint32 isoDistributionFactor : 8;
uint32 triDistributionFactor : 8; ///< Recommended to be higher than quad factor.
uint32 quadDistributionFactor : 8;
/// Used by the hardware when distributed tessellation is in DONUT mode: the min tess factor for each patch is
/// tested against this threshold to determine whether a patch gets split up. If the patch isn't split, it still
/// increments the accumulator for the Patch distribution factor.
uint32 donutDistributionFactor : 5;
/// Used when the distribution mode is TRAPEZOID for quad and tri domain types. The number of donuts in the patch
/// are compared against this value to detemine whether this donut gets split up into trapezoids (needs the patch to
/// be in donut mode). A value of 0 or 1 will be treated as 2. The innermost donut is never allowed to be broken
/// into trapezoids.
uint32 trapDistributionFactor : 3;
};
/// Values packed as 32-bit uint.
uint32 u32All;
};
/// Specifies options that direct command buffer building.
struct CmdBufferBuildInfo
{
/// Command buffer build flags, specifies optional hints to control command buffer build optimizations.
CmdBufferBuildFlags flags;
/// Command buffer inherited state and params. If non-null, related state is assumed set in root-level and nested
/// command buffer should not modify the software states. Any software params that may be needed within nested
/// command buffer needs to be provided here.
const InheritedStateParams* pInheritedState;
/// If non-null, the command buffer will begin with all states set as they are in this previously built command
/// buffer. Any state specified in pInheritedState is excluded if it is also provided.
const ICmdBuffer* pStateInheritCmdBuffer;
/// Optional allocator for PAL to use when allocating temporary memory during command buffer building. PAL will
/// stop using this allocator once command building ends. If no allocator is provided PAL will use an internally
/// managed allocator instead which may be less efficient. PAL will use this allocator in two ways:
/// + Temporary storage within a single command building call. PAL will rewind the allocator before returning to
/// free all memory allocated within the call.
/// + Temporary storage for the entire command building period. When Begin() is called, PAL will save the current
/// position of the allocator and rewind the allocator to that point when End() is called. If the client also
/// wishes to allocate temporary storage that lasts between command building function calls they must allocate it
/// before calling Begin() or PAL will accidentally free it.
Util::VirtualLinearAllocator* pMemAllocator;
/// Optional tessellation distribution factors that will overwrite PAL set defaults. Clients must also set the
/// optimizeTessDistributionFactors flag for these custom factors to take effect.
TessDistributionFactors clientTessDistributionFactors;
/// Number of context states per PBB bin.
/// Client must also set @ref CmdBufferBuildFlags::optimizeContextStatesPerBin for this to take effect.
uint8 contextStatesPerBin;
/// Number of persistent states per PBB bin.
/// Client must also set @ref CmdBufferBuildFlags::optimizePersistentStatesPerBin for this to take effect.
uint8 persistentStatesPerBin;
/// Client/app data handle. This can have an arbitrary value and is used to uniquely identify this command buffer.
uint64 execMarkerClientHandle;
};
/// Specifies info on how a compute shader should use resources.
struct DynamicComputeShaderInfo
{
float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit. This can be used to selectively
/// throttle certain workloads that bottleneck multiqueue applications. For ease of use, a
/// value of zero means no limit is set. The remaining valid values are in the range (0, 40]
/// and specify the maximum number of waves per compute unit. If the hardware has one wave
/// limit control for multiple shader stages PAL will select the most strict limit.
/// This option is converted internally to set set HW WavesPerSh setting and the non-integer
/// maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for
/// example specify less number of waves than number of CUs per shader array.
uint32 maxThreadGroupsPerCu; ///< Override the maximum number of threadgroups that a particular CS can run on,
/// throttling it, to enable more graphics work to complete. 0 disables the limit.
uint32 tgScheduleCountPerCu; ///< Override the number of threadgroups to schedule on a single compute unit before
/// moving to the next compute unit. 0 selects optimal default.
uint32 ldsBytesPerTg; ///< Override the amount of LDS space used per thread-group for this pipeline, in bytes.
/// Zero indicates that the LDS size determined at pipeline-compilation time will be used.
};
/// Specifies info on how a graphics shader should use resources.
struct DynamicGraphicsShaderInfo
{
float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit. This can be used to selectively
/// throttle certain workloads that bottleneck multiqueue applications. For ease of use, a
/// value of zero means no limit is set. The remaining valid values are in the range (0, 40]
/// and specify the maximum number of waves per compute unit. If the hardware has one wave
/// limit control for multiple shader stages PAL will select the most strict limit.
/// This option is converted internally to set HW WavesPerSh setting and the non-integer
/// maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for
/// example specify less number of waves than number of CUs per shader array.
};
/// Specifies dynamic states of a graphics pipeline
struct DynamicGraphicsState
{
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 842
DepthClampMode depthClampMode; ///< Depth clamping behavior.
DepthRange depthRange; ///< Specifies Z dimensions of screen space (i.e., post viewport
/// transform: 0 to 1 or -1 to 1).
LogicOp logicOp; ///< Logic operation to perform.
uint32 colorWriteMask; ///< Color target write mask.
uint32 switchWinding : 1; ///< Whether to reverse vertex ordering for tessellation.
uint32 depthClipNearEnable : 1; ///< Enable clipping based on Near Z coordinate.
uint32 depthClipFarEnable : 1; ///< Enable clipping based on Far Z coordinate.
uint32 alphaToCoverageEnable : 1; ///< Enable alpha to coverage.
uint32 perpLineEndCapsEnable : 1; ///< Forces the use of perpendicular line end caps as opposed to
/// axis-aligned line end caps during line rasterization.
uint32 rasterizerDiscardEnable : 1; ///< Whether to kill all rasterized pixels.
uint32 dualSourceBlendEnable : 1; ///< Enable dual source blend
uint32 vertexBufferCount : 6; ///< Number of vertex buffer slots which are accessed by this pipeline
uint32 reserved : 19; ///< Reserved for future use.
#else
uint32 colorWriteMask; ///< Color target write mask. 4b / RT (8 count)
struct
{
uint32 switchWinding : 1; ///< Whether to reverse vertex ordering for tessellation.
uint32 depthClipNearEnable : 1; ///< Enable clipping based on Near Z coordinate.
uint32 depthClipFarEnable : 1; ///< Enable clipping based on Far Z coordinate.
uint32 alphaToCoverageEnable : 1; ///< Enable alpha to coverage.
uint32 perpLineEndCapsEnable : 1; ///< Forces the use of perpendicular line end caps as opposed to
/// axis-aligned line end caps during line rasterization.
uint32 rasterizerDiscardEnable : 1; ///< Whether to kill all rasterized pixels.
uint32 dualSourceBlendEnable : 1; ///< Enable dual source blend
uint32 vertexBufferCount : 6; ///< Number vertex buffer slots accessed by this pipeline
LogicOp logicOp : 4; ///< Logic operation to perform.
DepthRange depthRange : 1; ///< Specifies Z dimensions of screen space (i.e., post viewport
/// transform: 0 to 1 or -1 to 1).
DepthClampMode depthClampMode : 2; ///< Depth clamping behavior.
uint32 reserved1 : 7; ///< Reserved
uint32 reserved : 5; ///< Reserved for future use.
};
#endif
union
{
struct
{
uint32 depthClampMode : 1; ///< Whether to enable dynamic state depthClampMode.
uint32 depthRange : 1; ///< Whether to enable dynamic state depthRange.
uint32 logicOp : 1; ///< Whether to enable dynamic state logicOp.
uint32 colorWriteMask : 1; ///< Whether to enable dynamic state colorWriteMask.
uint32 switchWinding : 1; ///< Whether to enable dynamic state switchWinding.
uint32 depthClipMode : 1; ///< Whether to enable dynamic state depthClipNear/FarEnable.
uint32 alphaToCoverageEnable : 1; ///< Whether to enable dynamic state alphaToCoverageEnable.
uint32 perpLineEndCapsEnable : 1; ///< Whether to enable dynamic state perpLineEndCapsEnable.
uint32 rasterizerDiscardEnable : 1; ///< Whether to enable dynamic state rasterizerDiscardEnable.
uint32 dualSourceBlendEnable : 1; ///< Whether to enable dynamic state dualSourceBlendEnable
uint32 vertexBufferCount : 1; ///< Whether to enable dynamic state vertexBufferCount.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 842
uint32 reserved : 21; ///< Reserved for future use.
#else
uint32 reserved1 : 1; ///< Reserved.
uint32 reserved : 20; ///< Reserved for future use.
#endif
};
uint32 u32All;
} enable;
};
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 842
/// Specifies info on how graphics shaders should use resources.
struct DynamicGraphicsShaderInfos
{
DynamicGraphicsShaderInfo vs; ///< Dynamic Vertex shader information.
DynamicGraphicsShaderInfo hs; ///< Dynamic Hull shader information.
DynamicGraphicsShaderInfo ds; ///< Dynamic Domain shader information.
DynamicGraphicsShaderInfo gs; ///< Dynamic Geometry shader information.
DynamicGraphicsShaderInfo ts; ///< Dynamic Task shader information.
DynamicGraphicsShaderInfo ms; ///< Dynamic Mesh shader information.
DynamicGraphicsShaderInfo ps; ///< Dynamic Pixel shader information.
DynamicGraphicsState dynamicState; ///< Dynamic state of graphics pipeline.
union
{
struct
{
uint32 reserved0 : 8; ///< Reserved.
uint32 reserved : 24; ///< Reserved for future use.
};
uint32 u32All; ///< Flags packed as 32-bit uint.
} flags; ///< BindPipeline flags.
};
#else
/// Specifies info on how graphics shaders should use resources.
struct DynamicGraphicsShaderInfos
{
union
{
// VS/HS/DS/GS or TS/MS are active
struct
{
DynamicGraphicsShaderInfo vs; ///< Dynamic Vertex shader information.
DynamicGraphicsShaderInfo hs; ///< Dynamic Hull shader information.
DynamicGraphicsShaderInfo ds; ///< Dynamic Domain shader information.
DynamicGraphicsShaderInfo gs; ///< Dynamic Geometry shader information.
};
struct
{
DynamicGraphicsShaderInfo ts; ///< Dynamic Task shader information.
DynamicGraphicsShaderInfo ms; ///< Dynamic Mesh shader information.
};
};
DynamicGraphicsShaderInfo ps; ///< Dynamic Pixel shader information.
union
{
struct
{
uint8 vs : 1; // If set, there is dynamic VS shader info.
uint8 hs : 1; // If set, there is dynamic HS shader info.
uint8 ds : 1; // If set, there is dynamic DS shader info.
uint8 gs : 1; // If set, there is dynamic GS shader info.
uint8 ps : 1; // If set, there is dynamic PS shader info.
uint8 ts : 1; // If set, there is dynamic TS shader info.
uint8 ms : 1; // If set, there is dynamic MS shader info.
uint8 reserved : 1; // Reserved.
};
uint8 u8All;
} enable;
};
#endif
/// Specifies parameters for binding a pipeline.
/// @see ICmdBuffer::CmdBindPipeline
struct PipelineBindParams
{
PipelineBindPoint pipelineBindPoint; ///< Specifies which type of pipeline is to be bound (compute or graphics).
const IPipeline* pPipeline; ///< New pipeline to be bound. Can be null in order to unbind a previously
/// bound pipeline without binding a new one.
uint64 apiPsoHash; ///< 64-bit identifier provided by client driver based on the Pipeline State
/// Object. There exists a many-to-one correlation for ApiPsoHash to
/// internalPipelineHash to map the two.
union
{
DynamicComputeShaderInfo cs; ///< Dynamic Compute shader information.
#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 842
DynamicGraphicsShaderInfos graphics; ///< Dynamic Graphics shader information.
#else
struct
{
DynamicGraphicsShaderInfos gfxShaderInfo;
DynamicGraphicsState gfxDynState;
};
#endif
};
};
/// Specifies per-MRT color target view and current image state. Used as input to ICmdBuffer::CmdBindTargets().
struct ColorTargetBindInfo
{
const IColorTargetView* pColorTargetView; ///< Color target view to bind.
ImageLayout imageLayout; ///< Specifies the current image layout based on bitmasks of currently
/// allowed operations and engines that may perform those operations.
/// At minimum, the LayoutColorTarget usage flag and
/// LayoutUniversalEngine engine flag must be set.
};
/// Specifies depth/stencil view and current image state of the depth and stencil planes. Used as input to
/// ICmdBuffer::CmdBindTargets().
struct DepthStencilBindInfo
{
const IDepthStencilView* pDepthStencilView; ///< Depth/stencil target view to bind.
ImageLayout depthLayout; ///< Specifies the current image layout of the depth plane based on
/// bitmasks of currently allowed operations and engines that may
/// perform those operations. At minimum, the
/// LayoutDepthStencilTarget usage flag and LayoutUniversalEngine
/// engine flag must be set. Ignored if the specified view does not
/// have a depth plane.
ImageLayout stencilLayout; ///< Specifies the current image layout of the stencil plane based on
/// bitmasks of currently allowed operations and engines that may
/// perform those operations. At minimum, the
/// LayoutDepthStencilTarget usage flag and LayoutUniversalEngine
/// engine flag must be set. Ignored if the specified view does not
/// have a stencil plane.
};
/// Represents a GPU memory or image transition as part of a barrier.
///
/// A single transition will ensure cache coherency of dirty data in the specific set of source caches with the
/// specified set of destination caches. The source and destination designation is relative to the barrier itself
/// and does not indicate whether a particular cache is a read or write cache.
///
/// Typically a transition flushes written data from the source caches into the destination caches and thus the source
/// cache mask typically only contains write caches. However, the client is encouraged to include flags for any prior
/// read-only caches accesses as PAL may be able to optimize its cache operations.
///
/// If the both cache masks are zero the client is indicating that no cache coherency operations are required but PAL
/// may still issue cache operations for internal reasons.
///
/// In addition, the client can change an image's layout usage/engine flags which may result in a metadata blt.
///
/// @note There is no range provided to control the range of addresses that will be flushed/invalidated in GPU caches.
struct BarrierTransition
{
uint32 srcCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing previous write operations whose
/// results need to be visible for subsequent operations. Flags for prior read operations
/// may be included as well and may be used for internal optimizations.
uint32 dstCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing the operations expected to read
/// and/or write data flushed from the caches indicated by the srcCacheMask.
struct
{
const IImage* pImage; ///< If non-null, indicates this transition only applies to the specified image.
/// The remaining members of this structure are ignored if this member is null.
SubresRange subresRange; ///< Subset of pImage this transition applies to. If newLayout includes @ref
/// LayoutUninitializedTarget this range must cover all subresources of pImage
/// unless the perSubresInit image create flag was specified.
ImageLayout oldLayout; ///< Specifies the current image layout based on bitmasks of allowed operations and
/// engines up to this point. These masks imply the previous compression state. No
/// usage flags should ever be set in oldLayout.usages that correspond to usages
/// that are not supported by the engine that is performing the transition. The
/// queue type performing the transition must be set in oldLayout.engines.
ImageLayout newLayout; ///< Specifies the upcoming image layout based on bitmasks of allowed operations and
/// engines after this point. These masks imply the upcoming compression state.
/// point. This usage mask implies the upcoming compressions state. A difference
/// between oldLayoutUsageMask and newLayoutUsageMask may result in a
/// decompression.
/// Specifies a custom sample pattern over a 2x2 pixel quad. The position for each sample is specified on a
/// grid where the pixel center is <0,0>, the top left corner of the pixel is <-8,-8>, and <7,7> is the maximum
/// valid position (not quite to the bottom/right border of the pixel).
/// Specifies a custom sample pattern over a 2x2 pixel quad. Can be left null for non-MSAA images or when
/// a valid MsaaQuadSamplePattern is bound prior to the CmdBarrier call.
const MsaaQuadSamplePattern* pQuadSamplePattern;
} imageInfo; ///< Image-specific transition information.
};
/// Describes a barrier as inserted by a call to ICmdBuffer::CmdBarrier().
///
/// A barrier can be used to 1) stall GPU execution at a specified point to resolve a data hazard, 2) flush/invalidate
/// GPU caches to ensure data coherency, and/or 3) compress/decompress image resources as necessary when changing how
/// the GPU will use the image.
///
/// This structure directly specifies how #1 is performed. #2 and #3 are managed by the list of @ref BarrierTransition
/// structures passed in pTransitions.
struct BarrierInfo
{
/// Determine at what point the GPU should stall until all specified waits and transitions have completed. If the
/// specified wait point is unavailable, PAL will wait at the closest available earlier point.
HwPipePoint waitPoint;
uint32 pipePointWaitCount; ///< Number of entries in pPipePoints.
const HwPipePoint* pPipePoints; ///< The barrier will stall until the hardware pipeline has cleared
/// up to each point specified in this array. One entry in this
/// array is typically enough, but CS and GFX operate in parallel
/// at certain stages.
uint32 gpuEventWaitCount; ///< Number of entries in ppGpuEvents.
const IGpuEvent** ppGpuEvents; ///< The barrier will stall until each GPU event in this array is
/// in the set state.
uint32 rangeCheckedTargetWaitCount; ///< Number of entries in ppTargets.
const IImage** ppTargets; ///< The barrier will stall until all previous rendering with any
/// color or depth/stencil image in this list bound as a target
/// has completed. If one of the targets is a nullptr it will
/// perform a full range sync.
uint32 transitionCount; ///< Number of entries in pTransitions.
const BarrierTransition* pTransitions; ///< List of image/memory transitions to process. See
/// @ref BarrierTransition. The same subresource should never
/// be specified more than once in the list of transitions.
/// PAL assumes that all specified subresources are unique.
uint32 globalSrcCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined
/// (bitwise logical union) with the @ref srcCacheMask field belonging to every
/// element in @ref pTransitions. If this is zero or if there are no transitions,
/// then no global cache flags are applied during every transition.
uint32 globalDstCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined
/// (bitwise logical union) with the @ref dstCacheMask field belonging to every
/// element in @ref pTransitions. If this is zero or if there are no transitions,
/// then no global cache flags are applied during every transition.
uint32 reason; ///< The reason that the barrier was invoked.
};
/// Specifies execution dependencies, *availability* and/or *visibility* operations on a section of an IGpuMemory
/// object that does not contain valid IImage data. PAL may assume image data is not present and skip certain
/// cache operations.
///
/// PAL specifies these execution dependencies using pairs of synchronization scope bitmasks of
/// @ref PipelineStageFlag values. The barrier's execution dependencies are only applied to state in this barrier.
/// Memory coherency operations or layout transitions in other barriers will ignore this barrier's execution
/// dependencies.
///
/// PAL specifies these operations using pairs of access scope bitmasks of @ref CacheCoherencyUsageFlags values.
/// The source mask (named srcAccessMask or srcGlobalAccessMask) describes which prior write operations should be made
/// available (i.e., written back from local caches to the LLC). The destination mask (named dstAccessMask or
/// dstGlobalAccessMask) describes which upcoming read/write operations that need visibility (i.e., invalidate
/// corresponding local caches above the LLC). These masks may be zero if no cache operations are needed.
///
/// In general, PAL executes the availability and visibility operations in isolation because the CmdRelease functions
/// require that the destination masks be zero and the CmdAcquire functions require that the source masks be zero.
/// In essence, CmdRelease implements the availability operations and CmdAcquire implements the visibility operations.
/// However, CmdReleaseThenAcquire sees both masks and thus can optimize its cache operations.
///
/// To facilitate cache optimizations, the client is encouraged to add flags corresponding to prior read operations
/// in the relevant source mask(s). Unlike the usual write operation flags, these read flags are entirely optional
/// and do not impact correctness; if they are omitted PAL will simply issue the full set of cache operations.
/// If they are provided PAL may detect cases where future read operations use the same caches as the prior read
/// operations and thus can skip the usual visibility operations.
///
/// Note that,
/// 1. If the client does provide read operation flags in a source mask they *must* guarantee that the same flags
/// were provided to a prior barrier's destination mask(s). Incorrect behavior may occur otherwise.
/// 2. One @ref MemBarrier or @ImgBarrier object can only be applied to a single resource otherwise PAL's internal
/// optimization may be incorrect. Don't OR multiple resource transitions' stage or access mask into one
/// @ref MemBarrier or @ImgBarrier when making PAL barrier call. However, you are allowed to OR multiple resource
/// transitions' stage or access mask into the global transition mask.
///
/// This struct is used by @ref AcquireReleaseInfo.
struct MemBarrier