Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,27 @@
#error No extension available for FP16.
#endif
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_buffer_reference2 : require
#extension GL_KHR_cooperative_matrix : require
#extension GL_KHR_memory_scope_semantics : require
#extension GL_NV_cooperative_matrix2 : require
#extension GL_EXT_float_e4m3 : require
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;

layout(buffer_reference) buffer A_buffer_ref;

layout(constant_id = 0) const uint Clamp = 0u;

layout(buffer_reference, std430) buffer A_buffer_ref
{
float16_t data_a[];
};

layout(set = 0, binding = 0, std430) buffer A_buffer
{
float16_t data_a[];
} _157;

void accum_to_a_cast()
{
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> Accum = coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator>(float16_t(0.0));
Expand All @@ -34,9 +49,99 @@ void value_cast()
void saturated_cast()
{
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> Accum = coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator>(float16_t(0.0));
coopmat<floate4m3_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseB> _52;
saturatedConvertEXT(_52, Accum);
coopmat<floate4m3_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseB> B = _52;
coopmat<floate4m3_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseB> _107;
saturatedConvertEXT(_107, Accum);
coopmat<floate4m3_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseB> B = _107;
}

void tensor_layouts()
{
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout1 = createTensorLayoutNV(2u, gl_CooperativeMatrixClampModeConstantNV);
tensorLayoutNV<2, (Clamp)> layout2 = createTensorLayoutNV(2u, (Clamp));
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout3 = setTensorLayoutClampValueNV(layout1, 42u);
layout1 = setTensorLayoutBlockSizeNV(layout1, 1u, 16u);
layout1 = setTensorLayoutBlockSizeNV(layout1, 1u, 16u);
layout1 = setTensorLayoutDimensionNV(layout1, 128u, 128u);
layout1 = setTensorLayoutDimensionNV(layout1, 128u, 128u);
layout1 = setTensorLayoutDimensionNV(layout1, 128u, 128u);
layout1 = setTensorLayoutStrideNV(layout1, 1u, 1u);
}

float16_t decodeLoad(const in A_buffer_ref buf, const in uint blockCoord[2], const in uint coordInBlock[2])
{
return buf.data_a[0];
}

void load_stores()
{
uint offset = 17u;
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout1 = createTensorLayoutNV(2u, gl_CooperativeMatrixClampModeConstantNV);
tensorViewNV<2u, false, 0u, 1u> view = createTensorViewNV(2u, false, 0u, 1u);
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> A;
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> _163;
coopMatLoadTensorNV(_163, _157.data_a, offset, layout1);
A = _163;
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> _169;
coopMatLoadTensorNV(_169, _157.data_a, offset, sliceTensorLayoutNV(layout1, 16u, 16u, 0u, 16u));
A = _169;
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> _175;
coopMatLoadTensorNV(_175, _157.data_a, offset, layout1, view);
A = _175;
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> _180;
coopMatLoadTensorNV(_180, _157.data_a, offset, layout1, decodeLoad);
A = _180;
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseA> _186;
coopMatLoadTensorNV(_186, _157.data_a, offset, layout1, view, decodeLoad);
A = _186;
coopMatStoreTensorNV(A, _157.data_a, offset, sliceTensorLayoutNV(layout1, 16u, 16u, 0u, 16u));
coopMatStoreTensorNV(A, _157.data_a, offset, sliceTensorLayoutNV(layout1, 16u, 16u, 0u, 16u), view);
}

float16_t maxReduce(const in float16_t x, const in float16_t y)
{
return max(x, y);
}

float16_t maxReduceIndirect(const in float16_t x, const in float16_t y)
{
return maxReduce(x, y);
}

float16_t Exp(const in uint row, const in uint col, const in float16_t elem)
{
return exp(elem);
}

float16_t ExpWithArg(const in uint row, const in uint col, const in float16_t elem, const in bool maybe)
{
if (maybe)
{
return exp(elem);
}
else
{
return elem;
}
}

void callback_functions()
{
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> A;
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> _201;
coopMatReduceNV(_201, A, gl_CooperativeMatrixReduceRowNV, maxReduce);
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> reduced = _201;
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> _204;
coopMatReduceNV(_204, reduced, gl_CooperativeMatrixReduceRowAndColumnNV, maxReduce);
reduced = _204;
coopmat<float16_t, gl_ScopeSubgroup, 8u, 8u, gl_MatrixUseAccumulator> _211;
coopMatReduceNV(_211, A, gl_CooperativeMatrixReduce2x2NV, maxReduceIndirect);
coopmat<float16_t, gl_ScopeSubgroup, 8u, 8u, gl_MatrixUseAccumulator> B = _211;
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> _213;
coopMatPerElementNV(_213, A, Exp);
A = _213;
coopmat<float16_t, gl_ScopeSubgroup, 16u, 16u, gl_MatrixUseAccumulator> _216;
coopMatPerElementNV(_216, A, ExpWithArg, true);
A = _216;
}

void main()
Expand All @@ -45,5 +150,8 @@ void main()
accum_to_b_cast();
value_cast();
saturated_cast();
tensor_layouts();
load_stores();
callback_functions();
}

89 changes: 89 additions & 0 deletions shaders-no-opt/vulkan/comp/nv-coopmat-2.vk.nocompat.spv16.comp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,38 @@
#extension GL_KHR_memory_scope_semantics : require
#extension GL_KHR_cooperative_matrix : require
#extension GL_NV_cooperative_matrix2 : require
#extension GL_EXT_buffer_reference : require
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
#extension GL_EXT_float_e5m2 : require
#extension GL_EXT_float_e4m3 : require

layout(local_size_x = 1) in;
layout(constant_id = 0) const uint32_t Clamp = 0;
layout (binding = 0) buffer A_buffer {float16_t data_a[];};
layout(buffer_reference, std430, buffer_reference_align = 8) buffer A_buffer_ref {float16_t data_a[];};

float16_t Exp(const in uint32_t row, const in uint32_t col, const in float16_t elem)
{
return exp(elem);
}

float16_t ExpWithArg(const in uint32_t row, const in uint32_t col, const in float16_t elem, const in bool maybe)
{
if(maybe) {
return exp(elem);
} else {
return elem;
}
}

float16_t maxReduce(const in float16_t x, const in float16_t y) {
return max(x, y);
}

float16_t maxReduceIndirect(const in float16_t x, const in float16_t y) {
return maxReduce(x, y);
}

void accum_to_a_cast()
{
Expand Down Expand Up @@ -41,10 +68,72 @@ void saturated_cast()
saturatedConvertEXT(B, Accum);
}

void tensor_layouts() {
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout1 = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
tensorLayoutNV<2, Clamp> layout2 = createTensorLayoutNV(2, Clamp);
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout3 = setTensorLayoutClampValueNV(layout1, 42);

const uint32_t BLOCK_SIZE = 16;
layout1 = setTensorLayoutBlockSizeNV(layout1, 1, BLOCK_SIZE);
layout1 = setTensorLayoutBlockSizeNV(layout1, 1, 16);

const int N = 128;
const int D = 128;
const int KV = 128;
layout1 = setTensorLayoutDimensionNV(layout1, N, D);
layout1 = setTensorLayoutDimensionNV(layout1, KV, D);
layout1 = setTensorLayoutDimensionNV(layout1, KV, D);
layout1 = setTensorLayoutStrideNV(layout1, 1, 1);
}


void tensor_views() {
tensorViewNV<2, false> view1 = createTensorViewNV(2, false);
tensorViewNV<2, false, 0, 1> view2 = createTensorViewNV(2, false);
tensorViewNV<2, false, 1, 0> viewTransposed = createTensorViewNV(2, false, 1, 0);
view1 = setTensorViewClipNV(view1, 0, 16, 0, 16);
view1 = setTensorViewDimensionsNV(view1, 256, 256);
view1 = setTensorViewStrideNV(view1, 2, 1);
}

float16_t decodeLoad(const in A_buffer_ref buf, const in uint32_t blockCoord[2], const in uint32_t coordInBlock[2]) {
return buf.data_a[0];
}

void load_stores() {
uint32_t offset = 17;
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A;
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> layout1 = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
tensorViewNV<2, false, 0, 1> view = createTensorViewNV(2, false, 0, 1);

coopMatLoadTensorNV(A, data_a, offset, layout1);
coopMatLoadTensorNV(A, data_a, offset, sliceTensorLayoutNV(layout1, 16, 16, 0, 16));
coopMatLoadTensorNV(A, data_a, offset, layout1, view);
coopMatLoadTensorNV(A, data_a, offset, layout1, decodeLoad);
coopMatLoadTensorNV(A, data_a, offset, layout1, view, decodeLoad);
coopMatStoreTensorNV(A, data_a, offset, sliceTensorLayoutNV(layout1, 16, 16, 0, 16));
coopMatStoreTensorNV(A, data_a, offset, sliceTensorLayoutNV(layout1, 16, 16, 0, 16), view);
}

void callback_functions() {
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> A;
coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> reduced;
coopMatReduceNV(reduced, A, gl_CooperativeMatrixReduceRowNV, maxReduce);
coopMatReduceNV(reduced, reduced, gl_CooperativeMatrixReduceRowAndColumnNV, maxReduce);
coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseAccumulator> B;
coopMatReduceNV(B, A, gl_CooperativeMatrixReduce2x2NV, maxReduceIndirect);

coopMatPerElementNV(A, A, Exp);
coopMatPerElementNV(A, A, ExpWithArg, true);
}

void main()
{
accum_to_a_cast();
accum_to_b_cast();
value_cast();
saturated_cast();
tensor_layouts();
load_stores();
callback_functions();
}
21 changes: 21 additions & 0 deletions spirv_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ void join_helper(StringStream<> &stream, T &&t, Ts &&... ts)
}
} // namespace inner

// From spec on Dims: "The value must be greater than zero and less than or equal to 5."
static constexpr const size_t TENSOR_VIEW_NV_MAX_DIMS = 5;

class Bitset
{
public:
Expand Down Expand Up @@ -584,6 +587,8 @@ struct SPIRType : IVariant
AccelerationStructure,
RayQuery,
CoopVecNV,
TensorLayoutNV,
TensorViewNV,

// Keep internal types at the end.
ControlPointArray,
Expand Down Expand Up @@ -642,6 +647,19 @@ struct SPIRType : IVariant
uint32_t rank;
uint32_t shape;
} tensor;

struct
{
uint32_t dim_id;
uint32_t clamp_mode_id;
} tensorLayoutNv;

struct
{
uint32_t dim_id;
uint32_t has_dimensions_id;
int32_t dim_ids[TENSOR_VIEW_NV_MAX_DIMS];
} tensorViewNv;
} ext;

spv::StorageClass storage = spv::StorageClassGeneric;
Expand Down Expand Up @@ -1041,6 +1059,9 @@ struct SPIRFunction : IVariant
BlockID entry_block = 0;
SmallVector<BlockID> blocks;
SmallVector<CombinedImageSamplerParameter> combined_parameters;
// SPV_NV_cooperative_matrix2 uses lambdas where all parameters need
// to be annotated as `const in`
bool used_as_lambda = false;

struct EntryLine
{
Expand Down
Loading
Loading