|
| 1 | +import itertools |
| 2 | + |
| 3 | +import numpy as np |
1 | 4 | import pytest
|
2 | 5 | import torch
|
3 | 6 | import pathlib
|
|
7 | 10 |
|
8 | 11 |
|
9 | 12 | @pytest.mark.parametrize("M, N", [[256, 64], [256, 32], [128, 32], [128, 16], [128, 8], [64, 64], [64, 32], [32, 32]])
|
| 13 | +class DpasLayout: |
| 14 | + |
| 15 | + def __init__(self, repeatCount, systolic_depth, execution_size, ops_per_chan, threads_per_warp, warps_per_cta, |
| 16 | + rep_cluster): |
| 17 | + self.repeatCount = repeatCount |
| 18 | + self.systolic_depth = systolic_depth |
| 19 | + self.execution_size = execution_size |
| 20 | + self.ops_per_chan = ops_per_chan |
| 21 | + self.threads_per_warp = threads_per_warp |
| 22 | + self.warps_per_cta = warps_per_cta |
| 23 | + self.rep_cluster = rep_cluster |
| 24 | + |
| 25 | + def __str__(self): |
| 26 | + return f"#triton_intel_gpu.dpas<{{repeatCount={self.repeatCount}, systolicDepth={self.systolic_depth}, executionSize = {self.execution_size}, opsPerChan = {self.ops_per_chan}, threadsPerWarp = {self.threads_per_warp}, warpsPerCTA={self.warps_per_cta}, repCluster={self.rep_cluster}}}>" |
| 27 | + |
| 28 | + |
| 29 | +def warps_per_cta(layout): |
| 30 | + return layout.warps_per_cta |
| 31 | + |
| 32 | + |
| 33 | +@pytest.mark.parametrize("M, N", [[256, 64], [256, 32], [128, 32], [64, 64], [64, 32], [32, 32]]) |
10 | 34 | @pytest.mark.parametrize("dtype_str", ["float32", "float16", "int8"])
|
11 | 35 | @pytest.mark.parametrize("transpose", [True, False])
|
12 | 36 | @pytest.mark.skipif(not is_xpu(), reason="Block load tests are specific to the XPU backend")
|
@@ -79,3 +103,107 @@ def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pa
|
79 | 103 | kernel[(1, 1, 1)](a, x, b, y)
|
80 | 104 | #import pdb; pdb.set_trace()
|
81 | 105 | assert torch.equal(a, x) and torch.equal(b.T if transpose else b, y)
|
| 106 | + |
| 107 | + |
| 108 | +layouts = [ |
| 109 | + # Layout for Xe2 and Xe2+ |
| 110 | + DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=4, threads_per_warp=16, |
| 111 | + warps_per_cta=[1, 4], rep_cluster=[1, 2]), |
| 112 | + DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=2, threads_per_warp=16, |
| 113 | + warps_per_cta=[8, 4], rep_cluster=[4, 2]), |
| 114 | + DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=1, threads_per_warp=16, |
| 115 | + warps_per_cta=[8, 4], rep_cluster=[1, 1]), |
| 116 | + DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=4, threads_per_warp=32, |
| 117 | + warps_per_cta=[1, 4], rep_cluster=[1, 2]), |
| 118 | + DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=2, threads_per_warp=32, |
| 119 | + warps_per_cta=[8, 4], rep_cluster=[4, 2]), |
| 120 | + DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=1, threads_per_warp=32, |
| 121 | + warps_per_cta=[8, 4], rep_cluster=[1, 1]), |
| 122 | + # Layout for Xe |
| 123 | +] |
| 124 | + |
| 125 | + |
| 126 | +@pytest.mark.parametrize("M, N", [[M, N] for M, N in itertools.product([32, 64, 128, 256], [32, 64, 128, 256])]) |
| 127 | +@pytest.mark.parametrize("dtype_str", ["float32", "float16", "int8"]) |
| 128 | +@pytest.mark.parametrize("layout", layouts) |
| 129 | +@pytest.mark.skipif(not is_xpu(), reason="Block load tests are specific to the XPU backend") |
| 130 | +def test_tensor_pointer_block_load(M, N, dtype_str, layout, device, tmp_path: pathlib.Path): |
| 131 | + |
| 132 | + warps = warps_per_cta(layout) |
| 133 | + num_warps = int(np.prod(warps)) |
| 134 | + threads_per_warp = layout.threads_per_warp |
| 135 | + ops_per_chan = layout.ops_per_chan |
| 136 | + A_width = 1 if ops_per_chan == 1 else ops_per_chan // 2 |
| 137 | + B_width = ops_per_chan |
| 138 | + |
| 139 | + ty = {"float32": "f32", "float16": "f16", "int8": "i8"}[dtype_str] |
| 140 | + |
| 141 | + support_block_io = torch.xpu.get_device_capability()['has_subgroup_2d_block_io'] |
| 142 | + |
| 143 | + ir = f""" |
| 144 | + #mma = {layout} |
| 145 | + #dot_a = #ttg.dot_op<{{opIdx = 0, parent = #mma, kWidth = {A_width}}}> |
| 146 | + #dot_b = #ttg.dot_op<{{opIdx = 1, parent = #mma, kWidth = {B_width}}}> |
| 147 | + module attributes {{triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_bf16_conversion, triton_intel_gpu.support_dpas, {"triton_intel_gpu.support_sg_2d_block," if support_block_io else ""} triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = {num_warps} : i32, ttg.target = "xpu", "ttg.threads-per-warp" = {threads_per_warp} : i32}} {{ |
| 148 | + tt.func public @tensor_pointer_block_load(%arg0: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}, %arg6: i32 {{tt.divisibility = 16 : i32}}, %arg2: !tt.ptr<{ty}> {{tt.divisibility = 16: i32}}, %arg3: !tt.ptr<{ty}> {{tt.divisibility = 16: i32}}, %arg7: i32 {{tt.divisibility = 16 : i32}}) attributes {{noinline = false}} {{ |
| 149 | + // A matrix |
| 150 | + %1 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_a}}>> |
| 151 | + %2 = tt.expand_dims %1 {{axis = 1 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_a}}>> -> tensor<{M}x1xi32, #dot_a> |
| 152 | + %3 = tt.splat %arg6 : i32 -> tensor<{M}x1xi32, #dot_a> |
| 153 | + %4 = arith.muli %2, %3 : tensor<{M}x1xi32, #dot_a> |
| 154 | + %5 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_a}}>> |
| 155 | + %6 = tt.expand_dims %5 {{axis = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_a}}>> -> tensor<1x{N}xi32, #dot_a> |
| 156 | + %7 = tt.broadcast %4 : tensor<{M}x1xi32, #dot_a> -> tensor<{M}x{N}xi32, #dot_a> |
| 157 | + %8 = tt.broadcast %6 : tensor<1x{N}xi32, #dot_a> -> tensor<{M}x{N}xi32, #dot_a> |
| 158 | + %9 = arith.addi %7, %8 : tensor<{M}x{N}xi32, #dot_a> |
| 159 | +
|
| 160 | + %10 = tt.splat %arg0 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a> |
| 161 | + %11 = tt.addptr %10, %9 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>, tensor<{M}x{N}xi32, #dot_a> |
| 162 | + %12 = tt.load %11 {{triton_intel_gpu.block_io = "row_major"}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a> |
| 163 | + %13 = tt.splat %arg1 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a> |
| 164 | + %14 = tt.addptr %13, %9 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>, tensor<{M}x{N}xi32, #dot_a> |
| 165 | + tt.store %14, %12 {{boundaryCheck = array<i32: 0, 1>}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a> |
| 166 | +
|
| 167 | + // B matrix |
| 168 | + %22 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_b}}>> |
| 169 | + %44 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_b}}>> |
| 170 | + %46 = tt.expand_dims %44 {{axis = 1 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_b}}>> -> tensor<{M}x1xi32, #dot_b> |
| 171 | + %48 = tt.splat %arg7 : i32 -> tensor<{M}x1xi32, #dot_b> |
| 172 | + %49 = arith.muli %46, %48 : tensor<{M}x1xi32, #dot_b> |
| 173 | + %50 = tt.expand_dims %22 {{axis = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_b}}>> -> tensor<1x{N}xi32, #dot_b> |
| 174 | + %51 = tt.broadcast %49 : tensor<{M}x1xi32, #dot_b> -> tensor<{M}x{N}xi32, #dot_b> |
| 175 | + %52 = tt.broadcast %50 : tensor<1x{N}xi32, #dot_b> -> tensor<{M}x{N}xi32, #dot_b> |
| 176 | + %53 = arith.addi %51, %52 : tensor<{M}x{N}xi32, #dot_b> |
| 177 | +
|
| 178 | + %54 = tt.splat %arg2 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b> |
| 179 | + %55 = tt.addptr %54, %53 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>, tensor<{M}x{N}xi32, #dot_b> |
| 180 | + %56 = tt.load %55 {{triton_intel_gpu.block_io = "row_major"}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b> |
| 181 | + %57 = tt.splat %arg3 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b> |
| 182 | + %58 = tt.addptr %57, %53 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>, tensor<{M}x{N}xi32, #dot_b> |
| 183 | + tt.store %58, %56 {{boundaryCheck = array<i32: 0, 1>}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b> |
| 184 | +
|
| 185 | + tt.return |
| 186 | + }} |
| 187 | + }} |
| 188 | + """ |
| 189 | + |
| 190 | + torch_dtype = getattr(torch, dtype_str) |
| 191 | + if torch_dtype.is_floating_point: |
| 192 | + a = torch.randn((M, N), dtype=torch_dtype, device=device) |
| 193 | + else: |
| 194 | + a = torch.randint(low=-127, high=128, size=(M, N), dtype=torch_dtype, device=device) |
| 195 | + |
| 196 | + x = torch.empty_like(a) |
| 197 | + y = torch.empty_like(a) |
| 198 | + |
| 199 | + temp_file = tmp_path / "test_tensor_pointer_block_load.ttgir" |
| 200 | + temp_file.write_text(ir) |
| 201 | + kernel = triton.compile(str(temp_file)) |
| 202 | + |
| 203 | + if support_block_io: |
| 204 | + # assert '2d block io' in kernel.asm['llir'] |
| 205 | + pass |
| 206 | + |
| 207 | + kernel[(1, 1, 1)](a, x, a.stride(0), a, y, a.stride(0)) |
| 208 | + |
| 209 | + assert torch.equal(a, x) and torch.equal(a, y) |
0 commit comments