@@ -59,6 +59,91 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
59
59
ret void
60
60
}
61
61
62
+ ; Function Attrs: nounwind
63
+ define dso_local void @__tile_dpbf8ps (ptr %dst , ptr %src1 , ptr %src2 ) #0 {
64
+ ; CHECK-LABEL: __tile_dpbf8ps:
65
+ ; CHECK: # %bb.0: # %entry
66
+ ; CHECK-NEXT: pushq %rbp
67
+ ; CHECK-NEXT: subq $4976, %rsp # imm = 0x1370
68
+ ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
69
+ ; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
70
+ ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
71
+ ; CHECK-NEXT: movzwl (%rsi), %eax
72
+ ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
73
+ ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
74
+ ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
75
+ ; CHECK-NEXT: movswq 2(%rdx), %rcx
76
+ ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
77
+ ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
78
+ ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
79
+ ; CHECK-NEXT: movswq 2(%rsi), %r8
80
+ ; CHECK-NEXT: movw %r8w, {{[0-9]+}}(%rsp)
81
+ ; CHECK-NEXT: movzwl %r8w, %r9d
82
+ ; CHECK-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
83
+ ; CHECK-NEXT: shrl $2, %r9d
84
+ ; CHECK-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
85
+ ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
86
+ ; CHECK-NEXT: addq $64, %rdi
87
+ ; CHECK-NEXT: tileloadd (%rdi,%rcx), %tmm0
88
+ ; CHECK-NEXT: addq $64, %rsi
89
+ ; CHECK-NEXT: tileloadd (%rsi,%r8), %tmm1
90
+ ; CHECK-NEXT: addq $64, %rdx
91
+ ; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2
92
+ ; CHECK-NEXT: movabsq $64, %rbp
93
+ ; CHECK-NEXT: tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
94
+ ; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
95
+ ; CHECK-NEXT: tdpbf8ps %tmm2, %tmm1, %tmm3
96
+ ; CHECK-NEXT: tilestored %tmm3, (%rdi,%rcx)
97
+ ; CHECK-NEXT: tilestored %tmm0, 1920(%rsp,%rbp) # 1024-byte Folded Spill
98
+ ; CHECK-NEXT: tileloadd 1920(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
99
+ ; CHECK-NEXT: tdpbhf8ps %tmm2, %tmm1, %tmm3
100
+ ; CHECK-NEXT: tilestored %tmm3, (%rdi,%rcx)
101
+ ; CHECK-NEXT: tilestored %tmm0, 2944(%rsp,%rbp) # 1024-byte Folded Spill
102
+ ; CHECK-NEXT: tileloadd 2944(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
103
+ ; CHECK-NEXT: tdphbf8ps %tmm2, %tmm1, %tmm3
104
+ ; CHECK-NEXT: tilestored %tmm3, (%rdi,%rcx)
105
+ ; CHECK-NEXT: tdphf8ps %tmm2, %tmm1, %tmm0
106
+ ; CHECK-NEXT: tilestored %tmm0, (%rdi,%rcx)
107
+ ; CHECK-NEXT: addq $4976, %rsp # imm = 0x1370
108
+ ; CHECK-NEXT: popq %rbp
109
+ ; CHECK-NEXT: tilerelease
110
+ ; CHECK-NEXT: vzeroupper
111
+ ; CHECK-NEXT: retq
112
+ entry:
113
+ %0 = load i16 , ptr %src1 , align 64
114
+ %col = getelementptr inbounds nuw i8 , ptr %src2 , i64 2
115
+ %1 = load i16 , ptr %col , align 2
116
+ %col1 = getelementptr inbounds nuw i8 , ptr %src1 , i64 2
117
+ %2 = load i16 , ptr %col1 , align 2
118
+ %tile = getelementptr inbounds nuw i8 , ptr %dst , i64 64
119
+ %3 = load <256 x i32 >, ptr %tile , align 64
120
+ %tile2 = getelementptr inbounds nuw i8 , ptr %src1 , i64 64
121
+ %4 = load <256 x i32 >, ptr %tile2 , align 64
122
+ %tile3 = getelementptr inbounds nuw i8 , ptr %src2 , i64 64
123
+ %5 = load <256 x i32 >, ptr %tile3 , align 64
124
+ %6 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 > %3 )
125
+ %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 > %4 )
126
+ %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 > %5 )
127
+
128
+ %9 = tail call x86_amx @llvm.x86.tdpbf8ps.internal (i16 %0 , i16 %1 , i16 %2 , x86_amx %6 , x86_amx %7 , x86_amx %8 )
129
+ %10 = tail call <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx %9 )
130
+ store <256 x i32 > %10 , ptr %tile , align 64
131
+
132
+ %11 = tail call x86_amx @llvm.x86.tdpbhf8ps.internal (i16 %0 , i16 %1 , i16 %2 , x86_amx %6 , x86_amx %7 , x86_amx %8 )
133
+ %12 = tail call <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx %11 )
134
+ store <256 x i32 > %12 , ptr %tile , align 64
135
+
136
+ %13 = tail call x86_amx @llvm.x86.tdphbf8ps.internal (i16 %0 , i16 %1 , i16 %2 , x86_amx %6 , x86_amx %7 , x86_amx %8 )
137
+ %14 = tail call <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx %13 )
138
+ store <256 x i32 > %14 , ptr %tile , align 64
139
+
140
+ %15 = tail call x86_amx @llvm.x86.tdphf8ps.internal (i16 %0 , i16 %1 , i16 %2 , x86_amx %6 , x86_amx %7 , x86_amx %8 )
141
+ %16 = tail call <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx %15 )
142
+ store <256 x i32 > %16 , ptr %tile , align 64
143
+
144
+ ret void
145
+ }
146
+
62
147
declare x86_amx @llvm.x86.tilezero.internal (i16 , i16 )
63
148
declare x86_amx @llvm.x86.tileloadd64.internal (i16 , i16 , i8* , i64 )
64
149
declare void @llvm.x86.tilestored64.internal (i16 , i16 , i8* , i64 , x86_amx)
@@ -67,3 +152,6 @@ declare x86_amx @llvm.x86.tdpbf8ps.internal(i16, i16, i16, x86_amx, x86_amx, x86
67
152
declare x86_amx @llvm.x86.tdpbhf8ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
68
153
declare x86_amx @llvm.x86.tdphbf8ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
69
154
declare x86_amx @llvm.x86.tdphf8ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
155
+
156
+ attributes #0 = { nounwind }
157
+
0 commit comments