buddy-compiler · R-Tars · Mar 3, 2025 · Mar 3, 2025 · Mar 3, 2025 · Mar 16, 2025
diff --git a/examples/MLIRLinalg/linalg-conv2d-f16.mlir b/examples/MLIRLinalg/linalg-conv2d-f16.mlir
@@ -0,0 +1,95 @@
+// RUN: buddy-opt %s \
+// RUN:     -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+// RUN:     -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+// RUN:     -convert-func-to-llvm -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+#map0 = affine_map<(d0, d1) -> (d0 + d1 - 1)>
+module {
+  func.func private @printMemrefF32(memref<*xf32>)
+
+  func.func @alloc_2d_filled_f16(%arg0: index, %arg1: index, %arg2: f16) -> memref<?x?xf16> {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = memref.alloc(%arg0, %arg1) : memref<?x?xf16>
+    scf.for %arg3 = %c0 to %arg0 step %c1 {
+      scf.for %arg4 = %c0 to %arg1 step %c1 {
+        memref.store %arg2, %0[%arg3, %arg4] : memref<?x?xf16>
+      }
+    }
+    return %0 : memref<?x?xf16>
+  }
+
+  func.func @conv_2d(%arg0: memref<?x?xf16>, %arg1: memref<?x?xf16>, %arg2: memref<?x?xf16>) {
+    linalg.conv_2d ins (%arg0, %arg1: memref<?x?xf16>, memref<?x?xf16>)
+                  outs (%arg2: memref<?x?xf16>)
+    return
+  }
+
+  func.func @main() {
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+
+    // Image and Output value.
+    %cst = arith.constant 1.000000e+00 : f16
+    %cst_0 = arith.constant 0.000000e+00 : f16
+    %cst_2048 = arith.constant 2048.000000e+00 : f16
+    %cst_2049 = arith.constant 2049.000000e+00 : f16
+
+    %current_filter = arith.constant 3 : index
+    %current_output = arith.constant 8 : index
+    %current_image = affine.apply #map0(%current_output, %current_filter)
+
+    // Filter.
+    %filter = call @alloc_2d_filled_f16(%current_filter, %current_filter, %cst) : (index, index, f16) -> memref<?x?xf16>
+    // Image.
+    %image1 = call @alloc_2d_filled_f16(%current_image, %current_image, %cst_2048) : (index, index, f16) -> memref<?x?xf16>
+
+    %image2 = call @alloc_2d_filled_f16(%current_image, %current_image, %cst_2049) : (index, index, f16) -> memref<?x?xf16>
+    // Output.
+    %output1 = call @alloc_2d_filled_f16(%current_output, %current_output, %cst_0) : (index, index, f16) -> memref<?x?xf16>
+
+    %output2 = call @alloc_2d_filled_f16(%current_output, %current_output, %cst_0) : (index, index, f16) -> memref<?x?xf16>
+
+    call @conv_2d(%image1, %filter, %output1) : (memref<?x?xf16>, memref<?x?xf16>, memref<?x?xf16>) -> ()
+
+    call @conv_2d(%image2, %filter, %output2) : (memref<?x?xf16>, memref<?x?xf16>, memref<?x?xf16>) -> ()
+
+    // Convert f16 output to f32 for printing.
+    %output_f32_1 = memref.alloc(%current_output, %current_output) : memref<?x?xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    scf.for %i = %c0 to %current_output step %c1 {
+      scf.for %j = %c0 to %current_output step %c1 {
+        %val_f16 = memref.load %output1[%i, %j] : memref<?x?xf16>
+        %val_f32 = arith.extf %val_f16 : f16 to f32
+        memref.store %val_f32, %output_f32_1[%i, %j] : memref<?x?xf32>
+      }
+    }
+    %print_output1 = memref.cast %output_f32_1 : memref<?x?xf32> to memref<*xf32>
+    call @printMemrefF32(%print_output1) : (memref<*xf32>) -> ()
+
+    %output_f32_2 = memref.alloc(%current_output, %current_output) : memref<?x?xf32>
+    scf.for %i = %c0 to %current_output step %c1 {
+      scf.for %j = %c0 to %current_output step %c1 {
+        %val_f16 = memref.load %output2[%i, %j] : memref<?x?xf16>
+        %val_f32 = arith.extf %val_f16 : f16 to f32
+        memref.store %val_f32, %output_f32_2[%i, %j] : memref<?x?xf32>
+      }
+    }
+    %print_output2 = memref.cast %output_f32_2 : memref<?x?xf32> to memref<*xf32>
+    call @printMemrefF32(%print_output2) : (memref<*xf32>) -> ()
+
+    memref.dealloc %image1 : memref<?x?xf16>
+    memref.dealloc %image2 : memref<?x?xf16>
+    memref.dealloc %filter : memref<?x?xf16>
+    memref.dealloc %output1 : memref<?x?xf16>
+    memref.dealloc %output2 : memref<?x?xf16>
+    memref.dealloc %output_f32_1 : memref<?x?xf32>
+    memref.dealloc %output_f32_2 : memref<?x?xf32>
+    return
+  }
+}
diff --git a/examples/MLIRLinalg/linalg-matmul-f16.mlir b/examples/MLIRLinalg/linalg-matmul-f16.mlir
@@ -0,0 +1,91 @@
+// RUN: buddy-opt %s \
+// RUN:     -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+// RUN:     -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+// RUN:     -convert-func-to-llvm -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+    func.func private @printMemrefF32(memref<*xf32>)
+
+    func.func @alloc_2d_filled_f16(%arg0: index, %arg1: index, %arg2: f16) -> memref<?x?xf16> {
+        %c0 = arith.constant 0 : index
+        %c1 = arith.constant 1 : index
+        %0 = memref.alloc(%arg0, %arg1) : memref<?x?xf16>
+        scf.for %arg3 = %c0 to %arg0 step %c1 {
+            scf.for %arg4 = %c0 to %arg1 step %c1 {
+                memref.store %arg2, %0[%arg3, %arg4] : memref<?x?xf16>
+            }
+        }
+        return %0 : memref<?x?xf16>
+    }
+
+    func.func @matmul(%a : memref<?x?xf16>, %b : memref<?x?xf16>, %c : memref<?x?xf16>) {
+        linalg.matmul
+            ins(%a, %b: memref<?x?xf16>, memref<?x?xf16>)
+            outs(%c: memref<?x?xf16>)
+        return
+    }
+
+    func.func @main() {
+        // Set up dims.
+        %cM = arith.constant 2 : index
+        %cN = arith.constant 2 : index
+        %cK = arith.constant 2 : index
+
+        // Set Init Value.
+        %cf2048 = arith.constant 2048.000000e+00 : f16
+        %cf2049 = arith.constant 2049.000000e+00 : f16
+        %cf0 = arith.constant 0.000000e+00 : f16
+        %cf2 = arith.constant 2.000000e+00 : f16
+
+        // Allocate and initialize matrices
+        %A1 = call @alloc_2d_filled_f16(%cM, %cK, %cf2048) : (index, index, f16) -> memref<?x?xf16>
+        %A2 = call @alloc_2d_filled_f16(%cM, %cK, %cf2049) : (index, index, f16) -> memref<?x?xf16>
+        %B = call @alloc_2d_filled_f16(%cK, %cN, %cf2) : (index, index, f16) -> memref<?x?xf16>
+        %C1 = call @alloc_2d_filled_f16(%cM, %cN, %cf0) : (index, index, f16) -> memref<?x?xf16>
+        %C2 = call @alloc_2d_filled_f16(%cM, %cN, %cf0) : (index, index, f16) -> memref<?x?xf16>
+
+        call @matmul(%A1, %B, %C1) : (memref<?x?xf16>, memref<?x?xf16>, memref<?x?xf16>) -> ()
+        call @matmul(%A2, %B, %C2) : (memref<?x?xf16>, memref<?x?xf16>, memref<?x?xf16>) -> ()
+
+        // Convert f16 output to f32 for printing
+        %C1_f32 = memref.alloc(%cM, %cN) : memref<?x?xf32>
+        %c0 = arith.constant 0 : index
+        %c1 = arith.constant 1 : index
+        scf.for %i = %c0 to %cM step %c1 {
+            scf.for %j = %c0 to %cN step %c1 {
+                %val_f16 = memref.load %C1[%i, %j] : memref<?x?xf16>
+                %val_f32 = arith.extf %val_f16 : f16 to f32
+                memref.store %val_f32, %C1_f32[%i, %j] : memref<?x?xf32>
+            }
+        }
+
+        %C2_f32 = memref.alloc(%cM, %cN) : memref<?x?xf32>
+        scf.for %i = %c0 to %cM step %c1 {
+            scf.for %j = %c0 to %cN step %c1 {
+                %val_f16 = memref.load %C2[%i, %j] : memref<?x?xf16>
+                %val_f32 = arith.extf %val_f16 : f16 to f32
+                memref.store %val_f32, %C2_f32[%i, %j] : memref<?x?xf32>
+            }
+        }
+
+        // Print output.
+        %print_C1 = memref.cast %C1_f32 : memref<?x?xf32> to memref<*xf32>
+        %print_C2 = memref.cast %C2_f32 : memref<?x?xf32> to memref<*xf32>
+        call @printMemrefF32(%print_C1) : (memref<*xf32>) -> ()
+        call @printMemrefF32(%print_C2) : (memref<*xf32>) -> ()
+
+        // Deallocations
+        memref.dealloc %A1 : memref<?x?xf16>
+        memref.dealloc %A2 : memref<?x?xf16>
+        memref.dealloc %B : memref<?x?xf16>
+        memref.dealloc %C1 : memref<?x?xf16>
+        memref.dealloc %C2 : memref<?x?xf16>
+        memref.dealloc %C1_f32 : memref<?x?xf32>
+        memref.dealloc %C2_f32 : memref<?x?xf32>
+        return
+    }
+}
diff --git a/examples/MLIRLinalg/makefile b/examples/MLIRLinalg/makefile
@@ -39,6 +39,13 @@ linalg-conv2d-run:
 		-convert-func-to-llvm -reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
+linalg-conv2d-f16-run:
+	@${MLIR_OPT} linalg-conv2d-f16.mlir ${MLIR_OPT_OPTIONS} \
+		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
 linalg-conv2d-tiling-lower:
 	@${MLIR_OPT} ./linalg-conv2d.mlir \
 		-test-transform-dialect-interpreter \
@@ -73,7 +80,7 @@ linalg-conv2d_nhwc_fhwc-optimize-run:
 		-convert-func-to-llvm -reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
-		
+
 linalg-conv2d_nhwc_fhwc-tile-optimize-lower:
 	@${BUDDY_OPT} linalg-conv2d_nhwc_fhwc.mlir \
 		-conv-nhwc-fhwc-tile-optimize="vec-size=16 tiling-height=2 tiling-width=3" \
@@ -142,6 +149,13 @@ linalg-matmul-run:
 		-convert-func-to-llvm -reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
+linalg-matmul-f16-run:
+	@${MLIR_OPT} linalg-matmul-f16.mlir ${MLIR_OPT_OPTIONS} \
+		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
 linalg-matmul-optimize-lower:
 	@${BUDDY_OPT} linalg-matmul.mlir ${MLIR_OPT_OPTIONS} \
 		--matmul-optimize="vec-size=16 kernel-m=2 kernel-n=4" \