diff --git a/examples/MLIRLinalg/linalg-conv2d-f16.mlir b/examples/MLIRLinalg/linalg-conv2d-f16.mlir new file mode 100644 index 0000000000..e91191f3a1 --- /dev/null +++ b/examples/MLIRLinalg/linalg-conv2d-f16.mlir @@ -0,0 +1,95 @@ +// RUN: buddy-opt %s \ +// RUN: -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \ +// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \ +// RUN: -convert-func-to-llvm -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +#map0 = affine_map<(d0, d1) -> (d0 + d1 - 1)> +module { + func.func private @printMemrefF32(memref<*xf32>) + + func.func @alloc_2d_filled_f16(%arg0: index, %arg1: index, %arg2: f16) -> memref { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = memref.alloc(%arg0, %arg1) : memref + scf.for %arg3 = %c0 to %arg0 step %c1 { + scf.for %arg4 = %c0 to %arg1 step %c1 { + memref.store %arg2, %0[%arg3, %arg4] : memref + } + } + return %0 : memref + } + + func.func @conv_2d(%arg0: memref, %arg1: memref, %arg2: memref) { + linalg.conv_2d ins (%arg0, %arg1: memref, memref) + outs (%arg2: memref) + return + } + + func.func @main() { + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + + // Image and Output value. + %cst = arith.constant 1.000000e+00 : f16 + %cst_0 = arith.constant 0.000000e+00 : f16 + %cst_2048 = arith.constant 2048.000000e+00 : f16 + %cst_2049 = arith.constant 2049.000000e+00 : f16 + + %current_filter = arith.constant 3 : index + %current_output = arith.constant 8 : index + %current_image = affine.apply #map0(%current_output, %current_filter) + + // Filter. + %filter = call @alloc_2d_filled_f16(%current_filter, %current_filter, %cst) : (index, index, f16) -> memref + // Image. + %image1 = call @alloc_2d_filled_f16(%current_image, %current_image, %cst_2048) : (index, index, f16) -> memref + + %image2 = call @alloc_2d_filled_f16(%current_image, %current_image, %cst_2049) : (index, index, f16) -> memref + // Output. + %output1 = call @alloc_2d_filled_f16(%current_output, %current_output, %cst_0) : (index, index, f16) -> memref + + %output2 = call @alloc_2d_filled_f16(%current_output, %current_output, %cst_0) : (index, index, f16) -> memref + + call @conv_2d(%image1, %filter, %output1) : (memref, memref, memref) -> () + + call @conv_2d(%image2, %filter, %output2) : (memref, memref, memref) -> () + + // Convert f16 output to f32 for printing. + %output_f32_1 = memref.alloc(%current_output, %current_output) : memref + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + scf.for %i = %c0 to %current_output step %c1 { + scf.for %j = %c0 to %current_output step %c1 { + %val_f16 = memref.load %output1[%i, %j] : memref + %val_f32 = arith.extf %val_f16 : f16 to f32 + memref.store %val_f32, %output_f32_1[%i, %j] : memref + } + } + %print_output1 = memref.cast %output_f32_1 : memref to memref<*xf32> + call @printMemrefF32(%print_output1) : (memref<*xf32>) -> () + + %output_f32_2 = memref.alloc(%current_output, %current_output) : memref + scf.for %i = %c0 to %current_output step %c1 { + scf.for %j = %c0 to %current_output step %c1 { + %val_f16 = memref.load %output2[%i, %j] : memref + %val_f32 = arith.extf %val_f16 : f16 to f32 + memref.store %val_f32, %output_f32_2[%i, %j] : memref + } + } + %print_output2 = memref.cast %output_f32_2 : memref to memref<*xf32> + call @printMemrefF32(%print_output2) : (memref<*xf32>) -> () + + memref.dealloc %image1 : memref + memref.dealloc %image2 : memref + memref.dealloc %filter : memref + memref.dealloc %output1 : memref + memref.dealloc %output2 : memref + memref.dealloc %output_f32_1 : memref + memref.dealloc %output_f32_2 : memref + return + } +} diff --git a/examples/MLIRLinalg/linalg-matmul-f16.mlir b/examples/MLIRLinalg/linalg-matmul-f16.mlir new file mode 100644 index 0000000000..082e2b03f0 --- /dev/null +++ b/examples/MLIRLinalg/linalg-matmul-f16.mlir @@ -0,0 +1,91 @@ +// RUN: buddy-opt %s \ +// RUN: -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \ +// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \ +// RUN: -convert-func-to-llvm -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +module { + func.func private @printMemrefF32(memref<*xf32>) + + func.func @alloc_2d_filled_f16(%arg0: index, %arg1: index, %arg2: f16) -> memref { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = memref.alloc(%arg0, %arg1) : memref + scf.for %arg3 = %c0 to %arg0 step %c1 { + scf.for %arg4 = %c0 to %arg1 step %c1 { + memref.store %arg2, %0[%arg3, %arg4] : memref + } + } + return %0 : memref + } + + func.func @matmul(%a : memref, %b : memref, %c : memref) { + linalg.matmul + ins(%a, %b: memref, memref) + outs(%c: memref) + return + } + + func.func @main() { + // Set up dims. + %cM = arith.constant 2 : index + %cN = arith.constant 2 : index + %cK = arith.constant 2 : index + + // Set Init Value. + %cf2048 = arith.constant 2048.000000e+00 : f16 + %cf2049 = arith.constant 2049.000000e+00 : f16 + %cf0 = arith.constant 0.000000e+00 : f16 + %cf2 = arith.constant 2.000000e+00 : f16 + + // Allocate and initialize matrices + %A1 = call @alloc_2d_filled_f16(%cM, %cK, %cf2048) : (index, index, f16) -> memref + %A2 = call @alloc_2d_filled_f16(%cM, %cK, %cf2049) : (index, index, f16) -> memref + %B = call @alloc_2d_filled_f16(%cK, %cN, %cf2) : (index, index, f16) -> memref + %C1 = call @alloc_2d_filled_f16(%cM, %cN, %cf0) : (index, index, f16) -> memref + %C2 = call @alloc_2d_filled_f16(%cM, %cN, %cf0) : (index, index, f16) -> memref + + call @matmul(%A1, %B, %C1) : (memref, memref, memref) -> () + call @matmul(%A2, %B, %C2) : (memref, memref, memref) -> () + + // Convert f16 output to f32 for printing + %C1_f32 = memref.alloc(%cM, %cN) : memref + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + scf.for %i = %c0 to %cM step %c1 { + scf.for %j = %c0 to %cN step %c1 { + %val_f16 = memref.load %C1[%i, %j] : memref + %val_f32 = arith.extf %val_f16 : f16 to f32 + memref.store %val_f32, %C1_f32[%i, %j] : memref + } + } + + %C2_f32 = memref.alloc(%cM, %cN) : memref + scf.for %i = %c0 to %cM step %c1 { + scf.for %j = %c0 to %cN step %c1 { + %val_f16 = memref.load %C2[%i, %j] : memref + %val_f32 = arith.extf %val_f16 : f16 to f32 + memref.store %val_f32, %C2_f32[%i, %j] : memref + } + } + + // Print output. + %print_C1 = memref.cast %C1_f32 : memref to memref<*xf32> + %print_C2 = memref.cast %C2_f32 : memref to memref<*xf32> + call @printMemrefF32(%print_C1) : (memref<*xf32>) -> () + call @printMemrefF32(%print_C2) : (memref<*xf32>) -> () + + // Deallocations + memref.dealloc %A1 : memref + memref.dealloc %A2 : memref + memref.dealloc %B : memref + memref.dealloc %C1 : memref + memref.dealloc %C2 : memref + memref.dealloc %C1_f32 : memref + memref.dealloc %C2_f32 : memref + return + } +} diff --git a/examples/MLIRLinalg/makefile b/examples/MLIRLinalg/makefile index 865a0b162c..c4cb61b4d3 100644 --- a/examples/MLIRLinalg/makefile +++ b/examples/MLIRLinalg/makefile @@ -39,6 +39,13 @@ linalg-conv2d-run: -convert-func-to-llvm -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} +linalg-conv2d-f16-run: + @${MLIR_OPT} linalg-conv2d-f16.mlir ${MLIR_OPT_OPTIONS} \ + -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \ + -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \ + -convert-func-to-llvm -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + linalg-conv2d-tiling-lower: @${MLIR_OPT} ./linalg-conv2d.mlir \ -test-transform-dialect-interpreter \ @@ -73,7 +80,7 @@ linalg-conv2d_nhwc_fhwc-optimize-run: -convert-func-to-llvm -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} - + linalg-conv2d_nhwc_fhwc-tile-optimize-lower: @${BUDDY_OPT} linalg-conv2d_nhwc_fhwc.mlir \ -conv-nhwc-fhwc-tile-optimize="vec-size=16 tiling-height=2 tiling-width=3" \ @@ -142,6 +149,13 @@ linalg-matmul-run: -convert-func-to-llvm -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} +linalg-matmul-f16-run: + @${MLIR_OPT} linalg-matmul-f16.mlir ${MLIR_OPT_OPTIONS} \ + -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \ + -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \ + -convert-func-to-llvm -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + linalg-matmul-optimize-lower: @${BUDDY_OPT} linalg-matmul.mlir ${MLIR_OPT_OPTIONS} \ --matmul-optimize="vec-size=16 kernel-m=2 kernel-n=4" \