diff --git a/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/BatchMatMulTransposeB.mlir b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/BatchMatMulTransposeB.mlir new file mode 100644 index 00000000..82e44129 --- /dev/null +++ b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/BatchMatMulTransposeB.mlir @@ -0,0 +1,6 @@ +func.func @kernel_placeholder(%a : memref, %b : memref, %c : memref) { + linalg.batch_matmul_transpose_b + ins(%a, %b: memref, memref) + outs(%c: memref) + return +} diff --git a/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/CMakeLists.txt b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/CMakeLists.txt new file mode 100644 index 00000000..bbecb8b1 --- /dev/null +++ b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/CMakeLists.txt @@ -0,0 +1,145 @@ +add_executable(dl-op-batch-matmul-transpose-b-benchmark + Main.cpp +) +target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark GoogleBenchmark) + +# CMAKE_C_FLAGS is set when configuring cmake. +separate_arguments(CLANG_FLAGS_LIST UNIX_COMMAND "${CMAKE_C_FLAGS}") + +add_custom_command(OUTPUT batch_matmul_transpose_b_scalar_O0.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/BatchMatMulTransposeB.mlir | + sed -e {s/@kernel_placeholder/@batch_matmul_transpose_b_scalar_O0/} | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -pass-pipeline + "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | + ${BUDDY_MLIR_BINARY_DIR}/buddy-opt + -arith-expand + -eliminate-empty-tensors + -empty-tensor-to-alloc-tensor + -one-shot-bufferize + -convert-linalg-to-affine-loops + -affine-loop-fusion + -affine-parallelize + -lower-affine + -func-bufferize + -arith-bufferize + -tensor-bufferize + -buffer-deallocation + -finalizing-bufferize + -convert-vector-to-scf + -expand-strided-metadata + -convert-vector-to-llvm + -memref-expand + -arith-expand + -convert-arith-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -llvm-request-c-wrappers + -convert-arith-to-llvm + -convert-math-to-llvm + -convert-math-to-libm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir -o batch_matmul_transpose_b_scalar_O0.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O0 ${CLANG_FLAGS_LIST} batch_matmul_transpose_b_scalar_O0.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/batch_matmul_transpose_b_scalar_O0.o + VERBATIM) + +add_library(batch_matmul_transpose_b_scalar_O0 STATIC batch_matmul_transpose_b_scalar_O0.o) +set_target_properties(batch_matmul_transpose_b_scalar_O0 PROPERTIES LINKER_LANGUAGE CXX) +target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark + batch_matmul_transpose_b_scalar_O0 +) + +add_custom_command(OUTPUT batch_matmul_transpose_b_scalar_O3.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/BatchMatMulTransposeB.mlir | + sed -e {s/@kernel_placeholder/@batch_matmul_transpose_b_scalar_O3/} | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -pass-pipeline + "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | + ${BUDDY_MLIR_BINARY_DIR}/buddy-opt + -arith-expand + -eliminate-empty-tensors + -empty-tensor-to-alloc-tensor + -one-shot-bufferize + -convert-linalg-to-affine-loops + -affine-loop-fusion + -affine-parallelize + -lower-affine + -func-bufferize + -arith-bufferize + -tensor-bufferize + -buffer-deallocation + -finalizing-bufferize + -convert-vector-to-scf + -expand-strided-metadata + -convert-vector-to-llvm + -memref-expand + -arith-expand + -convert-arith-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -llvm-request-c-wrappers + -convert-arith-to-llvm + -convert-math-to-llvm + -convert-math-to-libm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir -o batch_matmul_transpose_b_scalar_O3.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} batch_matmul_transpose_b_scalar_O3.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/batch_matmul_transpose_b_scalar_O3.o + VERBATIM) + +add_library(batch_matmul_transpose_b_scalar_O3 STATIC batch_matmul_transpose_b_scalar_O3.o) +set_target_properties(batch_matmul_transpose_b_scalar_O3 PROPERTIES LINKER_LANGUAGE CXX) +target_link_directories(dl-op-batch-matmul-transpose-b-benchmark PRIVATE ${LLVM_MLIR_LIBRARY_DIR}) +target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark + batch_matmul_transpose_b_scalar_O3 +) + +add_custom_command(OUTPUT batch_matmul_transpose_b_vec.o + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/BatchMatMulTransposeB.mlir | + sed -e {s/@kernel_placeholder/@batch_matmul_transpose_b_vec/} | + ${LLVM_MLIR_BINARY_DIR}/mlir-opt + -pass-pipeline + "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | + ${BUDDY_MLIR_BINARY_DIR}/buddy-opt + -arith-expand + -eliminate-empty-tensors + -empty-tensor-to-alloc-tensor + -one-shot-bufferize + -func-bufferize + -arith-bufferize + -tensor-bufferize + -buffer-deallocation + -finalizing-bufferize + -batchmatmul-transpose-b-vectorization + -convert-linalg-to-affine-loops + -affine-loop-fusion + -lower-affine + -convert-vector-to-scf + -expand-strided-metadata + -convert-vector-to-llvm + -memref-expand + -arith-expand + -convert-arith-to-llvm + -finalize-memref-to-llvm + -convert-scf-to-cf + -llvm-request-c-wrappers + -convert-arith-to-llvm + -convert-math-to-llvm + -convert-math-to-libm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir -o batch_matmul_transpose_b_vec.ll + COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} batch_matmul_transpose_b_vec.ll + -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/batch_matmul_transpose_b_vec.o + VERBATIM) + +add_library(batch_matmul_transpose_b_vec STATIC batch_matmul_transpose_b_vec.o) +set_target_properties(batch_matmul_transpose_b_vec PROPERTIES LINKER_LANGUAGE CXX) +target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark + batch_matmul_transpose_b_vec +) + +# Build the target for your new method here. diff --git a/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Main.cpp b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Main.cpp new file mode 100644 index 00000000..3e1cc45e --- /dev/null +++ b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Main.cpp @@ -0,0 +1,134 @@ +//===- Main.cpp -----------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This is the main file of Batch Matmul TransposeBOp benchmark. +// +//===----------------------------------------------------------------------===// + +#include "Utils.hpp" +#include +#include + +// ----------------------------------------------------------------------------- +// Benchmark Configuration. You can change the number here as needed. +// ----------------------------------------------------------------------------- + +#define _NUM_ITER 1 +#define _SIZE_BATCH 4 +#define _SIZE_N 40 +#define _SIZE_K 256 +#define _SIZE_M 256 + +// ----------------------------------------------------------------------------- +// Global Variables and Functions. No need to change the code here. +// ----------------------------------------------------------------------------- + +intptr_t sizesInput1[3] = {_SIZE_BATCH, _SIZE_M, _SIZE_K}; +intptr_t sizesInput2[3] = {_SIZE_BATCH, _SIZE_K, _SIZE_N}; +intptr_t sizesOutput[3] = {_SIZE_BATCH, _SIZE_M, _SIZE_N}; +float *input1 = nullptr; +float *input2 = nullptr; +MemRef input1MemRef(sizesInput1); +MemRef input2MemRef(sizesInput2); + +// Runs the provided BatchMatMulTransposeB function for benchmarking. +template +void DL_OPS_BATCH_MATMUL_TRANSPOSE_B(benchmark::State &state, Func func) { + MemRef outputMemRef(sizesOutput, 0.0); + for (auto _ : state) { + func(&input1MemRef, &input2MemRef, &outputMemRef); + } + benchmark::DoNotOptimize(outputMemRef); +} + +using MLIRFunctionType = void (*)(MemRef *, MemRef *, + MemRef *); +// Verifies the result of an MLIR-based function against expected output. +void MLIRVerification(float *outputExpected, MLIRFunctionType MLIRFunc, + const std::string &name) { + MemRef outputMemRef(sizesOutput, 0); + MLIRFunc(&input1MemRef, &input2MemRef, &outputMemRef); + float *outputOptimized = outputMemRef.getData(); + batch_matmul_transpose_b::verify(outputExpected, outputOptimized, + _SIZE_BATCH, _SIZE_M * _SIZE_N, name); +} + +// ----------------------------------------------------------------------------- +// MLIR Benchmark. You can compare your new method with other methods here. +// ----------------------------------------------------------------------------- + +extern "C" { +void _mlir_ciface_batch_matmul_transpose_b_scalar_O0(MemRef *A, + MemRef *B, + MemRef *C); +void _mlir_ciface_batch_matmul_transpose_b_scalar_O3(MemRef *A, + MemRef *B, + MemRef *C); +void _mlir_ciface_batch_matmul_transpose_b_vec(MemRef *A, + MemRef *B, + MemRef *C); +/// [Step 1] Add function of your new method. +} +BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL_TRANSPOSE_B, Scalar_O0, + _mlir_ciface_batch_matmul_transpose_b_scalar_O0) + ->Unit(benchmark::kMillisecond) + ->Iterations(_NUM_ITER); +BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL_TRANSPOSE_B, Scalar_O3, + _mlir_ciface_batch_matmul_transpose_b_scalar_O3) + ->Unit(benchmark::kMillisecond) + ->Iterations(_NUM_ITER); +BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL_TRANSPOSE_B, Vec, + _mlir_ciface_batch_matmul_transpose_b_vec) + ->Unit(benchmark::kMillisecond) + ->Iterations(_NUM_ITER); + +/// [Step 2] Call GoogleBenchmark function to run your new method. + +// ----------------------------------------------------------------------------- +// Main Function. You can verify the correctness of your new method here. +// ----------------------------------------------------------------------------- + +int main(int argc, char **argv) { + // Initialize input data. + input1 = batch_matmul_transpose_b::allocArray(_SIZE_BATCH * _SIZE_N, + _SIZE_K); + input2 = batch_matmul_transpose_b::allocArray(_SIZE_BATCH * _SIZE_K, + _SIZE_M); + input1MemRef = MemRef(input1, sizesInput1); + input2MemRef = MemRef(input2, sizesInput2); + + // Run benchmark. + ::benchmark::Initialize(&argc, argv); + ::benchmark::RunSpecifiedBenchmarks(); + + std::cout << "\033[34m---------- Verification ----------\033[0m" << std::endl; + // Attain scalar output results as expected output results in verification. + MemRef outputMemrefScalar(sizesOutput, 0); + _mlir_ciface_batch_matmul_transpose_b_scalar_O0(&input1MemRef, &input2MemRef, + &outputMemrefScalar); + float *outputExpected = outputMemrefScalar.getData(); + + MLIRVerification(outputExpected, + _mlir_ciface_batch_matmul_transpose_b_scalar_O3, + "Scalar_O3"); + MLIRVerification(outputExpected, _mlir_ciface_batch_matmul_transpose_b_vec, + "Vec"); + /// [Step 3] Add your new method for verification. + + delete[] input1; + delete[] input2; + return 0; +} diff --git a/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Utils.hpp b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Utils.hpp new file mode 100644 index 00000000..bc650a09 --- /dev/null +++ b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Utils.hpp @@ -0,0 +1,84 @@ +//===- Utils.hpp ----------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements BatchMatMulTransposeOp helper functions. +// +//===----------------------------------------------------------------------===// + +#ifndef BATCHMATMUL_TRANSPOSE_B_UTILS_HPP +#define BATCHMATMUL_TRANSPOSE_B_UTILS_HPP + +#include +#include +#include +#include +#include + +// ----------------------------------------------------------------------------- +// Helper Functions +// ----------------------------------------------------------------------------- + +namespace batch_matmul_transpose_b { + +// Allocates a 1D array with dimensions `rows * cols` and fills it with random +// values between 0 and 99. +template DATA_TYPE *allocArray(int rows, int cols) { + // Initialize the random number generator. + std::srand(static_cast(std::time(0))); + // Allocate memory for the array + DATA_TYPE *array = new DATA_TYPE[rows * cols]; + // Fill the array with random numbers between 0 and 99 + for (int i = 0; i < rows; i++) { + for (int j = 0; j < cols; j++) { + array[i * cols + j] = static_cast(std::rand() % 100); + } + } + return array; +} + +template +void verify(DATA_TYPE *A, DATA_TYPE *B, int batch, int size, + const std::string &name) { + const std::string PASS = "\033[32mPASS\033[0m"; + const std::string FAIL = "\033[31mFAIL\033[0m"; + const double epsilon = 1e-6; // Tolerance for floating point comparison + + std::cout << name << " "; + if (!A || !B) { + std::cout << FAIL << " (Null pointer detected)" << std::endl; + return; + } + + bool isPass = true; + for (int i = 0; i < batch; i++) { + for (int j = 0; j < size; j++) { + int k = i * size + j; + if (std::fabs(A[k] - B[k]) > epsilon) { + std::cout << FAIL << std::endl; + std::cout << "Batch=" << i << " Index=" << j << ":\tA[k]=" << A[k] + << " B[k]=" << B[k] << std::endl; + isPass = false; + break; + } + } + } + if (isPass) { + std::cout << PASS << std::endl; + } +} +} // namespace batch_matmul_transpose_b + +#endif // BATCHMATMUL_TRANSPOSE_B_UTILS_HPP diff --git a/benchmarks/DeepLearning/Ops/CMakeLists.txt b/benchmarks/DeepLearning/Ops/CMakeLists.txt index 4548f9e8..0fbc98b6 100644 --- a/benchmarks/DeepLearning/Ops/CMakeLists.txt +++ b/benchmarks/DeepLearning/Ops/CMakeLists.txt @@ -18,6 +18,7 @@ add_subdirectory(SoftmaxExpSumDivOp) add_subdirectory(Conv2DNhwcFhwcOp) add_subdirectory(TransposeOp) add_subdirectory(MatMulTransposeBOp) +add_subdirectory(BatchMatMulTransposeBOp) if (CROSS_COMPILE_RVV) add_subdirectory(Conv2DNhwcFhwcInt32Op)