From bc96e8ca0f9bd06dd0e9ce31813b0b59ccd36732 Mon Sep 17 00:00:00 2001 From: hayden Date: Thu, 17 Apr 2025 14:00:45 +0800 Subject: [PATCH 1/5] TOSA:ReduceSumOP Vectorize Optimization --- examples/BuddyDeepSeekR1/AnalyseDialectOps.py | 87 +++++ examples/BuddyDeepSeekR1/makefile | 22 ++ examples/BuddyNext/compare_outputs.sh | 121 +++++++ examples/BuddyNext/makefile | 250 ++++++++++++++ .../BuddyNext/next-reduce_sum-vec-manual.mlir | 92 +++++ .../next-reduce_sum-vec-manual1.mlir | 87 +++++ examples/BuddyNext/next-reduce_sum.mlir | 73 ++++ examples/BuddyNext/next-reduce_sum1.mlir | 73 ++++ midend/lib/CMakeLists.txt | 1 + midend/lib/Conversion/CMakeLists.txt | 1 + .../TosaVectorization/CMakeLists.txt | 6 + .../ReduceSumVectorization3D.cpp | 324 ++++++++++++++++++ midend/lib/InitAll.cpp | 2 + tools/buddy-opt/CMakeLists.txt | 1 + tools/buddy-opt/buddy-opt.cpp | 2 + 15 files changed, 1142 insertions(+) create mode 100644 examples/BuddyDeepSeekR1/AnalyseDialectOps.py create mode 100644 examples/BuddyDeepSeekR1/makefile create mode 100755 examples/BuddyNext/compare_outputs.sh create mode 100644 examples/BuddyNext/next-reduce_sum-vec-manual.mlir create mode 100644 examples/BuddyNext/next-reduce_sum-vec-manual1.mlir create mode 100644 examples/BuddyNext/next-reduce_sum.mlir create mode 100644 examples/BuddyNext/next-reduce_sum1.mlir create mode 100644 midend/lib/Conversion/TosaVectorization/CMakeLists.txt create mode 100644 midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp diff --git a/examples/BuddyDeepSeekR1/AnalyseDialectOps.py b/examples/BuddyDeepSeekR1/AnalyseDialectOps.py new file mode 100644 index 0000000000..a2cf9983b2 --- /dev/null +++ b/examples/BuddyDeepSeekR1/AnalyseDialectOps.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +import os +import re +from collections import Counter, defaultdict +from pathlib import Path + +def extract_dialect_ops(mlir_file_path): + """ + Extract operations from all dialects in an MLIR file and count their occurrences. + + Args: + mlir_file_path (str): Path to the MLIR file + + Returns: + dict: Dictionary containing dialect names as keys and Counter objects as values + """ + # Read the MLIR file + with open(mlir_file_path, 'r') as f: + content = f.read() + + # Find all operations using regex + # This pattern matches lines that contain operation names with dialect prefix + # Excludes numbers and common non-dialect prefixes + op_pattern = r'([a-zA-Z_][a-zA-Z0-9_]*)\.([a-zA-Z_][a-zA-Z0-9_]*)' + all_ops = re.findall(op_pattern, content) + + # Group operations by dialect + dialect_ops = defaultdict(Counter) + for dialect, op in all_ops: + # Skip common non-dialect prefixes + if dialect.lower() in ['func', 'module', 'memref', 'arith', 'builtin']: + continue + dialect_ops[dialect][op] += 1 + + return dialect_ops + +def main(): + # Get the directory of the current script + current_dir = Path(__file__).parent + + # Construct path to subgraph0.mlir + mlir_file = current_dir / 'subgraph0.mlir' + + if not mlir_file.exists(): + print(f"Error: {mlir_file} not found") + return + + # Extract and count operations by dialect + dialect_ops = extract_dialect_ops(str(mlir_file)) + + # Print results + print("\nMLIR Operation Statistics:") + print("=" * 60) + print(f"{'Dialect':<20} {'Operation':<30} {'Count':<10}") + print("=" * 60) + + total_ops = 0 + total_unique_ops = 0 + + # Sort dialects by total operation count + sorted_dialects = sorted( + dialect_ops.items(), + key=lambda x: sum(x[1].values()), + reverse=True + ) + + for dialect, ops in sorted_dialects: + dialect_total = sum(ops.values()) + total_ops += dialect_total + total_unique_ops += len(ops) + + print(f"\n{dialect} (Total: {dialect_total} ops)") + print("-" * 60) + + # Sort operations by count + sorted_ops = sorted(ops.items(), key=lambda x: x[1], reverse=True) + for op, count in sorted_ops: + print(f"{'':<20} {op:<30} {count:<10}") + + print("\n" + "=" * 60) + print(f"Total dialects: {len(dialect_ops)}") + print(f"Total unique operations: {total_unique_ops}") + print(f"Total operation instances: {total_ops}") + +if __name__ == "__main__": + main() diff --git a/examples/BuddyDeepSeekR1/makefile b/examples/BuddyDeepSeekR1/makefile new file mode 100644 index 0000000000..baf3c52b82 --- /dev/null +++ b/examples/BuddyDeepSeekR1/makefile @@ -0,0 +1,22 @@ +#!/bin/bash +BUDDY_OPT := ../../build/bin/buddy-opt +MLIR_OPT := ../../llvm/build/bin/mlir-opt +MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate +MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner +LLC := ../../llvm/build/bin/llc +OPT_FLAG := -O0 + +ifeq ($(shell uname),Linux) +MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so +MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so +LIB_OMP := ../../llvm/build/lib/libomp.so +MTRIPLE := x86_64-unknown-linux-gnu +else ifeq ($(shell uname),Darwin) +MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib +MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib +MTRIPLE := x86_64-apple-darwin +endif + +lower-deepseek-r1-tosa: + @${MLIR_OPT} ./subgraph0.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" -o ./subgraph0-lower.mlir \ No newline at end of file diff --git a/examples/BuddyNext/compare_outputs.sh b/examples/BuddyNext/compare_outputs.sh new file mode 100755 index 0000000000..8fd80acc86 --- /dev/null +++ b/examples/BuddyNext/compare_outputs.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +# 设置颜色输出 +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' # No Color +YELLOW='\033[1;33m' +BLUE='\033[0;34m' + +# 检查命令行参数 +if [ $# -ne 2 ]; then + echo -e "${YELLOW}Usage: $0 ${NC}" + echo "Example: $0 'next-reduce-sum-run' 'next-reduce-sum-vec-manual-run'" + exit 1 +fi + +CMD1="$1" +CMD2="$2" +RUNS=100 + +# 创建临时文件存储输出 +OUTPUT1=$(mktemp) +OUTPUT2=$(mktemp) +PROCESSED1=$(mktemp) +SPEEDUPS=$(mktemp) + +# 提取时间数据 +extract_time() { + local file="$1" + grep -o '[0-9]\+\.[0-9]\+e[-+]\?[0-9]\+\|[0-9]\+\.[0-9]\+' "$file" +} + +# 转换时间为秒 +convert_to_seconds() { + local time_val="$1" + if [[ $time_val =~ e ]]; then + echo "$time_val" | sed 's/e/*10^/' | bc -l + else + printf "%.9f" $time_val + fi +} + +# 计算平均值 +calculate_mean() { + local file="$1" + local sum=0 + local count=0 + while read -r line; do + sum=$(echo "$sum + $line" | bc -l) + count=$((count + 1)) + done < "$file" + if [ $count -gt 0 ]; then + echo "scale=9; $sum / $count" | bc -l + else + echo "0" + fi +} + +echo -e "${BLUE}Running each version $RUNS times...${NC}" + +# 运行两个命令并计算每次的加速比 +for ((i=1; i<=$RUNS; i++)); do + echo -ne "\rRun $i/$RUNS" + + # 运行第一个命令 + TEMP_OUT1=$(mktemp) + make $CMD1 > "$TEMP_OUT1" 2>/dev/null + TIME1=$(extract_time "$TEMP_OUT1") + if [ -n "$TIME1" ]; then + TIME1=$(convert_to_seconds "$TIME1") + fi + + # 运行第二个命令 + TEMP_OUT2=$(mktemp) + make $CMD2 > "$TEMP_OUT2" 2>/dev/null + TIME2=$(extract_time "$TEMP_OUT2") + if [ -n "$TIME2" ]; then + TIME2=$(convert_to_seconds "$TIME2") + fi + + # 保存第一次运行的输出用于比较 + if [ $i -eq 1 ]; then + grep "data =" "$TEMP_OUT1" | sed 's/base@ = [^[:space:]]*/base@ = /g' > "$PROCESSED1" + grep "data =" "$TEMP_OUT2" | sed 's/base@ = [^[:space:]]*/base@ = /g' > "$OUTPUT2" + fi + + # 计算这次运行的加速比 + if [ -n "$TIME1" ] && [ -n "$TIME2" ] && [ "$TIME1" != "0" ] && [ "$TIME2" != "0" ]; then + echo "scale=9; $TIME1/$TIME2" | bc -l >> "$SPEEDUPS" + fi + + rm "$TEMP_OUT1" "$TEMP_OUT2" +done +echo + +# 比较数据输出 +echo -e "\n${BLUE}Comparing output data:${NC}" +if diff "$PROCESSED1" "$OUTPUT2" > /dev/null; then + echo -e "${GREEN}✓ Outputs match! Both versions produce the same results.${NC}" +else + echo -e "${RED}✗ Outputs differ! Found differences:${NC}" + echo "----------------------------------------" + diff "$PROCESSED1" "$OUTPUT2" + echo "----------------------------------------" +fi + +# 计算加速比的均值 +echo -e "\n${BLUE}Performance Comparison:${NC}" +SPEEDUP_MEAN=$(calculate_mean "$SPEEDUPS") + +if [ -n "$SPEEDUP_MEAN" ] && [ "$SPEEDUP_MEAN" != "0" ]; then + if [ $(echo "$SPEEDUP_MEAN > 1" | bc -l) -eq 1 ]; then + printf "${GREEN}Second version is %.2fx faster${NC}\n" "$SPEEDUP_MEAN" + else + SLOWDOWN=$(echo "scale=2; 1/$SPEEDUP_MEAN" | bc -l) + printf "${RED}Second version is %.2fx slower${NC}\n" "$SLOWDOWN" + fi +fi + +# 清理临时文件 +rm "$OUTPUT1" "$OUTPUT2" "$PROCESSED1" "$SPEEDUPS" \ No newline at end of file diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile index c7f75e2307..a49563387f 100644 --- a/examples/BuddyNext/makefile +++ b/examples/BuddyNext/makefile @@ -381,6 +381,256 @@ next-transpose-vec-manual-run: -shared-libs=${MLIR_RUNNER_UTILS} \ -shared-libs=${MLIR_C_RUNNER_UTILS} +next-transpose-vec-autoopt-run: + @${MLIR_OPT} ./log-transpose-optimized.mlir \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -lower-affine \ + -convert-arith-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} \ + -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-transpose-vec-auto-run: + @${BUDDY_OPT} next-transpose.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -func-bufferize \ + -arith-bufferize | \ + ${BUDDY_OPT} \ + -genericOp-transpose-vectorization="vector-size=16" \ + -func-bufferize \ + -arith-bufferize \ + -affine-loop-fusion \ + -lower-affine \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -lower-affine \ + -convert-arith-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} \ + -shared-libs=${MLIR_C_RUNNER_UTILS} + + +next-reduce-sum-lower: + @${MLIR_OPT} ./next-reduce_sum1.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -func-bufferize \ + -arith-bufferize \ + -o next-log1.mlir + + +next-reduce-sum-run: + @${MLIR_OPT} ./next-reduce_sum.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -func-bufferize \ + -arith-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} \ + -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-reduce-sum1-run: + @${MLIR_OPT} ./next-reduce_sum1.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -func-bufferize \ + -arith-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} \ + -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-reduce-sum-vec-manual1-run: + @${MLIR_OPT} ./next-reduce_sum-vec-manual1.mlir \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} \ + -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-reduce-sum-vec-auto-run: + @${MLIR_OPT} ./next-reduce_sum.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize | \ + ${BUDDY_OPT} \ + -reduce-sum-vectorization-3d="vector-size=16" \ + -func-bufferize \ + -arith-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} \ + -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-reduce-sum1-vec-auto-run: + @${MLIR_OPT} ./next-reduce_sum1.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize | \ + ${BUDDY_OPT} \ + -reduce-sum-vectorization-3d="vector-size=16" \ + -func-bufferize \ + -arith-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} \ + -shared-libs=${MLIR_C_RUNNER_UTILS} + + +next-reduce-sum-vec-manual-run: + @${MLIR_OPT} ./next-reduce_sum-vec-manual.mlir \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} \ + -shared-libs=${MLIR_C_RUNNER_UTILS} + next-embedding-lower: @${MLIR_OPT} ./next-embedding.mlir \ -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual.mlir b/examples/BuddyNext/next-reduce_sum-vec-manual.mlir new file mode 100644 index 0000000000..044da55002 --- /dev/null +++ b/examples/BuddyNext/next-reduce_sum-vec-manual.mlir @@ -0,0 +1,92 @@ +// RUN: buddy-opt -reduce-vectorize="vector-size=16" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c_wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \ +// RUN: | mlir-cpu-runner -O0 -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func.func private @rtclock() -> f64 +func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } + +// 创建一个12x40x40的输入张量 +memref.global "private" @A : memref<12x40x40xf32> = dense<3.0> + +func.func @kernel(%a : memref<12x40x40xf32>) { + %t_start = call @rtclock() : () -> f64 + + %b = memref.alloc() : memref<12x40xf32> // 输出张量 + + // 初始化常量 + %c0 = arith.constant 0.0 : f32 + %c16 = arith.constant 16 : index + %c12 = arith.constant 12 : index + %c40 = arith.constant 40 : index + %c0_idx = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + + // 使用step 1的外层循环和8x8分块 + affine.for %i0 = 0 to 12 step 1 { + affine.for %j0 = 0 to 40 step 8 { + // 使用1维并行处理 + affine.parallel (%idx) = (0) to (8) { + // 计算j1 + %j1 = arith.remui %idx, %c8 : index + + %j = affine.apply affine_map<(d0, d1) -> (d0 + d1)> (%j0, %j1) + + // 检查是否在有效范围内 + %j_in_range = arith.cmpi slt, %j, %c40 : index + + // 只在有效范围内进行计算 + scf.if %j_in_range { + // 初始化累加器 + %init_acc = arith.constant 0.0 : f32 + + // 在k维度上使用16元素向量化 + %result_acc = affine.for %k = 0 to 40 step 16 iter_args(%acc = %init_acc) -> f32 { + // 预取下一个数据块 + %next_k = arith.addi %k, %c16 : index + %next_valid = arith.cmpi slt, %next_k, %c40 : index + scf.if %next_valid { + memref.prefetch %a[%i0, %j, %next_k], read, locality<3>, data : memref<12x40x40xf32> + } + + // 计算当前块大小和掩码 + %remaining = arith.subi %c40, %k : index + %vl = arith.minsi %remaining, %c16 : index + %mask = vector.create_mask %vl : vector<16xi1> + + // 使用向量化读取数据 + %vec = vector.transfer_read %a[%i0, %j, %k], %c0, %mask : memref<12x40x40xf32>, vector<16xf32> + + // 向量规约求和 + %block_sum = vector.reduction , %vec : vector<16xf32> into f32 + %next_acc = arith.addf %acc, %block_sum : f32 + affine.yield %next_acc : f32 + } + + // 写入结果 + memref.store %result_acc, %b[%i0, %j] : memref<12x40xf32> + } + } + } + } + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + // 打印结果 + %printed_b = memref.cast %b : memref<12x40xf32> to memref<*xf32> + call @printMemrefF32(%printed_b) : (memref<*xf32>) -> () + + // 打印时间 + vector.print %time : f64 + + memref.dealloc %b : memref<12x40xf32> + return +} + +func.func @main() { + %a = memref.get_global @A : memref<12x40x40xf32> + call @kernel(%a) : (memref<12x40x40xf32>) -> () + return +} \ No newline at end of file diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir b/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir new file mode 100644 index 0000000000..373b427b4d --- /dev/null +++ b/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir @@ -0,0 +1,87 @@ +// RUN: buddy-opt -reduce-vectorize="vector-size=32" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c_wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \ +// RUN: | mlir-cpu-runner -O0 -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func.func private @rtclock() -> f64 +func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } + +// 创建一个1x40x1536的输入张量 +memref.global "private" @A : memref<1x40x1536xf32> = dense<3.0> + +func.func @kernel(%a : memref<1x40x1536xf32>) { + %t_start = call @rtclock() : () -> f64 + + %b = memref.alloc() : memref<1x40xf32> // 输出张量 + + // 初始化常量 + %c0 = arith.constant 0.0 : f32 + %c32 = arith.constant 32 : index + %c1 = arith.constant 1 : index + %c40 = arith.constant 40 : index + %c1536 = arith.constant 1536 : index + %c0_idx = arith.constant 0 : index + %c8 = arith.constant 8 : index + + // 使用分块和向量化处理 + affine.for %j0 = 0 to 40 step 8 { + // 处理8个元素一组 + affine.for %j1 = 0 to 8 { + %j = affine.apply affine_map<(d0, d1) -> (d0 + d1)> (%j0, %j1) + + // 检查是否在有效范围内 + %j_in_range = arith.cmpi slt, %j, %c40 : index + + // 只在有效范围内进行计算 + scf.if %j_in_range { + // 初始化累加器 + %init_acc = arith.constant 0.0 : f32 + + // 在k维度上使用32元素向量化 + %result_acc = affine.for %k = 0 to 1536 step 32 iter_args(%acc = %init_acc) -> f32 { + // 预取下一个数据块 + %next_k = arith.addi %k, %c32 : index + %next_valid = arith.cmpi slt, %next_k, %c1536 : index + scf.if %next_valid { + memref.prefetch %a[%c0_idx, %j, %next_k], read, locality<3>, data : memref<1x40x1536xf32> + } + + // 计算当前块大小和掩码 + %remaining = arith.subi %c1536, %k : index + %vl = arith.minsi %remaining, %c32 : index + %mask = vector.create_mask %vl : vector<32xi1> + + // 使用向量化读取数据 + %vec = vector.transfer_read %a[%c0_idx, %j, %k], %c0, %mask : memref<1x40x1536xf32>, vector<32xf32> + + // 向量规约求和 + %block_sum = vector.reduction , %vec : vector<32xf32> into f32 + %next_acc = arith.addf %acc, %block_sum : f32 + affine.yield %next_acc : f32 + } + + // 写入结果 + memref.store %result_acc, %b[%c0_idx, %j] : memref<1x40xf32> + } + } + } + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + // 打印结果 + %printed_b = memref.cast %b : memref<1x40xf32> to memref<*xf32> + call @printMemrefF32(%printed_b) : (memref<*xf32>) -> () + + // 打印时间 + vector.print %time : f64 + + memref.dealloc %b : memref<1x40xf32> + return +} + +func.func @main() { + %a = memref.get_global @A : memref<1x40x1536xf32> + call @kernel(%a) : (memref<1x40x1536xf32>) -> () + return +} diff --git a/examples/BuddyNext/next-reduce_sum.mlir b/examples/BuddyNext/next-reduce_sum.mlir new file mode 100644 index 0000000000..a2732c8d2b --- /dev/null +++ b/examples/BuddyNext/next-reduce_sum.mlir @@ -0,0 +1,73 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func.func private @rtclock() -> f64 +func.func private @printMemrefF32(%ptr : tensor<*xf32>) + +func.func @kernel(%t0 : tensor<12x40x40xf32>) { + %t_start = call @rtclock() : () -> f64 + + // Perform reduce_sum along axis=2 + %t1 = tosa.reduce_sum %t0 {axis = 2 : i32} : (tensor<12x40x40xf32>) -> tensor<12x40x1xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %t1 : tensor<12x40x1xf32> to tensor<*xf32> + + // Verify the output shape and some sample values + // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [12, 40, 1] strides = [40, 1, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [120] + // CHECK-SAME: [120] + // CHECK-SAME: ... + // CHECK-SAME: [120] + // CHECK-SAME: ] + + // Print results + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + // Print timings + vector.print %time : f64 + + return +} + +func.func @main() { + // Create a tensor filled with 3.0 + %c0 = arith.constant dense<3.0> : tensor<12x40x40xf32> + call @kernel(%c0) : (tensor<12x40x40xf32>) -> () + + return +} \ No newline at end of file diff --git a/examples/BuddyNext/next-reduce_sum1.mlir b/examples/BuddyNext/next-reduce_sum1.mlir new file mode 100644 index 0000000000..7b691cc69f --- /dev/null +++ b/examples/BuddyNext/next-reduce_sum1.mlir @@ -0,0 +1,73 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func.func private @rtclock() -> f64 +func.func private @printMemrefF32(%ptr : tensor<*xf32>) + +func.func @kernel(%t0 : tensor<1x40x1536xf32>) { + %t_start = call @rtclock() : () -> f64 + + // Perform reduce_sum along axis=2 + %t1 = tosa.reduce_sum %t0 {axis = 2 : i32} : (tensor<1x40x1536xf32>) -> tensor<1x40x1xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %t1 : tensor<1x40x1xf32> to tensor<*xf32> + + // Verify the output shape and some sample values + // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [12, 40, 1] strides = [40, 1, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [120] + // CHECK-SAME: [120] + // CHECK-SAME: ... + // CHECK-SAME: [120] + // CHECK-SAME: ] + + // Print results + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + // Print timings + vector.print %time : f64 + + return +} + +func.func @main() { + // Create a tensor filled with 3.0 + %c0 = arith.constant dense<3.0> : tensor<1x40x1536xf32> + call @kernel(%c0) : (tensor<1x40x1536xf32>) -> () + + return +} \ No newline at end of file diff --git a/midend/lib/CMakeLists.txt b/midend/lib/CMakeLists.txt index cae54478c3..b8b2d18fd4 100644 --- a/midend/lib/CMakeLists.txt +++ b/midend/lib/CMakeLists.txt @@ -27,6 +27,7 @@ set(LinkedLibs MatMulParallelVectorization SchedulingOnDevices TransposeOptimization + TosaVectorization ) diff --git a/midend/lib/Conversion/CMakeLists.txt b/midend/lib/Conversion/CMakeLists.txt index c3c2fa2ddd..1d9e9b63d4 100644 --- a/midend/lib/Conversion/CMakeLists.txt +++ b/midend/lib/Conversion/CMakeLists.txt @@ -16,3 +16,4 @@ add_subdirectory(LowerSche) add_subdirectory(FuncBufferize) add_subdirectory(DepthwiseConvOptimization) add_subdirectory(MLIRGPU) +add_subdirectory(TosaVectorization) diff --git a/midend/lib/Conversion/TosaVectorization/CMakeLists.txt b/midend/lib/Conversion/TosaVectorization/CMakeLists.txt new file mode 100644 index 0000000000..fead1acafb --- /dev/null +++ b/midend/lib/Conversion/TosaVectorization/CMakeLists.txt @@ -0,0 +1,6 @@ +add_mlir_library(TosaVectorization + ReduceSumVectorization3D.cpp + + LINK_LIBS PUBLIC + BuddyUtils +) diff --git a/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp b/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp new file mode 100644 index 0000000000..a4bfddf37a --- /dev/null +++ b/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp @@ -0,0 +1,324 @@ +//===- ReduceSumVectorization3D.cpp ----------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file implements the reduce sum vectorization for 3D tensors. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/IntegerSet.h" +#include "mlir/IR/Location.h" +#include "mlir/IR/TypeRange.h" +#include "mlir/IR/ValueRange.h" +#include "mlir/Support/LogicalResult.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace mlir; +using namespace vector; +using namespace affine; + +//===----------------------------------------------------------------------===// +// Rewrite Pattern +//===----------------------------------------------------------------------===// + +namespace { + +class ReduceSumVectorization3DPattern : public ConversionPattern { +public: + explicit ReduceSumVectorization3DPattern(MLIRContext *context, + int64_t affineVectorSizeParam) + : ConversionPattern(linalg::ReduceOp::getOperationName(), 1, context), + affineVectorSize(affineVectorSizeParam) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + auto reduceOp = cast(op); + + // Check if it's a 3D to 2D reduction + if (!reduceOp.getOperand(0).getType().isa() || + !reduceOp.getOperand(1).getType().isa()) + return failure(); + + auto inputType = reduceOp.getOperand(0).getType().cast(); + auto outputType = reduceOp.getOperand(1).getType().cast(); + + // Verify dimensions + if (inputType.getRank() != 3 || outputType.getRank() != 2) + return failure(); + + // Get input and output + auto input = reduceOp.getOperand(0); + auto output = reduceOp.getOperand(1); + auto loc = op->getLoc(); + + // Get element type of input tensor + Type elementType = inputType.getElementType(); + + // Define constants + const Value index0 = + rewriter.create(loc, rewriter.getIndexAttr(0)); + const Value indexVecSize = rewriter.create( + loc, rewriter.getIndexAttr(affineVectorSize)); + // const Value c8 = + // rewriter.create(loc, rewriter.getIndexAttr(8)); + // const Value c1 = + // rewriter.create(loc, rewriter.getIndexAttr(1)); + const Value zeroFloat = rewriter.create( + loc, rewriter.getZeroAttr(elementType)); + + // Get input tensor dimensions + Value dim0 = rewriter.create(loc, input, 0); + Value dim1 = rewriter.create(loc, input, 1); + Value dim2 = rewriter.create(loc, input, 2); + + // Outer loop - first dimension + affine::buildAffineLoopNest( + rewriter, loc, {index0}, {dim0}, 1, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value i0 = ivRange.front(); + + // Middle loop - second dimension, step 8 + affine::buildAffineLoopNest( + builder, loc, {index0}, {dim1}, 8, + [&](OpBuilder &builder, Location loc, ValueRange ivRange) { + Value j0 = ivRange.front(); + + // Create parallel op to process 8 blocks + SmallVector reducedValues = + llvm::to_vector<4>(llvm::map_range( + ArrayRef{}, + [](const LoopReduction &red) { return red.value; })); + + AffineParallelOp parallelOp = + builder.create( + loc, ValueRange(reducedValues).getTypes(), ValueRange{}, + ArrayRef{ + builder.getNamedAttr("lowerBoundsGroups", + builder.getI32TensorAttr({1})), + builder.getNamedAttr("upperBoundsGroups", + builder.getI32TensorAttr({1})), + builder.getNamedAttr( + "lowerBoundsMap", + AffineMapAttr::get(AffineMap::get( + 0, 0, {builder.getAffineConstantExpr(0)}, + builder.getContext()))), + builder.getNamedAttr( + "upperBoundsMap", + AffineMapAttr::get(AffineMap::get( + 0, 0, {builder.getAffineConstantExpr(8)}, + builder.getContext()))), + builder.getNamedAttr("steps", + builder.getI64ArrayAttr({1})), + builder.getNamedAttr("reductions", + builder.getArrayAttr({}))}); + + // Create parallel block body + Block *parallelBody = new Block(); + builder.setInsertionPointToStart(parallelBody); + parallelBody->addArgument(builder.getIndexType(), loc); + Value idx = parallelBody->getArguments()[0]; + + // Calculate actual j index + Value j = builder.create(loc, j0, idx); + + // Check if j is within valid range + Value j_in_range = builder.create( + loc, arith::CmpIPredicate::slt, j, dim1); + + builder.create( + loc, j_in_range, [&](OpBuilder &builder, Location loc) { + // Initialize accumulator + Value acc = builder.create( + loc, builder.getZeroAttr(elementType)); + + // Vectorized reduction in the innermost dimension + auto lbMap = AffineMap::get( + /*dimCount=*/0, /*symbolCount=*/0, + builder.getAffineConstantExpr(0), + builder.getContext()); + auto ubMap = AffineMap::get( + /*dimCount=*/1, /*symbolCount=*/0, + builder.getAffineDimExpr(0), builder.getContext()); + + affine::AffineForOp reductionLoop = builder.create< + affine::AffineForOp>( + loc, + /*lbOperands=*/ValueRange{}, + /*lbMap=*/lbMap, + /*ubOperands=*/ValueRange{dim2}, + /*ubMap=*/ubMap, + /*step=*/affineVectorSize, + /*iterArgs=*/ValueRange{acc}, + [&](OpBuilder &builder, Location loc, Value iv, + ValueRange iterArgs) { + Value curr_acc = iterArgs[0]; + + // Prefetch next data block + Value next_k = builder.create( + loc, iv, indexVecSize); + Value next_valid = builder.create( + loc, arith::CmpIPredicate::slt, next_k, dim2); + + builder.create( + loc, next_valid, + [&](OpBuilder &builder, Location loc) { + builder.create( + loc, input, ValueRange{i0, j, next_k}, + /*isWrite=*/false, + /*locality=*/3, + /*isDataCache=*/true); + builder.create(loc); + }); + + // Calculate current block size and mask + Value remaining = + builder.create(loc, dim2, iv); + Value vl = builder.create( + loc, remaining, indexVecSize); + Value mask = builder.create( + loc, + VectorType::get({(int64_t)affineVectorSize}, + builder.getI1Type()), + ValueRange{vl}); + + // Vectorized read + auto vecType = VectorType::get( + {(int64_t)affineVectorSize}, elementType); + auto map = AffineMap::get( + /*dimCount=*/3, // 3D输入 + /*symbolCount=*/0, + {rewriter.getAffineDimExpr(2)}, // 只映射k维度 + rewriter.getContext()); + Value vec = builder.create( + loc, vecType, input, ValueRange{i0, j, iv}, map, + zeroFloat, mask, + ArrayAttr::get(builder.getContext(), + {builder.getBoolAttr(false)})); + + // Vector reduction sum + Value block_sum = + builder.create( + loc, vector::CombiningKind::ADD, vec); + + // Update accumulator + Value next_acc = builder.create( + loc, curr_acc, block_sum); + + builder.create(loc, + next_acc); + }); + + // Store result + builder.create( + loc, reductionLoop.getResult(0), output, + ValueRange{i0, j}); + + builder.create(loc); + }); + + builder.create(loc); + parallelOp.getRegion().push_back(parallelBody); + }); + }); + + // Remove original operation + rewriter.eraseOp(op); + return success(); + } + +private: + int64_t affineVectorSize; +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// ReduceSumVectorizationPass +//===----------------------------------------------------------------------===// + +namespace { +class ReduceSumVectorizationPass + : public PassWrapper> { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReduceSumVectorizationPass) + + StringRef getArgument() const final { return "reduce-sum-vectorization-3d"; } + + StringRef getDescription() const final { + return "Reduce Sum Vectorization for 3D tensors."; + } + + ReduceSumVectorizationPass() = default; + + ReduceSumVectorizationPass(const ReduceSumVectorizationPass &) {} + + explicit ReduceSumVectorizationPass(int64_t affineVectorSizeParam) { + affineVectorSize = affineVectorSizeParam; + } + + void runOnOperation() override { + MLIRContext *context = &getContext(); + ModuleOp module = getOperation(); + ConversionTarget target(*context); + target.addLegalDialect(); + target.addLegalOp(); + RewritePatternSet patterns(context); + patterns.add(context, affineVectorSize); + if (failed(applyPartialConversion(module, target, std::move(patterns)))) + signalPassFailure(); + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + Option affineVectorSize{*this, "vector-size", + llvm::cl::desc("Affine Vector size."), + llvm::cl::init(16)}; +}; +} // namespace + +namespace mlir { +namespace buddy { +void registerReduceSumVectorizationPass() { + PassRegistration(); +} +} // namespace buddy +} // namespace mlir \ No newline at end of file diff --git a/midend/lib/InitAll.cpp b/midend/lib/InitAll.cpp index d6cad2bc1e..f40ffac4ef 100644 --- a/midend/lib/InitAll.cpp +++ b/midend/lib/InitAll.cpp @@ -48,6 +48,7 @@ void registerMatMulParallelVectorizationPass(); void registerMatMulVectorizationPass(); void registerDeviceSchedulePass(); void registerTransposeOptimizationPass(); +void registerReduceSumVectorizationPass(); } // namespace buddy } // namespace mlir @@ -80,4 +81,5 @@ void mlir::buddy::registerAllPasses() { mlir::buddy::registerMatMulVectorizationPass(); mlir::buddy::registerDeviceSchedulePass(); mlir::buddy::registerTransposeOptimizationPass(); + mlir::buddy::registerReduceSumVectorizationPass(); } diff --git a/tools/buddy-opt/CMakeLists.txt b/tools/buddy-opt/CMakeLists.txt index 0abb857fad..bce971dae6 100644 --- a/tools/buddy-opt/CMakeLists.txt +++ b/tools/buddy-opt/CMakeLists.txt @@ -28,6 +28,7 @@ target_link_libraries(buddy-opt BatchMatMulOptimization MatMulParallelVectorization TransposeOptimization + TosaVectorization ConvOptimization DepthwiseConvOptimization VectorExp diff --git a/tools/buddy-opt/buddy-opt.cpp b/tools/buddy-opt/buddy-opt.cpp index 61a0958c72..9e7035edf8 100644 --- a/tools/buddy-opt/buddy-opt.cpp +++ b/tools/buddy-opt/buddy-opt.cpp @@ -71,6 +71,7 @@ void registerMatMulOptimizePass(); void registerMatMulVectorizationPass(); void registerMatMulParallelVectorizationPass(); void registerTransposeOptimizationPass(); +void registerReduceSumVectorizationPass(); void registerConvOptimizePass(); void registerConvNhwcFhwcOptimizePass(); void registerConvNhwcFhwcTileOptimizePass(); @@ -118,6 +119,7 @@ int main(int argc, char **argv) { mlir::buddy::registerMatMulVectorizationPass(); mlir::buddy::registerMatMulParallelVectorizationPass(); mlir::buddy::registerTransposeOptimizationPass(); + mlir::buddy::registerReduceSumVectorizationPass(); mlir::buddy::registerConvOptimizePass(); mlir::buddy::registerConvNhwcFhwcOptimizePass(); mlir::buddy::registerConvNhwcFhwcTileOptimizePass(); From edbbc65078d6023d621a84bf4b4ba96e49102895 Mon Sep 17 00:00:00 2001 From: hayden Date: Thu, 17 Apr 2025 21:53:46 +0800 Subject: [PATCH 2/5] Fixed Reduce_sum MLIR File Buddy-Check Error --- examples/BuddyNext/compare_outputs.sh | 4 +-- .../BuddyNext/next-reduce_sum-vec-manual.mlir | 28 +++++++++++++++++-- .../next-reduce_sum-vec-manual1.mlir | 26 ++++++++++++++++- examples/BuddyNext/next-reduce_sum.mlir | 20 ++++++------- examples/BuddyNext/next-reduce_sum1.mlir | 22 ++++++--------- 5 files changed, 70 insertions(+), 30 deletions(-) diff --git a/examples/BuddyNext/compare_outputs.sh b/examples/BuddyNext/compare_outputs.sh index 8fd80acc86..85d3897bf6 100755 --- a/examples/BuddyNext/compare_outputs.sh +++ b/examples/BuddyNext/compare_outputs.sh @@ -16,7 +16,7 @@ fi CMD1="$1" CMD2="$2" -RUNS=100 +RUNS=10 # 创建临时文件存储输出 OUTPUT1=$(mktemp) @@ -118,4 +118,4 @@ if [ -n "$SPEEDUP_MEAN" ] && [ "$SPEEDUP_MEAN" != "0" ]; then fi # 清理临时文件 -rm "$OUTPUT1" "$OUTPUT2" "$PROCESSED1" "$SPEEDUPS" \ No newline at end of file +rm "$OUTPUT1" "$OUTPUT2" "$PROCESSED1" "$SPEEDUPS" diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual.mlir b/examples/BuddyNext/next-reduce_sum-vec-manual.mlir index 044da55002..236c95366f 100644 --- a/examples/BuddyNext/next-reduce_sum-vec-manual.mlir +++ b/examples/BuddyNext/next-reduce_sum-vec-manual.mlir @@ -1,4 +1,21 @@ -// RUN: buddy-opt -reduce-vectorize="vector-size=16" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c_wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \ +// RUN: buddy-opt %s \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ // RUN: | mlir-cpu-runner -O0 -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ // RUN: | FileCheck %s @@ -76,9 +93,16 @@ func.func @kernel(%a : memref<12x40x40xf32>) { // 打印结果 %printed_b = memref.cast %b : memref<12x40xf32> to memref<*xf32> + + // All the elements of the MemRef are the same, + // only check the first line to verify the correctness. + // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [12, 40] strides = [40, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [120{{(, 120)*}}] + call @printMemrefF32(%printed_b) : (memref<*xf32>) -> () - // 打印时间 + // Print timings vector.print %time : f64 memref.dealloc %b : memref<12x40xf32> diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir b/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir index 373b427b4d..6167063c48 100644 --- a/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir +++ b/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir @@ -1,4 +1,21 @@ -// RUN: buddy-opt -reduce-vectorize="vector-size=32" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c_wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \ +// RUN: buddy-opt %s \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ // RUN: | mlir-cpu-runner -O0 -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ // RUN: | FileCheck %s @@ -71,6 +88,13 @@ func.func @kernel(%a : memref<1x40x1536xf32>) { // 打印结果 %printed_b = memref.cast %b : memref<1x40xf32> to memref<*xf32> + + // All the elements of the MemRef are the same, + // only check the first line to verify the correctness. + // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [1, 40] strides = [40, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [4608{{(, 4608)*}}] + call @printMemrefF32(%printed_b) : (memref<*xf32>) -> () // 打印时间 diff --git a/examples/BuddyNext/next-reduce_sum.mlir b/examples/BuddyNext/next-reduce_sum.mlir index a2732c8d2b..a5ddd89654 100644 --- a/examples/BuddyNext/next-reduce_sum.mlir +++ b/examples/BuddyNext/next-reduce_sum.mlir @@ -5,14 +5,13 @@ // RUN: -eliminate-empty-tensors \ // RUN: -empty-tensor-to-alloc-tensor \ // RUN: -one-shot-bufferize \ +// RUN: | buddy-opt \ +// RUN: -reduce-sum-vectorization-3d="vector-size=16" \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ // RUN: -convert-linalg-to-affine-loops \ // RUN: -affine-loop-fusion \ // RUN: -lower-affine \ -// RUN: -func-bufferize \ -// RUN: -arith-bufferize \ -// RUN: -tensor-bufferize \ -// RUN: -buffer-deallocation \ -// RUN: -finalizing-bufferize \ // RUN: -convert-vector-to-scf \ // RUN: -expand-strided-metadata \ // RUN: -convert-vector-to-llvm \ @@ -24,7 +23,7 @@ // RUN: -convert-openmp-to-llvm \ // RUN: -convert-arith-to-llvm \ // RUN: -convert-math-to-llvm \ -// RUN: -convert-math-to-libm \ +// RUN: -convert-math-to-libm \ // RUN: -convert-func-to-llvm \ // RUN: -reconcile-unrealized-casts \ // RUN: | mlir-cpu-runner -e main -entry-point-result=void \ @@ -46,15 +45,12 @@ func.func @kernel(%t0 : tensor<12x40x40xf32>) { %tensor_unranked = tensor.cast %t1 : tensor<12x40x1xf32> to tensor<*xf32> - // Verify the output shape and some sample values + // All the elements of the MemRef are the same, + // only check the first line to verify the correctness. // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [12, 40, 1] strides = [40, 1, 1] data = // CHECK-NEXT: [ // CHECK-SAME: [ - // CHECK-SAME: [120] - // CHECK-SAME: [120] - // CHECK-SAME: ... - // CHECK-SAME: [120] - // CHECK-SAME: ] + // CHECK-SAME: [120{{(, 120)*}}], // Print results call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () diff --git a/examples/BuddyNext/next-reduce_sum1.mlir b/examples/BuddyNext/next-reduce_sum1.mlir index 7b691cc69f..246052a40c 100644 --- a/examples/BuddyNext/next-reduce_sum1.mlir +++ b/examples/BuddyNext/next-reduce_sum1.mlir @@ -5,14 +5,13 @@ // RUN: -eliminate-empty-tensors \ // RUN: -empty-tensor-to-alloc-tensor \ // RUN: -one-shot-bufferize \ +// RUN: | buddy-opt \ +// RUN: -reduce-sum-vectorization-3d="vector-size=32" \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ // RUN: -convert-linalg-to-affine-loops \ // RUN: -affine-loop-fusion \ // RUN: -lower-affine \ -// RUN: -func-bufferize \ -// RUN: -arith-bufferize \ -// RUN: -tensor-bufferize \ -// RUN: -buffer-deallocation \ -// RUN: -finalizing-bufferize \ // RUN: -convert-vector-to-scf \ // RUN: -expand-strided-metadata \ // RUN: -convert-vector-to-llvm \ @@ -24,7 +23,7 @@ // RUN: -convert-openmp-to-llvm \ // RUN: -convert-arith-to-llvm \ // RUN: -convert-math-to-llvm \ -// RUN: -convert-math-to-libm \ +// RUN: -convert-math-to-libm \ // RUN: -convert-func-to-llvm \ // RUN: -reconcile-unrealized-casts \ // RUN: | mlir-cpu-runner -e main -entry-point-result=void \ @@ -46,15 +45,12 @@ func.func @kernel(%t0 : tensor<1x40x1536xf32>) { %tensor_unranked = tensor.cast %t1 : tensor<1x40x1xf32> to tensor<*xf32> - // Verify the output shape and some sample values - // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [12, 40, 1] strides = [40, 1, 1] data = + // All the elements of the MemRef are the same, + // only check the first line to verify the correctness. + // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 1] strides = [40, 1, 1] data = // CHECK-NEXT: [ // CHECK-SAME: [ - // CHECK-SAME: [120] - // CHECK-SAME: [120] - // CHECK-SAME: ... - // CHECK-SAME: [120] - // CHECK-SAME: ] + // CHECK-SAME: [4608{{(, 4608)*}}], // Print results call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () From 0dfba62306ea6f1e57cba1f9d4eb52fe9670b482 Mon Sep 17 00:00:00 2001 From: hayden Date: Sun, 25 May 2025 15:29:09 +0800 Subject: [PATCH 3/5] Make corrections based on review results --- examples/BuddyDeepSeekR1/.gitignore | 2 + examples/BuddyDeepSeekR1/AnalyseDialectOps.py | 87 ------------------- examples/BuddyDeepSeekR1/makefile | 22 ----- ...sum.mlir => next-reduce-sum-12x40x40.mlir} | 2 +- ...m1.mlir => next-reduce-sum-1x40x1536.mlir} | 2 +- ... next-reduce-sum-vec-manual-12x40x40.mlir} | 32 +++---- ...next-reduce-sum-vec-manual-1x40x1536.mlir} | 33 +++---- .../ReduceSumVectorization3D.cpp | 2 +- 8 files changed, 38 insertions(+), 144 deletions(-) create mode 100644 examples/BuddyDeepSeekR1/.gitignore delete mode 100644 examples/BuddyDeepSeekR1/AnalyseDialectOps.py delete mode 100644 examples/BuddyDeepSeekR1/makefile rename examples/BuddyNext/{next-reduce_sum.mlir => next-reduce-sum-12x40x40.mlir} (99%) rename examples/BuddyNext/{next-reduce_sum1.mlir => next-reduce-sum-1x40x1536.mlir} (99%) rename examples/BuddyNext/{next-reduce_sum-vec-manual.mlir => next-reduce-sum-vec-manual-12x40x40.mlir} (84%) rename examples/BuddyNext/{next-reduce_sum-vec-manual1.mlir => next-reduce-sum-vec-manual-1x40x1536.mlir} (85%) diff --git a/examples/BuddyDeepSeekR1/.gitignore b/examples/BuddyDeepSeekR1/.gitignore new file mode 100644 index 0000000000..8d6276ca46 --- /dev/null +++ b/examples/BuddyDeepSeekR1/.gitignore @@ -0,0 +1,2 @@ +*.data +*.mlir \ No newline at end of file diff --git a/examples/BuddyDeepSeekR1/AnalyseDialectOps.py b/examples/BuddyDeepSeekR1/AnalyseDialectOps.py deleted file mode 100644 index a2cf9983b2..0000000000 --- a/examples/BuddyDeepSeekR1/AnalyseDialectOps.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python3 - -import os -import re -from collections import Counter, defaultdict -from pathlib import Path - -def extract_dialect_ops(mlir_file_path): - """ - Extract operations from all dialects in an MLIR file and count their occurrences. - - Args: - mlir_file_path (str): Path to the MLIR file - - Returns: - dict: Dictionary containing dialect names as keys and Counter objects as values - """ - # Read the MLIR file - with open(mlir_file_path, 'r') as f: - content = f.read() - - # Find all operations using regex - # This pattern matches lines that contain operation names with dialect prefix - # Excludes numbers and common non-dialect prefixes - op_pattern = r'([a-zA-Z_][a-zA-Z0-9_]*)\.([a-zA-Z_][a-zA-Z0-9_]*)' - all_ops = re.findall(op_pattern, content) - - # Group operations by dialect - dialect_ops = defaultdict(Counter) - for dialect, op in all_ops: - # Skip common non-dialect prefixes - if dialect.lower() in ['func', 'module', 'memref', 'arith', 'builtin']: - continue - dialect_ops[dialect][op] += 1 - - return dialect_ops - -def main(): - # Get the directory of the current script - current_dir = Path(__file__).parent - - # Construct path to subgraph0.mlir - mlir_file = current_dir / 'subgraph0.mlir' - - if not mlir_file.exists(): - print(f"Error: {mlir_file} not found") - return - - # Extract and count operations by dialect - dialect_ops = extract_dialect_ops(str(mlir_file)) - - # Print results - print("\nMLIR Operation Statistics:") - print("=" * 60) - print(f"{'Dialect':<20} {'Operation':<30} {'Count':<10}") - print("=" * 60) - - total_ops = 0 - total_unique_ops = 0 - - # Sort dialects by total operation count - sorted_dialects = sorted( - dialect_ops.items(), - key=lambda x: sum(x[1].values()), - reverse=True - ) - - for dialect, ops in sorted_dialects: - dialect_total = sum(ops.values()) - total_ops += dialect_total - total_unique_ops += len(ops) - - print(f"\n{dialect} (Total: {dialect_total} ops)") - print("-" * 60) - - # Sort operations by count - sorted_ops = sorted(ops.items(), key=lambda x: x[1], reverse=True) - for op, count in sorted_ops: - print(f"{'':<20} {op:<30} {count:<10}") - - print("\n" + "=" * 60) - print(f"Total dialects: {len(dialect_ops)}") - print(f"Total unique operations: {total_unique_ops}") - print(f"Total operation instances: {total_ops}") - -if __name__ == "__main__": - main() diff --git a/examples/BuddyDeepSeekR1/makefile b/examples/BuddyDeepSeekR1/makefile deleted file mode 100644 index baf3c52b82..0000000000 --- a/examples/BuddyDeepSeekR1/makefile +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -BUDDY_OPT := ../../build/bin/buddy-opt -MLIR_OPT := ../../llvm/build/bin/mlir-opt -MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate -MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner -LLC := ../../llvm/build/bin/llc -OPT_FLAG := -O0 - -ifeq ($(shell uname),Linux) -MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so -MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so -LIB_OMP := ../../llvm/build/lib/libomp.so -MTRIPLE := x86_64-unknown-linux-gnu -else ifeq ($(shell uname),Darwin) -MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib -MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib -MTRIPLE := x86_64-apple-darwin -endif - -lower-deepseek-r1-tosa: - @${MLIR_OPT} ./subgraph0.mlir \ - -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" -o ./subgraph0-lower.mlir \ No newline at end of file diff --git a/examples/BuddyNext/next-reduce_sum.mlir b/examples/BuddyNext/next-reduce-sum-12x40x40.mlir similarity index 99% rename from examples/BuddyNext/next-reduce_sum.mlir rename to examples/BuddyNext/next-reduce-sum-12x40x40.mlir index a5ddd89654..cc3ccf788d 100644 --- a/examples/BuddyNext/next-reduce_sum.mlir +++ b/examples/BuddyNext/next-reduce-sum-12x40x40.mlir @@ -66,4 +66,4 @@ func.func @main() { call @kernel(%c0) : (tensor<12x40x40xf32>) -> () return -} \ No newline at end of file +} diff --git a/examples/BuddyNext/next-reduce_sum1.mlir b/examples/BuddyNext/next-reduce-sum-1x40x1536.mlir similarity index 99% rename from examples/BuddyNext/next-reduce_sum1.mlir rename to examples/BuddyNext/next-reduce-sum-1x40x1536.mlir index 246052a40c..b8687d1f6f 100644 --- a/examples/BuddyNext/next-reduce_sum1.mlir +++ b/examples/BuddyNext/next-reduce-sum-1x40x1536.mlir @@ -66,4 +66,4 @@ func.func @main() { call @kernel(%c0) : (tensor<1x40x1536xf32>) -> () return -} \ No newline at end of file +} diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual.mlir b/examples/BuddyNext/next-reduce-sum-vec-manual-12x40x40.mlir similarity index 84% rename from examples/BuddyNext/next-reduce_sum-vec-manual.mlir rename to examples/BuddyNext/next-reduce-sum-vec-manual-12x40x40.mlir index 236c95366f..706a811ee1 100644 --- a/examples/BuddyNext/next-reduce_sum-vec-manual.mlir +++ b/examples/BuddyNext/next-reduce-sum-vec-manual-12x40x40.mlir @@ -23,15 +23,15 @@ func.func private @rtclock() -> f64 func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } -// 创建一个12x40x40的输入张量 +// Create a 12x40x40 input tensor memref.global "private" @A : memref<12x40x40xf32> = dense<3.0> func.func @kernel(%a : memref<12x40x40xf32>) { %t_start = call @rtclock() : () -> f64 - %b = memref.alloc() : memref<12x40xf32> // 输出张量 + %b = memref.alloc() : memref<12x40xf32> // Output tensor - // 初始化常量 + // Initialize constants %c0 = arith.constant 0.0 : f32 %c16 = arith.constant 16 : index %c12 = arith.constant 12 : index @@ -40,48 +40,48 @@ func.func @kernel(%a : memref<12x40x40xf32>) { %c1 = arith.constant 1 : index %c8 = arith.constant 8 : index - // 使用step 1的外层循环和8x8分块 + // Use outer loop with step 1 and 8x8 blocking affine.for %i0 = 0 to 12 step 1 { affine.for %j0 = 0 to 40 step 8 { - // 使用1维并行处理 + // Use 1D parallel processing affine.parallel (%idx) = (0) to (8) { - // 计算j1 + // Compute j1 %j1 = arith.remui %idx, %c8 : index %j = affine.apply affine_map<(d0, d1) -> (d0 + d1)> (%j0, %j1) - // 检查是否在有效范围内 + // Check if within valid range %j_in_range = arith.cmpi slt, %j, %c40 : index - // 只在有效范围内进行计算 + // Only compute within valid range scf.if %j_in_range { - // 初始化累加器 + // Initialize accumulator %init_acc = arith.constant 0.0 : f32 - // 在k维度上使用16元素向量化 + // Vectorize along k dimension with 16 elements %result_acc = affine.for %k = 0 to 40 step 16 iter_args(%acc = %init_acc) -> f32 { - // 预取下一个数据块 + // Prefetch next data block %next_k = arith.addi %k, %c16 : index %next_valid = arith.cmpi slt, %next_k, %c40 : index scf.if %next_valid { memref.prefetch %a[%i0, %j, %next_k], read, locality<3>, data : memref<12x40x40xf32> } - // 计算当前块大小和掩码 + // Compute current block size and mask %remaining = arith.subi %c40, %k : index %vl = arith.minsi %remaining, %c16 : index %mask = vector.create_mask %vl : vector<16xi1> - // 使用向量化读取数据 + // Vectorized data read %vec = vector.transfer_read %a[%i0, %j, %k], %c0, %mask : memref<12x40x40xf32>, vector<16xf32> - // 向量规约求和 + // Vector reduction sum %block_sum = vector.reduction , %vec : vector<16xf32> into f32 %next_acc = arith.addf %acc, %block_sum : f32 affine.yield %next_acc : f32 } - // 写入结果 + // Write result memref.store %result_acc, %b[%i0, %j] : memref<12x40xf32> } } @@ -91,7 +91,7 @@ func.func @kernel(%a : memref<12x40x40xf32>) { %t_end = call @rtclock() : () -> f64 %time = arith.subf %t_end, %t_start : f64 - // 打印结果 + // Print result %printed_b = memref.cast %b : memref<12x40xf32> to memref<*xf32> // All the elements of the MemRef are the same, diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir b/examples/BuddyNext/next-reduce-sum-vec-manual-1x40x1536.mlir similarity index 85% rename from examples/BuddyNext/next-reduce_sum-vec-manual1.mlir rename to examples/BuddyNext/next-reduce-sum-vec-manual-1x40x1536.mlir index 6167063c48..da6ee6b1ff 100644 --- a/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir +++ b/examples/BuddyNext/next-reduce-sum-vec-manual-1x40x1536.mlir @@ -23,15 +23,15 @@ func.func private @rtclock() -> f64 func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } -// 创建一个1x40x1536的输入张量 +// Create a 1x40x1536 input tensor memref.global "private" @A : memref<1x40x1536xf32> = dense<3.0> func.func @kernel(%a : memref<1x40x1536xf32>) { %t_start = call @rtclock() : () -> f64 - %b = memref.alloc() : memref<1x40xf32> // 输出张量 + %b = memref.alloc() : memref<1x40xf32> // Output tensor - // 初始化常量 + // Initialize constants %c0 = arith.constant 0.0 : f32 %c32 = arith.constant 32 : index %c1 = arith.constant 1 : index @@ -40,44 +40,44 @@ func.func @kernel(%a : memref<1x40x1536xf32>) { %c0_idx = arith.constant 0 : index %c8 = arith.constant 8 : index - // 使用分块和向量化处理 + // Use blocking and vectorization affine.for %j0 = 0 to 40 step 8 { - // 处理8个元素一组 + // Process 8 elements at a time affine.for %j1 = 0 to 8 { %j = affine.apply affine_map<(d0, d1) -> (d0 + d1)> (%j0, %j1) - // 检查是否在有效范围内 + // Check if within valid range %j_in_range = arith.cmpi slt, %j, %c40 : index - // 只在有效范围内进行计算 + // Only compute within valid range scf.if %j_in_range { - // 初始化累加器 + // Initialize accumulator %init_acc = arith.constant 0.0 : f32 - // 在k维度上使用32元素向量化 + // Vectorize along k dimension with 32 elements %result_acc = affine.for %k = 0 to 1536 step 32 iter_args(%acc = %init_acc) -> f32 { - // 预取下一个数据块 + // Prefetch next data block %next_k = arith.addi %k, %c32 : index %next_valid = arith.cmpi slt, %next_k, %c1536 : index scf.if %next_valid { memref.prefetch %a[%c0_idx, %j, %next_k], read, locality<3>, data : memref<1x40x1536xf32> } - // 计算当前块大小和掩码 + // Compute current block size and mask %remaining = arith.subi %c1536, %k : index %vl = arith.minsi %remaining, %c32 : index %mask = vector.create_mask %vl : vector<32xi1> - // 使用向量化读取数据 + // Vectorized data read %vec = vector.transfer_read %a[%c0_idx, %j, %k], %c0, %mask : memref<1x40x1536xf32>, vector<32xf32> - // 向量规约求和 + // Vector reduction sum %block_sum = vector.reduction , %vec : vector<32xf32> into f32 %next_acc = arith.addf %acc, %block_sum : f32 affine.yield %next_acc : f32 } - // 写入结果 + // Write result memref.store %result_acc, %b[%c0_idx, %j] : memref<1x40xf32> } } @@ -86,7 +86,7 @@ func.func @kernel(%a : memref<1x40x1536xf32>) { %t_end = call @rtclock() : () -> f64 %time = arith.subf %t_end, %t_start : f64 - // 打印结果 + // Print result %printed_b = memref.cast %b : memref<1x40xf32> to memref<*xf32> // All the elements of the MemRef are the same, @@ -97,7 +97,7 @@ func.func @kernel(%a : memref<1x40x1536xf32>) { call @printMemrefF32(%printed_b) : (memref<*xf32>) -> () - // 打印时间 + // Print time vector.print %time : f64 memref.dealloc %b : memref<1x40xf32> @@ -109,3 +109,4 @@ func.func @main() { call @kernel(%a) : (memref<1x40x1536xf32>) -> () return } + diff --git a/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp b/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp index a4bfddf37a..4fea724255 100644 --- a/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp +++ b/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp @@ -321,4 +321,4 @@ void registerReduceSumVectorizationPass() { PassRegistration(); } } // namespace buddy -} // namespace mlir \ No newline at end of file +} // namespace mlir From 1c30c7d6c50f7a9fcd43e2f40e106685faa54f00 Mon Sep 17 00:00:00 2001 From: hayden Date: Mon, 26 May 2025 12:54:54 +0800 Subject: [PATCH 4/5] remove compare_outputs.sh --- examples/BuddyNext/.gitignore | 1 + examples/BuddyNext/compare_outputs.sh | 121 -------------------------- 2 files changed, 1 insertion(+), 121 deletions(-) delete mode 100755 examples/BuddyNext/compare_outputs.sh diff --git a/examples/BuddyNext/.gitignore b/examples/BuddyNext/.gitignore index 80a243fa81..e3aa442c78 100644 --- a/examples/BuddyNext/.gitignore +++ b/examples/BuddyNext/.gitignore @@ -1 +1,2 @@ log.* +*.sh diff --git a/examples/BuddyNext/compare_outputs.sh b/examples/BuddyNext/compare_outputs.sh deleted file mode 100755 index 85d3897bf6..0000000000 --- a/examples/BuddyNext/compare_outputs.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash - -# 设置颜色输出 -GREEN='\033[0;32m' -RED='\033[0;31m' -NC='\033[0m' # No Color -YELLOW='\033[1;33m' -BLUE='\033[0;34m' - -# 检查命令行参数 -if [ $# -ne 2 ]; then - echo -e "${YELLOW}Usage: $0 ${NC}" - echo "Example: $0 'next-reduce-sum-run' 'next-reduce-sum-vec-manual-run'" - exit 1 -fi - -CMD1="$1" -CMD2="$2" -RUNS=10 - -# 创建临时文件存储输出 -OUTPUT1=$(mktemp) -OUTPUT2=$(mktemp) -PROCESSED1=$(mktemp) -SPEEDUPS=$(mktemp) - -# 提取时间数据 -extract_time() { - local file="$1" - grep -o '[0-9]\+\.[0-9]\+e[-+]\?[0-9]\+\|[0-9]\+\.[0-9]\+' "$file" -} - -# 转换时间为秒 -convert_to_seconds() { - local time_val="$1" - if [[ $time_val =~ e ]]; then - echo "$time_val" | sed 's/e/*10^/' | bc -l - else - printf "%.9f" $time_val - fi -} - -# 计算平均值 -calculate_mean() { - local file="$1" - local sum=0 - local count=0 - while read -r line; do - sum=$(echo "$sum + $line" | bc -l) - count=$((count + 1)) - done < "$file" - if [ $count -gt 0 ]; then - echo "scale=9; $sum / $count" | bc -l - else - echo "0" - fi -} - -echo -e "${BLUE}Running each version $RUNS times...${NC}" - -# 运行两个命令并计算每次的加速比 -for ((i=1; i<=$RUNS; i++)); do - echo -ne "\rRun $i/$RUNS" - - # 运行第一个命令 - TEMP_OUT1=$(mktemp) - make $CMD1 > "$TEMP_OUT1" 2>/dev/null - TIME1=$(extract_time "$TEMP_OUT1") - if [ -n "$TIME1" ]; then - TIME1=$(convert_to_seconds "$TIME1") - fi - - # 运行第二个命令 - TEMP_OUT2=$(mktemp) - make $CMD2 > "$TEMP_OUT2" 2>/dev/null - TIME2=$(extract_time "$TEMP_OUT2") - if [ -n "$TIME2" ]; then - TIME2=$(convert_to_seconds "$TIME2") - fi - - # 保存第一次运行的输出用于比较 - if [ $i -eq 1 ]; then - grep "data =" "$TEMP_OUT1" | sed 's/base@ = [^[:space:]]*/base@ = /g' > "$PROCESSED1" - grep "data =" "$TEMP_OUT2" | sed 's/base@ = [^[:space:]]*/base@ = /g' > "$OUTPUT2" - fi - - # 计算这次运行的加速比 - if [ -n "$TIME1" ] && [ -n "$TIME2" ] && [ "$TIME1" != "0" ] && [ "$TIME2" != "0" ]; then - echo "scale=9; $TIME1/$TIME2" | bc -l >> "$SPEEDUPS" - fi - - rm "$TEMP_OUT1" "$TEMP_OUT2" -done -echo - -# 比较数据输出 -echo -e "\n${BLUE}Comparing output data:${NC}" -if diff "$PROCESSED1" "$OUTPUT2" > /dev/null; then - echo -e "${GREEN}✓ Outputs match! Both versions produce the same results.${NC}" -else - echo -e "${RED}✗ Outputs differ! Found differences:${NC}" - echo "----------------------------------------" - diff "$PROCESSED1" "$OUTPUT2" - echo "----------------------------------------" -fi - -# 计算加速比的均值 -echo -e "\n${BLUE}Performance Comparison:${NC}" -SPEEDUP_MEAN=$(calculate_mean "$SPEEDUPS") - -if [ -n "$SPEEDUP_MEAN" ] && [ "$SPEEDUP_MEAN" != "0" ]; then - if [ $(echo "$SPEEDUP_MEAN > 1" | bc -l) -eq 1 ]; then - printf "${GREEN}Second version is %.2fx faster${NC}\n" "$SPEEDUP_MEAN" - else - SLOWDOWN=$(echo "scale=2; 1/$SPEEDUP_MEAN" | bc -l) - printf "${RED}Second version is %.2fx slower${NC}\n" "$SLOWDOWN" - fi -fi - -# 清理临时文件 -rm "$OUTPUT1" "$OUTPUT2" "$PROCESSED1" "$SPEEDUPS" From 079a3beac2973b7cef3ce3f851b8f0c46e02b6ec Mon Sep 17 00:00:00 2001 From: hayden Date: Mon, 26 May 2025 12:56:38 +0800 Subject: [PATCH 5/5] modify gitignore --- examples/BuddyNext/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/BuddyNext/.gitignore b/examples/BuddyNext/.gitignore index e3aa442c78..56216edde9 100644 --- a/examples/BuddyNext/.gitignore +++ b/examples/BuddyNext/.gitignore @@ -1,2 +1,2 @@ log.* -*.sh +compare_outputs.sh