From bc96e8ca0f9bd06dd0e9ce31813b0b59ccd36732 Mon Sep 17 00:00:00 2001
From: hayden <haydenbrwon277@gmail.com>
Date: Thu, 17 Apr 2025 14:00:45 +0800
Subject: [PATCH 1/5] TOSA:ReduceSumOP Vectorize Optimization

---
 examples/BuddyDeepSeekR1/AnalyseDialectOps.py |  87 +++++
 examples/BuddyDeepSeekR1/makefile             |  22 ++
 examples/BuddyNext/compare_outputs.sh         | 121 +++++++
 examples/BuddyNext/makefile                   | 250 ++++++++++++++
 .../BuddyNext/next-reduce_sum-vec-manual.mlir |  92 +++++
 .../next-reduce_sum-vec-manual1.mlir          |  87 +++++
 examples/BuddyNext/next-reduce_sum.mlir       |  73 ++++
 examples/BuddyNext/next-reduce_sum1.mlir      |  73 ++++
 midend/lib/CMakeLists.txt                     |   1 +
 midend/lib/Conversion/CMakeLists.txt          |   1 +
 .../TosaVectorization/CMakeLists.txt          |   6 +
 .../ReduceSumVectorization3D.cpp              | 324 ++++++++++++++++++
 midend/lib/InitAll.cpp                        |   2 +
 tools/buddy-opt/CMakeLists.txt                |   1 +
 tools/buddy-opt/buddy-opt.cpp                 |   2 +
 15 files changed, 1142 insertions(+)
 create mode 100644 examples/BuddyDeepSeekR1/AnalyseDialectOps.py
 create mode 100644 examples/BuddyDeepSeekR1/makefile
 create mode 100755 examples/BuddyNext/compare_outputs.sh
 create mode 100644 examples/BuddyNext/next-reduce_sum-vec-manual.mlir
 create mode 100644 examples/BuddyNext/next-reduce_sum-vec-manual1.mlir
 create mode 100644 examples/BuddyNext/next-reduce_sum.mlir
 create mode 100644 examples/BuddyNext/next-reduce_sum1.mlir
 create mode 100644 midend/lib/Conversion/TosaVectorization/CMakeLists.txt
 create mode 100644 midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp

diff --git a/examples/BuddyDeepSeekR1/AnalyseDialectOps.py b/examples/BuddyDeepSeekR1/AnalyseDialectOps.py
new file mode 100644
index 0000000000..a2cf9983b2
--- /dev/null
+++ b/examples/BuddyDeepSeekR1/AnalyseDialectOps.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+import os
+import re
+from collections import Counter, defaultdict
+from pathlib import Path
+
+def extract_dialect_ops(mlir_file_path):
+    """
+    Extract operations from all dialects in an MLIR file and count their occurrences.
+    
+    Args:
+        mlir_file_path (str): Path to the MLIR file
+        
+    Returns:
+        dict: Dictionary containing dialect names as keys and Counter objects as values
+    """
+    # Read the MLIR file
+    with open(mlir_file_path, 'r') as f:
+        content = f.read()
+    
+    # Find all operations using regex
+    # This pattern matches lines that contain operation names with dialect prefix
+    # Excludes numbers and common non-dialect prefixes
+    op_pattern = r'([a-zA-Z_][a-zA-Z0-9_]*)\.([a-zA-Z_][a-zA-Z0-9_]*)'
+    all_ops = re.findall(op_pattern, content)
+    
+    # Group operations by dialect
+    dialect_ops = defaultdict(Counter)
+    for dialect, op in all_ops:
+        # Skip common non-dialect prefixes
+        if dialect.lower() in ['func', 'module', 'memref', 'arith', 'builtin']:
+            continue
+        dialect_ops[dialect][op] += 1
+    
+    return dialect_ops
+
+def main():
+    # Get the directory of the current script
+    current_dir = Path(__file__).parent
+    
+    # Construct path to subgraph0.mlir
+    mlir_file = current_dir / 'subgraph0.mlir'
+    
+    if not mlir_file.exists():
+        print(f"Error: {mlir_file} not found")
+        return
+    
+    # Extract and count operations by dialect
+    dialect_ops = extract_dialect_ops(str(mlir_file))
+    
+    # Print results
+    print("\nMLIR Operation Statistics:")
+    print("=" * 60)
+    print(f"{'Dialect':<20} {'Operation':<30} {'Count':<10}")
+    print("=" * 60)
+    
+    total_ops = 0
+    total_unique_ops = 0
+    
+    # Sort dialects by total operation count
+    sorted_dialects = sorted(
+        dialect_ops.items(),
+        key=lambda x: sum(x[1].values()),
+        reverse=True
+    )
+    
+    for dialect, ops in sorted_dialects:
+        dialect_total = sum(ops.values())
+        total_ops += dialect_total
+        total_unique_ops += len(ops)
+        
+        print(f"\n{dialect} (Total: {dialect_total} ops)")
+        print("-" * 60)
+        
+        # Sort operations by count
+        sorted_ops = sorted(ops.items(), key=lambda x: x[1], reverse=True)
+        for op, count in sorted_ops:
+            print(f"{'':<20} {op:<30} {count:<10}")
+    
+    print("\n" + "=" * 60)
+    print(f"Total dialects: {len(dialect_ops)}")
+    print(f"Total unique operations: {total_unique_ops}")
+    print(f"Total operation instances: {total_ops}")
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/BuddyDeepSeekR1/makefile b/examples/BuddyDeepSeekR1/makefile
new file mode 100644
index 0000000000..baf3c52b82
--- /dev/null
+++ b/examples/BuddyDeepSeekR1/makefile
@@ -0,0 +1,22 @@
+#!/bin/bash
+BUDDY_OPT := ../../build/bin/buddy-opt
+MLIR_OPT := ../../llvm/build/bin/mlir-opt
+MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
+MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
+LLC := ../../llvm/build/bin/llc
+OPT_FLAG := -O0
+
+ifeq ($(shell uname),Linux)
+MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
+MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
+LIB_OMP := ../../llvm/build/lib/libomp.so
+MTRIPLE := x86_64-unknown-linux-gnu
+else ifeq ($(shell uname),Darwin)
+MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
+MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
+MTRIPLE := x86_64-apple-darwin
+endif
+
+lower-deepseek-r1-tosa:
+	@${MLIR_OPT} ./subgraph0.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" -o ./subgraph0-lower.mlir
\ No newline at end of file
diff --git a/examples/BuddyNext/compare_outputs.sh b/examples/BuddyNext/compare_outputs.sh
new file mode 100755
index 0000000000..8fd80acc86
--- /dev/null
+++ b/examples/BuddyNext/compare_outputs.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# 设置颜色输出
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+
+# 检查命令行参数
+if [ $# -ne 2 ]; then
+    echo -e "${YELLOW}Usage: $0 <command1> <command2>${NC}"
+    echo "Example: $0 'next-reduce-sum-run' 'next-reduce-sum-vec-manual-run'"
+    exit 1
+fi
+
+CMD1="$1"
+CMD2="$2"
+RUNS=100
+
+# 创建临时文件存储输出
+OUTPUT1=$(mktemp)
+OUTPUT2=$(mktemp)
+PROCESSED1=$(mktemp)
+SPEEDUPS=$(mktemp)
+
+# 提取时间数据
+extract_time() {
+    local file="$1"
+    grep -o '[0-9]\+\.[0-9]\+e[-+]\?[0-9]\+\|[0-9]\+\.[0-9]\+' "$file"
+}
+
+# 转换时间为秒
+convert_to_seconds() {
+    local time_val="$1"
+    if [[ $time_val =~ e ]]; then
+        echo "$time_val" | sed 's/e/*10^/' | bc -l
+    else
+        printf "%.9f" $time_val
+    fi
+}
+
+# 计算平均值
+calculate_mean() {
+    local file="$1"
+    local sum=0
+    local count=0
+    while read -r line; do
+        sum=$(echo "$sum + $line" | bc -l)
+        count=$((count + 1))
+    done < "$file"
+    if [ $count -gt 0 ]; then
+        echo "scale=9; $sum / $count" | bc -l
+    else
+        echo "0"
+    fi
+}
+
+echo -e "${BLUE}Running each version $RUNS times...${NC}"
+
+# 运行两个命令并计算每次的加速比
+for ((i=1; i<=$RUNS; i++)); do
+    echo -ne "\rRun $i/$RUNS"
+    
+    # 运行第一个命令
+    TEMP_OUT1=$(mktemp)
+    make $CMD1 > "$TEMP_OUT1" 2>/dev/null
+    TIME1=$(extract_time "$TEMP_OUT1")
+    if [ -n "$TIME1" ]; then
+        TIME1=$(convert_to_seconds "$TIME1")
+    fi
+    
+    # 运行第二个命令
+    TEMP_OUT2=$(mktemp)
+    make $CMD2 > "$TEMP_OUT2" 2>/dev/null
+    TIME2=$(extract_time "$TEMP_OUT2")
+    if [ -n "$TIME2" ]; then
+        TIME2=$(convert_to_seconds "$TIME2")
+    fi
+    
+    # 保存第一次运行的输出用于比较
+    if [ $i -eq 1 ]; then
+        grep "data =" "$TEMP_OUT1" | sed 's/base@ = [^[:space:]]*/base@ = <addr>/g' > "$PROCESSED1"
+        grep "data =" "$TEMP_OUT2" | sed 's/base@ = [^[:space:]]*/base@ = <addr>/g' > "$OUTPUT2"
+    fi
+    
+    # 计算这次运行的加速比
+    if [ -n "$TIME1" ] && [ -n "$TIME2" ] && [ "$TIME1" != "0" ] && [ "$TIME2" != "0" ]; then
+        echo "scale=9; $TIME1/$TIME2" | bc -l >> "$SPEEDUPS"
+    fi
+    
+    rm "$TEMP_OUT1" "$TEMP_OUT2"
+done
+echo
+
+# 比较数据输出
+echo -e "\n${BLUE}Comparing output data:${NC}"
+if diff "$PROCESSED1" "$OUTPUT2" > /dev/null; then
+    echo -e "${GREEN}✓ Outputs match! Both versions produce the same results.${NC}"
+else
+    echo -e "${RED}✗ Outputs differ! Found differences:${NC}"
+    echo "----------------------------------------"
+    diff "$PROCESSED1" "$OUTPUT2"
+    echo "----------------------------------------"
+fi
+
+# 计算加速比的均值
+echo -e "\n${BLUE}Performance Comparison:${NC}"
+SPEEDUP_MEAN=$(calculate_mean "$SPEEDUPS")
+
+if [ -n "$SPEEDUP_MEAN" ] && [ "$SPEEDUP_MEAN" != "0" ]; then
+    if [ $(echo "$SPEEDUP_MEAN > 1" | bc -l) -eq 1 ]; then
+        printf "${GREEN}Second version is %.2fx faster${NC}\n" "$SPEEDUP_MEAN"
+    else
+        SLOWDOWN=$(echo "scale=2; 1/$SPEEDUP_MEAN" | bc -l)
+        printf "${RED}Second version is %.2fx slower${NC}\n" "$SLOWDOWN"
+    fi
+fi
+
+# 清理临时文件
+rm "$OUTPUT1" "$OUTPUT2" "$PROCESSED1" "$SPEEDUPS"
\ No newline at end of file
diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile
index c7f75e2307..a49563387f 100644
--- a/examples/BuddyNext/makefile
+++ b/examples/BuddyNext/makefile
@@ -381,6 +381,256 @@ next-transpose-vec-manual-run:
         -shared-libs=${MLIR_RUNNER_UTILS} \
 		-shared-libs=${MLIR_C_RUNNER_UTILS}
 
+next-transpose-vec-autoopt-run:
+	@${MLIR_OPT} ./log-transpose-optimized.mlir \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-lower-affine \
+		-convert-arith-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} \
+		-shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-transpose-vec-auto-run:
+	@${BUDDY_OPT} next-transpose.mlir \
+        -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+		${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-func-bufferize \
+		-arith-bufferize | \
+		${BUDDY_OPT} \
+		-genericOp-transpose-vectorization="vector-size=16" \
+		-func-bufferize \
+		-arith-bufferize \
+		-affine-loop-fusion \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-lower-affine \
+		-convert-arith-to-llvm \
+		-reconcile-unrealized-casts | \
+    ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+        -shared-libs=${MLIR_RUNNER_UTILS} \
+		-shared-libs=${MLIR_C_RUNNER_UTILS}
+
+
+next-reduce-sum-lower:
+	@${MLIR_OPT} ./next-reduce_sum1.mlir \
+        -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+    ${MLIR_OPT} \
+        -arith-expand \
+        -eliminate-empty-tensors \
+        -empty-tensor-to-alloc-tensor \
+        -one-shot-bufferize \
+		-func-bufferize \
+        -arith-bufferize \
+        -o next-log1.mlir
+
+
+next-reduce-sum-run:
+	@${MLIR_OPT} ./next-reduce_sum.mlir \
+        -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+    ${MLIR_OPT} \
+        -arith-expand \
+        -eliminate-empty-tensors \
+        -empty-tensor-to-alloc-tensor \
+        -one-shot-bufferize \
+				-func-bufferize \
+        -arith-bufferize \
+        -convert-linalg-to-affine-loops \
+        -affine-loop-fusion \
+        -lower-affine \
+        -convert-vector-to-scf \
+        -expand-strided-metadata \
+        -convert-vector-to-llvm \
+        -memref-expand \
+        -arith-expand \
+        -convert-arith-to-llvm \
+        -finalize-memref-to-llvm \
+        -convert-scf-to-cf \
+        -convert-openmp-to-llvm \
+        -convert-arith-to-llvm \
+        -convert-math-to-llvm \
+        -convert-math-to-libm  \
+        -convert-func-to-llvm \
+        -reconcile-unrealized-casts | \
+    ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+        -shared-libs=${MLIR_RUNNER_UTILS} \
+		-shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-reduce-sum1-run:
+	@${MLIR_OPT} ./next-reduce_sum1.mlir \
+			-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+			-arith-expand \
+			-eliminate-empty-tensors \
+			-empty-tensor-to-alloc-tensor \
+			-one-shot-bufferize \
+			-func-bufferize \
+			-arith-bufferize \
+			-convert-linalg-to-affine-loops \
+			-affine-loop-fusion \
+			-lower-affine \
+			-convert-vector-to-scf \
+			-expand-strided-metadata \
+			-convert-vector-to-llvm \
+			-memref-expand \
+			-arith-expand \
+			-convert-arith-to-llvm \
+			-finalize-memref-to-llvm \
+			-convert-scf-to-cf \
+			-convert-openmp-to-llvm \
+			-convert-arith-to-llvm \
+			-convert-math-to-llvm \
+			-convert-math-to-libm  \
+			-convert-func-to-llvm \
+			-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+			-shared-libs=${MLIR_RUNNER_UTILS} \
+	-shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-reduce-sum-vec-manual1-run:
+	@${MLIR_OPT} ./next-reduce_sum-vec-manual1.mlir \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} \
+		-shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-reduce-sum-vec-auto-run:
+	@${MLIR_OPT} ./next-reduce_sum.mlir \
+        -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+    ${MLIR_OPT} \
+        -arith-expand \
+        -eliminate-empty-tensors \
+        -empty-tensor-to-alloc-tensor \
+        -one-shot-bufferize | \
+		${BUDDY_OPT} \
+				-reduce-sum-vectorization-3d="vector-size=16" \
+				-func-bufferize \
+        -arith-bufferize \
+        -convert-linalg-to-affine-loops \
+        -affine-loop-fusion \
+        -lower-affine \
+        -convert-vector-to-scf \
+        -expand-strided-metadata \
+        -convert-vector-to-llvm \
+        -memref-expand \
+        -arith-expand \
+        -convert-arith-to-llvm \
+        -finalize-memref-to-llvm \
+        -convert-scf-to-cf \
+        -convert-openmp-to-llvm \
+        -convert-arith-to-llvm \
+        -convert-math-to-llvm \
+        -convert-math-to-libm  \
+        -convert-func-to-llvm \
+        -reconcile-unrealized-casts | \
+    ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+        -shared-libs=${MLIR_RUNNER_UTILS} \
+		-shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-reduce-sum1-vec-auto-run:
+	@${MLIR_OPT} ./next-reduce_sum1.mlir \
+        -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+    ${MLIR_OPT} \
+        -arith-expand \
+        -eliminate-empty-tensors \
+        -empty-tensor-to-alloc-tensor \
+        -one-shot-bufferize | \
+		${BUDDY_OPT} \
+				-reduce-sum-vectorization-3d="vector-size=16" \
+				-func-bufferize \
+        -arith-bufferize \
+        -convert-linalg-to-affine-loops \
+        -affine-loop-fusion \
+        -lower-affine \
+        -convert-vector-to-scf \
+        -expand-strided-metadata \
+        -convert-vector-to-llvm \
+        -memref-expand \
+        -arith-expand \
+        -convert-arith-to-llvm \
+        -finalize-memref-to-llvm \
+        -convert-scf-to-cf \
+        -convert-openmp-to-llvm \
+        -convert-arith-to-llvm \
+        -convert-math-to-llvm \
+        -convert-math-to-libm  \
+        -convert-func-to-llvm \
+        -reconcile-unrealized-casts | \
+    ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+        -shared-libs=${MLIR_RUNNER_UTILS} \
+		-shared-libs=${MLIR_C_RUNNER_UTILS}
+
+
+next-reduce-sum-vec-manual-run:
+	@${MLIR_OPT} ./next-reduce_sum-vec-manual.mlir \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} \
+		-shared-libs=${MLIR_C_RUNNER_UTILS}
+
 next-embedding-lower:
 	@${MLIR_OPT} ./next-embedding.mlir \
 		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual.mlir b/examples/BuddyNext/next-reduce_sum-vec-manual.mlir
new file mode 100644
index 0000000000..044da55002
--- /dev/null
+++ b/examples/BuddyNext/next-reduce_sum-vec-manual.mlir
@@ -0,0 +1,92 @@
+// RUN: buddy-opt -reduce-vectorize="vector-size=16" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c_wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \
+// RUN: | mlir-cpu-runner -O0 -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @rtclock() -> f64
+func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }
+
+// 创建一个12x40x40的输入张量
+memref.global "private" @A : memref<12x40x40xf32> = dense<3.0>
+
+func.func @kernel(%a : memref<12x40x40xf32>) {
+  %t_start = call @rtclock() : () -> f64
+  
+  %b = memref.alloc() : memref<12x40xf32>  // 输出张量
+
+  // 初始化常量
+  %c0 = arith.constant 0.0 : f32
+  %c16 = arith.constant 16 : index
+  %c12 = arith.constant 12 : index
+  %c40 = arith.constant 40 : index
+  %c0_idx = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+
+  // 使用step 1的外层循环和8x8分块
+  affine.for %i0 = 0 to 12 step 1 {
+    affine.for %j0 = 0 to 40 step 8 {
+      // 使用1维并行处理
+      affine.parallel (%idx) = (0) to (8) {
+        // 计算j1
+        %j1 = arith.remui %idx, %c8 : index
+        
+        %j = affine.apply affine_map<(d0, d1) -> (d0 + d1)> (%j0, %j1)
+        
+        // 检查是否在有效范围内
+        %j_in_range = arith.cmpi slt, %j, %c40 : index
+        
+        // 只在有效范围内进行计算
+        scf.if %j_in_range {
+          // 初始化累加器
+          %init_acc = arith.constant 0.0 : f32
+          
+          // 在k维度上使用16元素向量化
+          %result_acc = affine.for %k = 0 to 40 step 16 iter_args(%acc = %init_acc) -> f32 {
+            // 预取下一个数据块
+            %next_k = arith.addi %k, %c16 : index
+            %next_valid = arith.cmpi slt, %next_k, %c40 : index
+            scf.if %next_valid {
+              memref.prefetch %a[%i0, %j, %next_k], read, locality<3>, data : memref<12x40x40xf32>
+            }
+            
+            // 计算当前块大小和掩码
+            %remaining = arith.subi %c40, %k : index
+            %vl = arith.minsi %remaining, %c16 : index
+            %mask = vector.create_mask %vl : vector<16xi1>
+            
+            // 使用向量化读取数据
+            %vec = vector.transfer_read %a[%i0, %j, %k], %c0, %mask : memref<12x40x40xf32>, vector<16xf32>
+            
+            // 向量规约求和
+            %block_sum = vector.reduction <add>, %vec : vector<16xf32> into f32
+            %next_acc = arith.addf %acc, %block_sum : f32
+            affine.yield %next_acc : f32
+          }
+
+          // 写入结果
+          memref.store %result_acc, %b[%i0, %j] : memref<12x40xf32>
+        }
+      }
+    }
+  }
+
+  %t_end = call @rtclock() : () -> f64
+  %time = arith.subf %t_end, %t_start : f64
+
+  // 打印结果
+  %printed_b = memref.cast %b : memref<12x40xf32> to memref<*xf32>
+  call @printMemrefF32(%printed_b) : (memref<*xf32>) -> ()
+  
+  // 打印时间
+  vector.print %time : f64
+  
+  memref.dealloc %b : memref<12x40xf32>
+  return
+}
+
+func.func @main() {
+  %a = memref.get_global @A : memref<12x40x40xf32>
+  call @kernel(%a) : (memref<12x40x40xf32>) -> ()
+  return
+}
\ No newline at end of file
diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir b/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir
new file mode 100644
index 0000000000..373b427b4d
--- /dev/null
+++ b/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir
@@ -0,0 +1,87 @@
+// RUN: buddy-opt -reduce-vectorize="vector-size=32" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c_wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \
+// RUN: | mlir-cpu-runner -O0 -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @rtclock() -> f64
+func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }
+
+// 创建一个1x40x1536的输入张量
+memref.global "private" @A : memref<1x40x1536xf32> = dense<3.0>
+
+func.func @kernel(%a : memref<1x40x1536xf32>) {
+  %t_start = call @rtclock() : () -> f64
+  
+  %b = memref.alloc() : memref<1x40xf32>  // 输出张量
+
+  // 初始化常量
+  %c0 = arith.constant 0.0 : f32
+  %c32 = arith.constant 32 : index
+  %c1 = arith.constant 1 : index
+  %c40 = arith.constant 40 : index
+  %c1536 = arith.constant 1536 : index
+  %c0_idx = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+
+  // 使用分块和向量化处理
+  affine.for %j0 = 0 to 40 step 8 {
+    // 处理8个元素一组
+    affine.for %j1 = 0 to 8 {
+      %j = affine.apply affine_map<(d0, d1) -> (d0 + d1)> (%j0, %j1)
+      
+      // 检查是否在有效范围内
+      %j_in_range = arith.cmpi slt, %j, %c40 : index
+      
+      // 只在有效范围内进行计算
+      scf.if %j_in_range {
+        // 初始化累加器
+        %init_acc = arith.constant 0.0 : f32
+        
+        // 在k维度上使用32元素向量化
+        %result_acc = affine.for %k = 0 to 1536 step 32 iter_args(%acc = %init_acc) -> f32 {
+          // 预取下一个数据块
+          %next_k = arith.addi %k, %c32 : index
+          %next_valid = arith.cmpi slt, %next_k, %c1536 : index
+          scf.if %next_valid {
+            memref.prefetch %a[%c0_idx, %j, %next_k], read, locality<3>, data : memref<1x40x1536xf32>
+          }
+          
+          // 计算当前块大小和掩码
+          %remaining = arith.subi %c1536, %k : index
+          %vl = arith.minsi %remaining, %c32 : index
+          %mask = vector.create_mask %vl : vector<32xi1>
+          
+          // 使用向量化读取数据
+          %vec = vector.transfer_read %a[%c0_idx, %j, %k], %c0, %mask : memref<1x40x1536xf32>, vector<32xf32>
+          
+          // 向量规约求和
+          %block_sum = vector.reduction <add>, %vec : vector<32xf32> into f32
+          %next_acc = arith.addf %acc, %block_sum : f32
+          affine.yield %next_acc : f32
+        }
+
+        // 写入结果
+        memref.store %result_acc, %b[%c0_idx, %j] : memref<1x40xf32>
+      }
+    }
+  }
+
+  %t_end = call @rtclock() : () -> f64
+  %time = arith.subf %t_end, %t_start : f64
+
+  // 打印结果
+  %printed_b = memref.cast %b : memref<1x40xf32> to memref<*xf32>
+  call @printMemrefF32(%printed_b) : (memref<*xf32>) -> ()
+  
+  // 打印时间
+  vector.print %time : f64
+  
+  memref.dealloc %b : memref<1x40xf32>
+  return
+}
+
+func.func @main() {
+  %a = memref.get_global @A : memref<1x40x1536xf32>
+  call @kernel(%a) : (memref<1x40x1536xf32>) -> ()
+  return
+}
diff --git a/examples/BuddyNext/next-reduce_sum.mlir b/examples/BuddyNext/next-reduce_sum.mlir
new file mode 100644
index 0000000000..a2732c8d2b
--- /dev/null
+++ b/examples/BuddyNext/next-reduce_sum.mlir
@@ -0,0 +1,73 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @rtclock() -> f64
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+
+func.func @kernel(%t0 : tensor<12x40x40xf32>) {
+  %t_start = call @rtclock() : () -> f64
+
+  // Perform reduce_sum along axis=2
+  %t1 = tosa.reduce_sum %t0 {axis = 2 : i32} : (tensor<12x40x40xf32>) -> tensor<12x40x1xf32>
+
+  %t_end = call @rtclock() : () -> f64
+  %time = arith.subf %t_end, %t_start : f64
+
+  %tensor_unranked = tensor.cast %t1 : tensor<12x40x1xf32> to tensor<*xf32>
+
+  // Verify the output shape and some sample values
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [12, 40, 1] strides = [40, 1, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [
+  // CHECK-SAME: [120]
+  // CHECK-SAME: [120]
+  // CHECK-SAME: ...
+  // CHECK-SAME: [120]
+  // CHECK-SAME: ]
+
+  // Print results
+  call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+  // Print timings
+  vector.print %time : f64
+
+  return
+}
+
+func.func @main() {
+  // Create a tensor filled with 3.0
+  %c0 = arith.constant dense<3.0> : tensor<12x40x40xf32>
+  call @kernel(%c0) : (tensor<12x40x40xf32>) -> ()
+
+  return
+}
\ No newline at end of file
diff --git a/examples/BuddyNext/next-reduce_sum1.mlir b/examples/BuddyNext/next-reduce_sum1.mlir
new file mode 100644
index 0000000000..7b691cc69f
--- /dev/null
+++ b/examples/BuddyNext/next-reduce_sum1.mlir
@@ -0,0 +1,73 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @rtclock() -> f64
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+
+func.func @kernel(%t0 : tensor<1x40x1536xf32>) {
+  %t_start = call @rtclock() : () -> f64
+
+  // Perform reduce_sum along axis=2
+  %t1 = tosa.reduce_sum %t0 {axis = 2 : i32} : (tensor<1x40x1536xf32>) -> tensor<1x40x1xf32>
+
+  %t_end = call @rtclock() : () -> f64
+  %time = arith.subf %t_end, %t_start : f64
+
+  %tensor_unranked = tensor.cast %t1 : tensor<1x40x1xf32> to tensor<*xf32>
+
+  // Verify the output shape and some sample values
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [12, 40, 1] strides = [40, 1, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [
+  // CHECK-SAME: [120]
+  // CHECK-SAME: [120]
+  // CHECK-SAME: ...
+  // CHECK-SAME: [120]
+  // CHECK-SAME: ]
+
+  // Print results
+  call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+  // Print timings
+  vector.print %time : f64
+
+  return
+}
+
+func.func @main() {
+  // Create a tensor filled with 3.0
+  %c0 = arith.constant dense<3.0> : tensor<1x40x1536xf32>
+  call @kernel(%c0) : (tensor<1x40x1536xf32>) -> ()
+
+  return
+}
\ No newline at end of file
diff --git a/midend/lib/CMakeLists.txt b/midend/lib/CMakeLists.txt
index cae54478c3..b8b2d18fd4 100644
--- a/midend/lib/CMakeLists.txt
+++ b/midend/lib/CMakeLists.txt
@@ -27,6 +27,7 @@ set(LinkedLibs
   MatMulParallelVectorization
   SchedulingOnDevices
   TransposeOptimization
+  TosaVectorization
 )
 
 
diff --git a/midend/lib/Conversion/CMakeLists.txt b/midend/lib/Conversion/CMakeLists.txt
index c3c2fa2ddd..1d9e9b63d4 100644
--- a/midend/lib/Conversion/CMakeLists.txt
+++ b/midend/lib/Conversion/CMakeLists.txt
@@ -16,3 +16,4 @@ add_subdirectory(LowerSche)
 add_subdirectory(FuncBufferize)
 add_subdirectory(DepthwiseConvOptimization)
 add_subdirectory(MLIRGPU)
+add_subdirectory(TosaVectorization)
diff --git a/midend/lib/Conversion/TosaVectorization/CMakeLists.txt b/midend/lib/Conversion/TosaVectorization/CMakeLists.txt
new file mode 100644
index 0000000000..fead1acafb
--- /dev/null
+++ b/midend/lib/Conversion/TosaVectorization/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_mlir_library(TosaVectorization
+  ReduceSumVectorization3D.cpp
+  
+  LINK_LIBS PUBLIC
+  BuddyUtils
+)
diff --git a/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp b/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp
new file mode 100644
index 0000000000..a4bfddf37a
--- /dev/null
+++ b/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp
@@ -0,0 +1,324 @@
+//===- ReduceSumVectorization3D.cpp ----------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the reduce sum vectorization for 3D tensors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IntegerSet.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include <cstdint>
+#include <mlir/Dialect/Affine/Analysis/AffineAnalysis.h>
+#include <mlir/Dialect/Affine/IR/AffineOps.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/Linalg/Transforms/Transforms.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Pass/Pass.h>
+
+using namespace mlir;
+using namespace vector;
+using namespace affine;
+
+//===----------------------------------------------------------------------===//
+// Rewrite Pattern
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class ReduceSumVectorization3DPattern : public ConversionPattern {
+public:
+  explicit ReduceSumVectorization3DPattern(MLIRContext *context,
+                                           int64_t affineVectorSizeParam)
+      : ConversionPattern(linalg::ReduceOp::getOperationName(), 1, context),
+        affineVectorSize(affineVectorSizeParam) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto reduceOp = cast<linalg::ReduceOp>(op);
+
+    // Check if it's a 3D to 2D reduction
+    if (!reduceOp.getOperand(0).getType().isa<MemRefType>() ||
+        !reduceOp.getOperand(1).getType().isa<MemRefType>())
+      return failure();
+
+    auto inputType = reduceOp.getOperand(0).getType().cast<MemRefType>();
+    auto outputType = reduceOp.getOperand(1).getType().cast<MemRefType>();
+
+    // Verify dimensions
+    if (inputType.getRank() != 3 || outputType.getRank() != 2)
+      return failure();
+
+    // Get input and output
+    auto input = reduceOp.getOperand(0);
+    auto output = reduceOp.getOperand(1);
+    auto loc = op->getLoc();
+
+    // Get element type of input tensor
+    Type elementType = inputType.getElementType();
+
+    // Define constants
+    const Value index0 =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0));
+    const Value indexVecSize = rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getIndexAttr(affineVectorSize));
+    // const Value c8 =
+    //     rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(8));
+    // const Value c1 =
+    //     rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1));
+    const Value zeroFloat = rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getZeroAttr(elementType));
+
+    // Get input tensor dimensions
+    Value dim0 = rewriter.create<memref::DimOp>(loc, input, 0);
+    Value dim1 = rewriter.create<memref::DimOp>(loc, input, 1);
+    Value dim2 = rewriter.create<memref::DimOp>(loc, input, 2);
+
+    // Outer loop - first dimension
+    affine::buildAffineLoopNest(
+        rewriter, loc, {index0}, {dim0}, 1,
+        [&](OpBuilder &builder, Location loc, ValueRange ivRange) {
+          Value i0 = ivRange.front();
+
+          // Middle loop - second dimension, step 8
+          affine::buildAffineLoopNest(
+              builder, loc, {index0}, {dim1}, 8,
+              [&](OpBuilder &builder, Location loc, ValueRange ivRange) {
+                Value j0 = ivRange.front();
+
+                // Create parallel op to process 8 blocks
+                SmallVector<Value, 4U> reducedValues =
+                    llvm::to_vector<4>(llvm::map_range(
+                        ArrayRef<LoopReduction>{},
+                        [](const LoopReduction &red) { return red.value; }));
+
+                AffineParallelOp parallelOp =
+                    builder.create<affine::AffineParallelOp>(
+                        loc, ValueRange(reducedValues).getTypes(), ValueRange{},
+                        ArrayRef<NamedAttribute>{
+                            builder.getNamedAttr("lowerBoundsGroups",
+                                                 builder.getI32TensorAttr({1})),
+                            builder.getNamedAttr("upperBoundsGroups",
+                                                 builder.getI32TensorAttr({1})),
+                            builder.getNamedAttr(
+                                "lowerBoundsMap",
+                                AffineMapAttr::get(AffineMap::get(
+                                    0, 0, {builder.getAffineConstantExpr(0)},
+                                    builder.getContext()))),
+                            builder.getNamedAttr(
+                                "upperBoundsMap",
+                                AffineMapAttr::get(AffineMap::get(
+                                    0, 0, {builder.getAffineConstantExpr(8)},
+                                    builder.getContext()))),
+                            builder.getNamedAttr("steps",
+                                                 builder.getI64ArrayAttr({1})),
+                            builder.getNamedAttr("reductions",
+                                                 builder.getArrayAttr({}))});
+
+                // Create parallel block body
+                Block *parallelBody = new Block();
+                builder.setInsertionPointToStart(parallelBody);
+                parallelBody->addArgument(builder.getIndexType(), loc);
+                Value idx = parallelBody->getArguments()[0];
+
+                // Calculate actual j index
+                Value j = builder.create<arith::AddIOp>(loc, j0, idx);
+
+                // Check if j is within valid range
+                Value j_in_range = builder.create<arith::CmpIOp>(
+                    loc, arith::CmpIPredicate::slt, j, dim1);
+
+                builder.create<scf::IfOp>(
+                    loc, j_in_range, [&](OpBuilder &builder, Location loc) {
+                      // Initialize accumulator
+                      Value acc = builder.create<arith::ConstantOp>(
+                          loc, builder.getZeroAttr(elementType));
+
+                      // Vectorized reduction in the innermost dimension
+                      auto lbMap = AffineMap::get(
+                          /*dimCount=*/0, /*symbolCount=*/0,
+                          builder.getAffineConstantExpr(0),
+                          builder.getContext());
+                      auto ubMap = AffineMap::get(
+                          /*dimCount=*/1, /*symbolCount=*/0,
+                          builder.getAffineDimExpr(0), builder.getContext());
+
+                      affine::AffineForOp reductionLoop = builder.create<
+                          affine::AffineForOp>(
+                          loc,
+                          /*lbOperands=*/ValueRange{},
+                          /*lbMap=*/lbMap,
+                          /*ubOperands=*/ValueRange{dim2},
+                          /*ubMap=*/ubMap,
+                          /*step=*/affineVectorSize,
+                          /*iterArgs=*/ValueRange{acc},
+                          [&](OpBuilder &builder, Location loc, Value iv,
+                              ValueRange iterArgs) {
+                            Value curr_acc = iterArgs[0];
+
+                            // Prefetch next data block
+                            Value next_k = builder.create<arith::AddIOp>(
+                                loc, iv, indexVecSize);
+                            Value next_valid = builder.create<arith::CmpIOp>(
+                                loc, arith::CmpIPredicate::slt, next_k, dim2);
+
+                            builder.create<scf::IfOp>(
+                                loc, next_valid,
+                                [&](OpBuilder &builder, Location loc) {
+                                  builder.create<memref::PrefetchOp>(
+                                      loc, input, ValueRange{i0, j, next_k},
+                                      /*isWrite=*/false,
+                                      /*locality=*/3,
+                                      /*isDataCache=*/true);
+                                  builder.create<scf::YieldOp>(loc);
+                                });
+
+                            // Calculate current block size and mask
+                            Value remaining =
+                                builder.create<arith::SubIOp>(loc, dim2, iv);
+                            Value vl = builder.create<arith::MinSIOp>(
+                                loc, remaining, indexVecSize);
+                            Value mask = builder.create<vector::CreateMaskOp>(
+                                loc,
+                                VectorType::get({(int64_t)affineVectorSize},
+                                                builder.getI1Type()),
+                                ValueRange{vl});
+
+                            // Vectorized read
+                            auto vecType = VectorType::get(
+                                {(int64_t)affineVectorSize}, elementType);
+                            auto map = AffineMap::get(
+                                /*dimCount=*/3, // 3D输入
+                                /*symbolCount=*/0,
+                                {rewriter.getAffineDimExpr(2)}, // 只映射k维度
+                                rewriter.getContext());
+                            Value vec = builder.create<vector::TransferReadOp>(
+                                loc, vecType, input, ValueRange{i0, j, iv}, map,
+                                zeroFloat, mask,
+                                ArrayAttr::get(builder.getContext(),
+                                               {builder.getBoolAttr(false)}));
+
+                            // Vector reduction sum
+                            Value block_sum =
+                                builder.create<vector::ReductionOp>(
+                                    loc, vector::CombiningKind::ADD, vec);
+
+                            // Update accumulator
+                            Value next_acc = builder.create<arith::AddFOp>(
+                                loc, curr_acc, block_sum);
+
+                            builder.create<affine::AffineYieldOp>(loc,
+                                                                  next_acc);
+                          });
+
+                      // Store result
+                      builder.create<memref::StoreOp>(
+                          loc, reductionLoop.getResult(0), output,
+                          ValueRange{i0, j});
+
+                      builder.create<scf::YieldOp>(loc);
+                    });
+
+                builder.create<affine::AffineYieldOp>(loc);
+                parallelOp.getRegion().push_back(parallelBody);
+              });
+        });
+
+    // Remove original operation
+    rewriter.eraseOp(op);
+    return success();
+  }
+
+private:
+  int64_t affineVectorSize;
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// ReduceSumVectorizationPass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class ReduceSumVectorizationPass
+    : public PassWrapper<ReduceSumVectorizationPass, OperationPass<ModuleOp>> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReduceSumVectorizationPass)
+
+  StringRef getArgument() const final { return "reduce-sum-vectorization-3d"; }
+
+  StringRef getDescription() const final {
+    return "Reduce Sum Vectorization for 3D tensors.";
+  }
+
+  ReduceSumVectorizationPass() = default;
+
+  ReduceSumVectorizationPass(const ReduceSumVectorizationPass &) {}
+
+  explicit ReduceSumVectorizationPass(int64_t affineVectorSizeParam) {
+    affineVectorSize = affineVectorSizeParam;
+  }
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    ModuleOp module = getOperation();
+    ConversionTarget target(*context);
+    target.addLegalDialect<arith::ArithDialect, affine::AffineDialect,
+                           memref::MemRefDialect, VectorDialect,
+                           scf::SCFDialect>();
+    target.addLegalOp<ModuleOp, func::FuncOp, func::ReturnOp, linalg::FillOp>();
+    RewritePatternSet patterns(context);
+    patterns.add<ReduceSumVectorization3DPattern>(context, affineVectorSize);
+    if (failed(applyPartialConversion(module, target, std::move(patterns))))
+      signalPassFailure();
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, affine::AffineDialect, VectorDialect,
+                    memref::MemRefDialect, scf::SCFDialect>();
+  }
+
+  Option<int64_t> affineVectorSize{*this, "vector-size",
+                                   llvm::cl::desc("Affine Vector size."),
+                                   llvm::cl::init(16)};
+};
+} // namespace
+
+namespace mlir {
+namespace buddy {
+void registerReduceSumVectorizationPass() {
+  PassRegistration<ReduceSumVectorizationPass>();
+}
+} // namespace buddy
+} // namespace mlir
\ No newline at end of file
diff --git a/midend/lib/InitAll.cpp b/midend/lib/InitAll.cpp
index d6cad2bc1e..f40ffac4ef 100644
--- a/midend/lib/InitAll.cpp
+++ b/midend/lib/InitAll.cpp
@@ -48,6 +48,7 @@ void registerMatMulParallelVectorizationPass();
 void registerMatMulVectorizationPass();
 void registerDeviceSchedulePass();
 void registerTransposeOptimizationPass();
+void registerReduceSumVectorizationPass();
 } // namespace buddy
 } // namespace mlir
 
@@ -80,4 +81,5 @@ void mlir::buddy::registerAllPasses() {
   mlir::buddy::registerMatMulVectorizationPass();
   mlir::buddy::registerDeviceSchedulePass();
   mlir::buddy::registerTransposeOptimizationPass();
+  mlir::buddy::registerReduceSumVectorizationPass();
 }
diff --git a/tools/buddy-opt/CMakeLists.txt b/tools/buddy-opt/CMakeLists.txt
index 0abb857fad..bce971dae6 100644
--- a/tools/buddy-opt/CMakeLists.txt
+++ b/tools/buddy-opt/CMakeLists.txt
@@ -28,6 +28,7 @@ target_link_libraries(buddy-opt
   BatchMatMulOptimization
   MatMulParallelVectorization
   TransposeOptimization
+  TosaVectorization
   ConvOptimization
   DepthwiseConvOptimization
   VectorExp
diff --git a/tools/buddy-opt/buddy-opt.cpp b/tools/buddy-opt/buddy-opt.cpp
index 61a0958c72..9e7035edf8 100644
--- a/tools/buddy-opt/buddy-opt.cpp
+++ b/tools/buddy-opt/buddy-opt.cpp
@@ -71,6 +71,7 @@ void registerMatMulOptimizePass();
 void registerMatMulVectorizationPass();
 void registerMatMulParallelVectorizationPass();
 void registerTransposeOptimizationPass();
+void registerReduceSumVectorizationPass();
 void registerConvOptimizePass();
 void registerConvNhwcFhwcOptimizePass();
 void registerConvNhwcFhwcTileOptimizePass();
@@ -118,6 +119,7 @@ int main(int argc, char **argv) {
   mlir::buddy::registerMatMulVectorizationPass();
   mlir::buddy::registerMatMulParallelVectorizationPass();
   mlir::buddy::registerTransposeOptimizationPass();
+  mlir::buddy::registerReduceSumVectorizationPass();
   mlir::buddy::registerConvOptimizePass();
   mlir::buddy::registerConvNhwcFhwcOptimizePass();
   mlir::buddy::registerConvNhwcFhwcTileOptimizePass();

From edbbc65078d6023d621a84bf4b4ba96e49102895 Mon Sep 17 00:00:00 2001
From: hayden <haydenbrwon277@gmail.com>
Date: Thu, 17 Apr 2025 21:53:46 +0800
Subject: [PATCH 2/5]  Fixed Reduce_sum MLIR File Buddy-Check Error

---
 examples/BuddyNext/compare_outputs.sh         |  4 +--
 .../BuddyNext/next-reduce_sum-vec-manual.mlir | 28 +++++++++++++++++--
 .../next-reduce_sum-vec-manual1.mlir          | 26 ++++++++++++++++-
 examples/BuddyNext/next-reduce_sum.mlir       | 20 ++++++-------
 examples/BuddyNext/next-reduce_sum1.mlir      | 22 ++++++---------
 5 files changed, 70 insertions(+), 30 deletions(-)

diff --git a/examples/BuddyNext/compare_outputs.sh b/examples/BuddyNext/compare_outputs.sh
index 8fd80acc86..85d3897bf6 100755
--- a/examples/BuddyNext/compare_outputs.sh
+++ b/examples/BuddyNext/compare_outputs.sh
@@ -16,7 +16,7 @@ fi
 
 CMD1="$1"
 CMD2="$2"
-RUNS=100
+RUNS=10
 
 # 创建临时文件存储输出
 OUTPUT1=$(mktemp)
@@ -118,4 +118,4 @@ if [ -n "$SPEEDUP_MEAN" ] && [ "$SPEEDUP_MEAN" != "0" ]; then
 fi
 
 # 清理临时文件
-rm "$OUTPUT1" "$OUTPUT2" "$PROCESSED1" "$SPEEDUPS"
\ No newline at end of file
+rm "$OUTPUT1" "$OUTPUT2" "$PROCESSED1" "$SPEEDUPS"
diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual.mlir b/examples/BuddyNext/next-reduce_sum-vec-manual.mlir
index 044da55002..236c95366f 100644
--- a/examples/BuddyNext/next-reduce_sum-vec-manual.mlir
+++ b/examples/BuddyNext/next-reduce_sum-vec-manual.mlir
@@ -1,4 +1,21 @@
-// RUN: buddy-opt -reduce-vectorize="vector-size=16" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c_wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \
+// RUN: buddy-opt %s \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner -O0 -e main -entry-point-result=void \
 // RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
 // RUN: | FileCheck %s
@@ -76,9 +93,16 @@ func.func @kernel(%a : memref<12x40x40xf32>) {
 
   // 打印结果
   %printed_b = memref.cast %b : memref<12x40xf32> to memref<*xf32>
+  
+  // All the elements of the MemRef are the same,
+  // only check the first line to verify the correctness.
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [12, 40] strides = [40, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [120{{(, 120)*}}]
+  
   call @printMemrefF32(%printed_b) : (memref<*xf32>) -> ()
   
-  // 打印时间
+  // Print timings
   vector.print %time : f64
   
   memref.dealloc %b : memref<12x40xf32>
diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir b/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir
index 373b427b4d..6167063c48 100644
--- a/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir
+++ b/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir
@@ -1,4 +1,21 @@
-// RUN: buddy-opt -reduce-vectorize="vector-size=32" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c_wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \
+// RUN: buddy-opt %s \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner -O0 -e main -entry-point-result=void \
 // RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
 // RUN: | FileCheck %s
@@ -71,6 +88,13 @@ func.func @kernel(%a : memref<1x40x1536xf32>) {
 
   // 打印结果
   %printed_b = memref.cast %b : memref<1x40xf32> to memref<*xf32>
+  
+  // All the elements of the MemRef are the same,
+  // only check the first line to verify the correctness.
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [1, 40] strides = [40, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [4608{{(, 4608)*}}]
+  
   call @printMemrefF32(%printed_b) : (memref<*xf32>) -> ()
   
   // 打印时间
diff --git a/examples/BuddyNext/next-reduce_sum.mlir b/examples/BuddyNext/next-reduce_sum.mlir
index a2732c8d2b..a5ddd89654 100644
--- a/examples/BuddyNext/next-reduce_sum.mlir
+++ b/examples/BuddyNext/next-reduce_sum.mlir
@@ -5,14 +5,13 @@
 // RUN:     -eliminate-empty-tensors \
 // RUN:     -empty-tensor-to-alloc-tensor \
 // RUN:     -one-shot-bufferize \
+// RUN: | buddy-opt \
+// RUN:     -reduce-sum-vectorization-3d="vector-size=16" \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
 // RUN:     -convert-linalg-to-affine-loops \
 // RUN:     -affine-loop-fusion \
 // RUN:     -lower-affine \
-// RUN:     -func-bufferize \
-// RUN:     -arith-bufferize \
-// RUN:     -tensor-bufferize \
-// RUN:     -buffer-deallocation \
-// RUN:     -finalizing-bufferize \
 // RUN:     -convert-vector-to-scf \
 // RUN:     -expand-strided-metadata \
 // RUN:     -convert-vector-to-llvm \
@@ -24,7 +23,7 @@
 // RUN:     -convert-openmp-to-llvm \
 // RUN:     -convert-arith-to-llvm \
 // RUN:     -convert-math-to-llvm \
-// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-math-to-libm \
 // RUN:     -convert-func-to-llvm \
 // RUN:     -reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner -e main -entry-point-result=void \
@@ -46,15 +45,12 @@ func.func @kernel(%t0 : tensor<12x40x40xf32>) {
 
   %tensor_unranked = tensor.cast %t1 : tensor<12x40x1xf32> to tensor<*xf32>
 
-  // Verify the output shape and some sample values
+  // All the elements of the MemRef are the same,
+  // only check the first line to verify the correctness.
   // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [12, 40, 1] strides = [40, 1, 1] data =
   // CHECK-NEXT: [
   // CHECK-SAME: [
-  // CHECK-SAME: [120]
-  // CHECK-SAME: [120]
-  // CHECK-SAME: ...
-  // CHECK-SAME: [120]
-  // CHECK-SAME: ]
+  // CHECK-SAME: [120{{(, 120)*}}],
 
   // Print results
   call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
diff --git a/examples/BuddyNext/next-reduce_sum1.mlir b/examples/BuddyNext/next-reduce_sum1.mlir
index 7b691cc69f..246052a40c 100644
--- a/examples/BuddyNext/next-reduce_sum1.mlir
+++ b/examples/BuddyNext/next-reduce_sum1.mlir
@@ -5,14 +5,13 @@
 // RUN:     -eliminate-empty-tensors \
 // RUN:     -empty-tensor-to-alloc-tensor \
 // RUN:     -one-shot-bufferize \
+// RUN: | buddy-opt \
+// RUN:     -reduce-sum-vectorization-3d="vector-size=32" \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
 // RUN:     -convert-linalg-to-affine-loops \
 // RUN:     -affine-loop-fusion \
 // RUN:     -lower-affine \
-// RUN:     -func-bufferize \
-// RUN:     -arith-bufferize \
-// RUN:     -tensor-bufferize \
-// RUN:     -buffer-deallocation \
-// RUN:     -finalizing-bufferize \
 // RUN:     -convert-vector-to-scf \
 // RUN:     -expand-strided-metadata \
 // RUN:     -convert-vector-to-llvm \
@@ -24,7 +23,7 @@
 // RUN:     -convert-openmp-to-llvm \
 // RUN:     -convert-arith-to-llvm \
 // RUN:     -convert-math-to-llvm \
-// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-math-to-libm \
 // RUN:     -convert-func-to-llvm \
 // RUN:     -reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner -e main -entry-point-result=void \
@@ -46,15 +45,12 @@ func.func @kernel(%t0 : tensor<1x40x1536xf32>) {
 
   %tensor_unranked = tensor.cast %t1 : tensor<1x40x1xf32> to tensor<*xf32>
 
-  // Verify the output shape and some sample values
-  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [12, 40, 1] strides = [40, 1, 1] data =
+  // All the elements of the MemRef are the same,
+  // only check the first line to verify the correctness.
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 1] strides = [40, 1, 1] data =
   // CHECK-NEXT: [
   // CHECK-SAME: [
-  // CHECK-SAME: [120]
-  // CHECK-SAME: [120]
-  // CHECK-SAME: ...
-  // CHECK-SAME: [120]
-  // CHECK-SAME: ]
+  // CHECK-SAME: [4608{{(, 4608)*}}],
 
   // Print results
   call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()

From 0dfba62306ea6f1e57cba1f9d4eb52fe9670b482 Mon Sep 17 00:00:00 2001
From: hayden <haydenbrwon277@gmail.com>
Date: Sun, 25 May 2025 15:29:09 +0800
Subject: [PATCH 3/5] Make corrections based on review results

---
 examples/BuddyDeepSeekR1/.gitignore           |  2 +
 examples/BuddyDeepSeekR1/AnalyseDialectOps.py | 87 -------------------
 examples/BuddyDeepSeekR1/makefile             | 22 -----
 ...sum.mlir => next-reduce-sum-12x40x40.mlir} |  2 +-
 ...m1.mlir => next-reduce-sum-1x40x1536.mlir} |  2 +-
 ... next-reduce-sum-vec-manual-12x40x40.mlir} | 32 +++----
 ...next-reduce-sum-vec-manual-1x40x1536.mlir} | 33 +++----
 .../ReduceSumVectorization3D.cpp              |  2 +-
 8 files changed, 38 insertions(+), 144 deletions(-)
 create mode 100644 examples/BuddyDeepSeekR1/.gitignore
 delete mode 100644 examples/BuddyDeepSeekR1/AnalyseDialectOps.py
 delete mode 100644 examples/BuddyDeepSeekR1/makefile
 rename examples/BuddyNext/{next-reduce_sum.mlir => next-reduce-sum-12x40x40.mlir} (99%)
 rename examples/BuddyNext/{next-reduce_sum1.mlir => next-reduce-sum-1x40x1536.mlir} (99%)
 rename examples/BuddyNext/{next-reduce_sum-vec-manual.mlir => next-reduce-sum-vec-manual-12x40x40.mlir} (84%)
 rename examples/BuddyNext/{next-reduce_sum-vec-manual1.mlir => next-reduce-sum-vec-manual-1x40x1536.mlir} (85%)

diff --git a/examples/BuddyDeepSeekR1/.gitignore b/examples/BuddyDeepSeekR1/.gitignore
new file mode 100644
index 0000000000..8d6276ca46
--- /dev/null
+++ b/examples/BuddyDeepSeekR1/.gitignore
@@ -0,0 +1,2 @@
+*.data
+*.mlir
\ No newline at end of file
diff --git a/examples/BuddyDeepSeekR1/AnalyseDialectOps.py b/examples/BuddyDeepSeekR1/AnalyseDialectOps.py
deleted file mode 100644
index a2cf9983b2..0000000000
--- a/examples/BuddyDeepSeekR1/AnalyseDialectOps.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import re
-from collections import Counter, defaultdict
-from pathlib import Path
-
-def extract_dialect_ops(mlir_file_path):
-    """
-    Extract operations from all dialects in an MLIR file and count their occurrences.
-    
-    Args:
-        mlir_file_path (str): Path to the MLIR file
-        
-    Returns:
-        dict: Dictionary containing dialect names as keys and Counter objects as values
-    """
-    # Read the MLIR file
-    with open(mlir_file_path, 'r') as f:
-        content = f.read()
-    
-    # Find all operations using regex
-    # This pattern matches lines that contain operation names with dialect prefix
-    # Excludes numbers and common non-dialect prefixes
-    op_pattern = r'([a-zA-Z_][a-zA-Z0-9_]*)\.([a-zA-Z_][a-zA-Z0-9_]*)'
-    all_ops = re.findall(op_pattern, content)
-    
-    # Group operations by dialect
-    dialect_ops = defaultdict(Counter)
-    for dialect, op in all_ops:
-        # Skip common non-dialect prefixes
-        if dialect.lower() in ['func', 'module', 'memref', 'arith', 'builtin']:
-            continue
-        dialect_ops[dialect][op] += 1
-    
-    return dialect_ops
-
-def main():
-    # Get the directory of the current script
-    current_dir = Path(__file__).parent
-    
-    # Construct path to subgraph0.mlir
-    mlir_file = current_dir / 'subgraph0.mlir'
-    
-    if not mlir_file.exists():
-        print(f"Error: {mlir_file} not found")
-        return
-    
-    # Extract and count operations by dialect
-    dialect_ops = extract_dialect_ops(str(mlir_file))
-    
-    # Print results
-    print("\nMLIR Operation Statistics:")
-    print("=" * 60)
-    print(f"{'Dialect':<20} {'Operation':<30} {'Count':<10}")
-    print("=" * 60)
-    
-    total_ops = 0
-    total_unique_ops = 0
-    
-    # Sort dialects by total operation count
-    sorted_dialects = sorted(
-        dialect_ops.items(),
-        key=lambda x: sum(x[1].values()),
-        reverse=True
-    )
-    
-    for dialect, ops in sorted_dialects:
-        dialect_total = sum(ops.values())
-        total_ops += dialect_total
-        total_unique_ops += len(ops)
-        
-        print(f"\n{dialect} (Total: {dialect_total} ops)")
-        print("-" * 60)
-        
-        # Sort operations by count
-        sorted_ops = sorted(ops.items(), key=lambda x: x[1], reverse=True)
-        for op, count in sorted_ops:
-            print(f"{'':<20} {op:<30} {count:<10}")
-    
-    print("\n" + "=" * 60)
-    print(f"Total dialects: {len(dialect_ops)}")
-    print(f"Total unique operations: {total_unique_ops}")
-    print(f"Total operation instances: {total_ops}")
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/BuddyDeepSeekR1/makefile b/examples/BuddyDeepSeekR1/makefile
deleted file mode 100644
index baf3c52b82..0000000000
--- a/examples/BuddyDeepSeekR1/makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-BUDDY_OPT := ../../build/bin/buddy-opt
-MLIR_OPT := ../../llvm/build/bin/mlir-opt
-MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
-MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
-LLC := ../../llvm/build/bin/llc
-OPT_FLAG := -O0
-
-ifeq ($(shell uname),Linux)
-MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
-MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
-LIB_OMP := ../../llvm/build/lib/libomp.so
-MTRIPLE := x86_64-unknown-linux-gnu
-else ifeq ($(shell uname),Darwin)
-MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
-MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
-MTRIPLE := x86_64-apple-darwin
-endif
-
-lower-deepseek-r1-tosa:
-	@${MLIR_OPT} ./subgraph0.mlir \
-		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" -o ./subgraph0-lower.mlir
\ No newline at end of file
diff --git a/examples/BuddyNext/next-reduce_sum.mlir b/examples/BuddyNext/next-reduce-sum-12x40x40.mlir
similarity index 99%
rename from examples/BuddyNext/next-reduce_sum.mlir
rename to examples/BuddyNext/next-reduce-sum-12x40x40.mlir
index a5ddd89654..cc3ccf788d 100644
--- a/examples/BuddyNext/next-reduce_sum.mlir
+++ b/examples/BuddyNext/next-reduce-sum-12x40x40.mlir
@@ -66,4 +66,4 @@ func.func @main() {
   call @kernel(%c0) : (tensor<12x40x40xf32>) -> ()
 
   return
-}
\ No newline at end of file
+}
diff --git a/examples/BuddyNext/next-reduce_sum1.mlir b/examples/BuddyNext/next-reduce-sum-1x40x1536.mlir
similarity index 99%
rename from examples/BuddyNext/next-reduce_sum1.mlir
rename to examples/BuddyNext/next-reduce-sum-1x40x1536.mlir
index 246052a40c..b8687d1f6f 100644
--- a/examples/BuddyNext/next-reduce_sum1.mlir
+++ b/examples/BuddyNext/next-reduce-sum-1x40x1536.mlir
@@ -66,4 +66,4 @@ func.func @main() {
   call @kernel(%c0) : (tensor<1x40x1536xf32>) -> ()
 
   return
-}
\ No newline at end of file
+}
diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual.mlir b/examples/BuddyNext/next-reduce-sum-vec-manual-12x40x40.mlir
similarity index 84%
rename from examples/BuddyNext/next-reduce_sum-vec-manual.mlir
rename to examples/BuddyNext/next-reduce-sum-vec-manual-12x40x40.mlir
index 236c95366f..706a811ee1 100644
--- a/examples/BuddyNext/next-reduce_sum-vec-manual.mlir
+++ b/examples/BuddyNext/next-reduce-sum-vec-manual-12x40x40.mlir
@@ -23,15 +23,15 @@
 func.func private @rtclock() -> f64
 func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }
 
-// 创建一个12x40x40的输入张量
+// Create a 12x40x40 input tensor
 memref.global "private" @A : memref<12x40x40xf32> = dense<3.0>
 
 func.func @kernel(%a : memref<12x40x40xf32>) {
   %t_start = call @rtclock() : () -> f64
   
-  %b = memref.alloc() : memref<12x40xf32>  // 输出张量
+  %b = memref.alloc() : memref<12x40xf32>  // Output tensor
 
-  // 初始化常量
+  // Initialize constants
   %c0 = arith.constant 0.0 : f32
   %c16 = arith.constant 16 : index
   %c12 = arith.constant 12 : index
@@ -40,48 +40,48 @@ func.func @kernel(%a : memref<12x40x40xf32>) {
   %c1 = arith.constant 1 : index
   %c8 = arith.constant 8 : index
 
-  // 使用step 1的外层循环和8x8分块
+  // Use outer loop with step 1 and 8x8 blocking
   affine.for %i0 = 0 to 12 step 1 {
     affine.for %j0 = 0 to 40 step 8 {
-      // 使用1维并行处理
+      // Use 1D parallel processing
       affine.parallel (%idx) = (0) to (8) {
-        // 计算j1
+        // Compute j1
         %j1 = arith.remui %idx, %c8 : index
         
         %j = affine.apply affine_map<(d0, d1) -> (d0 + d1)> (%j0, %j1)
         
-        // 检查是否在有效范围内
+        // Check if within valid range
         %j_in_range = arith.cmpi slt, %j, %c40 : index
         
-        // 只在有效范围内进行计算
+        // Only compute within valid range
         scf.if %j_in_range {
-          // 初始化累加器
+          // Initialize accumulator
           %init_acc = arith.constant 0.0 : f32
           
-          // 在k维度上使用16元素向量化
+          // Vectorize along k dimension with 16 elements
           %result_acc = affine.for %k = 0 to 40 step 16 iter_args(%acc = %init_acc) -> f32 {
-            // 预取下一个数据块
+            // Prefetch next data block
             %next_k = arith.addi %k, %c16 : index
             %next_valid = arith.cmpi slt, %next_k, %c40 : index
             scf.if %next_valid {
               memref.prefetch %a[%i0, %j, %next_k], read, locality<3>, data : memref<12x40x40xf32>
             }
             
-            // 计算当前块大小和掩码
+            // Compute current block size and mask
             %remaining = arith.subi %c40, %k : index
             %vl = arith.minsi %remaining, %c16 : index
             %mask = vector.create_mask %vl : vector<16xi1>
             
-            // 使用向量化读取数据
+            // Vectorized data read
             %vec = vector.transfer_read %a[%i0, %j, %k], %c0, %mask : memref<12x40x40xf32>, vector<16xf32>
             
-            // 向量规约求和
+            // Vector reduction sum
             %block_sum = vector.reduction <add>, %vec : vector<16xf32> into f32
             %next_acc = arith.addf %acc, %block_sum : f32
             affine.yield %next_acc : f32
           }
 
-          // 写入结果
+          // Write result
           memref.store %result_acc, %b[%i0, %j] : memref<12x40xf32>
         }
       }
@@ -91,7 +91,7 @@ func.func @kernel(%a : memref<12x40x40xf32>) {
   %t_end = call @rtclock() : () -> f64
   %time = arith.subf %t_end, %t_start : f64
 
-  // 打印结果
+  // Print result
   %printed_b = memref.cast %b : memref<12x40xf32> to memref<*xf32>
   
   // All the elements of the MemRef are the same,
diff --git a/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir b/examples/BuddyNext/next-reduce-sum-vec-manual-1x40x1536.mlir
similarity index 85%
rename from examples/BuddyNext/next-reduce_sum-vec-manual1.mlir
rename to examples/BuddyNext/next-reduce-sum-vec-manual-1x40x1536.mlir
index 6167063c48..da6ee6b1ff 100644
--- a/examples/BuddyNext/next-reduce_sum-vec-manual1.mlir
+++ b/examples/BuddyNext/next-reduce-sum-vec-manual-1x40x1536.mlir
@@ -23,15 +23,15 @@
 func.func private @rtclock() -> f64
 func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }
 
-// 创建一个1x40x1536的输入张量
+// Create a 1x40x1536 input tensor
 memref.global "private" @A : memref<1x40x1536xf32> = dense<3.0>
 
 func.func @kernel(%a : memref<1x40x1536xf32>) {
   %t_start = call @rtclock() : () -> f64
   
-  %b = memref.alloc() : memref<1x40xf32>  // 输出张量
+  %b = memref.alloc() : memref<1x40xf32>  // Output tensor
 
-  // 初始化常量
+  // Initialize constants
   %c0 = arith.constant 0.0 : f32
   %c32 = arith.constant 32 : index
   %c1 = arith.constant 1 : index
@@ -40,44 +40,44 @@ func.func @kernel(%a : memref<1x40x1536xf32>) {
   %c0_idx = arith.constant 0 : index
   %c8 = arith.constant 8 : index
 
-  // 使用分块和向量化处理
+  // Use blocking and vectorization
   affine.for %j0 = 0 to 40 step 8 {
-    // 处理8个元素一组
+    // Process 8 elements at a time
     affine.for %j1 = 0 to 8 {
       %j = affine.apply affine_map<(d0, d1) -> (d0 + d1)> (%j0, %j1)
       
-      // 检查是否在有效范围内
+      // Check if within valid range
       %j_in_range = arith.cmpi slt, %j, %c40 : index
       
-      // 只在有效范围内进行计算
+      // Only compute within valid range
       scf.if %j_in_range {
-        // 初始化累加器
+        // Initialize accumulator
         %init_acc = arith.constant 0.0 : f32
         
-        // 在k维度上使用32元素向量化
+        // Vectorize along k dimension with 32 elements
         %result_acc = affine.for %k = 0 to 1536 step 32 iter_args(%acc = %init_acc) -> f32 {
-          // 预取下一个数据块
+          // Prefetch next data block
           %next_k = arith.addi %k, %c32 : index
           %next_valid = arith.cmpi slt, %next_k, %c1536 : index
           scf.if %next_valid {
             memref.prefetch %a[%c0_idx, %j, %next_k], read, locality<3>, data : memref<1x40x1536xf32>
           }
           
-          // 计算当前块大小和掩码
+          // Compute current block size and mask
           %remaining = arith.subi %c1536, %k : index
           %vl = arith.minsi %remaining, %c32 : index
           %mask = vector.create_mask %vl : vector<32xi1>
           
-          // 使用向量化读取数据
+          // Vectorized data read
           %vec = vector.transfer_read %a[%c0_idx, %j, %k], %c0, %mask : memref<1x40x1536xf32>, vector<32xf32>
           
-          // 向量规约求和
+          // Vector reduction sum
           %block_sum = vector.reduction <add>, %vec : vector<32xf32> into f32
           %next_acc = arith.addf %acc, %block_sum : f32
           affine.yield %next_acc : f32
         }
 
-        // 写入结果
+        // Write result
         memref.store %result_acc, %b[%c0_idx, %j] : memref<1x40xf32>
       }
     }
@@ -86,7 +86,7 @@ func.func @kernel(%a : memref<1x40x1536xf32>) {
   %t_end = call @rtclock() : () -> f64
   %time = arith.subf %t_end, %t_start : f64
 
-  // 打印结果
+  // Print result
   %printed_b = memref.cast %b : memref<1x40xf32> to memref<*xf32>
   
   // All the elements of the MemRef are the same,
@@ -97,7 +97,7 @@ func.func @kernel(%a : memref<1x40x1536xf32>) {
   
   call @printMemrefF32(%printed_b) : (memref<*xf32>) -> ()
   
-  // 打印时间
+  // Print time
   vector.print %time : f64
   
   memref.dealloc %b : memref<1x40xf32>
@@ -109,3 +109,4 @@ func.func @main() {
   call @kernel(%a) : (memref<1x40x1536xf32>) -> ()
   return
 }
+
diff --git a/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp b/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp
index a4bfddf37a..4fea724255 100644
--- a/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp
+++ b/midend/lib/Conversion/TosaVectorization/ReduceSumVectorization3D.cpp
@@ -321,4 +321,4 @@ void registerReduceSumVectorizationPass() {
   PassRegistration<ReduceSumVectorizationPass>();
 }
 } // namespace buddy
-} // namespace mlir
\ No newline at end of file
+} // namespace mlir

From 1c30c7d6c50f7a9fcd43e2f40e106685faa54f00 Mon Sep 17 00:00:00 2001
From: hayden <haydenbrwon277@gmail.com>
Date: Mon, 26 May 2025 12:54:54 +0800
Subject: [PATCH 4/5] remove compare_outputs.sh

---
 examples/BuddyNext/.gitignore         |   1 +
 examples/BuddyNext/compare_outputs.sh | 121 --------------------------
 2 files changed, 1 insertion(+), 121 deletions(-)
 delete mode 100755 examples/BuddyNext/compare_outputs.sh

diff --git a/examples/BuddyNext/.gitignore b/examples/BuddyNext/.gitignore
index 80a243fa81..e3aa442c78 100644
--- a/examples/BuddyNext/.gitignore
+++ b/examples/BuddyNext/.gitignore
@@ -1 +1,2 @@
 log.*
+*.sh
diff --git a/examples/BuddyNext/compare_outputs.sh b/examples/BuddyNext/compare_outputs.sh
deleted file mode 100755
index 85d3897bf6..0000000000
--- a/examples/BuddyNext/compare_outputs.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-
-# 设置颜色输出
-GREEN='\033[0;32m'
-RED='\033[0;31m'
-NC='\033[0m' # No Color
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-
-# 检查命令行参数
-if [ $# -ne 2 ]; then
-    echo -e "${YELLOW}Usage: $0 <command1> <command2>${NC}"
-    echo "Example: $0 'next-reduce-sum-run' 'next-reduce-sum-vec-manual-run'"
-    exit 1
-fi
-
-CMD1="$1"
-CMD2="$2"
-RUNS=10
-
-# 创建临时文件存储输出
-OUTPUT1=$(mktemp)
-OUTPUT2=$(mktemp)
-PROCESSED1=$(mktemp)
-SPEEDUPS=$(mktemp)
-
-# 提取时间数据
-extract_time() {
-    local file="$1"
-    grep -o '[0-9]\+\.[0-9]\+e[-+]\?[0-9]\+\|[0-9]\+\.[0-9]\+' "$file"
-}
-
-# 转换时间为秒
-convert_to_seconds() {
-    local time_val="$1"
-    if [[ $time_val =~ e ]]; then
-        echo "$time_val" | sed 's/e/*10^/' | bc -l
-    else
-        printf "%.9f" $time_val
-    fi
-}
-
-# 计算平均值
-calculate_mean() {
-    local file="$1"
-    local sum=0
-    local count=0
-    while read -r line; do
-        sum=$(echo "$sum + $line" | bc -l)
-        count=$((count + 1))
-    done < "$file"
-    if [ $count -gt 0 ]; then
-        echo "scale=9; $sum / $count" | bc -l
-    else
-        echo "0"
-    fi
-}
-
-echo -e "${BLUE}Running each version $RUNS times...${NC}"
-
-# 运行两个命令并计算每次的加速比
-for ((i=1; i<=$RUNS; i++)); do
-    echo -ne "\rRun $i/$RUNS"
-    
-    # 运行第一个命令
-    TEMP_OUT1=$(mktemp)
-    make $CMD1 > "$TEMP_OUT1" 2>/dev/null
-    TIME1=$(extract_time "$TEMP_OUT1")
-    if [ -n "$TIME1" ]; then
-        TIME1=$(convert_to_seconds "$TIME1")
-    fi
-    
-    # 运行第二个命令
-    TEMP_OUT2=$(mktemp)
-    make $CMD2 > "$TEMP_OUT2" 2>/dev/null
-    TIME2=$(extract_time "$TEMP_OUT2")
-    if [ -n "$TIME2" ]; then
-        TIME2=$(convert_to_seconds "$TIME2")
-    fi
-    
-    # 保存第一次运行的输出用于比较
-    if [ $i -eq 1 ]; then
-        grep "data =" "$TEMP_OUT1" | sed 's/base@ = [^[:space:]]*/base@ = <addr>/g' > "$PROCESSED1"
-        grep "data =" "$TEMP_OUT2" | sed 's/base@ = [^[:space:]]*/base@ = <addr>/g' > "$OUTPUT2"
-    fi
-    
-    # 计算这次运行的加速比
-    if [ -n "$TIME1" ] && [ -n "$TIME2" ] && [ "$TIME1" != "0" ] && [ "$TIME2" != "0" ]; then
-        echo "scale=9; $TIME1/$TIME2" | bc -l >> "$SPEEDUPS"
-    fi
-    
-    rm "$TEMP_OUT1" "$TEMP_OUT2"
-done
-echo
-
-# 比较数据输出
-echo -e "\n${BLUE}Comparing output data:${NC}"
-if diff "$PROCESSED1" "$OUTPUT2" > /dev/null; then
-    echo -e "${GREEN}✓ Outputs match! Both versions produce the same results.${NC}"
-else
-    echo -e "${RED}✗ Outputs differ! Found differences:${NC}"
-    echo "----------------------------------------"
-    diff "$PROCESSED1" "$OUTPUT2"
-    echo "----------------------------------------"
-fi
-
-# 计算加速比的均值
-echo -e "\n${BLUE}Performance Comparison:${NC}"
-SPEEDUP_MEAN=$(calculate_mean "$SPEEDUPS")
-
-if [ -n "$SPEEDUP_MEAN" ] && [ "$SPEEDUP_MEAN" != "0" ]; then
-    if [ $(echo "$SPEEDUP_MEAN > 1" | bc -l) -eq 1 ]; then
-        printf "${GREEN}Second version is %.2fx faster${NC}\n" "$SPEEDUP_MEAN"
-    else
-        SLOWDOWN=$(echo "scale=2; 1/$SPEEDUP_MEAN" | bc -l)
-        printf "${RED}Second version is %.2fx slower${NC}\n" "$SLOWDOWN"
-    fi
-fi
-
-# 清理临时文件
-rm "$OUTPUT1" "$OUTPUT2" "$PROCESSED1" "$SPEEDUPS"

From 079a3beac2973b7cef3ce3f851b8f0c46e02b6ec Mon Sep 17 00:00:00 2001
From: hayden <haydenbrwon277@gmail.com>
Date: Mon, 26 May 2025 12:56:38 +0800
Subject: [PATCH 5/5] modify gitignore

---
 examples/BuddyNext/.gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/BuddyNext/.gitignore b/examples/BuddyNext/.gitignore
index e3aa442c78..56216edde9 100644
--- a/examples/BuddyNext/.gitignore
+++ b/examples/BuddyNext/.gitignore
@@ -1,2 +1,2 @@
 log.*
-*.sh
+compare_outputs.sh