started collecting benchmarks

lamikr · lamikr · commit 380e01846ebd · 2024-08-07T18:18:59.000-07:00
- created simple smoke check benchmark script to benchmarks folder - started collecting benchmarks to directory under the benchmarks - purpose of these benchmarks is to be able to compare results after component version updates to catch regressions and improvements #63 Signed-off-by: Mika Laitio <lamikr@gmail.com>
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,10 @@
+# fast verify benchmark
+ 
+- run_and_save_benchmarks.sh will execute 2 
+  relatively fast benchmarks to smoke check and collect results from simple apps
+- todo: add llama.cpp benchmark
+
+# more demaning pytorch-gpu-benchmark
+- https://github.com/lamikr/pytorch-gpu-benchmark
+- please collect the results after execution from the
+  new_results folder and create merge request to get them saved to git repository
diff --git a/benchmarks/rocm_sdk_612/pytorch_241/20240807_175108_cpu_vs_gpu_simple.txt b/benchmarks/rocm_sdk_612/pytorch_241/20240807_175108_cpu_vs_gpu_simple.txt
@@ -0,0 +1,11 @@
+Benchmarking CPU and GPUs
+Pytorch version: 2.4.1-rc1
+ROCM HIP version: 6.1.40093-8099c494c
+       Device: cpu-16
+    'CPU time: 23.486 sec
+       Device: AMD Radeon RX 7700S
+    'GPU time: 0.199 sec
+       Device: AMD Radeon 780M
+    'GPU time: 0.191 sec
+Benchmark ready
+
diff --git a/benchmarks/rocm_sdk_612/pytorch_241/20240807_175108_pytorch_dot_products.txt b/benchmarks/rocm_sdk_612/pytorch_241/20240807_175108_pytorch_dot_products.txt
@@ -0,0 +1,51 @@
+Pytorch version: 2.4.1-rc1
+dot product calculation test
+tensor([[[ 0.0769,  1.4105,  0.0824,  0.5644,  0.5710,  0.8619, -0.0698,
+          -0.1378],
+         [-0.0206,  1.3138, -0.5070,  0.3971,  0.5620,  0.8419, -0.2367,
+           0.0135],
+         [-0.1797,  1.3761,  0.0258,  0.5147,  0.5673,  0.7445, -0.0543,
+          -0.0028]],
+
+        [[-0.4074,  0.4956,  0.0553, -0.7740, -0.3718,  1.3344,  0.8070,
+          -0.3321],
+         [-0.5268,  0.5001,  0.0537, -0.6846, -0.3624,  1.1640,  0.6590,
+          -0.2191],
+         [-0.5697,  0.5082,  0.0254, -0.6951, -0.3435,  1.0934,  0.7012,
+          -0.2850]]], device='cuda:0')
+
+Benchmarking cuda and cpu with Default, Math, Flash Attention amd Memory pytorch backends
+Device: AMD Radeon RX 7700S
+    Default cuda:0 benchmark:
+        24404.260 microseconds, 0.02440425969834905 sec
+    Math cuda:0 benchmark:
+        71419.426 microseconds, 0.07141942633703972 sec
+    Flash Attention cuda:0 benchmark:
+        24076.089 microseconds, 0.02407608859939501 sec
+    Memory Efficient cuda:0 benchmark:
+        24541.843 microseconds, 0.024541843199403956 sec
+Device: cpu-16
+    Default cpu benchmark:
+        26995025.818 microseconds, 26.99502581800334 sec
+    Math cpu benchmark:
+        30105574.327 microseconds, 30.105574326997157 sec
+    Flash Attention cpu benchmark:
+        26501703.386 microseconds, 26.501703385991274 sec
+    Memory Efficient cpu benchmark:
+    Memory Efficient cpu is not supported. See warnings for reasons.
+Summary
+
+Pytorch version: 2.4.1-rc1
+ROCM HIP version: 6.1.40093-8099c494c
+Device: AMD Radeon RX 7700S
+               Default cuda:0:            24404.260 ms
+                  Math cuda:0:            71419.426 ms
+       Flash Attention cuda:0:            24076.089 ms
+      Memory Efficient cuda:0:            24541.843 ms
+
+Device: cpu-16
+                  Default cpu:         26995025.818 ms
+                     Math cpu:         30105574.327 ms
+          Flash Attention cpu:         26501703.386 ms
+         Memory Efficient cpu:               -1.000 ms
+
diff --git a/benchmarks/rocm_sdk_612/pytorch_241/notes.txt b/benchmarks/rocm_sdk_612/pytorch_241/notes.txt
@@ -0,0 +1,6 @@
+20240807_175108_pytorch_dot_products.txt
+- pytorch 2.4.1-rc1
+- amdsmi-fix
+- aotriton gfx110* series tuning data
+- latest deepspeed
+- bitsandbytes, triton and torch_migraphx update
diff --git a/benchmarks/run_and_save_benchmarks.sh b/benchmarks/run_and_save_benchmarks.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# rocm-sdk launcher for test application
+# if test fails on AMD GPU, enable AMD_LOG_LEVEL and HIP_VISIBLE_DEVICES=0 variables
+# to get traces to find the failing code part
+if [ -z $ROCM_HOME ]; then
+    echo "Error, make sure that you have executed"
+    echo "    source  /opt/rocm_sdk_612/bin/env_rocm.sh"
+    echo "before running this script"
+    exit 1
+fi
+#AMD_LOG_LEVEL=1 HIP_VISIBLE_DEVICES=0 HIP_LAUNCH_BLOCKING=1
+
+DATE_STR=`date '+%Y%m%d_%H%M%S'`;
+echo "Timestamp for benchmark results: ${DATE_STR}"
+FN_PYTORCH_CPU_VS_GPU_SIMPLE_RES="${DATE_STR}_cpu_vs_gpu_simple.txt"
+FN_PYTORCH_DOT_PRODUCT_FLASH_RES="${DATE_STR}_pytorch_dot_products.txt"
+
+echo "Saving to file: $FN_PYTORCH_CPU_VS_GPU_SIMPLE_RES"
+python ../docs/examples/pytorch/pytorch_cpu_vs_gpu_simple_benchmark.py > ${FN_PYTORCH_CPU_VS_GPU_SIMPLE_RES}
+
+echo "Saving to file: $FN_PYTORCH_DOT_PRODUCT_FLASH_RES"
+python ../docs/examples/pytorch/flash_attention/flash_attention_dot_product_benchmark.py > ${FN_PYTORCH_DOT_PRODUCT_FLASH_RES}