CUDA fixes and automated bash script

ROCm · Oct 30, 2023 · 8c1ef2a · 8c1ef2a
1 parent 305353d
commit 8c1ef2a
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -16,6 +16,17 @@ To run the tests, we use the following repositories:
 
 # How to use the tool:
 
+### Easy mode: one bash script:
+Ensure a RUN_COMMAND has been set, this can be any executable or bash script.
+
+**Usage on ROCm:**
+bash automated_parser.sh --run-command "{RUN_COMMAND}" --use-rocm
+
+**Usage on CUDA:**
+bash automated_parser.sh --run-command "{RUN_COMMAND}" 
+
+This will collect the logs from your program automatically and dump out the final csv report.
+
 ### Run application and collect RCCL/NCCL Log:**
 
 Firstly, make sure you are running the experiments of a distributed setup of an application.
@@ -35,7 +46,6 @@ For some workloads buffered output can impact the RCCL/NCCL log format which may
 PYTHONBUFFERED=x stdbuf -i0 -o0 -e0
 ```
 
-
 ### Automated way:
 
 To gather the performance results once you have the debug log with you. Run the below command. 
@@ -57,6 +67,18 @@ python run_parser_and_generate_summary.py --nccl-debug-log nccl_debug_log.txt --
 ```
 python run_parser_and_generate_summary.py --nccl-debug-log nccl_debug_log.txt --cuda
 ```
+
+### Easy mode: one bash script:
+Ensure a RUN_COMMAND has been set, this can be any executable or bash script.
+
+**Usage on ROCm:**
+bash automated_parser.sh --run-command "{RUN_COMMAND}" --use-rocm
+
+**Usage on CUDA:**
+bash automated_parser.sh --run-command "{RUN_COMMAND}" 
+
+This will collect the logs from your program automatically and dump out the final csv report.
+
 ### To run the tool manually step by step:
 
 **Use Parser to dump out the test commands:**

diff --git a/automated_parser.sh b/automated_parser.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Argument parsing
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --run-command) RUN_COMMAND="$2"; shift ;;
+        --use-rocm) USE_ROCM=1 ;;
+        *) echo "Unknown parameter passed: $1"; exit 1 ;;
+    esac
+    shift
+done
+
+# Ensure RUN_COMMAND is set
+if [ -z "$RUN_COMMAND" ]; then
+    echo "Please provide --run-command argument."
+    exit 1
+fi
+
+# Build test repository
+if [ "$USE_ROCM" == "1" ]; then
+    TEST_DIR="rccl-tests"
+else
+    TEST_DIR="nccl-tests"
+fi
+make -C ${TEST_DIR}
+
+# Run code and capture debug log
+if [ "$USE_ROCM" == "1" ]; then
+    PYTHONBUFFERED=x HSA_FORCE_FINE_GRAIN_PCIE=1 NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=INIT,COLL stdbuf -i0 -o0 -e0 $RUN_COMMAND 2>&1 | tee nccl_debug_log.txt
+else
+    PYTHONBUFFERED=x NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=INIT,COLL stdbuf -i0 -o0 -e0 $RUN_COMMAND 2>&1 | tee nccl_debug_log.txt
+fi
+
+# Dump test commands
+python rccl_nccl_parser.py --nccl-debug-log nccl_debug_log.txt --output-script-name test_commands
+
+# Remove duplicates and count each collective occurence
+awk '!a[$0]++' test_commands.sh > test_commands_unique.sh
+awk '{a[$0]++} END {for (i in a) if (a[i] > i) print i", "a[i]}' test_commands.sh > test_commands_unique_counts.csv
+
+# Copy test commands to tests folder
+cp test_commands_unique.sh ${TEST_DIR}
+cd ${TEST_DIR} && sh test_commands_unique.sh |& tee nccl_perf_data.txt && cd ..
+
+# Generate summary
+python generate_summary.py --log-file ${TEST_DIR}/nccl_perf_data.txt --output-file-name nccl_summary_data --script-file ${TEST_DIR}/test_commands_unique.sh 
+echo "Performance data dumped to nccl-rccl-parser/nccl_summary_data"
+
+sed -i 's/|/,/g' nccl_summary_data.csv
+echo "Performance data converted to csv at nccl-rccl-parser/nccl_summary_data.csv"
+echo "NOTE: counts for each kernel stored at test_commands_unique_counts.csv"
diff --git a/generate_summary.py b/generate_summary.py
@@ -51,14 +51,19 @@ def parse_nccl_performance(useful_lines, commands):
         for i in range(len(split_list)):
             perf_line = perf_line + split_list[i] + "|"
             # Some collectives do not involve a redop
-            if field_index==2 and "reduce" not in commands[j].lower():
+            if (
+                field_index==2 and 
+                "reduce" not in commands[j].lower() and 
+                "none" not in split_list[3] # CUDA will always have redop but set to none if not used
+            ):
                 perf_line = perf_line + "|"
                 field_index = field_index + 1
             # Only broadcast and reduce involve a root
             if (
                field_index==3 and
                re.search(r'\Wreduce_perf', commands[j]) is None and
-               re.search(r'\Wbroadcast_perf', commands[j]) is None
+               re.search(r'\Wbroadcast_perf', commands[j]) is None and
+               "-1" not in split_list[4] # CUDA will always have a root but set to none if not used
             ):
                 perf_line = perf_line + "|"
                 field_index = field_index + 1

diff --git a/run_parser_and_generate_summary.py b/run_parser_and_generate_summary.py
@@ -48,7 +48,7 @@ def main():
             print ("ERROR: unable to run nccl-tests")
             sys.exit(1)
         os.system("mv nccl_perf_log.txt ../")
-        os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../"))
+        os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__))))
 
         summary_cmd = "python generate_summary.py --log-file nccl_perf_log.txt --script-file net_unique.sh --output-file-name nv_net_summary --count-file net_counts.csv"
         os.system(summary_cmd)