From 8c1ef2a5f4de4f0a3e9255060904bc45b9d6ea09 Mon Sep 17 00:00:00 2001 From: Jack Taylor Date: Mon, 30 Oct 2023 11:15:02 +0000 Subject: [PATCH] CUDA fixes and automated bash script --- README.md | 24 +++++++++++++- automated_parser.sh | 51 ++++++++++++++++++++++++++++++ generate_summary.py | 9 ++++-- run_parser_and_generate_summary.py | 2 +- 4 files changed, 82 insertions(+), 4 deletions(-) create mode 100644 automated_parser.sh diff --git a/README.md b/README.md index 7046d26..65de824 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,17 @@ To run the tests, we use the following repositories: # How to use the tool: +### Easy mode: one bash script: +Ensure a RUN_COMMAND has been set, this can be any executable or bash script. + +**Usage on ROCm:** +bash automated_parser.sh --run-command "{RUN_COMMAND}" --use-rocm + +**Usage on CUDA:** +bash automated_parser.sh --run-command "{RUN_COMMAND}" + +This will collect the logs from your program automatically and dump out the final csv report. + ### Run application and collect RCCL/NCCL Log:** Firstly, make sure you are running the experiments of a distributed setup of an application. @@ -35,7 +46,6 @@ For some workloads buffered output can impact the RCCL/NCCL log format which may PYTHONBUFFERED=x stdbuf -i0 -o0 -e0 ``` - ### Automated way: To gather the performance results once you have the debug log with you. Run the below command. @@ -57,6 +67,18 @@ python run_parser_and_generate_summary.py --nccl-debug-log nccl_debug_log.txt -- ``` python run_parser_and_generate_summary.py --nccl-debug-log nccl_debug_log.txt --cuda ``` + +### Easy mode: one bash script: +Ensure a RUN_COMMAND has been set, this can be any executable or bash script. + +**Usage on ROCm:** +bash automated_parser.sh --run-command "{RUN_COMMAND}" --use-rocm + +**Usage on CUDA:** +bash automated_parser.sh --run-command "{RUN_COMMAND}" + +This will collect the logs from your program automatically and dump out the final csv report. + ### To run the tool manually step by step: **Use Parser to dump out the test commands:** diff --git a/automated_parser.sh b/automated_parser.sh new file mode 100644 index 0000000..87f18b4 --- /dev/null +++ b/automated_parser.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Argument parsing +while [[ "$#" -gt 0 ]]; do + case $1 in + --run-command) RUN_COMMAND="$2"; shift ;; + --use-rocm) USE_ROCM=1 ;; + *) echo "Unknown parameter passed: $1"; exit 1 ;; + esac + shift +done + +# Ensure RUN_COMMAND is set +if [ -z "$RUN_COMMAND" ]; then + echo "Please provide --run-command argument." + exit 1 +fi + +# Build test repository +if [ "$USE_ROCM" == "1" ]; then + TEST_DIR="rccl-tests" +else + TEST_DIR="nccl-tests" +fi +make -C ${TEST_DIR} + +# Run code and capture debug log +if [ "$USE_ROCM" == "1" ]; then + PYTHONBUFFERED=x HSA_FORCE_FINE_GRAIN_PCIE=1 NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=INIT,COLL stdbuf -i0 -o0 -e0 $RUN_COMMAND 2>&1 | tee nccl_debug_log.txt +else + PYTHONBUFFERED=x NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=INIT,COLL stdbuf -i0 -o0 -e0 $RUN_COMMAND 2>&1 | tee nccl_debug_log.txt +fi + +# Dump test commands +python rccl_nccl_parser.py --nccl-debug-log nccl_debug_log.txt --output-script-name test_commands + +# Remove duplicates and count each collective occurence +awk '!a[$0]++' test_commands.sh > test_commands_unique.sh +awk '{a[$0]++} END {for (i in a) if (a[i] > i) print i", "a[i]}' test_commands.sh > test_commands_unique_counts.csv + +# Copy test commands to tests folder +cp test_commands_unique.sh ${TEST_DIR} +cd ${TEST_DIR} && sh test_commands_unique.sh |& tee nccl_perf_data.txt && cd .. + +# Generate summary +python generate_summary.py --log-file ${TEST_DIR}/nccl_perf_data.txt --output-file-name nccl_summary_data --script-file ${TEST_DIR}/test_commands_unique.sh +echo "Performance data dumped to nccl-rccl-parser/nccl_summary_data" + +sed -i 's/|/,/g' nccl_summary_data.csv +echo "Performance data converted to csv at nccl-rccl-parser/nccl_summary_data.csv" +echo "NOTE: counts for each kernel stored at test_commands_unique_counts.csv" diff --git a/generate_summary.py b/generate_summary.py index 4e94ca5..692facf 100644 --- a/generate_summary.py +++ b/generate_summary.py @@ -51,14 +51,19 @@ def parse_nccl_performance(useful_lines, commands): for i in range(len(split_list)): perf_line = perf_line + split_list[i] + "|" # Some collectives do not involve a redop - if field_index==2 and "reduce" not in commands[j].lower(): + if ( + field_index==2 and + "reduce" not in commands[j].lower() and + "none" not in split_list[3] # CUDA will always have redop but set to none if not used + ): perf_line = perf_line + "|" field_index = field_index + 1 # Only broadcast and reduce involve a root if ( field_index==3 and re.search(r'\Wreduce_perf', commands[j]) is None and - re.search(r'\Wbroadcast_perf', commands[j]) is None + re.search(r'\Wbroadcast_perf', commands[j]) is None and + "-1" not in split_list[4] # CUDA will always have a root but set to none if not used ): perf_line = perf_line + "|" field_index = field_index + 1 diff --git a/run_parser_and_generate_summary.py b/run_parser_and_generate_summary.py index b1d38ed..442f9f7 100644 --- a/run_parser_and_generate_summary.py +++ b/run_parser_and_generate_summary.py @@ -48,7 +48,7 @@ def main(): print ("ERROR: unable to run nccl-tests") sys.exit(1) os.system("mv nccl_perf_log.txt ../") - os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../")) + os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)))) summary_cmd = "python generate_summary.py --log-file nccl_perf_log.txt --script-file net_unique.sh --output-file-name nv_net_summary --count-file net_counts.csv" os.system(summary_cmd)