From cbe9c045bc7bbe9a4e4725d3304c6ee3e9dfbd9e Mon Sep 17 00:00:00 2001 From: sunway513 Date: Thu, 28 Jan 2021 20:47:49 +0000 Subject: [PATCH 1/2] add more operator in the parser --- rccl_nccl_parser.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/rccl_nccl_parser.py b/rccl_nccl_parser.py index e59c02d..3403965 100644 --- a/rccl_nccl_parser.py +++ b/rccl_nccl_parser.py @@ -3,8 +3,15 @@ import argparse coll_op_map = { - "AllReduce": "all_reduce_perf", "Broadcast": "broadcast_perf", + "Reduce": "reduce_perf", + "AllGather": "all_gather_perf", + "ReduceScatter": "reduce_scatter_perf", + "AllReduce": "all_reduce_perf", + "Gather": "gather_perf", + "Scatter": "scatter_perf", + "AllToAll": "alltoall_perf", + "AllToAllv": "alltoallv_perf", } reduction_op_map = { From 69e8b071857cfa4d6b62d0b05bf2c0b58f7c5413 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Tue, 9 Feb 2021 04:24:18 +0000 Subject: [PATCH 2/2] Add support for parsing more collectives. Only AllToAllv is not enabled because it has a counts array argument instead of a numeric count argument, so one cannot reconstruct a rccl-tests command --- generate_summary.py | 26 +++++++++++++++++++++++--- rccl_nccl_parser.py | 16 +++++++++------- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/generate_summary.py b/generate_summary.py index a66f094..6412527 100644 --- a/generate_summary.py +++ b/generate_summary.py @@ -1,6 +1,7 @@ import os import sys import argparse +import re def get_script_commands(script_file): fs = open(script_file, 'r') @@ -35,18 +36,37 @@ def parse_nccl_performance(useful_lines, commands): perf_lines = [] perf_lines.append("sep=|") - perf_lines.append("size|count|type|redop|time-oplace(us)|algbw(gb/s)-oplace|busbw(gb/s)-oplace|error|" + \ - "time-iplace(us)|algbw(gb/s)-iplace|busbw(gb/s)-iplace|error|avg_bus_bw|commands") + header = "size|count|type|redop|root|time-oplace(us)|algbw(gb/s)-oplace|busbw(gb/s)-oplace|error|" + \ + "time-iplace(us)|algbw(gb/s)-iplace|busbw(gb/s)-iplace|error|avg_bus_bw|commands" + #print(header) + num_fields = len(header.split("|")) + perf_lines.append(header) for j in range(len(useful_lines)): line = useful_lines[j] line = line.replace("# Avg bus bandwidth : ", "") split_list = line.split() perf_line = "" + field_index = 0 for i in range(len(split_list)): perf_line = perf_line + split_list[i] + "|" + # Some collectives do not involve a redop + if field_index==2 and "reduce" not in commands[j].lower(): + perf_line = perf_line + "|" + field_index = field_index + 1 + # Only broadcast and reduce involve a root + if ( + field_index==3 and + re.search(r'\Wreduce_perf', commands[j]) is None and + re.search(r'\Wbroadcast_perf', commands[j]) is None + ): + perf_line = perf_line + "|" + field_index = field_index + 1 + field_index = field_index + 1 #print (perf_line + commands[j]) - perf_lines.append(perf_line + commands[j]) + perf_line = perf_line + commands[j] + assert len(perf_line.split("|")) == num_fields + perf_lines.append(perf_line) return perf_lines diff --git a/rccl_nccl_parser.py b/rccl_nccl_parser.py index 3403965..6d9689a 100644 --- a/rccl_nccl_parser.py +++ b/rccl_nccl_parser.py @@ -11,7 +11,9 @@ "Gather": "gather_perf", "Scatter": "scatter_perf", "AllToAll": "alltoall_perf", - "AllToAllv": "alltoallv_perf", +# "AllToAllv": "alltoallv_perf", + "Send": "sendrecv_perf", + "Recv": "sendrecv_perf", } reduction_op_map = { @@ -69,12 +71,12 @@ def parse_nccl_log(nccl_lines): for j in range(len(nccl_lines)): line = nccl_lines[j] split_list = line.split(" ") - comm = split_list[4].replace(":", "") - count = split_list[12] - datatype = split_list[14] - op_type = split_list[16] - root = split_list[18] - nnranks = split_list[21].split("=")[1].replace("]", "") + comm = split_list[split_list.index("INFO") + 1].replace(":", "") + count = split_list[split_list.index("count") + 1] + datatype = split_list[split_list.index("datatype") + 1] + op_type = split_list[split_list.index("op") + 1] + root = split_list[split_list.index("root") + 1] + nnranks = next(item for item in split_list if 'nranks' in item).split("=")[1].replace("]", "") #print (comm) #print (count)