Skip to content

Commit f9f8b05

Browse files
committed
various benchmark execution fixes
- allow specifying gpu-index parameter in addition of gpu-count parameter. - gpu index parameter can be used to request the benchmarks to be run only of certain gpu index in multi-gpu case - if more than one gpu, run benchmarks separately for each and then in the end run tests with all gpus used at a same time - fixes for: lamikr/rocm_sdk_builder#63 Signed-off-by: Mika Laitio <[email protected]>
1 parent 32fe071 commit f9f8b05

File tree

2 files changed

+94
-43
lines changed

2 files changed

+94
-43
lines changed

benchmark_models.py

+75-38
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@
1111
import argparse
1212
from torch.utils.data import Dataset, DataLoader
1313
import json
14+
import sys
1415

1516
torch.backends.cudnn.benchmark = True
1617
# https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
1718
# This flag allows you to enable the inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware.
1819
# If you check it using the profile tool, the cnn method such as winograd, fft, etc. is used for the first iteration and the best operation is selected for the device.
1920

20-
2121
MODEL_LIST = {
2222
models.mnasnet: models.mnasnet.__all__[1:],
2323
models.resnet: models.resnet.__all__[1:],
@@ -32,24 +32,32 @@
3232
precisions = ["float", "half", "double"]
3333
# For post-voltaic architectures, there is a possibility to use tensor-core at half precision.
3434
# Due to the gradient overflow problem, apex is recommended for practical use.
35-
device_name = str(torch.cuda.get_device_name(0))
3635
# Training settings
3736
parser = argparse.ArgumentParser(description="PyTorch Benchmarking")
3837
parser.add_argument(
3938
"--WARM_UP", "-w", type=int, default=5, required=False, help="Num of warm up"
4039
)
40+
4141
parser.add_argument(
4242
"--NUM_TEST", "-n", type=int, default=50, required=False, help="Num of Test"
4343
)
44+
4445
parser.add_argument(
4546
"--BATCH_SIZE", "-b", type=int, default=12, required=False, help="Num of batch size"
4647
)
48+
4749
parser.add_argument(
4850
"--NUM_CLASSES", "-c", type=int, default=1000, required=False, help="Num of class"
4951
)
52+
5053
parser.add_argument(
51-
"--NUM_GPU", "-g", type=int, default=1, required=False, help="Num of gpus"
54+
"--GPU_COUNT", "-g", type=int, default=1, required=False, help="Number of gpus used in test"
5255
)
56+
57+
parser.add_argument(
58+
"--GPU_INDEX", "-i", type=int, default=-1, required=False, help="Index for the used gpu"
59+
)
60+
5361
parser.add_argument(
5462
"--folder",
5563
"-f",
@@ -58,9 +66,6 @@
5866
required=False,
5967
help="folder to save results",
6068
)
61-
args = parser.parse_args()
62-
args.BATCH_SIZE *= args.NUM_GPU
63-
6469

6570
class RandomDataset(Dataset):
6671
def __init__(self, length):
@@ -73,16 +78,7 @@ def __getitem__(self, index):
7378
def __len__(self):
7479
return self.len
7580

76-
77-
rand_loader = DataLoader(
78-
dataset=RandomDataset(args.BATCH_SIZE * (args.WARM_UP + args.NUM_TEST)),
79-
batch_size=args.BATCH_SIZE,
80-
shuffle=False,
81-
num_workers=8,
82-
)
83-
84-
85-
def train(precision="single"):
81+
def train(precision="single", gpu_index=-1):
8682
"""use fake image for training speed test"""
8783
target = torch.LongTensor(args.BATCH_SIZE).random_(args.NUM_CLASSES).cuda()
8884
criterion = nn.CrossEntropyLoss()
@@ -91,10 +87,15 @@ def train(precision="single"):
9187
for model_name in MODEL_LIST[model_type]:
9288
if model_name[-8:] == '_Weights': continue
9389
model = getattr(model_type, model_name)()
94-
if args.NUM_GPU > 1:
95-
model = nn.DataParallel(model, device_ids=range(args.NUM_GPU))
90+
if args.GPU_COUNT > 1:
91+
model = nn.DataParallel(model, device_ids=range(args.GPU_COUNT))
9692
model = getattr(model, precision)()
97-
model = model.to("cuda")
93+
torch_device_name = "cuda"
94+
if (gpu_index >= 0):
95+
torch_device_name = "cuda:" + str(gpu_index)
96+
print("torch_device_name: " + torch_device_name)
97+
torch_device = torch.device(torch_device_name)
98+
model = model.to(torch_device)
9899
durations = []
99100
print(f"Benchmarking Training {precision} precision type {model_name} ")
100101
for step, img in enumerate(rand_loader):
@@ -116,18 +117,22 @@ def train(precision="single"):
116117
benchmark[model_name] = durations
117118
return benchmark
118119

119-
120-
def inference(precision="float"):
120+
def inference(precision="float", gpu_index=-1):
121121
benchmark = {}
122122
with torch.no_grad():
123123
for model_type in MODEL_LIST.keys():
124124
for model_name in MODEL_LIST[model_type]:
125125
if model_name[-8:] == '_Weights': continue
126126
model = getattr(model_type, model_name)()
127-
if args.NUM_GPU > 1:
128-
model = nn.DataParallel(model, device_ids=range(args.NUM_GPU))
127+
if args.GPU_COUNT > 1:
128+
model = nn.DataParallel(model, device_ids=range(args.GPU_COUNT))
129129
model = getattr(model, precision)()
130-
model = model.to("cuda")
130+
torch_device_name = "cuda"
131+
if (gpu_index >= 0):
132+
torch_device_name = "cuda:" + str(gpu_index)
133+
print("torch_device_name: " + torch_device_name)
134+
torch_device = torch.device(torch_device_name)
135+
model = model.to(torch_device)
131136
model.eval()
132137
durations = []
133138
print(
@@ -149,30 +154,62 @@ def inference(precision="float"):
149154
benchmark[model_name] = durations
150155
return benchmark
151156

152-
153157
f"{platform.uname()}\n{psutil.cpu_freq()}\ncpu_count: {psutil.cpu_count()}\nmemory_available: {psutil.virtual_memory().available}"
154158

155-
156159
if __name__ == "__main__":
157-
folder_name = args.folder
160+
args = parser.parse_args()
161+
args.BATCH_SIZE *= args.GPU_COUNT
162+
163+
print("BATCH_SIZE: " + str(args.BATCH_SIZE))
164+
rand_loader = DataLoader(
165+
dataset=RandomDataset(args.BATCH_SIZE * (args.WARM_UP + args.NUM_TEST)),
166+
batch_size=args.BATCH_SIZE,
167+
shuffle=False,
168+
num_workers=8,
169+
)
170+
gpu_count = args.GPU_COUNT
171+
gpu_index = args.GPU_INDEX
172+
173+
print("gpu_index: " + str(gpu_index))
174+
print("gpu_count: " + str(gpu_count))
175+
176+
if (gpu_index >= 0):
177+
device_name = str(torch.cuda.get_device_name(gpu_index))
178+
else:
179+
device_name = str(torch.cuda.get_device_name(0))
180+
device_name = f"{device_name}"
181+
if (args.GPU_COUNT > 1):
182+
device_name = device_name + str(gpu_count) + "X"
183+
device_name = device_name.replace(" ", "_")
184+
device_file_name = device_name + "_"
185+
print("device_name: " + device_name)
186+
187+
if (gpu_index >= 0):
188+
folder_name = args.folder + "/" + str(gpu_index) + "/" + device_name
189+
else:
190+
folder_name = args.folder + "/" + str(gpu_count) + "X"
191+
print("folder_name: " + folder_name)
158192

159-
device_name = f"{device_name}_{args.NUM_GPU}_gpus_"
160193
system_configs = f"{platform.uname()}\n\
161194
{psutil.cpu_freq()}\n\
162195
cpu_count: {psutil.cpu_count()}\n\
163196
memory_available: {psutil.virtual_memory().available}"
164197
gpu_configs = [
165-
torch.cuda.device_count(),
198+
gpu_count,
199+
torch.__version__,
200+
torch.version.hip,
166201
torch.version.cuda,
167202
torch.backends.cudnn.version(),
168-
torch.cuda.get_device_name(0),
203+
device_name,
169204
]
170205
gpu_configs = list(map(str, gpu_configs))
171206
temp = [
172-
"Number of GPUs on current device : ",
173-
"CUDA Version : ",
174-
"Cudnn Version : ",
175-
"Device Name : ",
207+
"GPU_Count: ",
208+
"Torch_Version : ",
209+
"ROCM_Version: ",
210+
"CUDA_Version: ",
211+
"Cudnn_Version: ",
212+
"Device_Name: ",
176213
]
177214

178215
os.makedirs(folder_name, exist_ok=True)
@@ -197,14 +234,14 @@ def inference(precision="float"):
197234
f.writelines(s + "\n" for s in gpu_configs)
198235

199236
for precision in precisions:
200-
train_result = train(precision)
237+
train_result = train(precision, gpu_index)
201238
train_result_df = pd.DataFrame(train_result)
202-
path = f"{folder_name}/{device_name}_{precision}_model_train_benchmark.csv"
239+
path = f"{folder_name}/{device_file_name}_{precision}_model_train_benchmark.csv"
203240
train_result_df.to_csv(path, index=False)
204241

205-
inference_result = inference(precision)
242+
inference_result = inference(precision, gpu_index)
206243
inference_result_df = pd.DataFrame(inference_result)
207-
path = f"{folder_name}/{device_name}_{precision}_model_inference_benchmark.csv"
244+
path = f"{folder_name}/{device_file_name}_{precision}_model_inference_benchmark.csv"
208245
inference_result_df.to_csv(path, index=False)
209246

210247
now = datetime.datetime.now()

run_benchmarks.sh

+19-5
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
#! /bin/bash
22

33
if [ -x "$(command -v rocm-smi)" ]; then
4-
count=`rocm-smi --showproductname --json | wc -l`
4+
count=`amd-smi list | grep UUID: | wc -l`
55

66
echo 'AMD gpu benchmarks starting'
77
echo "GPU count: " ${count}
8-
for (( c=$count; c>=1; c-- ))
8+
for (( ii = 0; ii < $count; ii++ ))
99
do
10-
python3 benchmark_models.py -g $c&& &>/dev/null
10+
# run benchmark for one gpu at a time
11+
echo "benchmark gpu index: $ii"
12+
python3 benchmark_models.py -i $ii -g 1&& &>/dev/null
1113
done
14+
if (( count > 1 )); then
15+
# then if there are more than 1 gpu, run benchmark which allows using all of them
16+
echo "multigpu benchmark: $count"
17+
python3 benchmark_models.py -g $count&& &>/dev/null
18+
fi
1219
echo 'AMD GPU benchmarks finished'
1320
fi
1421

@@ -17,9 +24,16 @@ if [ -x "$(command -v nvidia-smi)" ]; then
1724

1825
echo 'NVidia gpu benchmarks starting'
1926
echo "GPU count: " ${count}
20-
for (( c=$count; c>=1; c-- ))
27+
for (( ii = 0; ii < $count; ii++ ))
2128
do
22-
python3 benchmark_models.py -g $c&& &>/dev/null
29+
# run benchmark for one gpu at a time
30+
python3 benchmark_models.py -i $ii -g 1&& &>/dev/null
31+
echo $ii
2332
done
33+
if (( count > 1 )); then
34+
# then if there are more than 1 gpu, run benchmark which allows using all of them
35+
echo "multigpu benchmark: $count"
36+
python3 benchmark_models.py -g $count&& &>/dev/null
37+
fi
2438
echo 'Nvidia GPU benchmarks finished'
2539
fi

0 commit comments

Comments
 (0)