various benchmark execution fixes

lamikr · lamikr · commit f9f8b05ab3ef · 2024-07-21T16:23:31.000-04:00
- allow specifying gpu-index parameter in addition of gpu-count parameter. - gpu index parameter can be used to request the benchmarks to be run only of certain gpu index in multi-gpu case - if more than one gpu, run benchmarks separately for each and then in the end run tests with all gpus used at a same time - fixes for: lamikr/rocm_sdk_builder#63 Signed-off-by: Mika Laitio <lamikr@gmail.com>
diff --git a/benchmark_models.py b/benchmark_models.py
@@ -11,13 +11,13 @@
 import argparse
 from torch.utils.data import Dataset, DataLoader
 import json
+import sys
 
 torch.backends.cudnn.benchmark = True
 # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
 # This flag allows you to enable the inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware.
 # If you check it using the profile tool, the cnn method such as winograd, fft, etc. is used for the first iteration and the best operation is selected for the device.
 
-
 MODEL_LIST = {
     models.mnasnet: models.mnasnet.__all__[1:],
     models.resnet: models.resnet.__all__[1:],
@@ -32,24 +32,32 @@
 precisions = ["float", "half", "double"]
 # For post-voltaic architectures, there is a possibility to use tensor-core at half precision.
 # Due to the gradient overflow problem, apex is recommended for practical use.
-device_name = str(torch.cuda.get_device_name(0))
 # Training settings
 parser = argparse.ArgumentParser(description="PyTorch Benchmarking")
 parser.add_argument(
     "--WARM_UP", "-w", type=int, default=5, required=False, help="Num of warm up"
 )
+
 parser.add_argument(
     "--NUM_TEST", "-n", type=int, default=50, required=False, help="Num of Test"
 )
+
 parser.add_argument(
     "--BATCH_SIZE", "-b", type=int, default=12, required=False, help="Num of batch size"
 )
+
 parser.add_argument(
     "--NUM_CLASSES", "-c", type=int, default=1000, required=False, help="Num of class"
 )
+
 parser.add_argument(
-    "--NUM_GPU", "-g", type=int, default=1, required=False, help="Num of gpus"
+    "--GPU_COUNT", "-g", type=int, default=1, required=False, help="Number of gpus used in test"
 )
+
+parser.add_argument(
+    "--GPU_INDEX", "-i", type=int, default=-1, required=False, help="Index for the used gpu"
+)
+
 parser.add_argument(
     "--folder",
     "-f",
@@ -58,9 +66,6 @@
     required=False,
     help="folder to save results",
 )
-args = parser.parse_args()
-args.BATCH_SIZE *= args.NUM_GPU
-
 
 class RandomDataset(Dataset):
     def __init__(self, length):
@@ -73,16 +78,7 @@ def __getitem__(self, index):
     def __len__(self):
         return self.len
 
-
-rand_loader = DataLoader(
-    dataset=RandomDataset(args.BATCH_SIZE * (args.WARM_UP + args.NUM_TEST)),
-    batch_size=args.BATCH_SIZE,
-    shuffle=False,
-    num_workers=8,
-)
-
-
-def train(precision="single"):
+def train(precision="single", gpu_index=-1):
     """use fake image for training speed test"""
     target = torch.LongTensor(args.BATCH_SIZE).random_(args.NUM_CLASSES).cuda()
     criterion = nn.CrossEntropyLoss()
@@ -91,10 +87,15 @@ def train(precision="single"):
         for model_name in MODEL_LIST[model_type]:
             if model_name[-8:] == '_Weights': continue
             model = getattr(model_type, model_name)()
-            if args.NUM_GPU > 1:
-                model = nn.DataParallel(model, device_ids=range(args.NUM_GPU))
+            if args.GPU_COUNT > 1:
+                model = nn.DataParallel(model, device_ids=range(args.GPU_COUNT))
             model = getattr(model, precision)()
-            model = model.to("cuda")
+            torch_device_name = "cuda"
+            if (gpu_index >= 0):
+                torch_device_name = "cuda:" + str(gpu_index)
+            print("torch_device_name: " + torch_device_name)
+            torch_device = torch.device(torch_device_name)
+            model = model.to(torch_device)        
             durations = []
             print(f"Benchmarking Training {precision} precision type {model_name} ")
             for step, img in enumerate(rand_loader):
@@ -116,18 +117,22 @@ def train(precision="single"):
             benchmark[model_name] = durations
     return benchmark
 
-
-def inference(precision="float"):
+def inference(precision="float", gpu_index=-1):
     benchmark = {}
     with torch.no_grad():
         for model_type in MODEL_LIST.keys():
             for model_name in MODEL_LIST[model_type]:
                 if model_name[-8:] == '_Weights': continue
                 model = getattr(model_type, model_name)()
-                if args.NUM_GPU > 1:
-                    model = nn.DataParallel(model, device_ids=range(args.NUM_GPU))
+                if args.GPU_COUNT > 1:
+                    model = nn.DataParallel(model, device_ids=range(args.GPU_COUNT))
                 model = getattr(model, precision)()
-                model = model.to("cuda")
+                torch_device_name = "cuda"
+                if (gpu_index >= 0):
+                    torch_device_name = "cuda:" + str(gpu_index)
+                print("torch_device_name: " + torch_device_name)
+                torch_device = torch.device(torch_device_name)
+                model = model.to(torch_device)
                 model.eval()
                 durations = []
                 print(
@@ -149,30 +154,62 @@ def inference(precision="float"):
                 benchmark[model_name] = durations
     return benchmark
 
-
 f"{platform.uname()}\n{psutil.cpu_freq()}\ncpu_count: {psutil.cpu_count()}\nmemory_available: {psutil.virtual_memory().available}"
 
-
 if __name__ == "__main__":
-    folder_name = args.folder
+    args = parser.parse_args()
+    args.BATCH_SIZE *= args.GPU_COUNT
+
+    print("BATCH_SIZE: " + str(args.BATCH_SIZE))
+    rand_loader = DataLoader(
+        dataset=RandomDataset(args.BATCH_SIZE * (args.WARM_UP + args.NUM_TEST)),
+        batch_size=args.BATCH_SIZE,
+        shuffle=False,
+        num_workers=8,
+    )
+    gpu_count = args.GPU_COUNT
+    gpu_index = args.GPU_INDEX
+    
+    print("gpu_index: " + str(gpu_index))
+    print("gpu_count: " + str(gpu_count))
+
+    if (gpu_index >= 0):
+        device_name = str(torch.cuda.get_device_name(gpu_index))
+    else:
+        device_name = str(torch.cuda.get_device_name(0))
+    device_name = f"{device_name}"
+    if (args.GPU_COUNT > 1):
+        device_name = device_name + str(gpu_count) + "X"
+    device_name = device_name.replace(" ", "_")
+    device_file_name = device_name + "_"
+    print("device_name: " + device_name)
+
+    if (gpu_index >= 0):
+        folder_name = args.folder + "/" + str(gpu_index) + "/" + device_name
+    else:
+        folder_name = args.folder + "/" + str(gpu_count) + "X"
+    print("folder_name: " + folder_name)
 
-    device_name = f"{device_name}_{args.NUM_GPU}_gpus_"
     system_configs = f"{platform.uname()}\n\
                      {psutil.cpu_freq()}\n\
                     cpu_count: {psutil.cpu_count()}\n\
                     memory_available: {psutil.virtual_memory().available}"
     gpu_configs = [
-        torch.cuda.device_count(),
+        gpu_count,
+        torch.__version__,
+        torch.version.hip,
         torch.version.cuda,
         torch.backends.cudnn.version(),
-        torch.cuda.get_device_name(0),
+        device_name,
     ]
     gpu_configs = list(map(str, gpu_configs))
     temp = [
-        "Number of GPUs on current device : ",
-        "CUDA Version : ",
-        "Cudnn Version : ",
-        "Device Name : ",
+        "GPU_Count: ",
+        "Torch_Version : ",
+        "ROCM_Version: ",
+        "CUDA_Version: ",
+        "Cudnn_Version: ",
+        "Device_Name: ",
     ]
 
     os.makedirs(folder_name, exist_ok=True)
@@ -197,14 +234,14 @@ def inference(precision="float"):
         f.writelines(s + "\n" for s in gpu_configs)
 
     for precision in precisions:
-        train_result = train(precision)
+        train_result = train(precision, gpu_index)
         train_result_df = pd.DataFrame(train_result)
-        path = f"{folder_name}/{device_name}_{precision}_model_train_benchmark.csv"
+        path = f"{folder_name}/{device_file_name}_{precision}_model_train_benchmark.csv"
         train_result_df.to_csv(path, index=False)
 
-        inference_result = inference(precision)
+        inference_result = inference(precision, gpu_index)
         inference_result_df = pd.DataFrame(inference_result)
-        path = f"{folder_name}/{device_name}_{precision}_model_inference_benchmark.csv"
+        path = f"{folder_name}/{device_file_name}_{precision}_model_inference_benchmark.csv"
         inference_result_df.to_csv(path, index=False)
 
     now = datetime.datetime.now()
diff --git a/run_benchmarks.sh b/run_benchmarks.sh
@@ -1,14 +1,21 @@
 #! /bin/bash
 
 if [ -x "$(command -v rocm-smi)" ]; then
-    count=`rocm-smi --showproductname --json | wc -l`
+    count=`amd-smi list | grep UUID: | wc -l`
 
     echo 'AMD gpu benchmarks starting'
     echo "GPU count: " ${count}
-    for (( c=$count; c>=1; c-- ))
+    for (( ii = 0; ii < $count; ii++ ))
     do
-        python3 benchmark_models.py -g $c&& &>/dev/null
+        # run benchmark for one gpu at a time
+		echo "benchmark gpu index: $ii"
+        python3 benchmark_models.py -i $ii -g 1&& &>/dev/null
     done
+    if (( count > 1 )); then
+        # then if there are more than 1 gpu, run benchmark which allows using all of them
+        echo "multigpu benchmark: $count"
+        python3 benchmark_models.py -g $count&& &>/dev/null
+    fi
     echo 'AMD GPU benchmarks finished'
 fi
 
@@ -17,9 +24,16 @@ if [ -x "$(command -v nvidia-smi)" ]; then
 
     echo 'NVidia gpu benchmarks starting'
     echo "GPU count: " ${count}
-    for (( c=$count; c>=1; c-- ))
+    for (( ii = 0; ii < $count; ii++ ))
     do
-        python3 benchmark_models.py -g $c&& &>/dev/null
+        # run benchmark for one gpu at a time
+        python3 benchmark_models.py -i $ii -g 1&& &>/dev/null
+        echo $ii
     done
+    if (( count > 1 )); then
+        # then if there are more than 1 gpu, run benchmark which allows using all of them
+        echo "multigpu benchmark: $count"
+        python3 benchmark_models.py -g $count&& &>/dev/null
+    fi
     echo 'Nvidia GPU benchmarks finished'
 fi