[RN50/Paddle] Fix 2308 compatibility issue

Wong4j · nv-kkudrynski · commit 4fca54fb8958 · 2023-12-07T09:13:33.000-08:00
diff --git a/PaddlePaddle/Classification/RN50v1.5/Dockerfile b/PaddlePaddle/Classification/RN50v1.5/Dockerfile
@@ -1,4 +1,4 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.06-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.09-py3
 FROM ${FROM_IMAGE_NAME}
 
 ADD requirements.txt /workspace/
diff --git a/PaddlePaddle/Classification/RN50v1.5/README.md b/PaddlePaddle/Classification/RN50v1.5/README.md
@@ -233,7 +233,7 @@ The following section lists the requirements you need to meet to start training
 This repository contains a Dockerfile that extends the CUDA NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PaddlePaddle 22.05-py3 NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/paddlepaddle) or newer
+* [PaddlePaddle 23.09-py3 NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/paddlepaddle) or newer
 * Supported GPUs:
     * [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
 
@@ -289,7 +289,7 @@ docker build . -t nvidia_resnet50
 
 ### 4. Start an interactive session in the NGC container to run training/inference.
 ```bash
-nvidia-docker run --rm -it -v <path to imagenet>:/imagenet --ipc=host nvidia_resnet50
+nvidia-docker run --rm -it -v <path to imagenet>:/imagenet --ipc=host --e FLAGS_apply_pass_to_program=1  nvidia_resnet50
 ```
 
 ### 5. Start training
diff --git a/PaddlePaddle/Classification/RN50v1.5/program.py b/PaddlePaddle/Classification/RN50v1.5/program.py
@@ -12,26 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import time
 import logging
-
+import time
 from profile import Profiler
+
+import dllogger
+import models
 import numpy as np
-from optimizer import build_optimizer
 from lr_scheduler import build_lr_scheduler
+from optimizer import build_optimizer
 from utils.misc import AverageMeter
 from utils.mode import Mode, RunScope
 from utils.utility import get_num_trainers
-import models
-
-import dllogger
 
 import paddle
 import paddle.nn.functional as F
 from paddle.distributed import fleet
 from paddle.distributed.fleet import DistributedStrategy
-from paddle.static import sparsity
 from paddle.distributed.fleet.meta_optimizers.common import CollectiveHelper
+from paddle.incubate import asp as sparsity
 
 
 def create_feeds(image_shape):
@@ -45,11 +44,13 @@ def create_feeds(image_shape):
                      key (string): Name of variable to feed.
                      Value (tuple): paddle.static.data.
     """
-    feeds = dict()
+    feeds = {}
     feeds['data'] = paddle.static.data(
-        name="data", shape=[None] + image_shape, dtype="float32")
+        name="data", shape=[None] + image_shape, dtype="float32"
+    )
     feeds['label'] = paddle.static.data(
-        name="label", shape=[None, 1], dtype="int64")
+        name="label", shape=[None, 1], dtype="int64"
+    )
 
     return feeds
 
@@ -70,16 +71,15 @@ def create_fetchs(out, feeds, class_num, label_smoothing=0, mode=Mode.TRAIN):
                       key (string): Name of variable to fetch.
                       Value (tuple): (variable, AverageMeter).
     """
-    fetchs = dict()
+    fetchs = {}
     target = paddle.reshape(feeds['label'], [-1, 1])
 
     if mode == Mode.TRAIN:
         if label_smoothing == 0:
             loss = F.cross_entropy(out, target)
         else:
             label_one_hot = F.one_hot(target, class_num)
-            soft_target = F.label_smooth(
-                label_one_hot, epsilon=label_smoothing)
+            soft_target = F.label_smooth(label_one_hot, epsilon=label_smoothing)
             soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
             log_softmax = -F.log_softmax(out, axis=-1)
             loss = paddle.sum(log_softmax * soft_target, axis=-1)
@@ -94,19 +94,23 @@ def create_fetchs(out, feeds, class_num, label_smoothing=0, mode=Mode.TRAIN):
 
     acc_top1 = paddle.metric.accuracy(input=out, label=target, k=1)
     acc_top5 = paddle.metric.accuracy(input=out, label=target, k=5)
-    metric_dict = dict()
+    metric_dict = {}
     metric_dict["top1"] = acc_top1
     metric_dict["top5"] = acc_top5
 
     for key in metric_dict:
         if mode != Mode.TRAIN and paddle.distributed.get_world_size() > 1:
             paddle.distributed.all_reduce(
-                metric_dict[key], op=paddle.distributed.ReduceOp.SUM)
-            metric_dict[key] = metric_dict[
-                key] / paddle.distributed.get_world_size()
+                metric_dict[key], op=paddle.distributed.ReduceOp.SUM
+            )
+            metric_dict[key] = (
+                metric_dict[key] / paddle.distributed.get_world_size()
+            )
 
-        fetchs[key] = (metric_dict[key], AverageMeter(
-            key, '7.4f', need_avg=True))
+        fetchs[key] = (
+            metric_dict[key],
+            AverageMeter(key, '7.4f', need_avg=True),
+        )
 
     return fetchs
 
@@ -127,13 +131,16 @@ def create_strategy(args, is_train=True):
     exec_strategy = paddle.static.ExecutionStrategy()
 
     exec_strategy.num_threads = 1
-    exec_strategy.num_iteration_per_drop_scope = (10000 if args.amp and
-                                                  args.use_pure_fp16 else 10)
-
-    paddle.set_flags({
-        'FLAGS_cudnn_exhaustive_search': True,
-        'FLAGS_conv_workspace_size_limit': 4096
-    })
+    exec_strategy.num_iteration_per_drop_scope = (
+        10000 if args.amp and args.use_pure_fp16 else 10
+    )
+
+    paddle.set_flags(
+        {
+            'FLAGS_cudnn_exhaustive_search': True,
+            'FLAGS_conv_workspace_size_limit': 4096,
+        }
+    )
 
     if not is_train:
         build_strategy.fix_op_run_order = True
@@ -177,7 +184,7 @@ def dist_optimizer(args, optimizer):
         dist_strategy.amp_configs = {
             "init_loss_scaling": args.scale_loss,
             "use_dynamic_loss_scaling": args.use_dynamic_loss_scaling,
-            "use_pure_fp16": args.use_pure_fp16
+            "use_pure_fp16": args.use_pure_fp16,
         }
 
     dist_strategy.asp = args.asp
@@ -223,14 +230,16 @@ def build(args, main_prog, startup_prog, step_each_epoch, is_train=True):
                 input_image_channel=input_image_channel,
                 data_format=data_format,
                 use_pure_fp16=use_pure_fp16,
-                bn_weight_decay=bn_weight_decay)
+                bn_weight_decay=bn_weight_decay,
+            )
             out = model(feeds["data"])
 
             fetchs = create_fetchs(
-                out, feeds, class_num, args.label_smoothing, mode=mode)
+                out, feeds, class_num, args.label_smoothing, mode=mode
+            )
 
             if args.asp:
-                sparsity.set_excluded_layers(main_prog, [model.fc.weight.name])
+                sparsity.set_excluded_layers(main_program=main_prog, param_names=[model.fc.weight.name])
 
             lr_scheduler = None
             optimizer = None
@@ -244,10 +253,13 @@ def build(args, main_prog, startup_prog, step_each_epoch, is_train=True):
     # This is a workaround to "Communicator of ring id 0 has not been initialized.".
     # Since Paddle's design, the initialization would be done inside train program,
     # eval_only need to manually call initialization.
-    if args.run_scope == RunScope.EVAL_ONLY and \
-       paddle.distributed.get_world_size() > 1:
+    if (
+        args.run_scope == RunScope.EVAL_ONLY
+        and paddle.distributed.get_world_size() > 1
+    ):
         collective_helper = CollectiveHelper(
-            role_maker=fleet.PaddleCloudRoleMaker(is_collective=True))
+            role_maker=fleet.PaddleCloudRoleMaker(is_collective=True)
+        )
         collective_helper.update_startup_program(startup_prog)
 
     return fetchs, lr_scheduler, feeds, optimizer
@@ -270,22 +282,22 @@ def compile_prog(args, program, loss_name=None, is_train=True):
     build_strategy, exec_strategy = create_strategy(args, is_train)
 
     compiled_program = paddle.static.CompiledProgram(
-        program).with_data_parallel(
-            loss_name=loss_name,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy)
+        program, build_strategy=build_strategy
+    )
 
     return compiled_program
 
 
-def run(args,
-        dataloader,
-        exe,
-        program,
-        fetchs,
-        epoch,
-        mode=Mode.TRAIN,
-        lr_scheduler=None):
+def run(
+    args,
+    dataloader,
+    exe,
+    program,
+    fetchs,
+    epoch,
+    mode=Mode.TRAIN,
+    lr_scheduler=None,
+):
     """
     Execute program.
 
@@ -312,11 +324,11 @@ def run(args,
         if fetchs[k][1] is not None:
             metric_dict[k] = fetchs[k][1]
 
-    metric_dict["batch_time"] = AverageMeter(
-        'batch_time', '.5f', postfix=" s,")
+    metric_dict["batch_time"] = AverageMeter('batch_time', '.5f', postfix=" s,")
     metric_dict["data_time"] = AverageMeter('data_time', '.5f', postfix=" s,")
     metric_dict["compute_time"] = AverageMeter(
-        'compute_time', '.5f', postfix=" s,")
+        'compute_time', '.5f', postfix=" s,"
+    )
 
     for m in metric_dict.values():
         m.reset()
@@ -328,8 +340,7 @@ def run(args,
     batch_size = None
     latency = []
 
-    total_benchmark_steps = \
-        args.benchmark_steps + args.benchmark_warmup_steps
+    total_benchmark_steps = args.benchmark_steps + args.benchmark_warmup_steps
 
     dataloader.reset()
     while True:
@@ -361,11 +372,12 @@ def run(args,
         batch_size = batch[0]["data"].shape()[0]
         feed_dict = batch[0]
 
-        with profiler.profile_tag(idx, "Training"
-                                  if mode == Mode.TRAIN else "Evaluation"):
-            results = exe.run(program=program,
-                              feed=feed_dict,
-                              fetch_list=fetch_list)
+        with profiler.profile_tag(
+            idx, "Training" if mode == Mode.TRAIN else "Evaluation"
+        ):
+            results = exe.run(
+                program=program, feed=feed_dict, fetch_list=fetch_list
+            )
 
         for name, m in zip(fetchs.keys(), results):
             if name in metric_dict:
@@ -382,15 +394,16 @@ def run(args,
         tic = time.perf_counter()
 
         if idx % args.print_interval == 0:
-            log_msg = dict()
+            log_msg = {}
             log_msg['loss'] = metric_dict['loss'].val.item()
             log_msg['top1'] = metric_dict['top1'].val.item()
             log_msg['top5'] = metric_dict['top5'].val.item()
             log_msg['data_time'] = metric_dict['data_time'].val
             log_msg['compute_time'] = metric_dict['compute_time'].val
             log_msg['batch_time'] = metric_dict['batch_time'].val
-            log_msg['ips'] = \
+            log_msg['ips'] = (
                 batch_size * num_trainers / metric_dict['batch_time'].val
+            )
             if mode == Mode.TRAIN:
                 log_msg['lr'] = metric_dict['lr'].val
             log_info((epoch, idx), log_msg, mode)
@@ -406,10 +419,10 @@ def run(args,
                 logging.info("Begin benchmark at step %d", idx + 1)
 
             if idx == total_benchmark_steps:
-                benchmark_data = dict()
-                benchmark_data[
-                    'ips'] = batch_size * num_trainers / metric_dict[
-                        'batch_time'].avg
+                benchmark_data = {}
+                benchmark_data['ips'] = (
+                    batch_size * num_trainers / metric_dict['batch_time'].avg
+                )
                 if mode == mode.EVAL:
                     latency = np.array(latency) * 1000
                     quantile = np.quantile(latency, [0.9, 0.95, 0.99])
@@ -422,15 +435,19 @@ def run(args,
                 logging.info("End benchmark at epoch step %d", idx)
                 return benchmark_data
 
-    epoch_data = dict()
+    epoch_data = {}
     epoch_data['loss'] = metric_dict['loss'].avg.item()
     epoch_data['epoch_time'] = metric_dict['batch_time'].total
-    epoch_data['ips'] = batch_size * num_trainers * \
-            metric_dict["batch_time"].count / metric_dict["batch_time"].sum
+    epoch_data['ips'] = (
+        batch_size
+        * num_trainers
+        * metric_dict["batch_time"].count
+        / metric_dict["batch_time"].sum
+    )
     if mode == Mode.EVAL:
         epoch_data['top1'] = metric_dict['top1'].avg.item()
         epoch_data['top5'] = metric_dict['top5'].avg.item()
-    log_info((epoch, ), epoch_data, mode)
+    log_info((epoch,), epoch_data, mode)
 
     return epoch_data
 
@@ -445,7 +462,7 @@ def log_info(step, metrics, mode):
         mode(utils.Mode): Train or eval mode.
     """
     prefix = 'train' if mode == Mode.TRAIN else 'val'
-    dllogger_iter_data = dict()
+    dllogger_iter_data = {}
     for key in metrics:
         dllogger_iter_data[f"{prefix}.{key}"] = metrics[key]
     dllogger.log(step=step, data=dllogger_iter_data)
diff --git a/PaddlePaddle/Classification/RN50v1.5/train.py b/PaddlePaddle/Classification/RN50v1.5/train.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.06-py3`
	`1`	`+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/paddlepaddle:23.09-py3`
`2`	`2`	`FROM ${FROM_IMAGE_NAME}`
`3`	`3`
`4`	`4`	`ADD requirements.txt /workspace/`