From 48062eb6617cab50ab65b362929cdbd491b5b5f1 Mon Sep 17 00:00:00 2001 From: Farhad Ramezanghorbani Date: Thu, 16 Jan 2025 00:52:30 +0000 Subject: [PATCH 1/4] lr multiplier for given keys Signed-off-by: Farhad Ramezanghorbani --- .../src/bionemo/esm2/scripts/finetune_esm2.py | 27 ++++++++++++++++++- .../esm2/scripts/test_finetune_esm2.py | 4 +++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/finetune_esm2.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/finetune_esm2.py index 1b35f169f..5d7cda3a7 100644 --- a/sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/finetune_esm2.py +++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/finetune_esm2.py @@ -76,6 +76,8 @@ def train_model( experiment_name: str, resume_if_exists: bool, precision: PrecisionTypes, + scale_lr_layer: Optional[str] = None, + lr_multiplier: float = 1.0, wandb_entity: Optional[str] = None, wandb_project: Optional[str] = None, wandb_offline: bool = False, @@ -125,6 +127,8 @@ def train_model( result_dir that stores the logs and checkpoints. resume_if_exists (bool): attempt to resume if the checkpoint exists [FIXME @skothenhill this doesn't work yet] precision (PrecisionTypes): Precision type for training (e.g., float16, float32) + scale_lr_layer (Optional[str]): layer names for which the lr is scaled by lr_multiplier + lr_multiplier (float): lr multiplier for parameters in scale_lr_layer wandb_entity (Optional[str]): The team posting this run (default: your username or your default team) wandb_project (Optional[str]): The name of the project to which this run will belong wandb_offline (bool): Run offline (data can be streamed later to wandb servers). @@ -271,8 +275,13 @@ def train_model( weight_decay=0.01, adam_beta1=0.9, adam_beta2=0.98, - ) + ), ) + # fiddle is not serializing lambda fn + # to bypass serialization of lambda fn scale_lr_condition as part of optimizer configuration + if scale_lr_layer: + optimizer.scale_lr_cond = lambda name, param: scale_lr_layer in name + optimizer.lr_mult = lr_multiplier module = biobert_lightning_module(config=config, tokenizer=tokenizer, optimizer=optimizer) @@ -342,6 +351,8 @@ def finetune_esm2_entrypoint(): tensor_model_parallel_size=args.tensor_model_parallel_size, accumulate_grad_batches=args.accumulate_grad_batches, precision=args.precision, + scale_lr_layer=args.scale_lr_layer, + lr_multiplier=args.lr_multiplier, experiment_name=args.experiment_name, resume_if_exists=args.resume_if_exists, restore_from_checkpoint_path=args.restore_from_checkpoint_path, @@ -394,6 +405,20 @@ def get_parser(): default=4e-4, help="Learning rate for training. Default is 4e-4", ) + parser.add_argument( + "--scale-lr-layer", + type=str, + required=False, + default=None, + help="Layer name for which we scale the lr by lr-multiplier", + ) + parser.add_argument( + "--lr-multiplier", + type=float, + required=False, + default=1.0, + help="Learning rate multiplier for layers with scale-lr-layer in their name", + ) parser.add_argument( "--create-tensorboard-logger", action="store_true", default=False, help="Create a tensorboard logger." ) diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py index bc929a1ab..251635c7c 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py @@ -69,6 +69,8 @@ def test_esm2_finetune_token_classifier( log_every_n_steps=n_steps_train // 2, num_dataset_workers=10, lr=1e-5, + scale_lr_layer="classification_head", + lr_multiplier=1e2, micro_batch_size=4, accumulate_grad_batches=1, resume_if_exists=False, @@ -114,6 +116,8 @@ def test_esm2_finetune_regressor( log_every_n_steps=n_steps_train // 2, num_dataset_workers=10, lr=1e-5, + scale_lr_layer="regression_head", + lr_multiplier=1e2, micro_batch_size=4, accumulate_grad_batches=1, resume_if_exists=False, From 83a79974ac7fac378a25d478d3b0fc6c0322ac74 Mon Sep 17 00:00:00 2001 From: Farhad Ramezanghorbani Date: Fri, 17 Jan 2025 17:39:45 +0000 Subject: [PATCH 2/4] updated docs Signed-off-by: Farhad Ramezanghorbani --- .../user-guide/examples/bionemo-esm2/finetune.ipynb | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb b/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb index aa26cd72f..5996e873b 100644 --- a/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb +++ b/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb @@ -435,7 +435,14 @@ "```bash\n", "finetune_esm2 --help \n", "```\n", - "For a detailed description of training loop and the arguments please refer to the [ESM-2 Pretraining](./pretrain.md) tutorial." + "\n", + "For a detailed description of training loop and the arguments please refer to the [ESM-2 Pretraining](./pretrain.md) tutorial.\n", + "\n", + "#### Scaled LR for fine-tune head parameters \n", + "We can assign a different LR for specific layers (e.g. task head) during fine-tuning by making it possible to specify the name of the target layer as well as the LR multiplier.\n", + "\n", + "- `--lr-multiplier`: is a float that scales `--lr`\n", + "- `--sclae-lr-layer`: is the name of the layers for which we scale the LR" ] }, { @@ -522,6 +529,8 @@ " --val-check-interval 10 \\\n", " --log-every-n-steps 10 \\\n", " --lr 5e-3 \\\n", + " --lr-multiplier 1e2 \\\n", + " --scale-lr-layer \"regression_head\" \\\n", " --result-dir {work_dir} \\\n", " --micro-batch-size 2 \\\n", " --num-gpus 1 \\\n", @@ -689,6 +698,8 @@ " --val-check-interval 10 \\\n", " --log-every-n-steps 10 \\\n", " --lr 5e-3 \\\n", + " --lr-multiplier 1e2 \\\n", + " --scale-lr-layer \"classification_head\" \\\n", " --result-dir {work_dir} \\\n", " --micro-batch-size 2 \\\n", " --num-gpus 1 \\\n", From 19fbdc7bb6a078192290a83ba87a713000e41055 Mon Sep 17 00:00:00 2001 From: Farhad Ramezanghorbani Date: Fri, 17 Jan 2025 19:05:35 +0000 Subject: [PATCH 3/4] undo grad overlap reduce Signed-off-by: Farhad Ramezanghorbani --- .../src/bionemo/esm2/scripts/finetune_esm2.py | 16 +++++++++--- .../esm2/scripts/test_finetune_esm2.py | 26 ++++++++++++++++--- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/finetune_esm2.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/finetune_esm2.py index 5d7cda3a7..93c55a5f3 100644 --- a/sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/finetune_esm2.py +++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/finetune_esm2.py @@ -76,6 +76,7 @@ def train_model( experiment_name: str, resume_if_exists: bool, precision: PrecisionTypes, + encoder_frozen: bool = False, scale_lr_layer: Optional[str] = None, lr_multiplier: float = 1.0, wandb_entity: Optional[str] = None, @@ -100,7 +101,7 @@ def train_model( dataset_class: Type[InMemoryProteinDataset] = InMemorySingleValueDataset, config_class: Type[BioBertConfig] = ESM2FineTuneSeqConfig, metric_tracker: Callback | None = None, - overlap_grad_reduce: bool = True, + overlap_grad_reduce: bool = False, # Default to False to avoid communication issue in gradient synchronization step overlap_param_gather: bool = True, average_in_collective: bool = True, grad_reduce_in_fp32: bool = False, @@ -127,6 +128,7 @@ def train_model( result_dir that stores the logs and checkpoints. resume_if_exists (bool): attempt to resume if the checkpoint exists [FIXME @skothenhill this doesn't work yet] precision (PrecisionTypes): Precision type for training (e.g., float16, float32) + encoder_frozen (bool): Freeze the encoder parameters. Default is False. scale_lr_layer (Optional[str]): layer names for which the lr is scaled by lr_multiplier lr_multiplier (float): lr multiplier for parameters in scale_lr_layer wandb_entity (Optional[str]): The team posting this run (default: your username or your default team) @@ -258,6 +260,7 @@ def train_model( ) # Configure the model config = config_class( + encoder_frozen=encoder_frozen, params_dtype=get_autocast_dtype(precision), pipeline_dtype=get_autocast_dtype(precision), autocast_dtype=get_autocast_dtype(precision), # setting this speeds things up a lot @@ -351,6 +354,7 @@ def finetune_esm2_entrypoint(): tensor_model_parallel_size=args.tensor_model_parallel_size, accumulate_grad_batches=args.accumulate_grad_batches, precision=args.precision, + encoder_frozen=args.encoder_frozen, scale_lr_layer=args.scale_lr_layer, lr_multiplier=args.lr_multiplier, experiment_name=args.experiment_name, @@ -365,7 +369,7 @@ def finetune_esm2_entrypoint(): nsys_ranks=args.nsys_ranks, dataset_class=args.dataset_class, config_class=args.config_class, - overlap_grad_reduce=not args.no_overlap_grad_reduce, + overlap_grad_reduce=args.overlap_grad_reduce, overlap_param_gather=not args.no_overlap_param_gather, average_in_collective=not args.no_average_in_collective, grad_reduce_in_fp32=args.grad_reduce_in_fp32, @@ -398,6 +402,12 @@ def get_parser(): default="bf16-mixed", help="Precision type to use for training.", ) + parser.add_argument( + "--encoder-frozen", + action="store_true", + default=False, + help="Freeze the encoder parameters", + ) parser.add_argument( "--lr", type=float, @@ -596,7 +606,7 @@ def get_parser(): ) # DDP config parser.add_argument( - "--no-overlap-grad-reduce", + "--overlap-grad-reduce", action="store_true", default=False, ) diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py index 251635c7c..03e01c3cc 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_finetune_esm2.py @@ -46,9 +46,11 @@ def data_to_csv(data, tmp_path): @pytest.mark.needs_gpu +@pytest.mark.parametrize("encoder_frozen", [True, False]) def test_esm2_finetune_token_classifier( tmp_path, dummy_data_per_token_classification_ft, + encoder_frozen, n_steps_train: int = 50, seed: int = 42, ): @@ -75,6 +77,7 @@ def test_esm2_finetune_token_classifier( accumulate_grad_batches=1, resume_if_exists=False, precision="bf16-mixed", + encoder_frozen=encoder_frozen, dataset_class=InMemoryPerTokenValueDataset, config_class=ESM2FineTuneTokenConfig, metric_tracker=MetricTracker(metrics_to_track_val=["loss"], metrics_to_track_train=["loss"]), @@ -89,13 +92,17 @@ def test_esm2_finetune_token_classifier( encoder_requires_grad = [ p.requires_grad for name, p in trainer.model.named_parameters() if "classification_head" not in name ] - assert not all(encoder_requires_grad), "Pretrained model is not fully frozen during fine-tuning" + assert ( + not all(encoder_requires_grad) == encoder_frozen + ), f"Conflict in param requires_grad when encoder_frozen={encoder_frozen}" @pytest.mark.needs_gpu +@pytest.mark.parametrize("encoder_frozen", [True, False]) def test_esm2_finetune_regressor( tmp_path, dummy_data_single_value_regression_ft, + encoder_frozen, n_steps_train: int = 50, seed: int = 42, ): @@ -122,6 +129,7 @@ def test_esm2_finetune_regressor( accumulate_grad_batches=1, resume_if_exists=False, precision="bf16-mixed", + encoder_frozen=encoder_frozen, dataset_class=InMemorySingleValueDataset, config_class=ESM2FineTuneSeqConfig, metric_tracker=MetricTracker(metrics_to_track_val=["loss"], metrics_to_track_train=["loss"]), @@ -136,7 +144,9 @@ def test_esm2_finetune_regressor( encoder_requires_grad = [ p.requires_grad for name, p in trainer.model.named_parameters() if "regression_head" not in name ] - assert not all(encoder_requires_grad), "Pretrained model is not fully frozen during fine-tuning" + assert ( + not all(encoder_requires_grad) == encoder_frozen + ), f"Conflict in param requires_grad when encoder_frozen={encoder_frozen}" @pytest.fixture @@ -262,7 +272,7 @@ def test_get_parser(): "--nsys-ranks", "0", "1", - "--no-overlap-grad-reduce", + "--overlap-grad-reduce", "--no-overlap-param-gather", "--no-average-in-collective", "--grad-reduce-in-fp32", @@ -270,6 +280,11 @@ def test_get_parser(): "InMemoryPerTokenValueDataset", "--config-class", "ESM2FineTuneTokenConfig", + "--encoder-frozen", + "--lr-multiplier", + "1e2", + "--scale-lr-layer", + "dummy_layer", ] ) @@ -311,9 +326,12 @@ def test_get_parser(): assert args.nsys_start_step == 10 assert args.nsys_end_step == 50 assert args.nsys_ranks == [0, 1] - assert args.no_overlap_grad_reduce is True + assert args.overlap_grad_reduce is True assert args.no_overlap_param_gather is True assert args.no_average_in_collective is True assert args.grad_reduce_in_fp32 is True assert args.dataset_class == InMemoryPerTokenValueDataset assert args.config_class == ESM2FineTuneTokenConfig + assert args.encoder_frozen is True + assert args.lr_multiplier == 100 + assert args.scale_lr_layer == "dummy_layer" From 987dd9248fab06bfc1528be9c4e57d1e737d0f6c Mon Sep 17 00:00:00 2001 From: Farhad Ramezanghorbani Date: Fri, 17 Jan 2025 19:10:08 +0000 Subject: [PATCH 4/4] expose encoder_frozen Signed-off-by: Farhad Ramezanghorbani --- docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb b/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb index 5996e873b..359eb0b7f 100644 --- a/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb +++ b/docs/docs/user-guide/examples/bionemo-esm2/finetune.ipynb @@ -528,6 +528,7 @@ " --num-gpus 1 \\\n", " --val-check-interval 10 \\\n", " --log-every-n-steps 10 \\\n", + " --encoder-frozen \\\n", " --lr 5e-3 \\\n", " --lr-multiplier 1e2 \\\n", " --scale-lr-layer \"regression_head\" \\\n", @@ -697,6 +698,7 @@ " --num-gpus 1 \\\n", " --val-check-interval 10 \\\n", " --log-every-n-steps 10 \\\n", + " --encoder-frozen \\\n", " --lr 5e-3 \\\n", " --lr-multiplier 1e2 \\\n", " --scale-lr-layer \"classification_head\" \\\n",