diff --git a/transformers_doc/en/pytorch/training.ipynb b/transformers_doc/en/pytorch/training.ipynb index 6826183e..c8e5d9b0 100644 --- a/transformers_doc/en/pytorch/training.ipynb +++ b/transformers_doc/en/pytorch/training.ipynb @@ -237,6 +237,121 @@ "trainer.push_to_hub()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Native PyTorch Training Loop with Gradient Clipping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you prefer more control over training, you can write a native PyTorch training loop instead of using [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer).\n", + "\n", + "One important technique to include is **gradient clipping** (`torch.nn.utils.clip_grad_norm_`), which prevents exploding gradients — a common problem when fine-tuning large models. Exploding gradients cause the loss to become `NaN` or the model weights to grow uncontrollably, destabilizing training.\n", + "\n", + "First, set up the DataLoader, optimizer, and learning rate scheduler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch.utils.data import DataLoader\n", + "from transformers import AdamW, get_scheduler\n", + "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n", + "from datasets import load_dataset\n", + "\n", + "# Load dataset and tokenizer\n", + "dataset = load_dataset(\"yelp_review_full\")\n", + "tokenizer = AutoTokenizer.from_pretrained(\"google-bert/bert-base-cased\")\n", + "\n", + "def tokenize(examples):\n", + " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", + "\n", + "tokenized_dataset = dataset.map(tokenize, batched=True)\n", + "\n", + "# Use a small subset to keep things fast\n", + "small_train = tokenized_dataset[\"train\"].shuffle(seed=42).select(range(1000))\n", + "small_eval = tokenized_dataset[\"test\"].shuffle(seed=42).select(range(200))\n", + "\n", + "# Format datasets for PyTorch\n", + "small_train.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"label\"])\n", + "small_eval.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"label\"])\n", + "\n", + "train_dataloader = DataLoader(small_train, shuffle=True, batch_size=8)\n", + "eval_dataloader = DataLoader(small_eval, batch_size=8)\n", + "\n", + "# Load model\n", + "model = AutoModelForSequenceClassification.from_pretrained(\"google-bert/bert-base-cased\", num_labels=5)\n", + "\n", + "# Set up optimizer and scheduler\n", + "optimizer = AdamW(model.parameters(), lr=5e-5)\n", + "num_epochs = 3\n", + "num_training_steps = num_epochs * len(train_dataloader)\n", + "lr_scheduler = get_scheduler(\n", + " name=\"linear\",\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0,\n", + " num_training_steps=num_training_steps,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now write the training loop. Notice the `torch.nn.utils.clip_grad_norm_()` call **after** `loss.backward()` and **before** `optimizer.step()`. This is the correct position — gradients must be computed first before they can be clipped." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm.auto import tqdm\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "model.to(device)\n", + "\n", + "progress_bar = tqdm(range(num_training_steps))\n", + "\n", + "model.train()\n", + "for epoch in range(num_epochs):\n", + " for batch in train_dataloader:\n", + " batch = {k: v.to(device) for k, v in batch.items()}\n", + "\n", + " # Forward pass\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + "\n", + " # Backward pass\n", + " loss.backward()\n", + "\n", + " # Gradient clipping — prevents exploding gradients\n", + " # max_norm=1.0 is a safe default for most fine-tuning tasks\n", + " torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n", + "\n", + " optimizer.step()\n", + " lr_scheduler.step()\n", + " optimizer.zero_grad()\n", + " progress_bar.update(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> [!TIP]\n", + "> `max_norm=1.0` is the standard default and works well for most fine-tuning scenarios. If your loss is still unstable, try lowering it to `0.5`. If training is too slow to converge, try raising it to `2.0`." + ] + }, { "cell_type": "markdown", "metadata": {},