open-thought · skr3178 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/tiny-grpo.iml b/.idea/tiny-grpo.iml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Minimal GRPO implementation
-
-Goal: Working toy implementation of llama-3.2-3b locally RL training with GRPO. Understanding the algorithm & hyper parameters. Just running everything locally on a single node.
+Since I had a smaller 12 GB GPU, I tested this with smaller number of samples and an even smaller model of LLM instruct than originally proposed.
+Goal: Working toy implementation of HuggingFaceTB/SmolLM-135M-Instruct locally RL training with GRPO. Understanding the algorithm & hyper parameters. Just running everything locally on a single node.
 
 ### Setup
 
@@ -16,16 +16,25 @@ conda activate grpo
 ```
 pip install -r requirements.txt
 pip install flash-attn --no-build-isolation
+
+#May need to upgrade nvcc--version to higher for flash-attn to work
 ```
 
-3. Play with the source in `train.py`
+3. Play with the source in `train_ds2.py`
+Since I had only one 12 GB 3060 GPU, I modified the code to run on single GPU instead of distributed 
+```
+python train_ds2.py
 
 ```
-python train.py
+
+with multiple gpu
+
+```
+torchrun --nproc_per_node=8 train.py
 ```
 
 ### Inspiration
-
+https://github.com/open-thought/tiny-grpo
 - [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF)
 - [Spinning Up in Deep RL](https://spinningup.openai.com/en/latest/)
 

diff --git a/__pycache__/ckpt_utils.cpython-312.pyc b/__pycache__/ckpt_utils.cpython-312.pyc
diff --git a/__pycache__/loss.cpython-312.pyc b/__pycache__/loss.cpython-312.pyc
diff --git a/__pycache__/replay_buffer.cpython-312.pyc b/__pycache__/replay_buffer.cpython-312.pyc
diff --git a/ckpt_utils.py b/ckpt_utils.py
@@ -0,0 +1,34 @@
+from torch.distributed.checkpoint.state_dict import (
+    set_optimizer_state_dict,
+    set_model_state_dict,
+    get_model_state_dict,
+    get_optimizer_state_dict,
+)
+import torch.distributed.checkpoint as dcp
+import torch.distributed as dist
+
+
+def save_checkpoint(model, optimizer, path):
+    """Save model and optimizer state using distributed checkpoint"""
+    model_state = get_model_state_dict(model=model)
+    optimizer_state = get_optimizer_state_dict(model=model, optimizers=optimizer)
+
+    state_dict = {"model": model_state, "optimizer": optimizer_state}
+
+    dcp.save(state_dict=state_dict, storage_writer=dcp.FileSystemWriter(path))
+
+
+def load_checkpoint(model, optimizer, path):
+    """Load model and optimizer state using distributed checkpoint"""
+
+    dcp_state_dict = {
+        "model": get_model_state_dict(model=model),
+        "optimizer": get_optimizer_state_dict(model=model, optimizers=optimizer),
+    }
+
+    dcp.load(dcp_state_dict, storage_reader=dcp.FileSystemReader(path))
+
+    set_model_state_dict(model=model, model_state_dict=dcp_state_dict["model"])
+    set_optimizer_state_dict(
+        model=model, optimizers=optimizer, optim_state_dict=dcp_state_dict["optimizer"]
+    )