[torchax] Added JittableModule::load_state_dict mechanism

zmelumian · zmelumian · commit 121cc008e7c4 · 2025-05-21T09:09:30.000+03:00
diff --git a/torchax/test/test_statedict.py b/torchax/test/test_statedict.py
@@ -1,36 +1,54 @@
-
-    
 import unittest
 import torch
 from torch.utils import _pytree as pytree
 
-from torchax import (
-    interop,
-    mesh_util,
-    tensor
-)
+from torchax import (interop, mesh_util, tensor)
 
 
 class Model(torch.nn.Module):
-    def __init__(self):
-        super(Model, self).__init__()
-        self.linear = torch.nn.Linear(10, 5)
 
-    def forward(self, x):
-        return self.linear(x)
-    
+  def __init__(self):
+    super(Model, self).__init__()
+    self.linear = torch.nn.Linear(10, 5)
+
+  def forward(self, x):
+    return self.linear(x)
+
+
+mesh = mesh_util.Mesh.fsdp_mesh()
+model = interop.JittableModule(mesh.initialize_model_sharded(Model, ()))
+
 
 class TestTensorStateDict(unittest.TestCase):
-    def test_load_statedict(self):
-        mesh = mesh_util.Mesh.fsdp_mesh()
-        model = mesh.initialize_model_sharded(Model, ())
-        model = interop.JittableModule(model)        
-        state_dict = model.cpu_state_dict()
-        is_xla_tensor = pytree.tree_map(
-            lambda t: isinstance(t, tensor.Tensor),
-            state_dict
-        )
-        assert not any(is_xla_tensor.values()), "State dict should not contain XLA tensors"
+
+  def test_get_statedict(self):
+    state_dict_cpu = model.cpu_state_dict()
+    is_xla_tensor = pytree.tree_map(lambda t: isinstance(t, tensor.Tensor),
+                                    state_dict_cpu)
+    assert not any(
+        is_xla_tensor.values()), "State dict should not contain XLA tensors"
+
+  def test_load_statedict(self):
+    state_dict_cpu = model.cpu_state_dict()
+    state_dict_cpu = pytree.tree_map(torch.zeros_like, state_dict_cpu)
+    model.load_state_dict(state_dict_cpu)
+    is_zeros = pytree.tree_map(lambda t: torch.equal(t, torch.zeros_like(t)),
+                               state_dict_cpu)
+    assert all(is_zeros.values()), "State dict should be zeros"
+
+  def test_load_statedict_partial(self):
+    state_dict_cpu = model.cpu_state_dict()
+    del state_dict_cpu['_model.linear.bias']
+    state_dict_cpu = pytree.tree_map(torch.ones_like, state_dict_cpu)
+    key_check = model.load_state_dict(state_dict_cpu, strict=False)
+    assert key_check.missing_keys == [
+        '_model.linear.bias'
+    ], "Missing keys should be '_model.linear.bias'"
+    linear_weight = model.state_dict()['_model.linear.weight']
+    assert torch.equal(
+        linear_weight,
+        torch.ones_like(linear_weight)), "Linear weight should be ones"
+
 
 if __name__ == '__main__':
-    unittest.main()
+  unittest.main()
diff --git a/torchax/torchax/interop.py b/torchax/torchax/interop.py
@@ -1,3 +1,4 @@
+from typing import Mapping, Any
 import collections
 import copy
 import functools
@@ -125,14 +126,88 @@ def call(*args, **kwargs):
       return jitted(self.params, self.buffers, *args, **kwargs)
 
     self._jitted[key] = call
-    
+
   def cpu_state_dict(self, *args, **kwargs):
+    """
+    Wrapper for state_dict
+    
+    this function will make sure to transfer all the parameters to CPU
+    making it easier to save the state dict with torch.save
+
+    Returns:
+        Mapping[str, Any]: A mapping of parameter names to their values (in torch CPU)
+    """
     state_dict = super().state_dict(*args, **kwargs)
+    state_dict = pytree.tree_map(lambda t: t.cpu(), state_dict)
+    return state_dict
+
+  def load_state_dict(self,
+                      state_dict: Mapping[str, Any],
+                      strict: bool = True,
+                      assign: bool = False):
+    """
+    Wrapper for load_state_dict
+    
+    This function assumes torch CPU state dict and will transfer the parameters to the correct device
+    and dtype before loading them into the model.
+
+    Args:
+        state_dict (Mapping[str, Any]): A mapping of parameter names to their values (in torch CPU)
+        strict (bool, optional): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``True``
+        assign (bool, optional): When set to ``False``, the properties of the tensors
+            in the current module are preserved whereas setting it to ``True`` preserves
+            properties of the Tensors in the state dict. The only
+            exception is the ``requires_grad`` field of :class:`~torch.nn.Parameter`s
+            for which the value from the module is preserved.
+            Default: ``False``
+
+    Returns:
+        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
+              * **missing_keys** is a list of str containing any keys that are expected
+                  by this module but missing from the provided ``state_dict``.
+              * **unexpected_keys** is a list of str containing the keys that are not
+                  expected by this module but present in the provided ``state_dict``.
+    """
+    # Move tensors to JAX to have easier time extracting sharding information
+    current_state_dict = super().state_dict()
+    current_state_dict = jax_view(current_state_dict)
+
+    # create out shardings that eithe reuses the current state dict sharding or replicates the weights
+    def extract_sharding_or_replicate(name):
+      if name in current_state_dict:
+        return current_state_dict[name].sharding
+      return jax.sharding.PartitionSpec()
+
+    output_shards = {
+        name: extract_sharding_or_replicate(name) for name in state_dict
+    }
+
+    def convert_to_xla_tensor_if_needed(t):
+      is_torch_tensor = isinstance(t, torch.Tensor)
+      is_xla_tensor = isinstance(t, torchax.tensor.Tensor)
+      if is_xla_tensor:
+        t = jax_view(t)
+      elif is_torch_tensor:
+        # convert to jax tensor
+        t = tensor.t2j(t)
+      return t
+
+    # convert the state dict to JAX and shard them
     state_dict = pytree.tree_map(
-      lambda t: t.cpu(),
-      state_dict
+        tensor.t2j,
+        state_dict,
     )
-    return state_dict
+    # Convert ordered dict to regular dict, pjit type-safety checks
+    state_dict = dict(state_dict)
+    jitted = jax_jit(
+        lambda t: t, kwargs_for_jax_jit={"out_shardings": output_shards})
+    state_dict = jitted(state_dict)
+    # review it as torch tensors, so we can use torch.assign if we need to
+    state_dict = torch_view(state_dict)
+
+    return super().load_state_dict(state_dict, strict, assign)
 
 
 class CompileMixin: