EdanToledo
diff --git a/‎pyproject.toml‎
Lines changed: 6 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎stoix/configs/default/anakin/default_ff_disco103.yaml‎
Lines changed: 11 additions & 0 deletions b/‎stoix/configs/default/anakin/default_ff_disco103.yaml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎stoix/configs/network/specialised/disco_rl.yaml‎
Lines changed: 62 additions & 0 deletions b/‎stoix/configs/network/specialised/disco_rl.yaml‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎stoix/configs/system/disco_rl/ff_disco103.yaml‎
Lines changed: 48 additions & 0 deletions b/‎stoix/configs/system/disco_rl/ff_disco103.yaml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎stoix/networks/base.py‎
Lines changed: 22 additions & 4 deletions b/‎stoix/networks/base.py‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎stoix/networks/heads.py‎
Lines changed: 12 additions & 2 deletions b/‎stoix/networks/heads.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎stoix/networks/specialised/disco103.py‎
Lines changed: 152 additions & 0 deletions b/‎stoix/networks/specialised/disco103.py‎
Lines changed: 152 additions & 0 deletions
@@ -158,7 +158,7 @@ dependencies = [
     "tqdm>=4.67.1",
     "wandb>=0.19.8",
     "playground>=0.0.5",
-    "protobuf==3.20.3"
+    "protobuf==3.20.3",
 ]
 
 [dependency-groups]
@@ -171,6 +171,11 @@ dev = [
   "testfixtures",
 ]
 
+[project.optional-dependencies]
+disco = [
+  "disco_rl @ git+https://github.com/google-deepmind/disco_rl.git@main ; python_version >= '3.11'",
+]
+
 [project.urls]
 "Homepage" = "https://github.com/EdanToledo/Stoix"
 "Bug Tracker" = "https://github.com/EdanToledo/Stoix/issues"
@@ -0,0 +1,11 @@
+defaults:
+  - logger: logger
+  - arch: anakin
+  - system: disco_rl/ff_disco103
+  - network: specialised/disco_rl
+  - env: gymnax/cartpole
+  - _self_
+
+hydra:
+  searchpath:
+    - file://stoix/configs
@@ -0,0 +1,62 @@
+# ---MLP PPO Networks---
+agent_network:
+  shared_torso:
+    _target_: stoix.networks.torso.MLPTorso
+    layer_sizes: [512, 512]
+    use_layer_norm: False
+    activation: relu
+
+  action_conditional_torso:
+    _target_: stoix.networks.specialised.disco103.LSTMActionConditionedTorso
+    lstm_size: 128
+    activation: relu
+
+  logits_head:
+    _target_: stoix.networks.base.chained_torsos
+    _recursive_: false
+    torso_cfgs:
+      - _target_: stoix.networks.torso.MLPTorso
+        layer_sizes: [128]
+        use_layer_norm: False
+        activation: relu
+      - _target_: stoix.networks.heads.LinearHead
+
+  y_head:
+    _target_: stoix.networks.base.chained_torsos
+    _recursive_: false
+    torso_cfgs:
+      - _target_: stoix.networks.torso.MLPTorso
+        layer_sizes: [128]
+        use_layer_norm: False
+        activation: relu
+      - _target_: stoix.networks.heads.LinearHead
+
+  z_head:
+    _target_: stoix.networks.base.chained_torsos
+    _recursive_: false
+    torso_cfgs:
+      - _target_: stoix.networks.torso.MLPTorso
+        layer_sizes: [128]
+        use_layer_norm: False
+        activation: relu
+      - _target_: stoix.networks.heads.LinearHead
+
+  q_head:
+    _target_: stoix.networks.base.chained_torsos
+    _recursive_: false
+    torso_cfgs:
+      - _target_: stoix.networks.torso.MLPTorso
+        layer_sizes: [128]
+        use_layer_norm: False
+        activation: relu
+      - _target_: stoix.networks.heads.LinearHead
+
+  aux_pi_head:
+    _target_: stoix.networks.base.chained_torsos
+    _recursive_: false
+    torso_cfgs:
+      - _target_: stoix.networks.torso.MLPTorso
+        layer_sizes: [128]
+        use_layer_norm: False
+        activation: relu
+      - _target_: stoix.networks.heads.LinearHead
@@ -0,0 +1,48 @@
+# --- Defaults FF-Disco103 ---
+
+system_name: ff_disco103 # Name of the system.
+
+# --- RL hyperparameters ---
+rollout_length: 128 # Number of environment steps per training step.
+epochs: 4 # Number of epochs to train on the collected data.
+num_minibatches: 8 # Number of minibatches to split the data into.
+gamma: 0.997 # Discounting factor.
+lr: 3e-4 # Learning rate.
+max_abs_update: 1.0 # Maximum abs values for a weight update.
+reward_scale: 1.0 # Scaling factor for the reward.
+decay_learning_rates: False # Whether learning rates should be linearly decayed during training.
+
+# DiscoRL HyperParams - These are passed to the agent_loss functions
+disco_hyperparams:
+  pi_cost: 1.0 # Weight for the policy loss.
+  y_cost: 1.0 # Weight for the 'y' auxiliary loss.
+  z_cost: 1.0 # Weight for the 'z' auxiliary loss.
+  aux_policy_cost: 1.0 # Weight for the auxiliary policy loss.
+  value_cost: 0.2 # Weight for the value function loss.
+  value_fn_td_lambda: 0.95 # Lambda for TD(lambda) updates of the value function.
+  target_params_coeff: 0.9 # Polyak averaging coeff for target net
+
+# DiscoRL UpdateRule Config - These values are passed to the DiscoUpdateRule constructor
+disco_rule:
+  value_discount: ${system.gamma} # Discount factor for the value function.
+  max_abs_value: 300.0 # Maximum absolute value for categorical value transforms.
+  num_bins: 601 # Number of bins for the Q-network's categorical distribution.
+  moving_average_decay: 0.99 # Decay rate for the moving average of statistics.
+  # Config for the meta-network (meta_nets.LSTM)
+  net:
+    name: "lstm" # Name of the network architecture.
+    prediction_size: 600 # Size of y and z auxiliary outputs
+    hidden_size: 256 # Size of the hidden layers in the network.
+    embedding_size: [16, 1] # Size of the embeddings.
+    policy_channels: [16, 2] # Number of channels in the policy network layers.
+    policy_target_channels: [16] # Number of channels in the policy target network layers.
+    output_stddev: 0.3 # Standard deviation for the output distribution.
+    aux_stddev: 0.3 # Standard deviation for the auxiliary output distributions.
+    policy_target_stddev: 0.3 # Standard deviation for the policy target distribution.
+    state_stddev: 1.0 # Standard deviation for the state normalization.
+    # Config for the inner MetaLSTM
+    meta_rnn_kwargs:
+      hidden_size: 128 # Size of the hidden layers in the meta-network.
+      embedding_size: [16] # Size of the embeddings in the meta-network.
+      pred_embedding_size: [16, 1] # Size of the prediction embeddings in the meta-network.
+      policy_channels: [16, 2] # Number of channels in the policy layers of the meta-network.
@@ -1,4 +1,5 @@
 import functools
+import inspect
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import chex
@@ -221,14 +222,31 @@ def __call__(
         return critic_hidden_state, critic_output
 
 
-def chained_torsos(torso_cfgs: List[Dict[str, Any]]) -> nn.Module:
+def chained_torsos(torso_cfgs: List[Dict[str, Any]], **kwargs: Dict[str, Any]) -> nn.Module:
     """Create a network by chaining multiple torsos together using a list of configs.
     This makes use of hydra to instantiate the modules and the composite network
-    to chain them together.
+    to chain them together. Be careful when using kwargs, if two torsos accept
+    the same argument name, the value will be passed to both torsos.
 
     Args:
         torso_cfgs: List of dictionaries containing the configuration for each torso.
-            These configs should use the same format as the individual torso configs."""
+            These configs should use the same format as the individual torso configs.
+        **kwargs: Additional keyword arguments to pass to each torso during instantiation.
+    """
+
+    torso_modules = []
+    for torso_cfg in torso_cfgs:
+        # Get the target class
+        target_class = hydra.utils.get_class(torso_cfg["_target_"])
+
+        # Inspect the signature to find all accepted parameter names
+        sig = inspect.signature(target_class)
+        accepted_keys = set(sig.parameters.keys())
+
+        # Filter the kwargs based on the accepted keys
+        current_kwargs = {k: v for k, v in kwargs.items() if k in accepted_keys}
+
+        # Instantiate with the filtered kwargs
+        torso_modules.append(hydra.utils.instantiate(torso_cfg, **current_kwargs))
 
-    torso_modules = [hydra.utils.instantiate(torso_cfg) for torso_cfg in torso_cfgs]
     return CompositeNetwork(torso_modules)
@@ -294,11 +294,21 @@ def __call__(self, embedding: chex.Array) -> Tuple[distrax.EpsilonGreedy, chex.A
 class LinearHead(nn.Module):
     output_dim: int
     kernel_init: Initializer = orthogonal(0.01)
+    pre_shape: Optional[Tuple[int, ...]] = None
+
+    def setup(self) -> None:
+        if self.pre_shape is not None:
+            self.shape = self.pre_shape + (self.output_dim,)
+        else:
+            self.shape = (self.output_dim,)
+        self.output_size = int(np.prod(self.shape))
 
     @nn.compact
     def __call__(self, embedding: chex.Array) -> chex.Array:
-
-        return nn.Dense(self.output_dim, kernel_init=self.kernel_init)(embedding)
+        out = nn.Dense(self.output_size, kernel_init=self.kernel_init)(embedding)
+        if self.pre_shape is None:
+            return out
+        return out.reshape(out.shape[:-1] + self.shape)
 
 
 class MultiDiscreteHead(nn.Module):
 
@@ -0,0 +1,152 @@
+from typing import Tuple
+
+import chex
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.linen.initializers import Initializer, orthogonal
+
+from stoix.networks.utils import parse_activation_fn
+from stoix.systems.disco_rl.disco_rl_types import AgentOutput
+
+
+class LSTMActionConditionedTorso(nn.Module):
+    """LSTM-based action-conditional torso inspired by Muesli/MuZero.
+
+    This torso creates a root embedding from the observation, then performs
+    an LSTM transition for all possible actions in parallel, producing
+    action-conditional hidden states of shape [batch, num_actions, hidden_dim].
+
+    Attributes:
+        num_actions: Number of discrete actions.
+        lstm_size: Size of the LSTM hidden state.
+        root_mlp_sizes: Sizes of MLP layers for root embedding. If None, uses a single linear layer.
+        activation: Activation function for the root MLP.
+        kernel_init: Kernel initializer for linear layers.
+    """
+
+    num_actions: int
+    lstm_size: int
+    root_mlp_sizes: Tuple[int, ...] = ()
+    activation: str = "relu"
+    kernel_init: Initializer = orthogonal(1.0)
+
+    @nn.compact
+    def __call__(self, observation: chex.Array) -> chex.Array:
+        """Forward pass.
+
+        Args:
+            observation: Input observation of shape [batch, ...].
+
+        Returns:
+            Action-conditional hidden states of shape [batch, num_actions, lstm_size].
+        """
+        batch_size = observation.shape[0]
+
+        # 1. Create root embedding from observation
+        root_embedding = self._root_embedding(observation)  # [batch, lstm_size]
+
+        # 2. Perform LSTM transition for all actions
+        action_hidden_states = self._model_transition_all_actions(
+            root_embedding, batch_size
+        )  # [batch, num_actions, lstm_size]
+
+        return action_hidden_states
+
+    def _root_embedding(self, observation: chex.Array) -> chex.Array:
+        """Constructs a root embedding from the observation.
+
+        Args:
+            observation: Input observation of shape [batch, ...].
+
+        Returns:
+            Root embedding (LSTM cell state) of shape [batch, lstm_size].
+        """
+        # Simply use the observation as input
+        x = observation
+
+        # Apply optional MLP layers
+        if self.root_mlp_sizes:
+            for size in self.root_mlp_sizes:
+                x = nn.Dense(size, kernel_init=self.kernel_init)(x)
+                x = parse_activation_fn(self.activation)(x)
+
+        # Final linear layer to get cell state
+        cell = nn.Dense(self.lstm_size, kernel_init=self.kernel_init, name="root_cell")(x)
+        # Create hidden state as tanh(cell)
+        hidden = jnp.tanh(cell)
+        return (hidden, cell)
+
+    def _model_transition_all_actions(self, root_carry: chex.Array, batch_size: int) -> chex.Array:
+        """Performs LSTM transition for all actions in parallel.
+
+        Args:
+            root_carry: Root carry state of shape [batch, lstm_size].
+            batch_size: Batch size.
+
+        Returns:
+            LSTM outputs for all actions of shape [batch, num_actions, lstm_size].
+        """
+        # Create one-hot encodings for all actions
+        # Shape: [num_actions, num_actions]
+        one_hot_actions = jnp.eye(self.num_actions, dtype=root_carry[0].dtype)
+
+        # Repeat for each batch element
+        # Shape: [batch * num_actions, num_actions]
+        batched_one_hot_actions = jnp.tile(one_hot_actions, [batch_size, 1])
+
+        # Repeat the root carry for each action
+        # This uses jax.tree.map to handle the (hidden, cell) tuple
+        initial_carry = jax.tree.map(
+            lambda x: jnp.repeat(x, repeats=self.num_actions, axis=0), root_carry
+        )
+
+        # Apply LSTM
+        lstm_cell = nn.LSTMCell(features=self.lstm_size, name="action_cond_lstm")
+        _, lstm_output = lstm_cell(initial_carry, batched_one_hot_actions)
+
+        # Reshape output from [batch * num_actions, lstm_size] to [batch, num_actions, lstm_size]
+        action_hidden_states = lstm_output.reshape(batch_size, self.num_actions, self.lstm_size)
+
+        return action_hidden_states
+
+
+class DiscoAgentNetwork(nn.Module):
+    """
+    A network for the DiscoRL agent.
+
+    This network has a shared torso and five separate heads, matching
+    the architecture required by the DiscoUpdateRule:
+    1. logits (Policy)
+    2. q (Categorical Value)
+    3. y (Auxiliary)
+    4. z (Auxiliary)
+    5. aux_pi (Auxiliary Policy)
+    """
+
+    shared_torso: nn.Module
+    action_conditional_torso: nn.Module
+    logits_head: nn.Module
+    q_head: nn.Module
+    y_head: nn.Module
+    z_head: nn.Module
+    aux_pi_head: nn.Module
+
+    def __call__(self, obs: chex.Array) -> AgentOutput:
+        """Forward pass."""
+        # Run the shared torso
+        torso_output = self.shared_torso(obs)
+
+        # Run logits and y prediction heads on the torso output
+        logits = self.logits_head(torso_output)
+        y = self.y_head(torso_output)
+
+        # We now run the action conditional heads.
+        # We do this by running an action-conditional torso first,
+        # then passing its output to the q, z, and aux_pi heads.
+        action_conditional_torso_output = self.action_conditional_torso(torso_output)
+        q = self.q_head(action_conditional_torso_output)
+        z = self.z_head(action_conditional_torso_output)
+        aux_pi = self.aux_pi_head(action_conditional_torso_output)
+
+        return AgentOutput(logits=logits, q=q, y=y, z=z, aux_pi=aux_pi)