[nnx] improve Module docs

google · Jan 22, 2025 · 9b3ea6b · 9b3ea6b
1 parent e4418e2
commit 9b3ea6b
Show file tree

Hide file tree

Showing 9 changed files with 247 additions and 241 deletions.
diff --git a/flax/nnx/nn/attention.py b/flax/nnx/nn/attention.py
@@ -244,7 +244,7 @@ class MultiHeadAttention(Module):
     >>> assert (layer(q) == layer(q, q)).all()
     >>> assert (layer(q) == layer(q, q, q)).all()
 
-  Attributes:
+  Args:
     num_heads: number of attention heads. Features (i.e. inputs_q.shape[-1])
       should be divisible by the number of heads.
     in_features: int or tuple with number of input features.

diff --git a/flax/nnx/nn/linear.py b/flax/nnx/nn/linear.py
@@ -119,7 +119,7 @@ class LinearGeneral(Module):
     >>> y.shape
     (16, 4, 5)
 
-  Attributes:
+  Args:
     in_features: int or tuple with number of input features.
     out_features: int or tuple with number of output features.
     axis: int or tuple with axes to apply the transformation on. For instance,
@@ -301,7 +301,7 @@ class Linear(Module):
       )
     })
 
-  Attributes:
+  Args:
     in_features: the number of input features.
     out_features: the number of output features.
     use_bias: whether to add a bias to the output (default: True).
@@ -393,7 +393,7 @@ class Einsum(Module):
     >>> y.shape
     (16, 11, 8, 4)
 
-  Attributes:
+  Args:
     einsum_str: a string to denote the einsum equation. The equation must
       have exactly two operands, the lhs being the input passed in, and
       the rhs being the learnable kernel. Exactly one of ``einsum_str``
@@ -572,7 +572,7 @@ class Conv(Module):
     ...                  mask=mask, padding='VALID', rngs=rngs)
     >>> out = layer(x)
 
-  Attributes:
+  Args:
     in_features: int or tuple with number of input features.
     out_features: int or tuple with number of output features.
     kernel_size: shape of the convolutional kernel. For 1D convolution,
@@ -823,7 +823,7 @@ class ConvTranspose(Module):
     ...                  mask=mask, padding='VALID', rngs=rngs)
     >>> out = layer(x)
 
-  Attributes:
+  Args:
     in_features: int or tuple with number of input features.
     out_features: int or tuple with number of output features.
     kernel_size: shape of the convolutional kernel. For 1D convolution,
@@ -1092,7 +1092,7 @@ class Embed(Module):
   broadcast the ``embedding`` matrix to input shape with ``features``
   dimension appended.
 
-  Attributes:
+  Args:
     num_embeddings: number of embeddings / vocab size.
     features: number of feature dimensions for each embedding.
     dtype: the dtype of the embedding vectors (default: same as embedding).

diff --git a/flax/nnx/nn/lora.py b/flax/nnx/nn/lora.py
@@ -61,7 +61,7 @@ class LoRA(Module):
     >>> y.shape
     (16, 4)
 
-  Attributes:
+  Args:
     in_features: the number of input features.
     lora_rank: the rank of the LoRA dimension.
     out_features: the number of output features.
@@ -133,7 +133,7 @@ class LoRALinear(Linear):
     >>> y.shape
     (16, 4)
 
-  Attributes:
+  Args:
     in_features: the number of input features.
     out_features: the number of output features.
     lora_rank: the rank of the LoRA dimension.

diff --git a/flax/nnx/nn/normalization.py b/flax/nnx/nn/normalization.py
@@ -236,7 +236,7 @@ class BatchNorm(Module):
     >>> assert (batch_stats2['mean'].value == batch_stats3['mean'].value).all()
     >>> assert (batch_stats2['var'].value == batch_stats3['var'].value).all()
 
-  Attributes:
+  Args:
     num_features: the number of input features.
     use_running_average: if True, the stored batch statistics will be
       used instead of computing the batch statistics on the input.
@@ -407,7 +407,7 @@ class LayerNorm(Module):
 
     >>> y = layer(x)
 
-  Attributes:
+  Args:
     num_features: the number of input features.
     epsilon: A small float added to variance to avoid dividing by zero.
     dtype: the dtype of the result (default: infer from input and params).
@@ -539,7 +539,7 @@ class RMSNorm(Module):
 
     >>> y = layer(x)
 
-  Attributes:
+  Args:
     num_features: the number of input features.
     epsilon: A small float added to variance to avoid dividing by zero.
     dtype: the dtype of the result (default: infer from input and params).
@@ -670,7 +670,7 @@ class GroupNorm(Module):
     >>> y2 = nnx.LayerNorm(num_features=6, reduction_axes=(1, 2, 3), rngs=nnx.Rngs(0))(x)
     >>> np.testing.assert_allclose(y, y2)
 
-  Attributes:
+  Args:
     num_features: the number of input features/channels.
     num_groups: the total number of channel groups. The default value of 32 is
       proposed by the original group normalization paper.