[nnx] add flax_pytree_module flag

cgarciae · cgarciae · commit bcd7071955d2 · 2025-07-22T11:56:58.000-07:00
diff --git a/examples/nnx_toy_examples/mutable_array_demo.py b/examples/nnx_toy_examples/mutable_array_demo.py
@@ -13,17 +13,15 @@
 # limitations under the License.
 
 # %%
-import os
-
-os.environ['FLAX_MUTABLE_ARRAY'] = 'true'
-
 import jax
 import jax.numpy as jnp
 import matplotlib.pyplot as plt
 import numpy as np
 
 from flax import nnx
 
+# activate mutable arrays
+nnx.use_mutable_arrays(True)
 
 # ## Data
 # We create a simple dataset of points sampled from a parabola with some noise.
@@ -151,9 +149,7 @@ def create_block(rngs, /):
 
       self.blocks = nnx.mutable(create_block(rngs.fork(split=num_blocks)))
     else:
-      self.blocks = nnx.data(
-        [Block(dhidden, dhidden, rngs=rngs) for i in range(num_blocks)]
-      )
+      self.blocks = [Block(dhidden, dhidden, rngs=rngs) for i in range(num_blocks)]
 
   def __call__(self, x: jax.Array, *, rngs: nnx.Rngs | None = None):
     self.count[...] += 1
@@ -197,13 +193,11 @@ def make_opt_state(x):
       else:
         return OptState(jnp.zeros_like(x))
 
-    self.momentum = nnx.data(
-      jax.tree.map(
+    self.momentum = jax.tree.map(
         make_opt_state,
         params,
         is_leaf=lambda x: isinstance(x, nnx.Variable),
       )
-    )
 
   # during the update we simply map over (params, momentum, grads),
   # for each triplet we implement the SGD update rule which updates
@@ -226,11 +220,13 @@ def update_fn(
 # Variables are immutable (only contain Arrays) by default as it can make
 # initialization easier, however this means we have to use 'mutable' to
 # create the MutableArrays that will be updated during training.
+
 rngs = nnx.Rngs(params=0, dropout=1)
 model = Model(
   num_blocks=3, din=1, dhidden=256, dout=1, use_scan=False, rngs=rngs
 )
 optimizer = SGD(params=nnx.state(model, nnx.Param), lr=3e-3, decay=0.99)
+
 # Create a copy of the model structure and set its attributes to eval model.
 # This works because they share the underlying MutableArrays so both models
 # will always be in sync.
@@ -260,7 +256,6 @@ def loss_fn(params):
   # so we don't need to return anything 🚀
   optimizer.update(params, grads)
 
-
 # simple test step that computes the loss
 @jax.jit
 def test_step(model: Model, x, y):
diff --git a/flax/configurations.py b/flax/configurations.py
@@ -24,6 +24,7 @@
 class Config:
   flax_use_flaxlib: bool
   flax_mutable_array: bool
+  flax_pytree_module: bool
   flax_max_repr_depth: int | None
   # See https://google.github.io/pytype/faq.html.
   _HAS_DYNAMIC_ATTRIBUTES = True
@@ -272,6 +273,11 @@ def temp_flip_flag(var_name: str, var_value: bool):
   default=False,
   help='Whether to use mutable arrays.',
 )
+flax_pytree_module = bool_flag(
+  name='flax_pytree_module',
+  default=True,
+  help='Whether Modules are pytrees by default or not.',
+)
 
 flax_max_repr_depth = int_flag(
   name='flax_max_repr_depth',
diff --git a/flax/nnx/__init__.py b/flax/nnx/__init__.py
@@ -180,6 +180,8 @@
 from .variablelib import mutable_array as mutable_array
 from .variablelib import MutableArray as MutableArray
 from .variablelib import is_mutable_array as is_mutable_array
+from .variablelib import use_mutable_arrays as use_mutable_arrays
+from .variablelib import using_mutable_arrays as using_mutable_arrays
 from .visualization import display as display
 from .extract import to_tree as to_tree
 from .extract import from_tree as from_tree
diff --git a/flax/nnx/graph.py b/flax/nnx/graph.py
@@ -1612,7 +1612,11 @@ def create_static_cache(x):
       return node_cache
     return x
 
-  cached_args = jax.tree.map(create_static_cache, cached_args)
+  cached_args = jax.tree.map(
+    create_static_cache,
+    cached_args,
+    is_leaf=lambda x: is_graph_node(x) or isinstance(x, Variable),
+  )
 
   @functools.wraps(f)
   def cache_args_wrapper(*args, **kwargs):
diff --git a/flax/nnx/nn/normalization.py b/flax/nnx/nn/normalization.py
@@ -18,7 +18,7 @@
 import jax.numpy as jnp
 from jax import lax
 
-from flax import nnx, config
+from flax import nnx
 from flax.nnx import rnglib
 from flax.nnx.module import Module, first_from
 from flax.nnx.nn import dtypes, initializers
@@ -355,7 +355,7 @@ def __call__(
         mask=mask,
       )
       # stop_gradient only for flax_mutable_array
-      if config.flax_mutable_array:
+      if self.mean.mutable or self.var.mutable:
         stop_gradient = jax.lax.stop_gradient
       else:
         stop_gradient = lambda x: x
diff --git a/flax/nnx/object.py b/flax/nnx/object.py
@@ -335,7 +335,7 @@ class Object(reprlib.Representable, metaclass=ObjectMeta):
     _object__state: ObjectState
 
   def __init_subclass__(
-    cls, *, pytree: bool = config.flax_mutable_array, **kwargs
+    cls, *, pytree: bool = config.flax_pytree_module, **kwargs
   ) -> None:
     super().__init_subclass__(**kwargs)
 
@@ -387,7 +387,8 @@ def _setattr(self, name: str, value: tp.Any) -> None:
       value = value.value
       if name not in self._object__nodes:
         self._object__nodes = self._object__nodes.union((name,))
-    elif is_data_type(value):
+    # any attribute that contains known data types will be registered as data
+    elif any(is_data_type(leaf) for leaf in jax.tree.leaves(value, is_leaf=is_data_type)):
       if name not in self._object__nodes:
         self._object__nodes = self._object__nodes.union((name,))
     elif type(self)._object__is_pytree and name not in self._object__nodes:
diff --git a/flax/nnx/transforms/iteration.py b/flax/nnx/transforms/iteration.py
@@ -652,7 +652,7 @@ def check_carry_same_references(key_path, arg, out):
       )
 
   jax.tree_util.tree_map_with_path(
-    check_carry_same_references, carry_arg, carry_arg_out
+    check_carry_same_references, carry_arg, carry_arg_out, is_leaf=graph.is_graph_node
   )
 
 def _extract_graphdefs(
diff --git a/flax/nnx/variablelib.py b/flax/nnx/variablelib.py
diff --git a/tests/nnx/variable_test.py b/tests/nnx/variable_test.py

Original file line number	Diff line number	Diff line change
`@@ -652,7 +652,7 @@ def check_carry_same_references(key_path, arg, out):`
`652`	`652`	`)`
`653`	`653`
`654`	`654`	`jax.tree_util.tree_map_with_path(`
`655`		`- check_carry_same_references, carry_arg, carry_arg_out`
	`655`	`+ check_carry_same_references, carry_arg, carry_arg_out, is_leaf=graph.is_graph_node`
`656`	`656`	`)`
`657`	`657`
`658`	`658`	`def _extract_graphdefs(`