Are there any gotchas with internal training of `nnx.Module`s? #4827

cisprague · 2025-07-21T12:17:45Z

cisprague
Jul 21, 2025

Are there any gotchas or Flax NNX mechanics to be aware of in the context of the following two training loop styles?
In this example, both training loops take pretty much the same amount of time. But, are there some cases where this is not efficient?

Internal

import jax, jax.numpy as jnp, flax.nnx as nnx, optax, time
from typing import Sequence
from functools import partial
from tqdm import tqdm
jax.devices()

class Module(nnx.Module):

    def __init__(self, x_dim: int, hidden: Sequence[int], rngs: nnx.Rngs):
        layers = [nnx.Linear(x_dim, hidden[0], rngs=rngs), nnx.swish]
        for i in range(len(hidden) - 1):
            layers += [nnx.Linear(hidden[i], hidden[i+1], rngs=rngs)]
            layers += [nnx.swish]
        layers += [nnx.Linear(hidden[-1], 1, rngs=rngs)]
        self.module = nnx.Sequential(*layers)
        self.x_dim = x_dim

    @partial(nnx.jit, static_argnames='n')
    def step(self, key: jax.Array, n: int, optimizer: nnx.Optimizer) -> jax.Array:
        def loss_fn(self):
            x = jax.random.uniform(key, (n, self.x_dim), minval=-1.0, maxval=1.0) # (n, x_dim)
            y = jnp.exp(-jnp.sum(x**2, axis=1, keepdims=True)) # (n, 1)
            yhat = self.module(x)
            loss = jnp.mean((y - yhat)**2)
            return loss
        loss, grads = nnx.value_and_grad(loss_fn)(self)
        optimizer.update(grads)
        return loss
    
    def fit(self, tx: optax.GradientTransformation, n_epochs: int, key: jax.Array, n: int):
        # Usually not a jitable method, e.g. w/ passing in dataloader
        t, optimizer = [], nnx.Optimizer(self, tx)
        for _ in (pb := tqdm(range(n_epochs), desc="Training")):
            t0 = time.time()
            key, trainkey = jax.random.split(key)
            loss = self.step(trainkey, n, optimizer)
            pb.set_postfix({'Loss': loss.item()})
            t1 = time.time()
            t.append(t1 - t0)
        t = sum(t) / n_epochs
        return t

module = Module(1000, [500, 500, 500], nnx.Rngs(0))
module.fit(optax.adam(1e-4), 1000, jax.random.PRNGKey(0), 1000)

External

@partial(nnx.jit, static_argnames='n')
def step(module: Module, key: jax.Array, n: int, optimizer: nnx.Optimizer) -> jax.Array:
    def loss_fn(module):
        x = jax.random.uniform(key, (n, module.x_dim), minval=-1.0, maxval=1.0) # (n, x_dim)
        y = jnp.exp(-jnp.sum(x**2, axis=1, keepdims=True)) # (n, 1)
        yhat = module.module(x)
        loss = jnp.mean((y - yhat)**2)
        return loss
    loss, grads = nnx.value_and_grad(loss_fn)(module)
    optimizer.update(grads)
    return loss

def fit(module, tx: optax.GradientTransformation, n_epochs: int, key: jax.Array, n: int):
    # Usually not a jitable method, e.g. w/ passing in dataloader
    t, optimizer = [], nnx.Optimizer(module, tx)
    for _ in (pb := tqdm(range(n_epochs), desc="Training")):
        t0 = time.time()
        key, trainkey = jax.random.split(key)
        loss = step(module, trainkey, n, optimizer)
        pb.set_postfix({'Loss': loss.item()})
        t1 = time.time()
        t.append(t1 - t0)
    t = sum(t) / n_epochs
    return t

module = Module(1000, [500, 500, 500], nnx.Rngs(0))
fit(module, optax.adam(1e-4), 1000, jax.random.PRNGKey(0), 1000)

Answered by cgarciae

Jul 24, 2025

@cisprague there is not preference here.

View full answer

8bitmp3 · 2025-07-21T13:01:40Z

8bitmp3
Jul 21, 2025
Maintainer

@cgarciae

0 replies

cgarciae · 2025-07-24T01:10:33Z

cgarciae
Jul 24, 2025
Maintainer

@cisprague there is not preference here.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Are there any gotchas with internal training of `nnx.Module`s? #4827

Uh oh!

{{title}}

Uh oh!

Replies: 2 comments

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Are there any gotchas with internal training of nnx.Modules? #4827

Uh oh!

cisprague Jul 21, 2025

Replies: 2 comments

Uh oh!

8bitmp3 Jul 21, 2025 Maintainer

Uh oh!

cgarciae Jul 24, 2025 Maintainer

Are there any gotchas with internal training of `nnx.Module`s? #4827

cisprague
Jul 21, 2025

8bitmp3
Jul 21, 2025
Maintainer

cgarciae
Jul 24, 2025
Maintainer