Merge branch 'lm_workload_priya' of github.com:mlcommons/algorithmic-efficiency into lm_workload_priya

priyakasimbeg · priyakasimbeg · commit f531b35ad55c · 2025-10-10T04:33:21.000Z
diff --git a/algoperf/checkpoint_utils.py b/algoperf/checkpoint_utils.py
@@ -5,7 +5,7 @@
 """
 
 import os
-from typing import Sequence, Tuple
+from typing import Sequence, Tuple, Optional
 
 import numpy as np
 import torch
@@ -14,7 +14,8 @@
 from flax.training import checkpoints as flax_checkpoints
 from flax.training.checkpoints import latest_checkpoint
 from tensorflow.io import gfile  # pytype: disable=import-error
-
+import orbax.checkpoint as ocp
+from orbax.checkpoint.type_handlers import NumpyHandler
 from algoperf import spec
 from algoperf.pytorch_utils import pytorch_setup
 
@@ -29,6 +30,48 @@
   int,
 ]
 
+class BoolHandler(NumpyHandler):
+    """
+    An implementation of TypeHandler for np.bool_ that inherits from NumpyHandler.
+    It works by treating the scalar as a 0-dimensional array.
+    """
+
+    def typestr(self) -> str:
+        """Unique string identifier for this handler."""
+        return 'np.bool_'
+
+    async def serialize(
+        self,
+        values: Sequence[np.bool_],
+        infos: Sequence,
+        args: Optional[Sequence[ocp.SaveArgs]] = None,
+    ):
+        """
+        Serializes a sequence of np.bool_ scalars by first converting them
+        to 0-dim numpy arrays and then calling the parent NumpyHandler.
+        """
+        # Convert each scalar np.bool_ to a 0-dimensional np.ndarray
+        array_values = [np.asarray(v, dtype=np.bool_) for v in values]
+        # Use the parent class's robust serialization logic
+        return await super().serialize(array_values, infos, args)
+
+    async def deserialize(
+        self,
+        infos: Sequence,
+        args: Optional[Sequence[ocp.RestoreArgs]] = None,
+    ) -> Sequence[np.bool_]:
+        """
+        Deserializes into a sequence of np.bool_ scalars by calling the
+        parent handler and then converting the resulting 0-dim arrays.
+        """
+        # Parent deserialize will return a sequence of 0-dimensional np.ndarray
+        results = await super().deserialize(infos, args)
+
+        # Convert each 0-d array back to an np.bool_ scalar using .item()
+        scalar_results = [np.bool_(r.item()) for r in results]
+        return scalar_results
+
+ocp.type_handlers.register_type_handler(np.bool_, BoolHandler(), override=True)
 
 def maybe_restore_checkpoint(
   framework: str,
diff --git a/algoperf/workloads/lm/lm_jax/workload.py b/algoperf/workloads/lm/lm_jax/workload.py
@@ -84,21 +84,19 @@ def model_fn(
 
   
   def compute_weighted_cross_entropy(
-    self,
-    logits: spec.Tensor,
-    targets: spec.Tensor,
-    weights: Optional[spec.Tensor] = None,
-    label_smoothing: float = 0.1,
-  ) -> Dict[str, spec.Tensor]:  # differentiable
+      self,
+      logits: spec.Tensor,
+      targets: spec.Tensor,
+      weights: Optional[spec.Tensor] = None,
+      label_smoothing: float = 0.1,
+    ) -> Dict[str, spec.Tensor]:  # differentiable
     """Compute weighted cross entropy and entropy for log probs and targets.
-
     Args:
      logits: [batch, length, num_classes] float array.
      targets: categorical targets [batch, length] int array.
      weights: array of shape [batch, length].
      label_smoothing: label smoothing constant, used to determine the on and off
        values.
-
     Returns:
       {'summed': scalar summed loss, 'n_valid_examples': scalar number of
       valid examples in batch, 'per_example': 1-d array of per-example losses}
@@ -108,18 +106,26 @@ def compute_weighted_cross_entropy(
         f'Incorrect shapes. Got shape {logits.shape} logits and '
         f'{targets.shape} targets.'
       )
-    smoothed_targets = optax.smooth_labels(
-      common_utils.onehot(targets, self._vocab_size), label_smoothing
-    )
-
-    per_example_losses = -jnp.sum(
-      smoothed_targets * jax.nn.log_softmax(logits), axis=-1
-    )
-    if weights is None:
-      weights = jnp.ones_like(targets)
-    per_example_losses = jnp.where(weights, per_example_losses, 0.0)
+    # Compute log probabilities
+    log_probs = jax.nn.log_softmax(logits, axis=-1)
+    # Extract log probability of the target class
+    # Shape: [batch, length]
+    target_log_probs = jnp.take_along_axis(
+      log_probs, 
+      targets[..., None], 
+      axis=-1
+    ).squeeze(-1)
+    # Cross-entropy with smoothing: -(1 - α) * log_p[target] - α * mean(log_p)
+    # The above formula is easy to derive from the definition of label smoothing and cross-entropy loss.
+    confidence = 1.0 - label_smoothing
+    smoothing_term = label_smoothing / self._vocab_size
+    per_example_losses = -1.0 * (confidence * target_log_probs + smoothing_term * log_probs.sum(axis=-1))
+    if weights is not None:
+      per_example_losses = jnp.where(weights, per_example_losses, 0.0)
+      n_valid_examples = weights.sum()
+    else:
+      n_valid_examples = targets.shape[0] * targets.shape[1]
     summed_loss = per_example_losses.sum()
-    n_valid_examples = weights.sum()
     return {
       'summed': summed_loss,
       'n_valid_examples': n_valid_examples,
diff --git a/algoperf/workloads/lm/lm_pytorch/plainlm_model.py b/algoperf/workloads/lm/lm_pytorch/plainlm_model.py
@@ -16,7 +16,7 @@ class ModelConfig:
     n_layers: int
     n_heads: int
     rmsnorm_eps: float = 1e-6
-    tie_embeddings: bool = False
+    tie_embeddings: bool = True
 
 
 class MLP(nn.Module):