mlcommons · rka97 · Feb 27, 2025 · Mar 11, 2025 · Mar 12, 2025 · Mar 14, 2025
@@ -25,4 +25,4 @@ scoring/plots/
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_0/eval_measurements.csv
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv
 
-algoperf/_version.py
+algoperf/_version.py
@@ -5,7 +5,7 @@
 """
 
 import os
-from typing import Sequence, Tuple
+from typing import Sequence, Tuple, Optional
 
 import numpy as np
 import torch
@@ -14,7 +14,8 @@
 from flax.training import checkpoints as flax_checkpoints
 from flax.training.checkpoints import latest_checkpoint
 from tensorflow.io import gfile  # pytype: disable=import-error
-
+import orbax.checkpoint as ocp
+from orbax.checkpoint.type_handlers import NumpyHandler
 from algoperf import spec
 from algoperf.pytorch_utils import pytorch_setup
 
@@ -29,6 +30,48 @@
   int,
 ]
 
+class BoolHandler(NumpyHandler):
+    """
+    An implementation of TypeHandler for np.bool_ that inherits from NumpyHandler.
+    It works by treating the scalar as a 0-dimensional array.
+    """
+
+    def typestr(self) -> str:
+        """Unique string identifier for this handler."""
+        return 'np.bool_'
+
+    async def serialize(
+        self,
+        values: Sequence[np.bool_],
+        infos: Sequence,
+        args: Optional[Sequence[ocp.SaveArgs]] = None,
+    ):
+        """
+        Serializes a sequence of np.bool_ scalars by first converting them
+        to 0-dim numpy arrays and then calling the parent NumpyHandler.
+        """
+        # Convert each scalar np.bool_ to a 0-dimensional np.ndarray
+        array_values = [np.asarray(v, dtype=np.bool_) for v in values]
+        # Use the parent class's robust serialization logic
+        return await super().serialize(array_values, infos, args)
+
+    async def deserialize(
+        self,
+        infos: Sequence,
+        args: Optional[Sequence[ocp.RestoreArgs]] = None,
+    ) -> Sequence[np.bool_]:
+        """
+        Deserializes into a sequence of np.bool_ scalars by calling the
+        parent handler and then converting the resulting 0-dim arrays.
+        """
+        # Parent deserialize will return a sequence of 0-dimensional np.ndarray
+        results = await super().deserialize(infos, args)
+
+        # Convert each 0-d array back to an np.bool_ scalar using .item()
+        scalar_results = [np.bool_(r.item()) for r in results]
+        return scalar_results
+
+ocp.type_handlers.register_type_handler(np.bool_, BoolHandler(), override=True)
 
 def maybe_restore_checkpoint(
   framework: str,

@@ -44,6 +44,8 @@ def pytorch_param_types(
         param_types[name] = spec.ParameterType.ATTENTION_BIAS
       elif 'in_proj' in name:
         param_types[name] = spec.ParameterType.ATTENTION_QKV
+      elif 'qkv' in name:
+        param_types[name] = spec.ParameterType.ATTENTION_QKV
       elif 'kv_proj' in name:
         param_types[name] = spec.ParameterType.ATTENTION_KV
       elif 'k_proj' in name or 'key' in name:

@@ -0,0 +1,138 @@
+"""Input pipeline for a LM dataset."""
+
+import functools
+import os
+from typing import Optional
+
+import jax
+import numpy as np
+import tensorflow as tf
+
+from algoperf import data_utils
+
+AUTOTUNE = tf.data.experimental.AUTOTUNE
+PAD_ID = tf.constant(-1, dtype=tf.int64)
+
+TFDS_SPLIT_NAME = {'train': 'train', 'eval_train': 'train', 'validation': 'val'}
+
+SEQUENCE_LENGTH = 1024
+MAX_CORPUS_CHARS = 1_000_000_000
+SHUFFLE_BUFFER_SIZE = 100_000
+VOCAB_SIZE = 50_257
+
+
+def batch_with_padding(
+  dataset: tf.data.Dataset,
+  batch_size,
+  padded_shapes=None,
+  padding_id=PAD_ID,
+):
+  """Batches a tf.data.Dataset and adds padding if len(dataset) is not divisible by the batch size.
+
+  Args:
+    dataset: tf.data.Dataset
+    batch_size: batch size of resulting batched dataset
+    padded_shapes: shapes of the padded batches
+    padding_id: value for padding, for elements in new batch
+
+  Returns:
+  """
+  batched_dataset = dataset.batch(batch_size, drop_remainder=False)
+
+  # tf.data.Dataset.padded.batch pads elements in the batch so we call it
+  # again with batch_size=1 to pad each element in original batch.
+  padded_batched_dataset = batched_dataset.padded_batch(
+    1, padded_shapes=padded_shapes, padding_values=padding_id
+  )
+
+  # Remove extra dimension resulting from the batch_size=1.
+  padded_batched_dataset = padded_batched_dataset.unbatch()
+
+  return padded_batched_dataset
+
+
+def get_data_iter(data_rng: jax.random.PRNGKey,
+  split: str,
+  data_dir: str,
+  global_batch_size: int,
+  num_batches: Optional[int] = None,):
+
+  ds = get_lm_dataset(data_rng, split, data_dir, global_batch_size, num_batches)
+
+  it = map(
+    functools.partial(
+      data_utils.shard_and_maybe_pad_np, global_batch_size=global_batch_size
+    ),
+    ds,
+  )
+
+  return iter(it)
+
+def get_lm_dataset(
+  data_rng: jax.random.PRNGKey,
+  split: str,
+  data_dir: str,
+  global_batch_size: int,
+  num_batches: Optional[int] = None,
+):
+  """Load preprocessed TF dataset."""
+  if split not in TFDS_SPLIT_NAME:
+    raise NotImplementedError
+
+  shuffle_seed = jax.random.randint(data_rng, (), -2**31, 2**31-1)
+
+  data_dir = os.path.join(data_dir, TFDS_SPLIT_NAME[split])
+  tokens_ds = tf.data.Dataset.load(data_dir)
+
+  # tokens
+  tokens_ds = tokens_ds.flat_map(tf.data.Dataset.from_tensor_slices)
+
+  # sequences
+  sequences_ds = tokens_ds.batch(SEQUENCE_LENGTH + 1, drop_remainder=True)
+
+  # get inputs and outputs
+  sequences_ds = sequences_ds.map(
+    lambda x: {
+      'inputs': x['input_ids'][:SEQUENCE_LENGTH],
+      'targets': x['input_ids'][1:],
+    },
+    num_parallel_calls=AUTOTUNE,
+  )
+
+  # batch
+  if split == 'train':
+    shuffled_sequences_ds = sequences_ds.shuffle(
+      SHUFFLE_BUFFER_SIZE, seed=shuffle_seed
+    )
+    repeated_sequences_dataset = shuffled_sequences_ds.repeat()
+    ds = repeated_sequences_dataset.batch(
+      global_batch_size, drop_remainder=False
+    ).prefetch(tf.data.experimental.AUTOTUNE)
+  elif split == 'eval_train':
+    ds = batch_with_padding(
+      sequences_ds,
+      global_batch_size,
+      padded_shapes={
+        'inputs': (global_batch_size, None),
+        'targets': (global_batch_size, None),
+      },
+    )
+    ds = ds.map(lambda x: {'inputs': x['inputs'],
+                          'targets': x['targets'],
+                          'weights': tf.where(tf.equal(x['inputs'], PAD_ID), 0.0, 1.0)})
+    ds = ds.take(1000).prefetch(tf.data.experimental.AUTOTUNE)  # todo(kasimbeg): set final size of validation
+  elif split == 'validation':
+    ds = batch_with_padding(
+      sequences_ds,
+      global_batch_size,
+      padded_shapes={
+        'inputs': (global_batch_size, None),
+        'targets': (global_batch_size, None),
+      },
+    )
+    ds = ds.map(lambda x: {'inputs': x['inputs'],
+                          'targets': x['targets'],
+                          'weights': tf.where(tf.equal(x['inputs'], PAD_ID), 0.0, 1.0)})
+    ds = ds.take(1000).prefetch(tf.data.experimental.AUTOTUNE)  # todo(kasimbeg): set final size
+
+  return ds
@@ -0,0 +1,19 @@
+from flax import linen as nn
+import jax.numpy as jnp
+
+class LinearModel(nn.Module):
+    vocab_size: int
+
+    @nn.compact
+    def __call__(self, inputs: jnp.ndarray) -> jnp.ndarray:
+        x = nn.Dense(
+            10,
+            kernel_init=nn.initializers.normal(0.02),
+            bias_init=nn.initializers.zeros
+        )(inputs)
+        return nn.Dense(
+            self.vocab_size,
+            kernel_init=nn.initializers.normal(0.02),
+            bias_init=nn.initializers.zeros,
+            name="output"
+        )(x)