Internal change.

marcenacp · copybara-github · commit ed0771e14efd · 2025-07-17T05:19:54.000-07:00
PiperOrigin-RevId: 784102873
diff --git a/grain/_src/python/dataset/transformations/BUILD b/grain/_src/python/dataset/transformations/BUILD
@@ -285,7 +285,6 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":packing_concat_then_split",
-        "//grain/_src/core:exceptions",
         "//grain/_src/python/dataset",
         "//grain/_src/python/testing:experimental",
         "@abseil-py//absl/testing:absltest",
diff --git a/grain/_src/python/dataset/transformations/packing_concat_then_split.py b/grain/_src/python/dataset/transformations/packing_concat_then_split.py
@@ -140,8 +140,10 @@ class _CtsElement:
     parent_state: The state of the parent iterator *before* __next__() was
       called.
     features: Features as returned by calling __next__() on the parent iterator.
-    slices: If set then maps the feature name to the `slice` object for the
-      split features.
+    slices: Maps the feature name to a tuple (start, stop) representing the
+      slice of the feature to use (in case this element represents a partial
+      element resulting from a split). A slice of (-1, -1) represents the whole
+      feature.
   """
 
   parent_state: dict[str, Any]
@@ -151,7 +153,15 @@ class _CtsElement:
   def split(
       self, split_points: Mapping[str, int]
   ) -> tuple[_CtsElement | None, _CtsElement]:
-    """Splits the element into two elements."""
+    """Splits the element into two elements.
+
+    Args:
+      split_points: A mapping from feature name to the desired split index.
+
+    Returns:
+      The left and right elements. If the element is not split, returns None
+      for the left element and the original element for the right element.
+    """
     # We split at the very beginning.
     if all(x == 0 for x in split_points.values()):
       return None, self
@@ -256,15 +266,8 @@ def _has_full_length_feature(self, element: _CtsElement) -> bool:
     for key, target_sequence_length in self._config.length_struct.items():
       feature = element.get_sliced_features(key)
       sequence_length = 1 if np.ndim(feature) == 0 else len(feature)
-      if sequence_length < target_sequence_length:
-        continue
       if sequence_length == target_sequence_length:
         return True
-      if sequence_length > target_sequence_length:
-        raise exceptions.PyGrainInternalError(
-            f"Feature '{key}' has {sequence_length} tokens but target length is"
-            f" only {target_sequence_length}. The element should be split."
-        )
     return False
 
   def _pack_elements(
@@ -371,7 +374,10 @@ def _maybe_add_to_buffer(
       else:
         if sequence_length > available_tokens:
           needs_splitting = True
-          split_points[key] = available_tokens
+          start_index = 0
+          if element.slices[key] != _EMPTY_SLICE:
+            start_index = element.slices[key][0]
+          split_points[key] = start_index + available_tokens
           new_tokens_in_buffer[key] = available_tokens
         else:
           # No splitting.
diff --git a/grain/_src/python/dataset/transformations/packing_concat_then_split_test.py b/grain/_src/python/dataset/transformations/packing_concat_then_split_test.py
@@ -17,7 +17,6 @@
 
 from absl.testing import absltest
 from absl.testing import parameterized
-from grain._src.core import exceptions
 from grain._src.python.dataset import dataset
 from grain._src.python.dataset.transformations import packing_concat_then_split
 from grain._src.python.dataset.transformations import source
@@ -43,11 +42,14 @@ class ConcatThenSplitIterDatasetTest(parameterized.TestCase):
 
   # observations will be [
   #   [1], [2, 2], [3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5, 5],
-  #   [6, 6, 6, 6, 6, 6], [1], [2, 2], [3, 3, 3], ...
+  #   [6, 6, 6, 6, 6, 6], [7, 7, 7, 7, 7, 7, 7], [1], [2, 2], [3, 3, 3], ...
   # ].
   def dummy_iter_dataset(self, *, num_observations: int) -> dataset.IterDataset:
     return (
-        source.RangeMapDataset(1, 7)
+        # On purpose, we have observations longer (length=7) than the packing
+        # sequence length of most test cases (6), so we can test splitting long
+        # features.
+        source.RangeMapDataset(1, 8)
         .repeat()
         .map_with_index(
             lambda index, value: {
@@ -92,16 +94,22 @@ def test_meta_features_not_restricting_when_splitting_full_length_features(
                 "index": np.asarray([5, 6, 0, 0, 0, 0]),
             },
             {
-                "observation": np.asarray([6, 6, 6, 1, 2, 2]),
-                "observation_segment_ids": np.asarray([1, 1, 1, 2, 3, 3]),
-                "observation_positions": np.asarray([0, 1, 2, 0, 0, 1]),
-                "index": np.asarray([6, 7, 8, 0, 0, 0]),
+                "observation": np.asarray([6, 6, 6, 7, 7, 7]),
+                "observation_segment_ids": np.asarray([1, 1, 1, 2, 2, 2]),
+                "observation_positions": np.asarray([0, 1, 2, 0, 1, 2]),
+                "index": np.asarray([6, 7, 0, 0, 0, 0]),
+            },
+            {
+                "observation": np.asarray([7, 7, 7, 7, 1, 2]),
+                "observation_segment_ids": np.asarray([1, 1, 1, 1, 2, 3]),
+                "observation_positions": np.asarray([0, 1, 2, 3, 0, 0]),
+                "index": np.asarray([7, 8, 9, 0, 0, 0]),
             },
             # Reached end.
             {
-                "observation": np.asarray([3, 3, 3, 0, 0, 0]),
-                "observation_segment_ids": np.asarray([1, 1, 1, 0, 0, 0]),
-                "observation_positions": np.asarray([0, 1, 2, 0, 0, 0]),
+                "observation": np.asarray([2, 0, 0, 0, 0, 0]),
+                "observation_segment_ids": np.asarray([1, 0, 0, 0, 0, 0]),
+                "observation_positions": np.asarray([0, 0, 0, 0, 0, 0]),
                 "index": np.asarray([9, 0, 0, 0, 0, 0]),
             },
         ],
@@ -132,10 +140,10 @@ def test_meta_features_not_restricting(self):
                 "index": np.asarray([4, 5, 0, 0, 0, 0]),
             },
             {
-                "observation": np.asarray([5, 5, 5, 1, 2, 2]),
-                "observation_segment_ids": np.asarray([1, 1, 1, 2, 3, 3]),
-                "observation_positions": np.asarray([0, 1, 2, 0, 0, 1]),
-                "index": np.asarray([5, 7, 8, 0, 0, 0]),
+                "observation": np.asarray([5, 5, 5, 7, 7, 7]),
+                "observation_segment_ids": np.asarray([1, 1, 1, 2, 2, 2]),
+                "observation_positions": np.asarray([0, 1, 2, 0, 1, 2]),
+                "index": np.asarray([5, 7, 0, 0, 0, 0]),
             },
             # Fully packed example comes without being split.
             {
@@ -144,11 +152,17 @@ def test_meta_features_not_restricting(self):
                 "observation_positions": np.asarray([0, 1, 2, 3, 4, 5]),
                 "index": np.asarray([6, 0, 0, 0, 0, 0]),
             },
+            {
+                "observation": np.asarray([7, 7, 7, 7, 1, 2]),
+                "observation_segment_ids": np.asarray([1, 1, 1, 1, 2, 3]),
+                "observation_positions": np.asarray([0, 1, 2, 3, 0, 0]),
+                "index": np.asarray([7, 8, 9, 0, 0, 0]),
+            },
             # Reached end.
             {
-                "observation": np.asarray([3, 3, 3, 0, 0, 0]),
-                "observation_segment_ids": np.asarray([1, 1, 1, 0, 0, 0]),
-                "observation_positions": np.asarray([0, 1, 2, 0, 0, 0]),
+                "observation": np.asarray([2, 0, 0, 0, 0, 0]),
+                "observation_segment_ids": np.asarray([1, 0, 0, 0, 0, 0]),
+                "observation_positions": np.asarray([0, 0, 0, 0, 0, 0]),
                 "index": np.asarray([9, 0, 0, 0, 0, 0]),
             },
         ],
@@ -191,15 +205,21 @@ def test_meta_features_restricting(self):
                 "index": np.asarray([6, 0]),
             },
             {
-                "observation": np.asarray([1, 2, 2, 0, 0, 0]),
-                "observation_segment_ids": np.asarray([1, 2, 2, 0, 0, 0]),
-                "observation_positions": np.asarray([0, 0, 1, 0, 0, 0]),
+                "observation": np.asarray([7, 7, 7, 7, 7, 7]),
+                "observation_segment_ids": np.asarray([1, 1, 1, 1, 1, 1]),
+                "observation_positions": np.asarray([0, 1, 2, 3, 4, 5]),
+                "index": np.asarray([7, 0]),
+            },
+            {
+                "observation": np.asarray([7, 1, 0, 0, 0, 0]),
+                "observation_segment_ids": np.asarray([1, 2, 0, 0, 0, 0]),
+                "observation_positions": np.asarray([0, 0, 0, 0, 0, 0]),
                 "index": np.asarray([7, 8]),
             },
             {
-                "observation": np.asarray([3, 3, 3, 0, 0, 0]),
-                "observation_segment_ids": np.asarray([1, 1, 1, 0, 0, 0]),
-                "observation_positions": np.asarray([0, 1, 2, 0, 0, 0]),
+                "observation": np.asarray([2, 2, 0, 0, 0, 0]),
+                "observation_segment_ids": np.asarray([1, 1, 0, 0, 0, 0]),
+                "observation_positions": np.asarray([0, 1, 0, 0, 0, 0]),
                 "index": np.asarray([9, 0]),
             },
         ],
@@ -233,10 +253,10 @@ def test_replace_first_token_with_bos(self):
                 "index": np.asarray([4, 5, 0, 0, 0, 0]),
             },
             {
-                "observation": np.asarray([1000, 5, 5, 1000, 1000, 2]),
-                "observation_segment_ids": np.asarray([1, 1, 1, 2, 3, 3]),
-                "observation_positions": np.asarray([0, 1, 2, 0, 0, 1]),
-                "index": np.asarray([5, 7, 8, 0, 0, 0]),
+                "observation": np.asarray([1000, 5, 5, 1000, 7, 7]),
+                "observation_segment_ids": np.asarray([1, 1, 1, 2, 2, 2]),
+                "observation_positions": np.asarray([0, 1, 2, 0, 1, 2]),
+                "index": np.asarray([5, 7, 0, 0, 0, 0]),
             },
             # Fully packed example comes without being split.
             {
@@ -245,11 +265,17 @@ def test_replace_first_token_with_bos(self):
                 "observation_positions": np.asarray([0, 1, 2, 3, 4, 5]),
                 "index": np.asarray([6, 0, 0, 0, 0, 0]),
             },
+            {
+                "observation": np.asarray([1000, 7, 7, 7, 1000, 1000]),
+                "observation_segment_ids": np.asarray([1, 1, 1, 1, 2, 3]),
+                "observation_positions": np.asarray([0, 1, 2, 3, 0, 0]),
+                "index": np.asarray([7, 8, 9, 0, 0, 0]),
+            },
             # Reached end.
             {
-                "observation": np.asarray([1000, 3, 3, 0, 0, 0]),
-                "observation_segment_ids": np.asarray([1, 1, 1, 0, 0, 0]),
-                "observation_positions": np.asarray([0, 1, 2, 0, 0, 0]),
+                "observation": np.asarray([1000, 0, 0, 0, 0, 0]),
+                "observation_segment_ids": np.asarray([1, 0, 0, 0, 0, 0]),
+                "observation_positions": np.asarray([0, 0, 0, 0, 0, 0]),
                 "index": np.asarray([9, 0, 0, 0, 0, 0]),
             },
         ],
@@ -320,27 +346,27 @@ def _create_iter(state: dict[str, Any] | None):
                 "index": np.asarray([4, 5, 6, 0, 0, 0]),
             },
             {
-                "observation": np.asarray([6, 6, 6, 6, 6, 1, 2, 2]),
-                "observation_segment_ids": np.asarray([1, 1, 1, 1, 1, 2, 3, 3]),
-                "observation_positions": np.asarray([0, 1, 2, 3, 4, 0, 0, 1]),
-                "index": np.asarray([6, 7, 8, 0, 0, 0]),
+                "observation": np.asarray([6, 6, 6, 6, 6, 7, 7, 7]),
+                "observation_segment_ids": np.asarray([1, 1, 1, 1, 1, 2, 2, 2]),
+                "observation_positions": np.asarray([0, 1, 2, 3, 4, 0, 1, 2]),
+                "index": np.asarray([6, 7, 0, 0, 0, 0]),
             },
             {
-                "observation": np.asarray([3, 3, 3, 4, 4, 4, 4, 5]),
-                "observation_segment_ids": np.asarray([1, 1, 1, 2, 2, 2, 2, 3]),
-                "observation_positions": np.asarray([0, 1, 2, 0, 1, 2, 3, 0]),
-                "index": np.asarray([9, 10, 11, 0, 0, 0]),
+                "observation": np.asarray([7, 7, 7, 7, 1, 2, 2, 3]),
+                "observation_segment_ids": np.asarray([1, 1, 1, 1, 2, 3, 3, 4]),
+                "observation_positions": np.asarray([0, 1, 2, 3, 0, 0, 1, 0]),
+                "index": np.asarray([7, 8, 9, 10, 0, 0]),
             },
             {
-                "observation": np.asarray([5, 5, 5, 5, 6, 6, 6, 6]),
-                "observation_segment_ids": np.asarray([1, 1, 1, 1, 2, 2, 2, 2]),
-                "observation_positions": np.asarray([0, 1, 2, 3, 0, 1, 2, 3]),
-                "index": np.asarray([11, 12, 0, 0, 0, 0]),
+                "observation": np.asarray([3, 3, 4, 4, 4, 4, 5, 5]),
+                "observation_segment_ids": np.asarray([1, 1, 2, 2, 2, 2, 3, 3]),
+                "observation_positions": np.asarray([0, 1, 0, 1, 2, 3, 0, 1]),
+                "index": np.asarray([10, 11, 12, 0, 0, 0]),
             },
             {
-                "observation": np.asarray([6, 6, 0, 0, 0, 0, 0, 0]),
-                "observation_segment_ids": np.asarray([1, 1, 0, 0, 0, 0, 0, 0]),
-                "observation_positions": np.asarray([0, 1, 0, 0, 0, 0, 0, 0]),
+                "observation": np.asarray([5, 5, 5, 0, 0, 0, 0, 0]),
+                "observation_segment_ids": np.asarray([1, 1, 1, 0, 0, 0, 0, 0]),
+                "observation_positions": np.asarray([0, 1, 2, 0, 0, 0, 0, 0]),
                 "index": np.asarray([12, 0, 0, 0, 0, 0]),
             },
         ],
@@ -406,34 +432,6 @@ def test_checkpointing_using_grain_built_in_tools(
         )
     )
 
-  @parameterized.product(
-      bos_handling=list(BOSHandling),
-  )
-  def test_pack_sequence_longer_than_sequence_length(self, bos_handling):
-    sequence_length = 10
-    if bos_handling == BOSHandling.REPLACE_FIRST_TOKEN_WITH_BOS:
-      bos_token_id = 1000
-      bos_features = {"observation"}
-    else:
-      bos_token_id = None
-      bos_features = {}
-    ds = dataset.MapDataset.source([
-        {"observation": np.repeat(1, 100)},  # 100 > sequence_length
-    ]).to_iter_dataset()
-    ds = packing_concat_then_split.ConcatThenSplitIterDataset(
-        ds,
-        length_struct={"observation": sequence_length},
-        split_full_length_features=False,
-        bos_handling=bos_handling,
-        bos_token_id=bos_token_id,
-        bos_features=bos_features,
-    )
-    with self.assertRaisesWithPredicateMatch(
-        exceptions.PyGrainInternalError,
-        lambda _: "Feature 'observation' has 100 tokens",
-    ):
-      next(iter(ds))
-
   def assert_equal_elements(
       self,
       actual_elements: list[dict[str, np.ndarray]],