[Executorch] make slice_copy parallel

kimishpatel · kimishpatel · commit beaab8c24a4f · 2025-11-20T12:49:55.000-08:00
Pull Request resolved: #15830 When doing large prefills in LLMs, slice_copy takes about 5-10% time. Mainly coming from slicing in the rope implementation. Differential Revision: [D85532081](https://our.internmc.facebook.com/intern/diff/D85532081/) ghstack-source-id: 324784683
diff --git a/kernels/portable/cpu/util/slice_util.cpp b/kernels/portable/cpu/util/slice_util.cpp
@@ -9,6 +9,7 @@
 #include <c10/util/irange.h>
 #include <executorch/kernels/portable/cpu/util/slice_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <cstring>
 
 namespace torch {
@@ -202,12 +203,44 @@ void compute_slice(
       InvalidArgument,
       /* void */,
       "out.nbytes() is smaller than the expected slice size.");
-  for (const auto i : c10::irange(leading_dims)) {
-    const char* src = input_data + (i * dim_length + start) * length_per_step;
-    for ([[maybe_unused]] const auto j : c10::irange(length)) {
-      memcpy(dest, src, length_per_step);
-      src += step * length_per_step;
-      dest += length_per_step;
+  // Thresholds for enabling multithreading:
+  // - Minimum number of leading dimensions: 8
+  // - Minimum total elements to copy: 32768 (GRAIN_SIZE)
+  constexpr int64_t MIN_LEADING_DIMS_FOR_MT = 8;
+  constexpr int64_t MIN_ELEMENTS_FOR_MT =
+      executorch::extension::internal::GRAIN_SIZE;
+
+  const int64_t total_elements = leading_dims * length * trailing_dims;
+  const bool use_multithreading = leading_dims >= MIN_LEADING_DIMS_FOR_MT &&
+      total_elements >= MIN_ELEMENTS_FOR_MT;
+
+  if (use_multithreading) {
+    // Use parallel_for to distribute work across leading dimensions
+    // Calculate grain size based on number of elements per leading dimension
+    const int64_t grain_size = MIN_LEADING_DIMS_FOR_MT;
+
+    executorch::extension::parallel_for(
+        0, leading_dims, grain_size, [&](const auto begin, const auto end) {
+          for (const auto i : c10::irange(begin, end)) {
+            const char* src =
+                input_data + (i * dim_length + start) * length_per_step;
+            char* local_dest = dest + i * length * length_per_step;
+            for ([[maybe_unused]] const auto j : c10::irange(length)) {
+              memcpy(local_dest, src, length_per_step);
+              src += step * length_per_step;
+              local_dest += length_per_step;
+            }
+          }
+        });
+  } else {
+    // Single-threaded path for small workloads
+    for (const auto i : c10::irange(leading_dims)) {
+      const char* src = input_data + (i * dim_length + start) * length_per_step;
+      for ([[maybe_unused]] const auto j : c10::irange(length)) {
+        memcpy(dest, src, length_per_step);
+        src += step * length_per_step;
+        dest += length_per_step;
+      }
     }
   }
 }
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
@@ -292,6 +292,7 @@ def define_common_targets():
         exported_headers = ["slice_util.h"],
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/extension/threadpool:threadpool",
         ],
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )

Original file line number	Diff line number	Diff line change
`@@ -292,6 +292,7 @@ def define_common_targets():`
`292`	`292`	`exported_headers = ["slice_util.h"],`
`293`	`293`	`deps = [`
`294`	`294`	`"//executorch/runtime/kernel:kernel_includes",`
	`295`	`+ "//executorch/extension/threadpool:threadpool",`
`295`	`296`	`],`
`296`	`297`	`visibility = ["//executorch/kernels/portable/cpu/..."],`
`297`	`298`	`)`