Use ordered reduction intrinsics for integer reductions

sayantn · sayantn · commit 7134a49031c2 · 2025-11-23T11:18:18.000+05:30
only ordered intrinsics have implementation in rustc-const-eval
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
@@ -4810,7 +4810,7 @@ pub fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(k1: __mmask16, a: __m128i, b: __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_reduce_add_epi16(a: __m256i) -> i16 {
-    unsafe { simd_reduce_add_unordered(a.as_i16x16()) }
+    unsafe { simd_reduce_add_ordered(a.as_i16x16(), 0) }
 }
 
 /// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -4820,7 +4820,7 @@ pub fn _mm256_reduce_add_epi16(a: __m256i) -> i16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_mask_reduce_add_epi16(k: __mmask16, a: __m256i) -> i16 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO)) }
+    unsafe { simd_reduce_add_ordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO), 0) }
 }
 
 /// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a.
@@ -4830,7 +4830,7 @@ pub fn _mm256_mask_reduce_add_epi16(k: __mmask16, a: __m256i) -> i16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_reduce_add_epi16(a: __m128i) -> i16 {
-    unsafe { simd_reduce_add_unordered(a.as_i16x8()) }
+    unsafe { simd_reduce_add_ordered(a.as_i16x8(), 0) }
 }
 
 /// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -4840,7 +4840,7 @@ pub fn _mm_reduce_add_epi16(a: __m128i) -> i16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_mask_reduce_add_epi16(k: __mmask8, a: __m128i) -> i16 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO)) }
+    unsafe { simd_reduce_add_ordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO), 0) }
 }
 
 /// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
@@ -4850,7 +4850,7 @@ pub fn _mm_mask_reduce_add_epi16(k: __mmask8, a: __m128i) -> i16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_reduce_add_epi8(a: __m256i) -> i8 {
-    unsafe { simd_reduce_add_unordered(a.as_i8x32()) }
+    unsafe { simd_reduce_add_ordered(a.as_i8x32(), 0) }
 }
 
 /// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -4860,7 +4860,7 @@ pub fn _mm256_reduce_add_epi8(a: __m256i) -> i8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_mask_reduce_add_epi8(k: __mmask32, a: __m256i) -> i8 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO)) }
+    unsafe { simd_reduce_add_ordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO), 0) }
 }
 
 /// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
@@ -4870,7 +4870,7 @@ pub fn _mm256_mask_reduce_add_epi8(k: __mmask32, a: __m256i) -> i8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_reduce_add_epi8(a: __m128i) -> i8 {
-    unsafe { simd_reduce_add_unordered(a.as_i8x16()) }
+    unsafe { simd_reduce_add_ordered(a.as_i8x16(), 0) }
 }
 
 /// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -4880,7 +4880,7 @@ pub fn _mm_reduce_add_epi8(a: __m128i) -> i8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_mask_reduce_add_epi8(k: __mmask16, a: __m128i) -> i8 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO)) }
+    unsafe { simd_reduce_add_ordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO), 0) }
 }
 
 /// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
@@ -5314,7 +5314,7 @@ pub fn _mm_mask_reduce_min_epu8(k: __mmask16, a: __m128i) -> u8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_reduce_mul_epi16(a: __m256i) -> i16 {
-    unsafe { simd_reduce_mul_unordered(a.as_i16x16()) }
+    unsafe { simd_reduce_mul_ordered(a.as_i16x16(), 1) }
 }
 
 /// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -5324,7 +5324,7 @@ pub fn _mm256_reduce_mul_epi16(a: __m256i) -> i16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_mask_reduce_mul_epi16(k: __mmask16, a: __m256i) -> i16 {
-    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(1))) }
+    unsafe { simd_reduce_mul_ordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(1)), 1) }
 }
 
 /// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a.
@@ -5334,7 +5334,7 @@ pub fn _mm256_mask_reduce_mul_epi16(k: __mmask16, a: __m256i) -> i16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_reduce_mul_epi16(a: __m128i) -> i16 {
-    unsafe { simd_reduce_mul_unordered(a.as_i16x8()) }
+    unsafe { simd_reduce_mul_ordered(a.as_i16x8(), 1) }
 }
 
 /// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -5344,7 +5344,7 @@ pub fn _mm_reduce_mul_epi16(a: __m128i) -> i16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_mask_reduce_mul_epi16(k: __mmask8, a: __m128i) -> i16 {
-    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(1))) }
+    unsafe { simd_reduce_mul_ordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(1)), 1) }
 }
 
 /// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
@@ -5354,7 +5354,7 @@ pub fn _mm_mask_reduce_mul_epi16(k: __mmask8, a: __m128i) -> i16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_reduce_mul_epi8(a: __m256i) -> i8 {
-    unsafe { simd_reduce_mul_unordered(a.as_i8x32()) }
+    unsafe { simd_reduce_mul_ordered(a.as_i8x32(), 1) }
 }
 
 /// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -5364,7 +5364,7 @@ pub fn _mm256_reduce_mul_epi8(a: __m256i) -> i8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm256_mask_reduce_mul_epi8(k: __mmask32, a: __m256i) -> i8 {
-    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(1))) }
+    unsafe { simd_reduce_mul_ordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(1)), 1) }
 }
 
 /// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
@@ -5374,7 +5374,7 @@ pub fn _mm256_mask_reduce_mul_epi8(k: __mmask32, a: __m256i) -> i8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_reduce_mul_epi8(a: __m128i) -> i8 {
-    unsafe { simd_reduce_mul_unordered(a.as_i8x16()) }
+    unsafe { simd_reduce_mul_ordered(a.as_i8x16(), 1) }
 }
 
 /// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -5384,7 +5384,7 @@ pub fn _mm_reduce_mul_epi8(a: __m128i) -> i8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm_mask_reduce_mul_epi8(k: __mmask16, a: __m128i) -> i8 {
-    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(1))) }
+    unsafe { simd_reduce_mul_ordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(1)), 1) }
 }
 
 /// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
@@ -33171,7 +33171,7 @@ pub fn _mm_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[target_feature(enable = "avx512f")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
-    unsafe { simd_reduce_add_unordered(a.as_i32x16()) }
+    unsafe { simd_reduce_add_ordered(a.as_i32x16(), 0) }
 }
 
 /// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -33181,7 +33181,7 @@ pub fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO)) }
+    unsafe { simd_reduce_add_ordered(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO), 0) }
 }
 
 /// Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.
@@ -33191,7 +33191,7 @@ pub fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
-    unsafe { simd_reduce_add_unordered(a.as_i64x8()) }
+    unsafe { simd_reduce_add_ordered(a.as_i64x8(), 0) }
 }
 
 /// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -33201,7 +33201,7 @@ pub fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
-    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO)) }
+    unsafe { simd_reduce_add_ordered(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO), 0) }
 }
 
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
@@ -33267,7 +33267,7 @@ pub fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 {
 #[target_feature(enable = "avx512f")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm512_reduce_mul_epi32(a: __m512i) -> i32 {
-    unsafe { simd_reduce_mul_unordered(a.as_i32x16()) }
+    unsafe { simd_reduce_mul_ordered(a.as_i32x16(), 1) }
 }
 
 /// Reduce the packed 32-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -33278,11 +33278,10 @@ pub fn _mm512_reduce_mul_epi32(a: __m512i) -> i32 {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
     unsafe {
-        simd_reduce_mul_unordered(simd_select_bitmask(
-            k,
-            a.as_i32x16(),
-            _mm512_set1_epi32(1).as_i32x16(),
-        ))
+        simd_reduce_mul_ordered(
+            simd_select_bitmask(k, a.as_i32x16(), _mm512_set1_epi32(1).as_i32x16()),
+            1,
+        )
     }
 }
 
@@ -33293,7 +33292,7 @@ pub fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
-    unsafe { simd_reduce_mul_unordered(a.as_i64x8()) }
+    unsafe { simd_reduce_mul_ordered(a.as_i64x8(), 1) }
 }
 
 /// Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -33304,11 +33303,10 @@ pub fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 pub fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
     unsafe {
-        simd_reduce_mul_unordered(simd_select_bitmask(
-            k,
-            a.as_i64x8(),
-            _mm512_set1_epi64(1).as_i64x8(),
-        ))
+        simd_reduce_mul_ordered(
+            simd_select_bitmask(k, a.as_i64x8(), _mm512_set1_epi64(1).as_i64x8()),
+            1,
+        )
     }
 }