From 3308759445700c3bc9642f49f67fc2a25fe901c5 Mon Sep 17 00:00:00 2001 From: Aaron Date: Sat, 11 Oct 2025 11:27:04 -0500 Subject: [PATCH 1/3] Leverage the existing GGML_F32_VEC helpers to broadcast the fill value across SIMD registers and store in vector-sized chunks, while retaining the scalar tail for leftover elements and non-SIMD builds. --- ggml/src/ggml-cpu/vec.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 2751359ce49f4..6a7b852b0dd38 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -86,7 +86,27 @@ inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i])); } } -inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; ++j) { + GGML_F32_VEC_STORE(x + i + j*GGML_F32_EPR, vx); + } + } + + for (int i = np; i < n; ++i) { + x[i] = v; + } +#else + for (int i = 0; i < n; ++i) { + x[i] = v; + } +#endif +} inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { From dff11736e3bc8dbbe7709f9185a33aeaeac87c4c Mon Sep 17 00:00:00 2001 From: Aaron Date: Sat, 11 Oct 2025 12:22:48 -0500 Subject: [PATCH 2/3] Vectorize additional f32 helper loops --- ggml/src/ggml-cpu/vec.h | 94 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 6a7b852b0dd38..47dae074a4d0c 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -77,9 +77,74 @@ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i])); } } -inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } -inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } -inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } +inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC vv = GGML_F32_VEC_SET1(v); + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; ++j) { + GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + GGML_F32_VEC az = GGML_F32_VEC_ADD(ax, vv); + GGML_F32_VEC_STORE(z + i + j*GGML_F32_EPR, az); + } + } + + for (int i = np; i < n; ++i) { + z[i] = x[i] + v; + } +#else + for (int i = 0; i < n; ++i) { + z[i] = x[i] + v; + } +#endif +} +inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; ++j) { + GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + ay = GGML_F32_VEC_ADD(ay, ax); + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay); + } + } + + for (int i = np; i < n; ++i) { + y[i] += x[i]; + } +#else + for (int i = 0; i < n; ++i) { + y[i] += x[i]; + } +#endif +} +inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); + + GGML_F32_VEC vv = GGML_F32_VEC_SET1(v); + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; ++j) { + GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + ay = GGML_F32_VEC_ADD(ay, vv); + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay); + } + } + + for (int i = np; i < n; ++i) { + y[i] += v; + } +#else + for (int i = 0; i < n; ++i) { + y[i] += v; + } +#endif +} inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { @@ -115,7 +180,28 @@ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp } } -inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } +inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); + + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; ++j) { + GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + GGML_F32_VEC az = GGML_F32_VEC_MUL(ax, ay); + GGML_F32_VEC_STORE(z + i + j*GGML_F32_EPR, az); + } + } + + for (int i = np; i < n; ++i) { + z[i] = x[i]*y[i]; + } +#else + for (int i = 0; i < n; ++i) { + z[i] = x[i]*y[i]; + } +#endif +} inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i])); From e4189f7556ae60a4494c1656939c75461178673f Mon Sep 17 00:00:00 2001 From: Aaron Date: Mon, 13 Oct 2025 10:12:14 -0500 Subject: [PATCH 3/3] Normalize f32 helper tails for ggml vec ops --- ggml/src/ggml-cpu/vec.h | 60 ++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 40 deletions(-) diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 47dae074a4d0c..e08a9fe40e513 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -78,33 +78,30 @@ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp } } inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { + int i = 0; #if defined(GGML_SIMD) const int np = (n & ~(GGML_F32_STEP - 1)); GGML_F32_VEC vv = GGML_F32_VEC_SET1(v); - for (int i = 0; i < np; i += GGML_F32_STEP) { + for (; i < np; i += GGML_F32_STEP) { for (int j = 0; j < GGML_F32_ARR; ++j) { GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); GGML_F32_VEC az = GGML_F32_VEC_ADD(ax, vv); GGML_F32_VEC_STORE(z + i + j*GGML_F32_EPR, az); } } - - for (int i = np; i < n; ++i) { - z[i] = x[i] + v; - } -#else - for (int i = 0; i < n; ++i) { +#endif + for (; i < n; ++i) { z[i] = x[i] + v; } -#endif } inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { + int i = 0; #if defined(GGML_SIMD) const int np = (n & ~(GGML_F32_STEP - 1)); - for (int i = 0; i < np; i += GGML_F32_STEP) { + for (; i < np; i += GGML_F32_STEP) { for (int j = 0; j < GGML_F32_ARR; ++j) { GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); @@ -112,38 +109,29 @@ inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay); } } - - for (int i = np; i < n; ++i) { - y[i] += x[i]; - } -#else - for (int i = 0; i < n; ++i) { +#endif + for (; i < n; ++i) { y[i] += x[i]; } -#endif } inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { + int i = 0; #if defined(GGML_SIMD) const int np = (n & ~(GGML_F32_STEP - 1)); GGML_F32_VEC vv = GGML_F32_VEC_SET1(v); - for (int i = 0; i < np; i += GGML_F32_STEP) { + for (; i < np; i += GGML_F32_STEP) { for (int j = 0; j < GGML_F32_ARR; ++j) { GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); ay = GGML_F32_VEC_ADD(ay, vv); GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay); } } - - for (int i = np; i < n; ++i) { - y[i] += v; - } -#else - for (int i = 0; i < n; ++i) { +#endif + for (; i < n; ++i) { y[i] += v; } -#endif } inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { @@ -152,25 +140,21 @@ inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp } } inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { + int i = 0; #if defined(GGML_SIMD) const int np = (n & ~(GGML_F32_STEP - 1)); GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); - for (int i = 0; i < np; i += GGML_F32_STEP) { + for (; i < np; i += GGML_F32_STEP) { for (int j = 0; j < GGML_F32_ARR; ++j) { GGML_F32_VEC_STORE(x + i + j*GGML_F32_EPR, vx); } } - - for (int i = np; i < n; ++i) { - x[i] = v; - } -#else - for (int i = 0; i < n; ++i) { +#endif + for (; i < n; ++i) { x[i] = v; } -#endif } inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } @@ -181,10 +165,11 @@ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp } inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { + int i = 0; #if defined(GGML_SIMD) const int np = (n & ~(GGML_F32_STEP - 1)); - for (int i = 0; i < np; i += GGML_F32_STEP) { + for (; i < np; i += GGML_F32_STEP) { for (int j = 0; j < GGML_F32_ARR; ++j) { GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); @@ -192,15 +177,10 @@ inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, co GGML_F32_VEC_STORE(z + i + j*GGML_F32_EPR, az); } } - - for (int i = np; i < n; ++i) { - z[i] = x[i]*y[i]; - } -#else - for (int i = 0; i < n; ++i) { +#endif + for (; i < n; ++i) { z[i] = x[i]*y[i]; } -#endif } inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) {