@@ -1276,14 +1276,83 @@ void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
12761276#endif
12771277}
12781278
1279+ /* Helper routine for calculating exp(x) - 1.
1280+ Copied from expm1f_1u6.c, with several simplifications:
1281+ - No special-case handling for tiny or special values, instead return early
1282+ from the main routine.
1283+ - No special handling for large values:
1284+ - No early return for infinity.
1285+ - Simpler combination of p and t in final stage of algorithm.
1286+ - |i| < 27, so can calculate t by simpler shift-and-add, instead of ldexpf.
1287+ From Optimized Routines by Arm Limited. */
1288+ static inline float
1289+ Expm1f (float x )
1290+ {
1291+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
1292+ float Shift = 0x1.8p23f ;
1293+ float j = fmaf (0x1.715476p+0f , x , Shift ) - Shift ;
1294+ int i = j ;
1295+ float f = fmaf (j , -0x1.62e4p-1f , x );
1296+ f = fmaf (j , -0x1.7f7d1cp-20f , f );
1297+
1298+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
1299+ Uses Estrin scheme, where the main expm1f routine uses Horner. */
1300+ float f2 = f * f ;
1301+ float p_01 = fmaf (f , 0x1.5554aep-3 , 0x1.fffffep-2 );
1302+ float p_23 = fmaf (f , 0x1.12287cp-7 , 0x1.555736p-5 );
1303+ float p = fmaf (f2 , p_23 , p_01 );
1304+ p = fmaf (f2 * f2 , 0x1.6b55a2p-10 , p );
1305+ p = fmaf (f2 , p , f );
1306+
1307+ /* t = 2^i. */
1308+ union
1309+ {
1310+ unsigned i ;
1311+ float f ;
1312+ } u = { (i + 127 ) << 23 };
1313+ float t = u .f ;
1314+
1315+ /* expm1(x) ~= p * t + (t - 1). */
1316+ return fmaf (p , t , t - 1 );
1317+ }
1318+
1319+ /* Single-precision tanh(x) approximation.
1320+ The maximum error is 2.58 ULP.
1321+ Designed by Arm Limited. */
1322+ static inline float
1323+ Tanhf (float x )
1324+ {
1325+ union
1326+ {
1327+ float f ;
1328+ unsigned i ;
1329+ } u = { x };
1330+ unsigned iax = u .i & 0x7fffffff ;
1331+ unsigned sign = u .i & ~0x7fffffff ;
1332+
1333+ /* Above 0x1.205966p+3 tanhf rounds to 1 (or -1 for negative). */
1334+ if (iax > 0x41102cb3 ) {
1335+ if (iax > 0x7f800000 )
1336+ return (x - x ) / (x - x );
1337+ u .i = 0x3f800000 | sign ;
1338+ return u .f ;
1339+ }
1340+ if (iax < 0x34000000 )
1341+ return x ;
1342+
1343+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
1344+ float q = Expm1f (2 * x );
1345+ return q / (q + 2 );
1346+ }
1347+
12791348void ggml_vec_norm_f32 (const int n , float * s , const float * x ) { ggml_vec_dot_f32 (n , s , 0 , x , 0 , x , 0 , 1 ); * s = sqrtf (* s ); }
12801349void ggml_vec_sqr_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = x [i ]* x [i ]; }
12811350void ggml_vec_sqrt_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = sqrtf (x [i ]); }
12821351void ggml_vec_log_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = logf (x [i ]); }
12831352void ggml_vec_abs_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = fabsf (x [i ]); }
12841353void ggml_vec_sgn_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = (x [i ] > 0.f ) ? 1.f : ((x [i ] < 0.f ) ? -1.f : 0.f ); }
12851354void ggml_vec_step_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = (x [i ] > 0.f ) ? 1.f : 0.f ; }
1286- void ggml_vec_tanh_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = tanhf (x [i ]); }
1355+ void ggml_vec_tanh_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = Tanhf (x [i ]); }
12871356void ggml_vec_elu_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = (x [i ] > 0.f ) ? x [i ] : expf (x [i ])- 1 ; }
12881357void ggml_vec_relu_f32 (const int n , float * y , const float * x ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = (x [i ] > 0.f ) ? x [i ] : 0.f ; }
12891358void ggml_vec_leaky_relu_f32 (const int n , float * y , const float * x , const float ns ) { for (int i = 0 ; i < n ; ++ i ) y [i ] = ((x [i ] > 0.f ) ? x [i ] : 0.f ) + ns * ((x [i ] < 0.0f ) ? x [i ] : 0.f ); }
@@ -1331,7 +1400,7 @@ void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (i
13311400
13321401static inline float ggml_gelu_f32 (float x ) {
13331402 // GeLU approximation that goes slower and we seem to be stuck with.
1334- return .5f * x * (1.f + tanhf (sqrtf (M_2_PI ) * (x + .044715f * x * x * x )));
1403+ return .5f * x * (1.f + Tanhf (sqrtf (M_2_PI ) * (x + .044715f * x * x * x )));
13351404}
13361405
13371406void ggml_vec_gelu_f32 (const int n , float * y , const float * x ) {
0 commit comments