Skip to content

Vector Wrapper Functions For Kernel Evaluations

Edmond Chow edited this page Oct 18, 2020 · 3 revisions

In the examples of using the KME function in Using and Writing Kernel Functions and using the BKM function in Bi-Kernel Matvec (BKM) Functions, the compiler pragma #pragma omp simd is used to have the compiler automatically vectorize the innermost loop. However, experienced developers who are familiar with x86 intrinsic functions may want to manually vectorize their KME and BKM functions for better performance.

H2Pack provides the file src/include/x86_intrin_wrapper.h containing a set of vector wrapper functions (VWF) to help you manually vectorize your KME functions and BKM functions. x86_intrin_wrapper.h will:

  • Detect and select the newest AVX series instruction set on your machine;
  • Set two macros SIMD_LEN_D and SIMD_LEN_S to the numbers of doubles / floats in a SIMD lane under the selected instruction set;
  • Map instruction set independent functions to intrinsic functions on the selected instruction set.

For example, to perform vectorized multiplication for a double-precision vector, you need _mm256_mul_pd() on AVX and AVX2, and _mm512_mul_pd() on AVX-512. Using VWF, you only need vec_mul_d().

The beginning of x86_intrin_wrapper.h lists all supported functions and their usage; please read this before using VWF.

Specifically for BKM functions, the provided VWFs can guarantee that:

  • n{0,1} are multiples of SIMD_LEN_{S, D};
  • The lengths of x_{in,out}_{0,1} are multiples of (SIMD_LEN_{S,D} * krnl_dim);
  • The addresses of coord{0,1} and x_{in,out}_{0,1} are aligned to (SIMD_LEN_{S,D} * sizeof(DTYPE)).

Therefore, you don't need to handle remainder loops in BKM functions.

Examples

A BKM function using VWFs for 3D Gaussian kernel:

void Gaussian_3D_bimv_intrin_d(
    const DTYPE *coord0, const int ld0, const int n0,
    const DTYPE *coord1, const int ld1, const int n1,
    const void *krnl_param, 
    const DTYPE *x_in_0, const DTYPE *x_in_1, 
    DTYPE * restrict x_out_0, DTYPE * restrict x_out_1
)
{
    const DTYPE *x0 = coord0 + ld0 * 0, *x1 = coord1 + ld1 * 0;
    const DTYPE *y0 = coord0 + ld0 * 1, *y1 = coord1 + ld1 * 1;
    const DTYPE *z0 = coord0 + ld0 * 2, *z1 = coord1 + ld1 * 2;
    const DTYPE *krnl_param_ = (DTYPE*) krnl_param;
    const DTYPE l = krnl_param_[0];
    const vec_d l_v = vec_set1_d(l);
    const DTYPE zero_v = vec_zero_d();
    for (int i = 0; i < n0; i++)
    {
        vec_d sum_iv = zero_v;
        const vec_d x0_iv = vec_bcast_d(x0 + i);
        const vec_d y0_iv = vec_bcast_d(y0 + i);
        const vec_d z0_iv = vec_bcast_d(z0 + i);
        const vec_d x_in_1_iv = vec_bcast_d(x_in_1 + i);
        for (int j = 0; j < n1; j += SIMD_LEN)
        {
            // Calculate dx, dy, dz, and r2 = dx^2 + dy^2 + dz^2
            vec_d dx_v = vec_sub_d(x0_i0v, vec_load_d(x1 + j));
            vec_d dy_v = vec_sub_d(y0_i0v, vec_load_d(y1 + j));
            vec_d dz_v = vec_sub_d(z0_i0v, vec_load_d(z1 + j));
            vec_d r2_v = vec_mul_d(dx_v, dx_v);
            r2_v = vec_fmadd(dy_v, dy_v, r2_v);
            r2_v = vec_fmadd(dz_v, dz_v, r2_v);
            
            // Calculate kval = exp(-l * r2)
            vec_d tmp_v  = vec_fnmadd_d(l_v, r2_v, zero_v);
            vec_d kval_v = vec_exp_d(tmp_v);
            
            // Multiply kval with x_in_0[j] and accumulate the result
            sum_iv = vec_fmadd_d(r2_v, vec_load_d(x_in_0 + j), sum_iv);
            
            // Multiply kval with x_in_1[i] and update x_out_1[j]
            vec_d x_out_1_jv = vec_load_d(x_out_1 + j);
            x_out_1_jv = vec_fmadd_d(x_in_1_iv, r2_v, x_out_1_jv);
            vec_store_d(x_out_1 + j, x_out_1_jv);
        }
        x_out_0[i] += vec_reduce_add_d(sum_iv);
    }
}

A KME function using VWFs for 3D Gaussian kernel:

void Gaussian_3D_eval_intrin_d(
    const DTYPE *coord0, const int ld0, const int n0, 
    const DTYPE *coord1, const int ld1, const int n1, 
    const void *param, DTYPE *restrict mat, const int ldm
{
    const DTYPE *x0 = coord0 + ld0 * 0, *x1 = coord1 + ld1 * 0;
    const DTYPE *y0 = coord0 + ld0 * 1, *y1 = coord1 + ld1 * 1;
    const DTYPE *z0 = coord0 + ld0 * 2, *z1 = coord1 + ld1 * 2;
    const int n1_vec = (n1 / SIMD_LEN) * SIMD_LEN;
    const DTYPE *param_ = (DTYPE*) param;
    const DTYPE neg_l = -param_[0];
    const vec_d neg_l_v = vec_set1_d(neg_l);
    for (int i = 0; i < n0; i++)
    {
        DTYPE *mat_irow = mat + i * ldm;
        
        vec_d x0_iv = vec_bcast_d(x0 + i);
        vec_d y0_iv = vec_bcast_d(y0 + i);
        vec_d z0_iv = vec_bcast_d(z0 + i);
        for (int j = 0; j < n1_vec; j += SIMD_LEN)
        {
            vec_d dx = vec_sub_d(x0_iv, vec_loadu_d(x1 + j));
            vec_d dy = vec_sub_d(y0_iv, vec_loadu_d(y1 + j));
            vec_d dz = vec_sub_d(z0_iv, vec_loadu_d(z1 + j));
            
            vec_d r2 = vec_mul_d(dx, dx);
            r2 = vec_fmadd_d(dy, dy, r2);
            r2 = vec_fmadd_d(dz, dz, r2);
            
            r2 = vec_exp_d(vec_mul_d(neg_l_v, r2));
            
            vec_storeu_d(mat_irow + j, r2);
        }
        
        const DTYPE x0_i = x0[i];
        const DTYPE y0_i = y0[i];
        const DTYPE z0_i = z0[i];
        for (int j = n1_vec; j < n1; j++)
        {
            DTYPE dx = x0_i - x1[j];
            DTYPE dy = y0_i - y1[j];
            DTYPE dz = z0_i - z1[j];
            DTYPE r2 = dx * dx + dy * dy + dz * dz;
            mat_irow[j] = exp(neg_l * r2);
        }
    }
}
Clone this wiki locally