|
36 | 36 | #include "ggml.h" |
37 | 37 | #include "ggml-backend-impl.h" |
38 | 38 |
|
39 | | -#define GGML_COMMON_IMPL_SYCL |
40 | | -#include "ggml-common.h" |
41 | | - |
42 | 39 | /* |
43 | 40 | Following definition copied from DPCT head files, which are used by ggml-sycl.cpp |
44 | 41 | */ |
@@ -3147,6 +3144,7 @@ namespace dpct |
3147 | 3144 |
|
3148 | 3145 | } // COPY from DPCT head files |
3149 | 3146 |
|
| 3147 | +#define GGML_COMMON_DECL_SYCL |
3150 | 3148 | #define GGML_COMMON_IMPL_SYCL |
3151 | 3149 | #include "ggml-common.h" |
3152 | 3150 |
|
@@ -3315,66 +3313,6 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0, |
3315 | 3313 | const float *src1_dd, float *dst_dd, |
3316 | 3314 | const dpct::queue_ptr &main_stream); |
3317 | 3315 |
|
3318 | | -// QK = number of values after dequantization |
3319 | | -// QR = QK / number of values before dequantization |
3320 | | -// QI = number of 32 bit integers before dequantization |
3321 | | - |
3322 | | -#define QK4_0 32 |
3323 | | -#define QR4_0 2 |
3324 | | -#define QI4_0 (QK4_0 / (4 * QR4_0)) |
3325 | | -typedef struct dpct_type_block_q4_0 { |
3326 | | - sycl::half d; // delta |
3327 | | - uint8_t qs[QK4_0 / 2]; // nibbles / quants |
3328 | | -} block_q4_0; |
3329 | | -static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); |
3330 | | - |
3331 | | -#define QK4_1 32 |
3332 | | -#define QR4_1 2 |
3333 | | -#define QI4_1 (QK4_1 / (4 * QR4_1)) |
3334 | | -typedef struct dpct_type_block_q4_1 { |
3335 | | - sycl::half2 dm; // dm.x = delta, dm.y = min |
3336 | | - uint8_t qs[QK4_1 / 2]; // nibbles / quants |
3337 | | -} block_q4_1; |
3338 | | -static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); |
3339 | | - |
3340 | | -#define QK5_0 32 |
3341 | | -#define QR5_0 2 |
3342 | | -#define QI5_0 (QK5_0 / (4 * QR5_0)) |
3343 | | -typedef struct dpct_type_block_q5_0 { |
3344 | | - sycl::half d; // delta |
3345 | | - uint8_t qh[4]; // 5-th bit of quants |
3346 | | - uint8_t qs[QK5_0 / 2]; // nibbles / quants |
3347 | | -} block_q5_0; |
3348 | | -static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); |
3349 | | - |
3350 | | -#define QK5_1 32 |
3351 | | -#define QR5_1 2 |
3352 | | -#define QI5_1 (QK5_1 / (4 * QR5_1)) |
3353 | | -typedef struct dpct_type_block_q5_1 { |
3354 | | - sycl::half2 dm; // dm.x = delta, dm.y = min |
3355 | | - uint8_t qh[4]; // 5-th bit of quants |
3356 | | - uint8_t qs[QK5_1 / 2]; // nibbles / quants |
3357 | | -} block_q5_1; |
3358 | | -static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); |
3359 | | - |
3360 | | -#define QK8_0 32 |
3361 | | -#define QR8_0 1 |
3362 | | -#define QI8_0 (QK8_0 / (4 * QR8_0)) |
3363 | | -typedef struct dpct_type_block_q8_0 { |
3364 | | - sycl::half d; // delta |
3365 | | - int8_t qs[QK8_0]; // quants |
3366 | | -} block_q8_0; |
3367 | | -static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); |
3368 | | - |
3369 | | -#define QK8_1 32 |
3370 | | -#define QR8_1 1 |
3371 | | -#define QI8_1 (QK8_1 / (4 * QR8_1)) |
3372 | | -typedef struct dpct_type_block_q8_1 { |
3373 | | - sycl::half2 ds; // ds.x = delta, ds.y = sum |
3374 | | - int8_t qs[QK8_0]; // quants |
3375 | | -} block_q8_1; |
3376 | | -static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding"); |
3377 | | - |
3378 | 3316 | typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); |
3379 | 3317 | typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm, |
3380 | 3318 | int **x_qh, int **x_sc); |
@@ -3410,93 +3348,6 @@ typedef struct dpct_type_block_q2_K { |
3410 | 3348 | } block_q2_K; |
3411 | 3349 | static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); |
3412 | 3350 |
|
3413 | | -#define QR3_K 4 |
3414 | | -#define QI3_K (QK_K / (4*QR3_K)) |
3415 | | -typedef struct dpct_type_block_q3_K { |
3416 | | - uint8_t hmask[QK_K/8]; // quants - high bit |
3417 | | - uint8_t qs[QK_K/4]; // quants - low 2 bits |
3418 | | -#ifdef GGML_QKK_64 |
3419 | | - uint8_t scales[2]; // scales, quantized with 8 bits |
3420 | | -#else |
3421 | | - uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits |
3422 | | -#endif |
3423 | | - sycl::half d; // super-block scale |
3424 | | -} block_q3_K; |
3425 | | -//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding"); |
3426 | | - |
3427 | | -#define QR4_K 2 |
3428 | | -#define QI4_K (QK_K / (4*QR4_K)) |
3429 | | -#ifdef GGML_QKK_64 |
3430 | | -typedef struct { |
3431 | | - sycl::half dm[2]; // super-block scales/mins |
3432 | | - uint8_t scales[2]; // 4-bit block scales/mins |
3433 | | - uint8_t qs[QK_K/2]; // 4--bit quants |
3434 | | -} block_q4_K; |
3435 | | -static_assert(sizeof(block_q4_K) == sizeof(sycl::half2) + QK_K/2 + 2, "wrong q4_K block size/padding"); |
3436 | | -#else |
3437 | | -typedef struct dpct_type_block_q4_K { |
3438 | | - sycl::half2 dm; // super-block scale for quantized scales/mins |
3439 | | - uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits |
3440 | | - uint8_t qs[QK_K/2]; // 4--bit quants |
3441 | | -} block_q4_K; |
3442 | | -static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); |
3443 | | -#endif |
3444 | | - |
3445 | | -#define QR5_K 2 |
3446 | | -#define QI5_K (QK_K / (4*QR5_K)) |
3447 | | -#ifdef GGML_QKK_64 |
3448 | | -typedef struct { |
3449 | | - sycl::half d; // super-block scale |
3450 | | - int8_t scales[QK_K/16]; // block scales |
3451 | | - uint8_t qh[QK_K/8]; // quants, high bit |
3452 | | - uint8_t qs[QK_K/2]; // quants, low 4 bits |
3453 | | -} block_q5_K; |
3454 | | -static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); |
3455 | | -#else |
3456 | | -typedef struct dpct_type_block_q5_K { |
3457 | | - sycl::half2 dm; // super-block scale for quantized scales/mins |
3458 | | - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits |
3459 | | - uint8_t qh[QK_K/8]; // quants, high bit |
3460 | | - uint8_t qs[QK_K/2]; // quants, low 4 bits |
3461 | | -} block_q5_K; |
3462 | | -static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); |
3463 | | -#endif |
3464 | | - |
3465 | | -#define QR6_K 2 |
3466 | | -#define QI6_K (QK_K / (4*QR6_K)) |
3467 | | -typedef struct dpct_type_block_q6_K { |
3468 | | - uint8_t ql[QK_K/2]; // quants, lower 4 bits |
3469 | | - uint8_t qh[QK_K/4]; // quants, upper 2 bits |
3470 | | - int8_t scales[QK_K/16]; // scales |
3471 | | - sycl::half d; // delta |
3472 | | -} block_q6_K; |
3473 | | -static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); |
3474 | | - |
3475 | | -#define QR2_XXS 8 |
3476 | | -#define QI2_XXS (QK_K / (4*QR2_XXS)) |
3477 | | -typedef struct dpct_type_block_iq2_xxs { |
3478 | | - sycl::half d; |
3479 | | - uint16_t qs[QK_K/8]; |
3480 | | -} block_iq2_xxs; |
3481 | | -static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding"); |
3482 | | - |
3483 | | -#define QR2_XS 8 |
3484 | | -#define QI2_XS (QK_K / (4*QR2_XS)) |
3485 | | -typedef struct dpct_type_block_iq2_xs { |
3486 | | - sycl::half d; |
3487 | | - uint16_t qs[QK_K/8]; |
3488 | | - uint8_t scales[QK_K/32]; |
3489 | | -} block_iq2_xs; |
3490 | | -static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); |
3491 | | - |
3492 | | -#define QR3_XXS 8 |
3493 | | -#define QI3_XXS (QK_K / (4*QR3_XXS)) |
3494 | | -typedef struct dpct_type_block_iq3_xxs { |
3495 | | - sycl::half d; |
3496 | | - uint8_t qs[3*(QK_K/8)]; |
3497 | | -} block_iq3_xxs; |
3498 | | -static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); |
3499 | | - |
3500 | 3351 | #define WARP_SIZE 32 |
3501 | 3352 | #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses |
3502 | 3353 |
|
|
0 commit comments