@@ -32,14 +32,22 @@ typedef half ggml_fp16_t;
3232#endif
3333#endif
3434
35+ // QK = number of values after dequantization
36+ // QR = QK / number of values before dequantization
37+ // QI = number of 32 bit integers before dequantization
38+
3539#define QK4_0 32
40+ #define QI4_0 (QK4_0 / (4 * QR4_0))
41+ #define QR4_0 2
3642typedef struct {
3743 ggml_fp16_t d; // delta
3844 uint8_t qs[QK4_0 / 2 ]; // nibbles / quants
3945} block_q4_0;
4046static_assert (sizeof (block_q4_0) == sizeof (ggml_fp16_t ) + QK4_0 / 2 , " wrong q4_0 block size/padding" );
4147
4248#define QK4_1 32
49+ #define QI4_1 (QK4_1 / (4 * QR4_1))
50+ #define QR4_1 2
4351typedef struct {
4452 ggml_fp16_t d; // delta
4553 ggml_fp16_t m; // min
@@ -48,6 +56,8 @@ typedef struct {
4856static_assert (sizeof (block_q4_1) == 2 * sizeof (ggml_fp16_t ) + QK4_1 / 2 , " wrong q4_1 block size/padding" );
4957
5058#define QK5_0 32
59+ #define QI5_0 (QK5_0 / (4 * QR5_0))
60+ #define QR5_0 2
5161typedef struct {
5262 ggml_fp16_t d; // delta
5363 uint8_t qh[4 ]; // 5-th bit of quants
@@ -56,6 +66,8 @@ typedef struct {
5666static_assert (sizeof (block_q5_0) == sizeof (ggml_fp16_t ) + sizeof (uint32_t ) + QK5_0 / 2 , " wrong q5_0 block size/padding" );
5767
5868#define QK5_1 32
69+ #define QI5_1 (QK5_1 / (4 * QR5_1))
70+ #define QR5_1 2
5971typedef struct {
6072 ggml_fp16_t d; // delta
6173 ggml_fp16_t m; // min
@@ -65,13 +77,17 @@ typedef struct {
6577static_assert (sizeof (block_q5_1) == 2 * sizeof (ggml_fp16_t ) + sizeof (uint32_t ) + QK5_1 / 2 , " wrong q5_1 block size/padding" );
6678
6779#define QK8_0 32
80+ #define QI8_0 (QK8_0 / (4 * QR8_0))
81+ #define QR8_0 1
6882typedef struct {
6983 ggml_fp16_t d; // delta
7084 int8_t qs[QK8_0]; // quants
7185} block_q8_0;
7286static_assert (sizeof (block_q8_0) == sizeof (ggml_fp16_t ) + QK8_0, " wrong q8_0 block size/padding" );
7387
7488#define QK8_1 32
89+ #define QI8_1 (QK8_1 / (4 * QR8_1))
90+ #define QR8_1 1
7591typedef struct {
7692 float d; // delta
7793 float s; // d * sum(qs[i])
@@ -96,6 +112,8 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
96112// weight is represented as x = a * q + b
97113// 16 blocks of 16 elements each
98114// Effectively 2.625 bits per weight
115+ #define QI2_K (QK_K / (4 *QR2_K))
116+ #define QR2_K 4
99117typedef struct {
100118 uint8_t scales[QK_K/16 ]; // scales and mins, quantized with 4 bits
101119 uint8_t qs[QK_K/4 ]; // quants
@@ -108,6 +126,8 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
108126// weight is represented as x = a * q
109127// 16 blocks of 16 elements each
110128// Effectively 3.4375 bits per weight
129+ #define QI3_K (QK_K / (4 *QR3_K))
130+ #define QR3_K 4
111131#ifdef GGML_QKK_64
112132typedef struct {
113133 uint8_t hmask[QK_K/8 ]; // quants - high bit
@@ -130,6 +150,8 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
130150// 8 blocks of 32 elements each
131151// weight is represented as x = a * q + b
132152// Effectively 4.5 bits per weight
153+ #define QI4_K (QK_K / (4 *QR4_K))
154+ #define QR4_K 2
133155#ifdef GGML_QKK_64
134156typedef struct {
135157 ggml_fp16_t d[2 ]; // super-block scales/mins
@@ -151,6 +173,8 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
151173// 8 blocks of 32 elements each
152174// weight is represented as x = a * q + b
153175// Effectively 5.5 bits per weight
176+ #define QI5_K (QK_K / (4 *QR5_K))
177+ #define QR5_K 2
154178#ifdef GGML_QKK_64
155179typedef struct {
156180 ggml_fp16_t d; // super-block scale
@@ -174,6 +198,8 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
174198// weight is represented as x = a * q
175199// 16 blocks of 16 elements each
176200// Effectively 6.5625 bits per weight
201+ #define QI6_K (QK_K / (4 *QR6_K))
202+ #define QR6_K 2
177203typedef struct {
178204 uint8_t ql[QK_K/2 ]; // quants, lower 4 bits
179205 uint8_t qh[QK_K/4 ]; // quants, upper 2 bits
@@ -193,13 +219,17 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
193219// (Almost) "true" 2-bit quantization.
194220// Due to the need to use blocks as per ggml design, it ends up using
195221// 2.0625 bpw because of the 16-bit scale for each block of 256.
222+ #define QI2_XXS (QK_K / (4 *QR2_XXS))
223+ #define QR2_XXS 8
196224typedef struct {
197225 ggml_fp16_t d;
198226 uint16_t qs[QK_K/8 ];
199227} block_iq2_xxs;
200228static_assert (sizeof (block_iq2_xxs) == sizeof (ggml_fp16_t ) + QK_K/8 *sizeof (uint16_t ), " wrong iq2_xxs block size/padding" );
201229
202230// 2.3125 bpw quants
231+ #define QI2_XS (QK_K / (4 *QR2_XS))
232+ #define QR2_XS 8
203233typedef struct {
204234 ggml_fp16_t d;
205235 uint16_t qs[QK_K/8 ];
@@ -208,6 +238,8 @@ typedef struct {
208238static_assert (sizeof (block_iq2_xs) == sizeof (ggml_fp16_t ) + QK_K/8 *sizeof (uint16_t ) + QK_K/32 , " wrong iq2_xs block size/padding" );
209239
210240// 2.5625 bpw quants
241+ #define QI2_S (QK_K / (4 *QR2_S))
242+ #define QR2_S 8
211243typedef struct {
212244 ggml_fp16_t d;
213245 uint8_t qs[QK_K/4 ];
@@ -219,6 +251,8 @@ static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wr
219251// (Almost) "true" 3-bit quantization.
220252// Due to the need to use blocks as per ggml design, it ends up using
221253// 3.0625 bpw because of the 16-bit scale for each block of 256.
254+ #define QI3_XXS (QK_K / (4 *QR3_XXS))
255+ #define QR3_XXS 8
222256typedef struct {
223257 ggml_fp16_t d;
224258 uint8_t qs[3 *QK_K/8 ];
@@ -231,6 +265,8 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
231265#else
232266#define IQ3S_N_SCALE QK_K/64
233267#endif
268+ #define QI3_XS (QK_K / (4 *QR3_XS))
269+ #define QR3_XS 8
234270typedef struct {
235271 ggml_fp16_t d;
236272 uint8_t qs[QK_K/4 ];
@@ -240,6 +276,8 @@ typedef struct {
240276} block_iq3_s;
241277static_assert (sizeof (block_iq3_s) == sizeof (ggml_fp16_t ) + 13 *(QK_K/32 ) + IQ3S_N_SCALE, " wrong iq3_s block size/padding" );
242278
279+ #define QI1_S (QK_K / (4 *QR1_S))
280+ #define QR1_S 8
243281typedef struct {
244282 ggml_fp16_t d;
245283 uint8_t qs[QK_K/8 ];
@@ -249,6 +287,8 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wr
249287
250288// Non-linear quants
251289#define QK4_NL 32
290+ #define QI4_NL (QK4_NL / (4 *QR4_NL))
291+ #define QR4_NL 2
252292typedef struct {
253293 ggml_fp16_t d;
254294 uint8_t qs[QK4_NL/2 ];
@@ -257,8 +297,12 @@ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4
257297
258298#if QK_K == 64
259299#define block_iq4_xs block_iq4_nl
300+ #define QI4_XS QI4_NL
301+ #define QR4_XS QR4_NL
260302// typedef struct block_iq4_nl block_iq4_xs;
261303#else
304+ #define QI4_XS (QK_K / (4 *QR4_XS))
305+ #define QR4_XS 8
262306typedef struct {
263307 ggml_fp16_t d;
264308 uint16_t scales_h;
0 commit comments