@@ -18241,14 +18241,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1824118241 // new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
1824218242 // }
1824318243 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
18244- new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
18244+ new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1824518245 }
1824618246 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
1824718247 new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1824818248 }
1824918249 else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q4_K;
1825018250 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
18251- new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18251+ new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1825218252 }
1825318253 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
1825418254 new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
@@ -18270,47 +18270,61 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1827018270 else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1827118271 }
1827218272 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18273- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
18273+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1827418274 new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1827518275 else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1827618276 }
1827718277 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18278- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
18278+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1827918279 new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
1828018280 else new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1828118281 }
1828218282 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
18283- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18283+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18284+ new_type = GGML_TYPE_Q6_K;
18285+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1828418286 new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1828518287 else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1828618288 }
1828718289 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
18288- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18290+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18291+ new_type = GGML_TYPE_Q6_K;
18292+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1828918293 new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1829018294 else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1829118295 }
1829218296 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML) {
18293- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18297+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18298+ new_type = GGML_TYPE_Q6_K;
18299+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1829418300 new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1829518301 else new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1829618302 }
1829718303 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
18298- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18304+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18305+ new_type = GGML_TYPE_Q6_K;
18306+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1829918307 new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1830018308 else new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1830118309 }
1830218310 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
18303- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18311+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18312+ new_type = GGML_TYPE_Q6_K;
18313+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1830418314 new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1830518315 else new_type = difquant_six_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1830618316 }
1830718317 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
18308- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18318+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18319+ new_type = GGML_TYPE_Q6_K;
18320+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1830918321 new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1831018322 else new_type = difquant_seven_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1831118323 }
1831218324 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_UXL) {
18313- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18325+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18326+ new_type = GGML_TYPE_Q6_K;
18327+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1831418328 new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1831518329 else new_type = GGML_TYPE_Q5_K;
1831618330 }
0 commit comments