@@ -402,9 +402,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
402402 // get general kv
403403 ml.get_key (LLM_KV_GENERAL_NAME, name, false );
404404
405- // get hparams kv
406- ml.get_key (LLM_KV_VOCAB_SIZE, hparams.n_vocab , false ) || ml.get_arr_n (LLM_KV_TOKENIZER_LIST, hparams.n_vocab , false );
407-
408405 // everything past this point is not vocab-related
409406 if (hparams.vocab_only ) {
410407 return ;
@@ -500,6 +497,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
500497 hparams.n_embd_head_v = 0 ;
501498 }
502499
500+ // for differentiating model types
501+ uint32_t n_vocab = 0 ;
502+ ml.get_key (LLM_KV_VOCAB_SIZE, n_vocab, false ) || ml.get_arr_n (LLM_KV_TOKENIZER_LIST, n_vocab, false );
503+
503504 // arch-specific KVs
504505 switch (arch) {
505506 case LLM_ARCH_LLAMA:
@@ -519,7 +520,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
519520 case 26 : type = LLM_TYPE_3B; break ;
520521 case 28 : type = LLM_TYPE_3B; break ; // Llama 3.2 3B
521522 // granite uses a vocab with len 49152
522- case 32 : type = hparams. n_vocab == 49152 ? LLM_TYPE_3B : (hparams. n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break ;
523+ case 32 : type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break ;
523524 case 36 : type = LLM_TYPE_8B; break ; // granite
524525 case 40 : type = LLM_TYPE_13B; break ;
525526 case 48 : type = LLM_TYPE_34B; break ;
@@ -621,7 +622,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
621622 {
622623 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
623624 ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
624- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
625625 ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type , false );
626626
627627 switch (hparams.n_layer ) {
@@ -644,7 +644,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
644644 {
645645 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
646646 ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
647- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
648647 ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type , false );
649648 hparams.f_max_alibi_bias = 8 .0f ;
650649
@@ -658,7 +657,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
658657 {
659658 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
660659 ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
661- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
662660 ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type );
663661
664662 if (hparams.n_layer == 12 && hparams.n_embd == 768 ) {
@@ -1365,8 +1363,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
13651363 const int64_t n_embd_head_v = hparams.n_embd_head_v ;
13661364 const int64_t n_ff = hparams.n_ff ();
13671365 const int64_t n_embd_gqa = n_embd_v_gqa;
1368- const int64_t n_vocab = hparams .n_vocab ;
1369- const int64_t n_vocab_type = hparams. n_vocab_type ;
1366+ const int64_t n_vocab = vocab .n_vocab () ;
1367+ const int64_t n_token_types = vocab. n_token_types () ;
13701368 const int64_t n_rot = hparams.n_rot ;
13711369 const int64_t n_expert = hparams.n_expert ;
13721370 const int64_t n_expert_used = hparams.n_expert_used ;
@@ -1811,7 +1809,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
18111809 case LLM_ARCH_NOMIC_BERT:
18121810 {
18131811 tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 );
1814- type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_vocab_type }, 0 );
1812+ type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_token_types }, 0 );
18151813
18161814 if (arch == LLM_ARCH_BERT) {
18171815 pos_embd = create_tensor (tn (LLM_TENSOR_POS_EMBD, " weight" ), {n_embd, n_ctx_train}, 0 );
@@ -1865,7 +1863,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
18651863 case LLM_ARCH_JINA_BERT_V2:
18661864 {
18671865 tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 ); // word_embeddings
1868- type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_vocab_type }, 0 ); // token_type_embeddings
1866+ type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_token_types }, 0 ); // token_type_embeddings
18691867
18701868 tok_norm = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD_NORM, " weight" ), {n_embd}, 0 ); // LayerNorm
18711869 tok_norm_b = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD_NORM, " bias" ), {n_embd}, 0 ); // LayerNorm bias
@@ -3494,7 +3492,6 @@ void llama_model::print_info() const {
34943492
34953493 // hparams
34963494 LLAMA_LOG_INFO (" %s: arch = %s\n " , __func__, arch_name ().c_str ());
3497- LLAMA_LOG_INFO (" %s: n_vocab (hp) = %u\n " , __func__, hparams.n_vocab );
34983495 LLAMA_LOG_INFO (" %s: vocab_only = %d\n " , __func__, hparams.vocab_only );
34993496
35003497 if (!hparams.vocab_only ) {
0 commit comments