@@ -402,9 +402,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
402402 // get general kv
403403 ml.get_key (LLM_KV_GENERAL_NAME, name, false );
404404
405- // get hparams kv
406- ml.get_key (LLM_KV_VOCAB_SIZE, hparams.n_vocab , false ) || ml.get_arr_n (LLM_KV_TOKENIZER_LIST, hparams.n_vocab , false );
407-
408405 // everything past this point is not vocab-related
409406 if (hparams.vocab_only ) {
410407 return ;
@@ -500,6 +497,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
500497 hparams.n_embd_head_v = 0 ;
501498 }
502499
500+ // for differentiating model types
501+ uint32_t n_vocab = 0 ;
502+ ml.get_key (LLM_KV_VOCAB_SIZE, n_vocab, false ) || ml.get_arr_n (LLM_KV_TOKENIZER_LIST, n_vocab, false );
503+
503504 // arch-specific KVs
504505 switch (arch) {
505506 case LLM_ARCH_LLAMA:
@@ -519,7 +520,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
519520 case 26 : type = LLM_TYPE_3B; break ;
520521 case 28 : type = LLM_TYPE_3B; break ; // Llama 3.2 3B
521522 // granite uses a vocab with len 49152
522- case 32 : type = hparams. n_vocab == 49152 ? LLM_TYPE_3B : (hparams. n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break ;
523+ case 32 : type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break ;
523524 case 36 : type = LLM_TYPE_8B; break ; // granite
524525 case 40 : type = LLM_TYPE_13B; break ;
525526 case 48 : type = LLM_TYPE_34B; break ;
@@ -621,7 +622,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
621622 {
622623 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
623624 ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
624- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
625625 ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type , false );
626626
627627 switch (hparams.n_layer ) {
@@ -644,7 +644,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
644644 {
645645 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
646646 ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
647- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
648647 ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type , false );
649648 hparams.f_max_alibi_bias = 8 .0f ;
650649
@@ -658,7 +657,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
658657 {
659658 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
660659 ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
661- ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
662660 ml.get_key (LLM_KV_POOLING_TYPE, hparams.pooling_type );
663661
664662 if (hparams.n_layer == 12 && hparams.n_embd == 768 ) {
@@ -1369,8 +1367,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
13691367 const int64_t n_embd_head_v = hparams.n_embd_head_v ;
13701368 const int64_t n_ff = hparams.n_ff ();
13711369 const int64_t n_embd_gqa = n_embd_v_gqa;
1372- const int64_t n_vocab = hparams .n_vocab ;
1373- const int64_t n_vocab_type = hparams. n_vocab_type ;
1370+ const int64_t n_vocab = vocab .n_vocab () ;
1371+ const int64_t n_token_types = vocab. n_token_types () ;
13741372 const int64_t n_rot = hparams.n_rot ;
13751373 const int64_t n_expert = hparams.n_expert ;
13761374 const int64_t n_expert_used = hparams.n_expert_used ;
@@ -1815,7 +1813,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
18151813 case LLM_ARCH_NOMIC_BERT:
18161814 {
18171815 tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 );
1818- type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_vocab_type }, 0 );
1816+ type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_token_types }, 0 );
18191817
18201818 if (arch == LLM_ARCH_BERT) {
18211819 pos_embd = create_tensor (tn (LLM_TENSOR_POS_EMBD, " weight" ), {n_embd, n_ctx_train}, 0 );
@@ -1869,7 +1867,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
18691867 case LLM_ARCH_JINA_BERT_V2:
18701868 {
18711869 tok_embd = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, 0 ); // word_embeddings
1872- type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_vocab_type }, 0 ); // token_type_embeddings
1870+ type_embd = create_tensor (tn (LLM_TENSOR_TOKEN_TYPES, " weight" ), {n_embd, n_token_types }, 0 ); // token_type_embeddings
18731871
18741872 tok_norm = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD_NORM, " weight" ), {n_embd}, 0 ); // LayerNorm
18751873 tok_norm_b = create_tensor (tn (LLM_TENSOR_TOKEN_EMBD_NORM, " bias" ), {n_embd}, 0 ); // LayerNorm bias
@@ -3553,7 +3551,6 @@ void llama_model::print_info() const {
35533551
35543552 // hparams
35553553 LLAMA_LOG_INFO (" %s: arch = %s\n " , __func__, arch_name ().c_str ());
3556- LLAMA_LOG_INFO (" %s: n_vocab (hp) = %u\n " , __func__, hparams.n_vocab );
35573554 LLAMA_LOG_INFO (" %s: vocab_only = %d\n " , __func__, hparams.vocab_only );
35583555
35593556 if (!hparams.vocab_only ) {
0 commit comments