@@ -857,21 +857,23 @@ struct common_init_result common_init_from_params(common_params & params) {
857857 return iparams;
858858 }
859859
860+ const llama_vocab * vocab = llama_model_get_vocab (model);
861+
860862 if (params.reranking ) {
861863 bool ok = true ;
862864
863- if (llama_token_bos (model ) == LLAMA_TOKEN_NULL) {
864- LOG_WRN (" %s: warning: model does not have a BOS token, reranking will not work\n " , __func__);
865+ if (llama_vocab_bos (vocab ) == LLAMA_TOKEN_NULL) {
866+ LOG_WRN (" %s: warning: vocab does not have a BOS token, reranking will not work\n " , __func__);
865867 ok = false ;
866868 }
867869
868- if (llama_token_eos (model ) == LLAMA_TOKEN_NULL) {
869- LOG_WRN (" %s: warning: model does not have an EOS token, reranking will not work\n " , __func__);
870+ if (llama_vocab_eos (vocab ) == LLAMA_TOKEN_NULL) {
871+ LOG_WRN (" %s: warning: vocab does not have an EOS token, reranking will not work\n " , __func__);
870872 ok = false ;
871873 }
872874
873- if (llama_token_sep (model ) == LLAMA_TOKEN_NULL) {
874- LOG_WRN (" %s: warning: model does not have a SEP token, reranking will not work\n " , __func__);
875+ if (llama_vocab_sep (vocab ) == LLAMA_TOKEN_NULL) {
876+ LOG_WRN (" %s: warning: vocab does not have a SEP token, reranking will not work\n " , __func__);
875877 ok = false ;
876878 }
877879
@@ -884,7 +886,7 @@ struct common_init_result common_init_from_params(common_params & params) {
884886
885887 auto cparams = common_context_params_to_llama (params);
886888
887- llama_context * lctx = llama_new_context_with_model (model, cparams);
889+ llama_context * lctx = llama_init_from_model (model, cparams);
888890 if (lctx == NULL ) {
889891 LOG_ERR (" %s: failed to create context with model '%s'\n " , __func__, params.model .c_str ());
890892 llama_model_free (model);
@@ -898,7 +900,7 @@ struct common_init_result common_init_from_params(common_params & params) {
898900
899901 if (!params.control_vectors .empty ()) {
900902 if (params.control_vector_layer_start <= 0 ) params.control_vector_layer_start = 1 ;
901- if (params.control_vector_layer_end <= 0 ) params.control_vector_layer_end = llama_n_layer (model);
903+ if (params.control_vector_layer_end <= 0 ) params.control_vector_layer_end = llama_model_n_layer (model);
902904
903905 const auto cvec = common_control_vector_load (params.control_vectors );
904906 if (cvec.n_embd == -1 ) {
@@ -908,12 +910,13 @@ struct common_init_result common_init_from_params(common_params & params) {
908910 return iparams;
909911 }
910912
911- int err = llama_control_vector_apply (lctx,
912- cvec.data .data (),
913- cvec.data .size (),
914- cvec.n_embd ,
915- params.control_vector_layer_start ,
916- params.control_vector_layer_end );
913+ int err = llama_apply_adapter_cvec (
914+ lctx,
915+ cvec.data .data (),
916+ cvec.data .size (),
917+ cvec.n_embd ,
918+ params.control_vector_layer_start ,
919+ params.control_vector_layer_end );
917920 if (err) {
918921 llama_free (lctx);
919922 llama_model_free (model);
@@ -924,8 +927,8 @@ struct common_init_result common_init_from_params(common_params & params) {
924927
925928 // load and optionally apply lora adapters
926929 for (auto & la : params.lora_adapters ) {
927- llama_lora_adapter_ptr lora;
928- lora.reset (llama_lora_adapter_init (model, la.path .c_str ()));
930+ llama_adapter_lora_ptr lora;
931+ lora.reset (llama_adapter_lora_init (model, la.path .c_str ()));
929932 if (lora == nullptr ) {
930933 LOG_ERR (" %s: failed to apply lora adapter '%s'\n " , __func__, la.path .c_str ());
931934 llama_free (lctx);
@@ -938,17 +941,17 @@ struct common_init_result common_init_from_params(common_params & params) {
938941 }
939942
940943 if (!params.lora_init_without_apply ) {
941- common_lora_adapters_apply (lctx, params.lora_adapters );
944+ common_set_adapter_lora (lctx, params.lora_adapters );
942945 }
943946
944- if (params.sampling .ignore_eos && llama_token_eos (model ) == LLAMA_TOKEN_NULL) {
945- LOG_WRN (" %s: warning: model does not have an EOS token, ignoring --ignore-eos\n " , __func__);
947+ if (params.sampling .ignore_eos && llama_vocab_eos (vocab ) == LLAMA_TOKEN_NULL) {
948+ LOG_WRN (" %s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n " , __func__);
946949 params.sampling .ignore_eos = false ;
947950 }
948951
949952 if (params.sampling .ignore_eos ) {
950- for (llama_token i = 0 ; i < llama_n_vocab (model ); i++) {
951- if (llama_token_is_eog (model , i)) {
953+ for (llama_token i = 0 ; i < llama_vocab_n_tokens (vocab ); i++) {
954+ if (llama_vocab_is_eog (vocab , i)) {
952955 LOG_INF (" %s: added %s logit bias = %f\n " , __func__, common_token_to_piece (lctx, i).c_str (), -INFINITY);
953956 params.sampling .logit_bias .push_back ({i, -INFINITY});
954957 }
@@ -969,8 +972,9 @@ struct common_init_result common_init_from_params(common_params & params) {
969972 LOG_WRN (" %s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n " , __func__);
970973
971974 std::vector<llama_token> tmp;
972- llama_token bos = llama_token_bos (model);
973- llama_token eos = llama_token_eos (model);
975+ llama_token bos = llama_vocab_bos (vocab);
976+ llama_token eos = llama_vocab_eos (vocab);
977+
974978 // some models (e.g. T5) don't have a BOS token
975979 if (bos != LLAMA_TOKEN_NULL) {
976980 tmp.push_back (bos);
@@ -1005,11 +1009,11 @@ struct common_init_result common_init_from_params(common_params & params) {
10051009 return iparams;
10061010}
10071011
1008- void common_lora_adapters_apply (struct llama_context * ctx, std::vector<common_lora_adapter_info > & lora) {
1009- llama_lora_adapter_clear (ctx);
1012+ void common_set_adapter_lora (struct llama_context * ctx, std::vector<common_adapter_lora_info > & lora) {
1013+ llama_clear_adapter_lora (ctx);
10101014 for (auto & la : lora) {
10111015 if (la.scale != 0 .0f ) {
1012- llama_lora_adapter_set (ctx, la.ptr , la.scale );
1016+ llama_set_adapter_lora (ctx, la.ptr , la.scale );
10131017 }
10141018 }
10151019}
@@ -1559,21 +1563,23 @@ std::vector<llama_token> common_tokenize(
15591563 const std::string & text,
15601564 bool add_special,
15611565 bool parse_special) {
1562- return common_tokenize (llama_get_model (ctx), text, add_special, parse_special);
1566+ const llama_model * model = llama_get_model (ctx);
1567+ const llama_vocab * vocab = llama_model_get_vocab (model);
1568+ return common_tokenize (vocab, text, add_special, parse_special);
15631569}
15641570
15651571std::vector<llama_token> common_tokenize (
1566- const struct llama_model * model ,
1572+ const struct llama_vocab * vocab ,
15671573 const std::string & text,
15681574 bool add_special,
15691575 bool parse_special) {
15701576 // upper limit for the number of tokens
15711577 int n_tokens = text.length () + 2 * add_special;
15721578 std::vector<llama_token> result (n_tokens);
1573- n_tokens = llama_tokenize (model , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
1579+ n_tokens = llama_tokenize (vocab , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
15741580 if (n_tokens < 0 ) {
15751581 result.resize (-n_tokens);
1576- int check = llama_tokenize (model , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
1582+ int check = llama_tokenize (vocab , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
15771583 GGML_ASSERT (check == -n_tokens);
15781584 } else {
15791585 result.resize (n_tokens);
@@ -1582,12 +1588,18 @@ std::vector<llama_token> common_tokenize(
15821588}
15831589
15841590std::string common_token_to_piece (const struct llama_context * ctx, llama_token token, bool special) {
1591+ const llama_model * model = llama_get_model (ctx);
1592+ const llama_vocab * vocab = llama_model_get_vocab (model);
1593+ return common_token_to_piece (vocab, token, special);
1594+ }
1595+
1596+ std::string common_token_to_piece (const struct llama_vocab * vocab, llama_token token, bool special) {
15851597 std::string piece;
15861598 piece.resize (piece.capacity ()); // using string internal cache, 15 bytes + '\n'
1587- const int n_chars = llama_token_to_piece (llama_get_model (ctx) , token, &piece[0 ], piece.size (), 0 , special);
1599+ const int n_chars = llama_token_to_piece (vocab , token, &piece[0 ], piece.size (), 0 , special);
15881600 if (n_chars < 0 ) {
15891601 piece.resize (-n_chars);
1590- int check = llama_token_to_piece (llama_get_model (ctx) , token, &piece[0 ], piece.size (), 0 , special);
1602+ int check = llama_token_to_piece (vocab , token, &piece[0 ], piece.size (), 0 , special);
15911603 GGML_ASSERT (check == -n_chars);
15921604 }
15931605 else {
@@ -1597,13 +1609,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
15971609 return piece;
15981610}
15991611
1600- std::string common_detokenize (llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1612+ std::string common_detokenize (const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1613+ const llama_model * model = llama_get_model (ctx);
1614+ const llama_vocab * vocab = llama_model_get_vocab (model);
1615+ return common_detokenize (vocab, tokens, special);
1616+ }
1617+
1618+ std::string common_detokenize (const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
16011619 std::string text;
16021620 text.resize (std::max (text.capacity (), tokens.size ()));
1603- int32_t n_chars = llama_detokenize (llama_get_model (ctx) , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
1621+ int32_t n_chars = llama_detokenize (vocab , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
16041622 if (n_chars < 0 ) {
16051623 text.resize (-n_chars);
1606- n_chars = llama_detokenize (llama_get_model (ctx) , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
1624+ n_chars = llama_detokenize (vocab , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
16071625 GGML_ASSERT (n_chars <= (int32_t )text.size ()); // whitespace trimming is performed after per-token detokenization
16081626 }
16091627
@@ -1631,7 +1649,7 @@ std::string common_get_builtin_chat_template(const struct llama_model * model) {
16311649
16321650bool common_chat_verify_template (const std::string & tmpl) {
16331651 llama_chat_message chat[] = {{" user" , " test" }};
1634- int res = llama_chat_apply_template (nullptr , tmpl.c_str (), chat, 1 , true , nullptr , 0 );
1652+ const int res = llama_chat_apply_template (tmpl.c_str (), chat, 1 , true , nullptr , 0 );
16351653 return res >= 0 ;
16361654}
16371655
@@ -1642,35 +1660,34 @@ std::string common_chat_apply_template(const struct llama_model * model,
16421660 int alloc_size = 0 ;
16431661 bool fallback = false ; // indicate if we must fallback to default chatml
16441662 std::vector<llama_chat_message> chat;
1645- for (auto & msg : msgs) {
1663+ for (const auto & msg : msgs) {
16461664 chat.push_back ({msg.role .c_str (), msg.content .c_str ()});
16471665 alloc_size += (msg.role .size () + msg.content .size ()) * 1.25 ;
16481666 }
16491667
1650- const char * ptr_tmpl = tmpl.empty () ? nullptr : tmpl.c_str ();
1668+ const char * ptr_tmpl = tmpl.empty () ? llama_model_chat_template (model) : tmpl.c_str ();
16511669 std::vector<char > buf (alloc_size);
16521670
16531671 // run the first time to get the total output length
1654- int32_t res = llama_chat_apply_template (model, ptr_tmpl, chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
1672+ int32_t res = llama_chat_apply_template (ptr_tmpl, chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
16551673
16561674 // error: chat template is not supported
16571675 if (res < 0 ) {
16581676 if (ptr_tmpl != nullptr ) {
16591677 // if the custom "tmpl" is not supported, we throw an error
16601678 // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
16611679 throw std::runtime_error (" this custom template is not supported" );
1662- } else {
1663- // If the built-in template is not supported, we default to chatml
1664- res = llama_chat_apply_template (nullptr , " chatml" , chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
1665- fallback = true ;
16661680 }
1681+
1682+ // If the built-in template is not supported, we default to chatml
1683+ res = llama_chat_apply_template (" chatml" , chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
1684+ fallback = true ;
16671685 }
16681686
16691687 // if it turns out that our buffer is too small, we resize it
16701688 if ((size_t ) res > buf.size ()) {
16711689 buf.resize (res);
16721690 res = llama_chat_apply_template (
1673- fallback ? nullptr : model,
16741691 fallback ? " chatml" : ptr_tmpl,
16751692 chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
16761693 }
0 commit comments