@@ -1005,32 +1005,6 @@ struct llama_server_context
10051005 slot.generated_text += token_str;
10061006 slot.has_next_token = true ;
10071007
1008- size_t pos = std::min (slot.sent_count , slot.generated_text .size ());
1009- const std::string str_test = slot.generated_text .substr (pos);
1010- bool is_stop_full = false ;
1011- size_t stop_pos = find_stopping_strings (str_test, token_str.size (), STOP_FULL, slot);
1012- if (stop_pos != std::string::npos) {
1013- is_stop_full = true ;
1014- slot.generated_text .erase (
1015- slot.generated_text .begin () + pos + stop_pos,
1016- slot.generated_text .end ());
1017- pos = std::min (slot.sent_count , slot.generated_text .size ());
1018- } else {
1019- is_stop_full = false ;
1020- stop_pos = find_stopping_strings (str_test, token_str.size (), STOP_PARTIAL, slot);
1021- }
1022-
1023- // check if there is any token to predict
1024- if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0 )) {
1025- // no send the stop word in the response
1026- result.text_to_send = slot.generated_text .substr (pos, std::string::npos);
1027- slot.sent_count += result.text_to_send .size ();
1028- // add the token to slot queue and cache
1029- }
1030- slot.add_token_string (result);
1031- if (slot.params .stream ) {
1032- send_partial_response (slot, result);
1033- }
10341008 if (slot.multibyte_pending > 0 )
10351009 {
10361010 slot.multibyte_pending -= token_str.size ();
@@ -1059,6 +1033,36 @@ struct llama_server_context
10591033 }
10601034 }
10611035
1036+ if (slot.multibyte_pending == 0 )
1037+ {
1038+ size_t pos = std::min (slot.sent_count , slot.generated_text .size ());
1039+ const std::string str_test = slot.generated_text .substr (pos);
1040+ bool is_stop_full = false ;
1041+ size_t stop_pos = find_stopping_strings (str_test, token_str.size (), STOP_FULL, slot);
1042+ if (stop_pos != std::string::npos) {
1043+ is_stop_full = true ;
1044+ slot.generated_text .erase (
1045+ slot.generated_text .begin () + pos + stop_pos,
1046+ slot.generated_text .end ());
1047+ pos = std::min (slot.sent_count , slot.generated_text .size ());
1048+ } else {
1049+ is_stop_full = false ;
1050+ stop_pos = find_stopping_strings (str_test, token_str.size (), STOP_PARTIAL, slot);
1051+ }
1052+
1053+ // check if there is any token to predict
1054+ if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0 )) {
1055+ // no send the stop word in the response
1056+ result.text_to_send = slot.generated_text .substr (pos, std::string::npos);
1057+ slot.sent_count += result.text_to_send .size ();
1058+ // add the token to slot queue and cache
1059+ }
1060+ slot.add_token_string (result);
1061+ if (slot.params .stream ) {
1062+ send_partial_response (slot, result);
1063+ }
1064+ }
1065+
10621066 if (slot.multibyte_pending > 0 && !slot.has_next_token )
10631067 {
10641068 slot.has_next_token = true ;
0 commit comments