@@ -2345,19 +2345,21 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
23452345
23462346 for (const auto & fragment : fragment_buffer) {
23472347 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2348- auto raw_text = fragment. raw_text . substr (fragment. offset , fragment. length ) ;
2348+ std::string text ;
23492349
23502350 // prefix with space if previous is special
23512351 if (tokenizer_add_space_prefix && is_prev_special) {
2352- raw_text = " " + raw_text ;
2352+ text = ' ' ;
23532353 }
23542354
2355+ text += fragment.raw_text .substr (fragment.offset , fragment.length );
2356+
23552357#ifdef PRETOKENIZERDEBUG
2356- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2358+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
23572359#endif
2358- llama_escape_whitespace (raw_text );
2360+ llama_escape_whitespace (text );
23592361 llm_tokenizer_spm_session session (vocab);
2360- session.tokenize (raw_text , output);
2362+ session.tokenize (text , output);
23612363 is_prev_special = false ;
23622364 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
23632365 output.push_back (fragment.token );
@@ -2387,12 +2389,12 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
23872389 }
23882390 for (const auto & fragment : fragment_buffer) {
23892391 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2390- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2392+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
23912393
23922394#ifdef PRETOKENIZERDEBUG
2393- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2395+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
23942396#endif
2395- session.tokenize (raw_text , output);
2397+ session.tokenize (text , output);
23962398 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
23972399 session.append (fragment.token , output);
23982400 }
@@ -2414,12 +2416,12 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
24142416
24152417 for (const auto & fragment : fragment_buffer) {
24162418 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2417- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2419+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
24182420
24192421#ifdef PRETOKENIZERDEBUG
2420- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2422+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
24212423#endif
2422- session.tokenize (raw_text , output);
2424+ session.tokenize (text , output);
24232425 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
24242426 output.push_back (fragment.token );
24252427 }
@@ -2440,11 +2442,11 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
24402442
24412443 for (const auto & fragment : fragment_buffer) {
24422444 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2443- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2445+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
24442446#ifdef PRETOKENIZERDEBUG
2445- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2447+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
24462448#endif
2447- session.tokenize (raw_text , output);
2449+ session.tokenize (text , output);
24482450 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
24492451 output.push_back (fragment.token );
24502452 }
@@ -2467,13 +2469,13 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
24672469 llm_tokenizer_rwkv_session session (vocab, *static_cast <const llm_tokenizer_rwkv *>(tokenizer.get ()));
24682470 for (const auto & fragment : fragment_buffer) {
24692471 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2470- auto raw_text = fragment.raw_text .substr (fragment.offset , fragment.length );
2472+ std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
24712473
24722474#ifdef PRETOKENIZERDEBUG
2473- LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , raw_text .length (), fragment.offset , fragment.length , raw_text .c_str ());
2475+ LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text .length (), fragment.offset , fragment.length , text .c_str ());
24742476#endif
24752477
2476- session.tokenize (raw_text , output);
2478+ session.tokenize (text , output);
24772479 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
24782480 output.push_back (fragment.token );
24792481 }
0 commit comments