@@ -1419,7 +1419,7 @@ struct server_context {
14191419 queue_results.send (res);
14201420 }
14211421
1422- void send_rank (const server_slot & slot, const llama_batch & batch) {
1422+ void send_rerank (const server_slot & slot, const llama_batch & batch) {
14231423 server_task_result res;
14241424 res.id = slot.id_task ;
14251425 res.error = false ;
@@ -1440,19 +1440,19 @@ struct server_context {
14401440
14411441 res.data = json {
14421442 {" index" , slot.index },
1443- {" rank " , -1e6 },
1443+ {" score " , -1e6 },
14441444 };
14451445
14461446 continue ;
14471447 }
14481448
14491449 res.data = json {
14501450 {" index" , slot.index },
1451- {" rank " , embd[0 ]},
1451+ {" score " , embd[0 ]},
14521452 };
14531453 }
14541454
1455- SLT_DBG (slot, " sending rank , res = '%s'\n " , res.data .dump ().c_str ());
1455+ SLT_DBG (slot, " sending rerank result , res = '%s'\n " , res.data .dump ().c_str ());
14561456
14571457 queue_results.send (res);
14581458 }
@@ -1493,6 +1493,9 @@ struct server_context {
14931493 else if (prompt.is_array ()) {
14941494 std::vector<json> prompts = prompt;
14951495 if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
1496+ // prompts[0] is the question
1497+ // the rest are the answers/documents
1498+ SRV_DBG (" creating rerank tasks, n_prompts = %d\n " , (int ) prompts.size () - 1 );
14961499 for (size_t i = 1 ; i < prompts.size (); i++) {
14971500 json qd;
14981501 qd.push_back (prompts[0 ]);
@@ -1501,6 +1504,7 @@ struct server_context {
15011504 create_task (data, true , qd);
15021505 }
15031506 } else {
1507+ SRV_DBG (" creating multi-prompt tasks, n_prompts = %d\n " , (int ) prompts.size ());
15041508 for (size_t i = 0 ; i < prompts.size (); i++) {
15051509 const auto & e = prompts[i];
15061510 if (e.is_string () || json_is_array_of_numbers (e)) {
@@ -1965,6 +1969,7 @@ struct server_context {
19651969 // track if this is an embedding or non-embedding batch
19661970 // if we've added sampled tokens above, we are in non-embedding mode
19671971 // -1: none, 0: non-embedding, 1: embedding
1972+ // TODO: make enum
19681973 int32_t batch_type = batch.n_tokens > 0 ? 0 : -1 ;
19691974
19701975 // next, batch any pending prompts without exceeding n_batch
@@ -2133,6 +2138,7 @@ struct server_context {
21332138 slot.n_prompt_tokens_processed = 0 ;
21342139 }
21352140
2141+ // non-causal tasks require to fit the entire prompt in the physical batch
21362142 if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
21372143 // cannot fit the prompt in the current batch - will try next iter
21382144 if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
@@ -2318,7 +2324,7 @@ struct server_context {
23182324 }
23192325
23202326 if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
2321- send_rank (slot, batch_view);
2327+ send_rerank (slot, batch_view);
23222328 slot.release ();
23232329 slot.i_batch = -1 ;
23242330 continue ; // continue loop of slots
0 commit comments