@@ -279,24 +279,24 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
279279 }
280280}
281281
282- void ggml_backend_tensor_copy_async (ggml_backend_t backend , struct ggml_tensor * src , struct ggml_tensor * dst ) {
282+ void ggml_backend_tensor_copy_async (ggml_backend_t backend_src , ggml_backend_t backend_dst , struct ggml_tensor * src , struct ggml_tensor * dst ) {
283283 GGML_ASSERT (ggml_are_same_layout (src , dst ) && "cannot copy tensors with different layouts" );
284284
285285 if (src == dst ) {
286286 return ;
287287 }
288288
289- if (ggml_backend_buft_supports_backend (src -> buffer -> buft , backend ) && ggml_backend_buft_supports_backend (dst -> buffer -> buft , backend )) {
290- if (backend -> iface .cpy_tensor_async != NULL ) {
291- if (backend -> iface .cpy_tensor_async (backend , src , dst )) {
292- return ;
293- }
289+ if (backend_dst -> iface .cpy_tensor_async != NULL ) {
290+ if (backend_dst -> iface .cpy_tensor_async (backend_src , backend_dst , src , dst )) {
291+ return ;
294292 }
295293 }
296294
297295 size_t nbytes = ggml_nbytes (src );
298296 if (ggml_backend_buffer_is_host (src -> buffer )) {
299- ggml_backend_tensor_set_async (backend , dst , src -> data , 0 , nbytes );
297+ // wait for src to be ready before copy
298+ ggml_backend_synchronize (backend_src );
299+ ggml_backend_tensor_set_async (backend_dst , dst , src -> data , 0 , nbytes );
300300 }
301301 else {
302302 ggml_backend_tensor_copy (src , dst );
@@ -1304,6 +1304,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
13041304 // copy the input tensors to the split backend
13051305 uint64_t copy_start_us = ggml_time_us ();
13061306 for (int j = 0 ; j < split -> n_inputs ; j ++ ) {
1307+ ggml_backend_t input_backend = get_allocr_backend (sched , node_allocr (split -> inputs [j ]));
13071308 struct ggml_tensor * input = split -> inputs [j ];
13081309 struct ggml_tensor * input_cpy = sched -> node_copies [hash_id (input )][split_backend_id ];
13091310
@@ -1312,7 +1313,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
13121313
13131314 // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
13141315 // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
1315- ggml_backend_tensor_copy_async (split_backend , input , input_cpy );
1316+ ggml_backend_tensor_copy_async (input_backend , split_backend , input , input_cpy );
13161317 }
13171318 //ggml_backend_synchronize(split_backend); // necessary to measure copy time
13181319 int64_t copy_end_us = ggml_time_us ();
0 commit comments