@@ -302,12 +302,26 @@ static void ggml_backend_copy_cache_ptrs(char **& backend_cache_ptrs, const char
302302 cudaMemcpy (backend_cache_ptrs, host_cache_ptrs, size*sizeof (char *), cudaMemcpyHostToDevice);
303303}
304304
305- void ggml_backend_copy_k_cache_ptrs (const char ** host_cache_ptrs, size_t size) {
306- ggml_backend_copy_cache_ptrs (k_cache_ptrs, host_cache_ptrs, size);
307- }
308-
309- void ggml_backend_copy_v_cache_ptrs (const char ** host_cache_ptrs, size_t size) {
310- ggml_backend_copy_cache_ptrs (v_cache_ptrs, host_cache_ptrs, size);
305+ void ggml_backend_copy_kv_cache_ptrs (const int64_t n_layer, const int64_t kv_head, struct ggml_tensor ** kv_kl, struct ggml_tensor ** kv_vl, const int64_t n_embd_k_gqa,const int64_t n_embd_v_gqa, const bool flash_attn) {
306+
307+ std::vector<const char *> host_k_cache_ptrs;
308+ std::vector<const char *> host_v_cache_ptrs;
309+ for (int il = 0 ; il < n_layer; ++il) {
310+ // K cache pointer for this layer
311+ ggml_tensor * tmp_tensor = kv_kl[il];
312+ size_t tmp_offset = (ggml_row_size (kv_kl[il]->type , n_embd_k_gqa))*kv_head;
313+ host_k_cache_ptrs.push_back (static_cast <char *>(tmp_tensor->data ) + tmp_offset);
314+ // V cache pointer for this layer
315+ tmp_tensor = kv_vl[il];
316+ if (flash_attn) {
317+ tmp_offset = (kv_head)*ggml_row_size (kv_vl[il]->type , n_embd_v_gqa);
318+ } else {
319+ tmp_offset = (kv_head)*ggml_element_size (kv_vl[il]);
320+ }
321+ host_v_cache_ptrs.push_back (static_cast <char *>(tmp_tensor->data ) + tmp_offset);
322+ }
323+ ggml_backend_copy_cache_ptrs (k_cache_ptrs, host_k_cache_ptrs.data (), host_k_cache_ptrs.size ());
324+ ggml_backend_copy_cache_ptrs (v_cache_ptrs, host_v_cache_ptrs.data (), host_v_cache_ptrs.size ());
311325}
312326
313327static void ggml_cpy_f16_f32_cuda (
0 commit comments