@@ -409,6 +409,21 @@ struct ggml_tensor_extra_gpu {
409409 cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
410410};
411411
412+ // this is faster on Windows
413+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
414+ inline cudaError_t ggml_cuda_set_device (int device) {
415+ int current_device;
416+ CUDA_CHECK (cudaGetDevice (¤t_device));
417+
418+ if (device == current_device) {
419+ return cudaSuccess;
420+ }
421+
422+ current_device = device;
423+
424+ return cudaSetDevice (device);
425+ }
426+
412427static int g_device_count = -1 ;
413428static int g_main_device = 0 ;
414429static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -5151,7 +5166,7 @@ void ggml_init_cublas() {
51515166 }
51525167
51535168 for (int64_t id = 0 ; id < g_device_count; ++id) {
5154- CUDA_CHECK (cudaSetDevice (id));
5169+ CUDA_CHECK (ggml_cuda_set_device (id));
51555170
51565171 // create cuda streams
51575172 for (int64_t is = 0 ; is < MAX_STREAMS; ++is) {
@@ -5795,7 +5810,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
57955810 size_t src1_asf = 0 ;
57965811 size_t dst_asf = 0 ;
57975812
5798- cudaSetDevice (g_main_device);
5813+ ggml_cuda_set_device (g_main_device);
57995814 const cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
58005815
58015816 if (src0_on_device) {
@@ -5940,7 +5955,7 @@ static void ggml_cuda_op_mul_mat(
59405955 const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
59415956 const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
59425957
5943- cudaSetDevice (id);
5958+ ggml_cuda_set_device (id);
59445959 const cudaStream_t stream = g_cudaStreams[id][0 ];
59455960
59465961 if (src0_on_device && src0_is_contiguous) {
@@ -5976,7 +5991,7 @@ static void ggml_cuda_op_mul_mat(
59765991 // if multiple devices are used they need to wait for the main device
59775992 // here an event is recorded that signals that the main device has finished calculating the input data
59785993 if (split && g_device_count > 1 ) {
5979- CUDA_CHECK (cudaSetDevice (g_main_device));
5994+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
59805995 CUDA_CHECK (cudaEventRecord (src0_extra->events [g_main_device][0 ], g_cudaStreams[g_main_device][0 ]));
59815996 }
59825997
@@ -5994,7 +6009,7 @@ static void ggml_cuda_op_mul_mat(
59946009 const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
59956010 const int64_t row_diff = row_high[id] - row_low[id];
59966011
5997- cudaSetDevice (id);
6012+ ggml_cuda_set_device (id);
59986013 const cudaStream_t stream = g_cudaStreams[id][is];
59996014
60006015 // wait for main GPU data if necessary
@@ -6096,7 +6111,7 @@ static void ggml_cuda_op_mul_mat(
60966111 }
60976112
60986113 for (int64_t id = 0 ; id < g_device_count; ++id) {
6099- CUDA_CHECK (cudaSetDevice (id));
6114+ CUDA_CHECK (ggml_cuda_set_device (id));
61006115
61016116 // free buffers again when done
61026117 if (src0_as[id] > 0 ) {
@@ -6118,7 +6133,7 @@ static void ggml_cuda_op_mul_mat(
61186133 int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1 ) / MUL_MAT_SRC1_COL_STRIDE;
61196134 is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
61206135
6121- CUDA_CHECK (cudaSetDevice (g_main_device));
6136+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
61226137 for (int64_t id = 0 ; id < g_device_count; ++id) {
61236138 for (int64_t is = 0 ; is < is_max; ++is) {
61246139 CUDA_CHECK (cudaStreamWaitEvent (g_cudaStreams[g_main_device][0 ], src0_extra->events [id][is]));
@@ -6127,7 +6142,7 @@ static void ggml_cuda_op_mul_mat(
61276142 }
61286143
61296144 if (dst->backend == GGML_BACKEND_CPU) {
6130- CUDA_CHECK (cudaSetDevice (g_main_device));
6145+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
61316146 CUDA_CHECK (cudaDeviceSynchronize ());
61326147 }
61336148}
@@ -6187,7 +6202,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
61876202
61886203 const int64_t ne12 = src1->ne [2 ];
61896204
6190- CUDA_CHECK (cudaSetDevice (g_main_device));
6205+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
61916206 cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
61926207
61936208 struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6218,7 +6233,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
62186233 const int64_t nb01 = src0->nb [1 ];
62196234 const int64_t nb02 = src0->nb [2 ];
62206235
6221- CUDA_CHECK (cudaSetDevice (g_main_device));
6236+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
62226237 cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
62236238
62246239 struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6310,7 +6325,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
63106325 const int64_t nb11 = src1->nb [1 ];
63116326 const int64_t nb12 = src1->nb [2 ];
63126327
6313- CUDA_CHECK (cudaSetDevice (g_main_device));
6328+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
63146329 cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
63156330
63166331 const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6376,7 +6391,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
63766391 continue ;
63776392 }
63786393
6379- cudaSetDevice (id);
6394+ ggml_cuda_set_device (id);
63806395
63816396 int64_t row_low, row_high;
63826397 if (backend == GGML_BACKEND_GPU) {
@@ -6446,13 +6461,13 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
64466461
64476462 for (int64_t id = 0 ; id < g_device_count; ++id) {
64486463 if (extra->data_device [id] != nullptr ) {
6449- CUDA_CHECK (cudaSetDevice (id));
6464+ CUDA_CHECK (ggml_cuda_set_device (id));
64506465 CUDA_CHECK (cudaFree (extra->data_device [id]));
64516466 }
64526467
64536468 for (int64_t is = 0 ; is < MAX_STREAMS; ++is) {
64546469 if (extra->events [id][is] != nullptr ) {
6455- CUDA_CHECK (cudaSetDevice (id));
6470+ CUDA_CHECK (ggml_cuda_set_device (id));
64566471 CUDA_CHECK (cudaEventDestroy (extra->events [id][is]));
64576472 }
64586473 }
@@ -6506,7 +6521,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
65066521 force_inplace;
65076522 const size_t size = ggml_nbytes (tensor);
65086523
6509- CUDA_CHECK (cudaSetDevice (g_main_device));
6524+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
65106525 if (inplace && (tensor->src [0 ]->backend == GGML_BACKEND_GPU || tensor->src [0 ]->backend == GGML_BACKEND_GPU_SPLIT)) {
65116526 struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src [0 ]->extra ;
65126527 char * src0_ddc = (char *) src0_extra->data_device [g_main_device];
0 commit comments