@@ -409,6 +409,19 @@ struct ggml_tensor_extra_gpu {
409409 cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
410410};
411411
412+ // this is faster on Windows
413+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
414+ inline cudaError_t ggml_cuda_set_device (int device) {
415+ int current_device;
416+ CUDA_CHECK (cudaGetDevice (¤t_device));
417+
418+ if (device == current_device) {
419+ return cudaSuccess;
420+ }
421+
422+ return cudaSetDevice (device);
423+ }
424+
412425static int g_device_count = -1 ;
413426static int g_main_device = 0 ;
414427static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -5151,7 +5164,7 @@ void ggml_init_cublas() {
51515164 }
51525165
51535166 for (int64_t id = 0 ; id < g_device_count; ++id) {
5154- CUDA_CHECK (cudaSetDevice (id));
5167+ CUDA_CHECK (ggml_cuda_set_device (id));
51555168
51565169 // create cuda streams
51575170 for (int64_t is = 0 ; is < MAX_STREAMS; ++is) {
@@ -5795,7 +5808,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
57955808 size_t src1_asf = 0 ;
57965809 size_t dst_asf = 0 ;
57975810
5798- cudaSetDevice (g_main_device);
5811+ ggml_cuda_set_device (g_main_device);
57995812 const cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
58005813
58015814 if (src0_on_device) {
@@ -5940,7 +5953,7 @@ static void ggml_cuda_op_mul_mat(
59405953 const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
59415954 const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
59425955
5943- cudaSetDevice (id);
5956+ ggml_cuda_set_device (id);
59445957 const cudaStream_t stream = g_cudaStreams[id][0 ];
59455958
59465959 if (src0_on_device && src0_is_contiguous) {
@@ -5976,7 +5989,7 @@ static void ggml_cuda_op_mul_mat(
59765989 // if multiple devices are used they need to wait for the main device
59775990 // here an event is recorded that signals that the main device has finished calculating the input data
59785991 if (split && g_device_count > 1 ) {
5979- CUDA_CHECK (cudaSetDevice (g_main_device));
5992+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
59805993 CUDA_CHECK (cudaEventRecord (src0_extra->events [g_main_device][0 ], g_cudaStreams[g_main_device][0 ]));
59815994 }
59825995
@@ -5994,7 +6007,7 @@ static void ggml_cuda_op_mul_mat(
59946007 const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
59956008 const int64_t row_diff = row_high[id] - row_low[id];
59966009
5997- cudaSetDevice (id);
6010+ ggml_cuda_set_device (id);
59986011 const cudaStream_t stream = g_cudaStreams[id][is];
59996012
60006013 // wait for main GPU data if necessary
@@ -6096,7 +6109,7 @@ static void ggml_cuda_op_mul_mat(
60966109 }
60976110
60986111 for (int64_t id = 0 ; id < g_device_count; ++id) {
6099- CUDA_CHECK (cudaSetDevice (id));
6112+ CUDA_CHECK (ggml_cuda_set_device (id));
61006113
61016114 // free buffers again when done
61026115 if (src0_as[id] > 0 ) {
@@ -6118,7 +6131,7 @@ static void ggml_cuda_op_mul_mat(
61186131 int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1 ) / MUL_MAT_SRC1_COL_STRIDE;
61196132 is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
61206133
6121- CUDA_CHECK (cudaSetDevice (g_main_device));
6134+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
61226135 for (int64_t id = 0 ; id < g_device_count; ++id) {
61236136 for (int64_t is = 0 ; is < is_max; ++is) {
61246137 CUDA_CHECK (cudaStreamWaitEvent (g_cudaStreams[g_main_device][0 ], src0_extra->events [id][is]));
@@ -6127,7 +6140,7 @@ static void ggml_cuda_op_mul_mat(
61276140 }
61286141
61296142 if (dst->backend == GGML_BACKEND_CPU) {
6130- CUDA_CHECK (cudaSetDevice (g_main_device));
6143+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
61316144 CUDA_CHECK (cudaDeviceSynchronize ());
61326145 }
61336146}
@@ -6187,7 +6200,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
61876200
61886201 const int64_t ne12 = src1->ne [2 ];
61896202
6190- CUDA_CHECK (cudaSetDevice (g_main_device));
6203+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
61916204 cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
61926205
61936206 struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6218,7 +6231,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
62186231 const int64_t nb01 = src0->nb [1 ];
62196232 const int64_t nb02 = src0->nb [2 ];
62206233
6221- CUDA_CHECK (cudaSetDevice (g_main_device));
6234+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
62226235 cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
62236236
62246237 struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6310,7 +6323,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
63106323 const int64_t nb11 = src1->nb [1 ];
63116324 const int64_t nb12 = src1->nb [2 ];
63126325
6313- CUDA_CHECK (cudaSetDevice (g_main_device));
6326+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
63146327 cudaStream_t main_stream = g_cudaStreams[g_main_device][0 ];
63156328
63166329 const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
@@ -6376,7 +6389,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
63766389 continue ;
63776390 }
63786391
6379- cudaSetDevice (id);
6392+ ggml_cuda_set_device (id);
63806393
63816394 int64_t row_low, row_high;
63826395 if (backend == GGML_BACKEND_GPU) {
@@ -6446,13 +6459,13 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
64466459
64476460 for (int64_t id = 0 ; id < g_device_count; ++id) {
64486461 if (extra->data_device [id] != nullptr ) {
6449- CUDA_CHECK (cudaSetDevice (id));
6462+ CUDA_CHECK (ggml_cuda_set_device (id));
64506463 CUDA_CHECK (cudaFree (extra->data_device [id]));
64516464 }
64526465
64536466 for (int64_t is = 0 ; is < MAX_STREAMS; ++is) {
64546467 if (extra->events [id][is] != nullptr ) {
6455- CUDA_CHECK (cudaSetDevice (id));
6468+ CUDA_CHECK (ggml_cuda_set_device (id));
64566469 CUDA_CHECK (cudaEventDestroy (extra->events [id][is]));
64576470 }
64586471 }
@@ -6506,7 +6519,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
65066519 force_inplace;
65076520 const size_t size = ggml_nbytes (tensor);
65086521
6509- CUDA_CHECK (cudaSetDevice (g_main_device));
6522+ CUDA_CHECK (ggml_cuda_set_device (g_main_device));
65106523 if (inplace && (tensor->src [0 ]->backend == GGML_BACKEND_GPU || tensor->src [0 ]->backend == GGML_BACKEND_GPU_SPLIT)) {
65116524 struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src [0 ]->extra ;
65126525 char * src0_ddc = (char *) src0_extra->data_device [g_main_device];
0 commit comments