@@ -274,16 +274,18 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
274274 GGML_METAL_ADD_KERNEL (mul_mv_q4_K_f32);
275275 GGML_METAL_ADD_KERNEL (mul_mv_q5_K_f32);
276276 GGML_METAL_ADD_KERNEL (mul_mv_q6_K_f32);
277- GGML_METAL_ADD_KERNEL (mul_mm_f32_f32);
278- GGML_METAL_ADD_KERNEL (mul_mm_f16_f32);
279- GGML_METAL_ADD_KERNEL (mul_mm_q4_0_f32);
280- GGML_METAL_ADD_KERNEL (mul_mm_q8_0_f32);
281- GGML_METAL_ADD_KERNEL (mul_mm_q4_1_f32);
282- GGML_METAL_ADD_KERNEL (mul_mm_q2_K_f32);
283- GGML_METAL_ADD_KERNEL (mul_mm_q3_K_f32);
284- GGML_METAL_ADD_KERNEL (mul_mm_q4_K_f32);
285- GGML_METAL_ADD_KERNEL (mul_mm_q5_K_f32);
286- GGML_METAL_ADD_KERNEL (mul_mm_q6_K_f32);
277+ if ([ctx->device supportsFamily: MTLGPUFamilyApple7]) {
278+ GGML_METAL_ADD_KERNEL (mul_mm_f32_f32);
279+ GGML_METAL_ADD_KERNEL (mul_mm_f16_f32);
280+ GGML_METAL_ADD_KERNEL (mul_mm_q4_0_f32);
281+ GGML_METAL_ADD_KERNEL (mul_mm_q8_0_f32);
282+ GGML_METAL_ADD_KERNEL (mul_mm_q4_1_f32);
283+ GGML_METAL_ADD_KERNEL (mul_mm_q2_K_f32);
284+ GGML_METAL_ADD_KERNEL (mul_mm_q3_K_f32);
285+ GGML_METAL_ADD_KERNEL (mul_mm_q4_K_f32);
286+ GGML_METAL_ADD_KERNEL (mul_mm_q5_K_f32);
287+ GGML_METAL_ADD_KERNEL (mul_mm_q6_K_f32);
288+ }
287289 GGML_METAL_ADD_KERNEL (rope_f32);
288290 GGML_METAL_ADD_KERNEL (rope_f16);
289291 GGML_METAL_ADD_KERNEL (alibi_f32);
@@ -296,8 +298,22 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
296298#undef GGML_METAL_ADD_KERNEL
297299 }
298300
299- GGML_METAL_LOG_INFO (" %s : hasUnifiedMemory = %s \n " , __func__, ctx->device .hasUnifiedMemory ? " true" : " false" );
300301#if TARGET_OS_OSX
302+ // print MTL GPU family:
303+ GGML_METAL_LOG_INFO (" %s : GPU name: %s \n " , __func__, [[ctx->device name ] UTF8String ]);
304+ GGML_METAL_LOG_INFO (" %s : GPU arch: %s \n " , __func__, [[ctx->device architecture ].name UTF8String ]);
305+
306+ // determine max supported GPU family
307+ // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
308+ // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
309+ for (int i = MTLGPUFamilyApple9 + 10 ; i >= MTLGPUFamilyApple1 ; --i) {
310+ if ([ctx->device supportsFamily: i]) {
311+ GGML_METAL_LOG_INFO (" %s : GPU family: MTLGPUFamilyApple%d (%d )\n " , __func__, i - MTLGPUFamilyApple1 + 1 , i);
312+ break ;
313+ }
314+ }
315+
316+ GGML_METAL_LOG_INFO (" %s : hasUnifiedMemory = %s \n " , __func__, ctx->device .hasUnifiedMemory ? " true" : " false" );
301317 GGML_METAL_LOG_INFO (" %s : recommendedMaxWorkingSetSize = %8.2f MB\n " , __func__, ctx->device .recommendedMaxWorkingSetSize / 1024.0 / 1024.0 );
302318 if (ctx->device .maxTransferRate != 0 ) {
303319 GGML_METAL_LOG_INFO (" %s : maxTransferRate = %8.2f MB/s\n " , __func__, ctx->device .maxTransferRate / 1024.0 / 1024.0 );
@@ -351,16 +367,18 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
351367 GGML_METAL_DEL_KERNEL (mul_mv_q4_K_f32);
352368 GGML_METAL_DEL_KERNEL (mul_mv_q5_K_f32);
353369 GGML_METAL_DEL_KERNEL (mul_mv_q6_K_f32);
354- GGML_METAL_DEL_KERNEL (mul_mm_f32_f32);
355- GGML_METAL_DEL_KERNEL (mul_mm_f16_f32);
356- GGML_METAL_DEL_KERNEL (mul_mm_q4_0_f32);
357- GGML_METAL_DEL_KERNEL (mul_mm_q8_0_f32);
358- GGML_METAL_DEL_KERNEL (mul_mm_q4_1_f32);
359- GGML_METAL_DEL_KERNEL (mul_mm_q2_K_f32);
360- GGML_METAL_DEL_KERNEL (mul_mm_q3_K_f32);
361- GGML_METAL_DEL_KERNEL (mul_mm_q4_K_f32);
362- GGML_METAL_DEL_KERNEL (mul_mm_q5_K_f32);
363- GGML_METAL_DEL_KERNEL (mul_mm_q6_K_f32);
370+ if ([ctx->device supportsFamily: MTLGPUFamilyApple7]) {
371+ GGML_METAL_DEL_KERNEL (mul_mm_f32_f32);
372+ GGML_METAL_DEL_KERNEL (mul_mm_f16_f32);
373+ GGML_METAL_DEL_KERNEL (mul_mm_q4_0_f32);
374+ GGML_METAL_DEL_KERNEL (mul_mm_q8_0_f32);
375+ GGML_METAL_DEL_KERNEL (mul_mm_q4_1_f32);
376+ GGML_METAL_DEL_KERNEL (mul_mm_q2_K_f32);
377+ GGML_METAL_DEL_KERNEL (mul_mm_q3_K_f32);
378+ GGML_METAL_DEL_KERNEL (mul_mm_q4_K_f32);
379+ GGML_METAL_DEL_KERNEL (mul_mm_q5_K_f32);
380+ GGML_METAL_DEL_KERNEL (mul_mm_q6_K_f32);
381+ }
364382 GGML_METAL_DEL_KERNEL (rope_f32);
365383 GGML_METAL_DEL_KERNEL (rope_f16);
366384 GGML_METAL_DEL_KERNEL (alibi_f32);
@@ -986,32 +1004,36 @@ void ggml_metal_graph_compute(
9861004 } break ;
9871005 case GGML_OP_MUL_MAT:
9881006 {
989- // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
990-
9911007 GGML_ASSERT (ne00 == ne10);
992- // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
993- uint gqa = ne12/ne02;
9941008 GGML_ASSERT (ne03 == ne13);
9951009
1010+ const uint gqa = ne12/ne02;
1011+
9961012 // find the break-even point where the matrix-matrix kernel becomes more efficient compared
997- // to the matrix-vector kernel. the numbers below are measured on M2 Ultra
998- // not sure if this translates across all chips
1013+ // to the matrix-vector kernel
9991014 int ne11_mm_min = 1 ;
10001015
1001- switch (src0t) {
1002- case GGML_TYPE_F16: ne11_mm_min = 2 ; break ;
1003- case GGML_TYPE_Q8_0: ne11_mm_min = 7 ; break ;
1004- case GGML_TYPE_Q2_K: ne11_mm_min = 15 ; break ;
1005- case GGML_TYPE_Q3_K: ne11_mm_min = 7 ; break ;
1006- case GGML_TYPE_Q4_0:
1007- case GGML_TYPE_Q4_1: ne11_mm_min = 15 ; break ;
1008- case GGML_TYPE_Q4_K: ne11_mm_min = 11 ; break ;
1009- case GGML_TYPE_Q5_0: // not tested yet
1010- case GGML_TYPE_Q5_1: ne11_mm_min = 13 ; break ; // not tested yet
1011- case GGML_TYPE_Q5_K: ne11_mm_min = 7 ; break ;
1012- case GGML_TYPE_Q6_K: ne11_mm_min = 7 ; break ;
1013- default : ne11_mm_min = 1 ; break ;
1016+ #if 0
1017+ // the numbers below are measured on M2 Ultra for 7B and 13B models
1018+ // these numbers do not translate to other devices or model sizes
1019+ // TODO: need to find a better approach
1020+ if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
1021+ switch (src0t) {
1022+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
1023+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
1024+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
1025+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
1026+ case GGML_TYPE_Q4_0:
1027+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
1028+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
1029+ case GGML_TYPE_Q5_0: // not tested yet
1030+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
1031+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
1032+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
1033+ default: ne11_mm_min = 1; break;
1034+ }
10141035 }
1036+ #endif
10151037
10161038 // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
10171039 // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
0 commit comments