@@ -1476,10 +1476,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
14761476
14771477 const int64_t ne10 = src1->ne [0 ];
14781478 const int64_t ne11 = src1->ne [1 ];
1479+ const int64_t ne12 = src1->ne [2 ];
1480+ const int64_t ne13 = src1->ne [3 ];
14791481
14801482 const int nb2 = dst->nb [2 ];
14811483 const int nb3 = dst->nb [3 ];
14821484
1485+ const int64_t r2 = ne12 / ne02;
1486+ const int64_t r3 = ne13 / ne03;
1487+
14831488 const float alpha = 1 .0f ;
14841489 const float beta = 0 .0f ;
14851490 const int x_ne = ne01 * ne00;
@@ -1498,13 +1503,22 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
14981503 cl_mem d_Y = ggml_cl_pool_malloc (sizeof (float ) * y_ne, &y_size);
14991504 cl_mem d_D = ggml_cl_pool_malloc (sizeof (float ) * d_ne, &d_size);
15001505
1501- for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1502- for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1506+ int64_t pi02 = -1 ;
1507+ int64_t pi03 = -1 ;
1508+
1509+ for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1510+ int64_t i03 = i13 / r3;
1511+
1512+ for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1513+ int64_t i02 = i12 / r2;
1514+
15031515 // copy data to device
1504- if (src0->backend != GGML_BACKEND_GPU) {
1516+ if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03) ) {
15051517 CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1518+ pi02 = i02;
1519+ pi03 = i03;
15061520 }
1507- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , NULL ));
1521+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , NULL ));
15081522
15091523 CL_CHECK (clFinish (queue));
15101524
@@ -1525,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
15251539 }
15261540
15271541 // copy dst to host
1528- float * d = (float *) ((char *) dst->data + i02 *nb2 + i03 *nb3);
1542+ float * d = (float *) ((char *) dst->data + i12 *nb2 + i13 *nb3);
15291543 CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &ev_sgemm, NULL ));
15301544 }
15311545 }
@@ -1547,6 +1561,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15471561
15481562 const int64_t ne10 = src1->ne [0 ];
15491563 const int64_t ne11 = src1->ne [1 ];
1564+ const int64_t ne12 = src1->ne [2 ];
1565+ const int64_t ne13 = src1->ne [3 ];
15501566
15511567 const int nb10 = src1->nb [0 ];
15521568 const int nb11 = src1->nb [1 ];
@@ -1556,6 +1572,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15561572 const int nb2 = dst->nb [2 ];
15571573 const int nb3 = dst->nb [3 ];
15581574
1575+ const int64_t r2 = ne12 / ne02;
1576+ const int64_t r3 = ne13 / ne03;
1577+
15591578 const ggml_fp16_t alpha = ggml_fp32_to_fp16 (1 .0f );
15601579 const ggml_fp16_t beta = ggml_fp32_to_fp16 (0 .0f );
15611580 const int x_ne = ne01 * ne00;
@@ -1577,32 +1596,41 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15771596 bool src1_cont_rows = nb10 == sizeof (float );
15781597 bool src1_cont_cols = (size_t )nb11 == ne11*sizeof (float );
15791598
1580- for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1581- for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1599+ int64_t pi02 = -1 ;
1600+ int64_t pi03 = -1 ;
1601+
1602+ for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1603+ int64_t i03 = i13 / r3;
1604+
1605+ for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1606+ int64_t i02 = i12 / r2;
1607+
15821608 // copy src0 to device
1583- if (src0->backend != GGML_BACKEND_GPU) {
1609+ if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03) ) {
15841610 CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1611+ pi02 = i02;
1612+ pi03 = i03;
15851613 }
15861614
15871615 // convert src1 to fp16
15881616 // TODO: use multiple threads
1589- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02 );
1590- char * src1i = (char *) src1->data + i03 *nb13 + i02 *nb12;
1617+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12 );
1618+ char * src1i = (char *) src1->data + i13 *nb13 + i12 *nb12;
15911619 if (src1_cont_rows) {
15921620 if (src1_cont_cols) {
15931621 ggml_fp32_to_fp16_row ((float *) src1i, tmp, ne10*ne11);
15941622 }
15951623 else {
1596- for (int64_t i01 = 0 ; i01 < ne11; i01 ++) {
1597- ggml_fp32_to_fp16_row ((float *) (src1i + i01 *nb11), tmp + i01 *ne10, ne10);
1624+ for (int64_t i11 = 0 ; i11 < ne11; i11 ++) {
1625+ ggml_fp32_to_fp16_row ((float *) (src1i + i11 *nb11), tmp + i11 *ne10, ne10);
15981626 }
15991627 }
16001628 }
16011629 else {
1602- for (int64_t i01 = 0 ; i01 < ne11; i01 ++) {
1603- for (int64_t i00 = 0 ; i00 < ne10; i00 ++) {
1630+ for (int64_t i11 = 0 ; i11 < ne11; i11 ++) {
1631+ for (int64_t i10 = 0 ; i10 < ne10; i10 ++) {
16041632 // very slow due to no inlining
1605- tmp[i01 *ne10 + i00 ] = ggml_fp32_to_fp16 (*(float *) (src1i + i01 *nb11 + i00 *nb10));
1633+ tmp[i11 *ne10 + i10 ] = ggml_fp32_to_fp16 (*(float *) (src1i + i11 *nb11 + i10 *nb10));
16061634 }
16071635 }
16081636 }
@@ -1631,7 +1659,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
16311659 // copy dst to host, then convert to float
16321660 CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (ggml_fp16_t ) * d_ne, tmp, 1 , &ev_sgemm, NULL ));
16331661
1634- float * d = (float *) ((char *) dst->data + i02 *nb2 + i03 *nb3);
1662+ float * d = (float *) ((char *) dst->data + i12 *nb2 + i13 *nb3);
16351663
16361664 ggml_fp16_to_fp32_row (tmp, d, d_ne);
16371665 }
@@ -1652,12 +1680,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
16521680
16531681 const int64_t ne10 = src1->ne [0 ];
16541682 const int64_t ne11 = src1->ne [1 ];
1683+ const int64_t ne12 = src1->ne [2 ];
1684+ const int64_t ne13 = src1->ne [3 ];
16551685
16561686 const int nb2 = dst->nb [2 ];
16571687 const int nb3 = dst->nb [3 ];
16581688 const ggml_type type = src0->type ;
16591689 const bool mul_mat_vec = ne11 == 1 ;
16601690
1691+ const int64_t r2 = ne12 / ne02;
1692+ const int64_t r3 = ne13 / ne03;
1693+
16611694 const float alpha = 1 .0f ;
16621695 const float beta = 0 .0f ;
16631696 const int x_ne = ne01 * ne00;
@@ -1690,12 +1723,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
16901723 size_t ev_idx = 0 ;
16911724 std::vector<cl_event> events;
16921725
1693- for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1694- for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1726+ int64_t pi02 = -1 ;
1727+ int64_t pi03 = -1 ;
1728+
1729+ for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1730+ int64_t i03 = i13 / r3;
1731+
1732+ for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1733+ int64_t i02 = i12 / r2;
1734+
16951735 // copy src0 to device if necessary
16961736 if (src0->backend == GGML_BACKEND_CPU) {
1697- events.emplace_back ();
1698- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, events.data () + ev_idx++));
1737+ if (i02 != pi02 || i03 != pi03) {
1738+ events.emplace_back ();
1739+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, events.data () + ev_idx++));
1740+ pi02 = i02;
1741+ pi03 = i03;
1742+ }
16991743 } else if (src0->backend == GGML_BACKEND_GPU) {
17001744 d_Q = (cl_mem) src0->extra ;
17011745 } else {
@@ -1704,7 +1748,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17041748 if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
17051749 // copy src1 to device
17061750 events.emplace_back ();
1707- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , events.data () + ev_idx++));
1751+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , events.data () + ev_idx++));
17081752
17091753 // compute
17101754 const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
@@ -1725,7 +1769,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17251769 CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , NULL , &global, local > 0 ? &local : NULL , events.size (), !events.empty () ? events.data () : NULL , NULL ));
17261770
17271771 // copy src1 to device
1728- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , NULL ));
1772+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , NULL ));
17291773
17301774 events.emplace_back ();
17311775
@@ -1749,7 +1793,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17491793 }
17501794
17511795 // copy dst to host
1752- float * d = (float *) ((char *) dst->data + i02 *nb2 + i03 *nb3);
1796+ float * d = (float *) ((char *) dst->data + i12 *nb2 + i13 *nb3);
17531797 CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &events[events.size () - 1 ], NULL ));
17541798 for (auto *event : events) {
17551799 clReleaseEvent (event);
0 commit comments