@@ -38,8 +38,10 @@ struct ibv_ravh {
3838
3939#define UCT_DC_MLX5_IFACE_MAX_USER_DCIS 15
4040#define UCT_DC_MLX5_KEEPALIVE_NUM_DCIS 1
41- #define UCT_DC_MLX5_IFACE_MAX_DCIS (UCT_DC_MLX5_IFACE_MAX_USER_DCIS + \
42- UCT_DC_MLX5_KEEPALIVE_NUM_DCIS)
41+ #define UCT_DC_MLX5_IFACE_MAX_DCI_POOLS 8
42+ #define UCT_DC_MLX5_IFACE_MAX_DCIS ((UCT_DC_MLX5_IFACE_MAX_USER_DCIS * \
43+ UCT_DC_MLX5_IFACE_MAX_DCI_POOLS) + \
44+ UCT_DC_MLX5_KEEPALIVE_NUM_DCIS)
4345
4446#define UCT_DC_MLX5_IFACE_ADDR_TM_ENABLED (_addr ) \
4547 (!!((_addr)->flags & UCT_DC_MLX5_IFACE_ADDR_HW_TM))
@@ -130,7 +132,7 @@ typedef struct uct_dc_mlx5_iface_config {
130132
131133typedef void (* uct_dc_dci_handle_failure_func_t )(uct_dc_mlx5_iface_t * iface ,
132134 struct mlx5_cqe64 * cqe ,
133- uint8_t dci ,
135+ uint8_t dci_index ,
134136 ucs_status_t status );
135137
136138
@@ -147,6 +149,7 @@ typedef struct uct_dc_dci {
147149 processed. Better have dci num
148150 groups scheduled than ep num. */
149151 };
152+ uint8_t pool_index ; /* DCI pool index. */
150153#if UCS_ENABLE_ASSERT
151154 uint8_t flags ; /* debug state, @ref uct_dc_dci_state_t */
152155#endif
@@ -175,22 +178,32 @@ typedef struct uct_dc_fc_request {
175178
176179KHASH_MAP_INIT_INT64 (uct_dc_mlx5_fc_hash , uint64_t );
177180
181+
182+ typedef struct {
183+ uint8_t stack_top ; /* dci stack top */
184+ uint8_t stack [UCT_DC_MLX5_IFACE_MAX_USER_DCIS ]; /* LIFO of indexes of available dcis */
185+ ucs_arbiter_t arbiter ; /* queue of requests
186+ waiting for DCI */
187+ } uct_dc_mlx5_dci_pool_t ;
188+
189+
178190struct uct_dc_mlx5_iface {
179191 uct_rc_mlx5_iface_common_t super ;
180192 struct {
181193 /* Array of dcis */
182194 uct_dc_dci_t dcis [UCT_DC_MLX5_IFACE_MAX_DCIS ];
183195
184196 uint8_t ndci ; /* Number of DCIs */
185- uct_dc_tx_policy_t policy ; /* dci selection algorithm */
186- int16_t available_quota ; /* if available tx is lower, let
187- another endpoint use the dci */
188197
189198 /* LIFO is only relevant for dcs allocation policy */
190- uint8_t stack_top ; /* dci stack top */
191- uint8_t dcis_stack [ UCT_DC_MLX5_IFACE_MAX_DCIS ]; /* LIFO of indexes of available dcis */
199+ uct_dc_mlx5_dci_pool_t dci_pool [ UCT_DC_MLX5_IFACE_MAX_DCI_POOLS ];
200+ uint8_t num_dci_pools ;
192201
193- ucs_arbiter_t dci_arbiter ;
202+ uint8_t policy ; /* dci selection algorithm */
203+ int16_t available_quota ; /* if available tx is lower, let
204+ another endpoint use the dci */
205+ /* DCI max elements */
206+ unsigned bb_max ;
194207
195208 /* Used to send grant messages for all peers */
196209 uct_dc_mlx5_ep_t * fc_ep ;
@@ -216,6 +229,8 @@ struct uct_dc_mlx5_iface {
216229 /* iface flags, see uct_dc_mlx5_iface_flags_t */
217230 uint8_t flags ;
218231
232+ uint8_t keepalive_dci ;
233+
219234 uct_ud_mlx5_iface_common_t ud_common ;
220235};
221236
@@ -261,7 +276,9 @@ void uct_dc_mlx5_iface_set_ep_failed(uct_dc_mlx5_iface_t *iface,
261276 uct_ib_mlx5_txwq_t * txwq ,
262277 ucs_status_t ep_status );
263278
264- void uct_dc_mlx5_iface_reset_dci (uct_dc_mlx5_iface_t * iface , uint8_t dci );
279+ ucs_status_t uct_dc_mlx5_iface_create_dcis (uct_dc_mlx5_iface_t * iface );
280+
281+ void uct_dc_mlx5_iface_reset_dci (uct_dc_mlx5_iface_t * iface , uint8_t dci_index );
265282
266283#if HAVE_DEVX
267284
@@ -270,7 +287,8 @@ ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface);
270287ucs_status_t uct_dc_mlx5_iface_devx_set_srq_dc_params (uct_dc_mlx5_iface_t * iface );
271288
272289ucs_status_t uct_dc_mlx5_iface_devx_dci_connect (uct_dc_mlx5_iface_t * iface ,
273- uct_ib_mlx5_qp_t * qp );
290+ uct_ib_mlx5_qp_t * qp ,
291+ uint8_t pool_index );
274292
275293#else
276294
@@ -305,11 +323,12 @@ uct_dc_mlx5_iface_fill_ravh(struct ibv_ravh *ravh, uint32_t dct_num)
305323}
306324#endif
307325
308- static inline uint8_t
326+ static UCS_F_ALWAYS_INLINE uint8_t
309327uct_dc_mlx5_iface_total_ndci (uct_dc_mlx5_iface_t * iface )
310328{
311- return iface -> tx .ndci + ((iface -> flags & UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE ) ?
312- UCT_DC_MLX5_KEEPALIVE_NUM_DCIS : 0 );
329+ return (iface -> tx .ndci * iface -> tx .num_dci_pools ) +
330+ ((iface -> flags & UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE ) ?
331+ UCT_DC_MLX5_KEEPALIVE_NUM_DCIS : 0 );
313332}
314333
315334/* TODO:
@@ -318,13 +337,20 @@ uct_dc_mlx5_iface_total_ndci(uct_dc_mlx5_iface_t *iface)
318337 * linear search is most probably the best way to go
319338 * because the number of dcis is usually small
320339 */
321- static inline uint8_t uct_dc_mlx5_iface_dci_find (uct_dc_mlx5_iface_t * iface , uint32_t qp_num )
340+ static UCS_F_ALWAYS_INLINE uint8_t
341+ uct_dc_mlx5_iface_dci_find (uct_dc_mlx5_iface_t * iface , struct mlx5_cqe64 * cqe )
322342{
323- uct_dc_dci_t * dcis = iface -> tx . dcis ;
324- int i , ndci = uct_dc_mlx5_iface_total_ndci ( iface ) ;
343+ uint32_t qp_num ;
344+ int i , ndci ;
325345
346+ if (ucs_likely (iface -> flags & UCT_DC_MLX5_IFACE_FLAG_UIDX )) {
347+ return cqe -> srqn_uidx >> UCT_IB_UIDX_SHIFT ;
348+ }
349+
350+ qp_num = ntohl (cqe -> sop_drop_qpn ) & UCS_MASK (UCT_IB_QPN_ORDER );
351+ ndci = uct_dc_mlx5_iface_total_ndci (iface );
326352 for (i = 0 ; i < ndci ; i ++ ) {
327- if (dcis [i ].txwq .super .qp_num == qp_num ) {
353+ if (iface -> tx . dcis [i ].txwq .super .qp_num == qp_num ) {
328354 return i ;
329355 }
330356 }
@@ -339,52 +365,55 @@ uct_dc_mlx5_iface_has_tx_resources(uct_dc_mlx5_iface_t *iface)
339365 (iface -> super .super .tx .reads_available > 0 );
340366}
341367
342- static inline int uct_dc_mlx5_iface_dci_has_tx_resources (uct_dc_mlx5_iface_t * iface , uint8_t dci )
368+ static UCS_F_ALWAYS_INLINE int
369+ uct_dc_mlx5_iface_dci_has_tx_resources (uct_dc_mlx5_iface_t * iface ,
370+ uint8_t dci_index )
343371{
344- return uct_rc_txqp_available (& iface -> tx .dcis [dci ].txqp ) > 0 ;
372+ return uct_rc_txqp_available (& iface -> tx .dcis [dci_index ].txqp ) > 0 ;
345373}
346374
347375/* returns pending queue of eps waiting for tx resources */
348- static inline ucs_arbiter_t * uct_dc_mlx5_iface_tx_waitq (uct_dc_mlx5_iface_t * iface )
376+ static UCS_F_ALWAYS_INLINE ucs_arbiter_t *
377+ uct_dc_mlx5_iface_tx_waitq (uct_dc_mlx5_iface_t * iface )
349378{
350- return & iface -> tx .dci_arbiter ;
379+ return & iface -> super . super . tx .arbiter ;
351380}
352381
353382/* returns pending queue of eps waiting for the dci allocation */
354- static inline ucs_arbiter_t * uct_dc_mlx5_iface_dci_waitq (uct_dc_mlx5_iface_t * iface )
383+ static UCS_F_ALWAYS_INLINE ucs_arbiter_t *
384+ uct_dc_mlx5_iface_dci_waitq (uct_dc_mlx5_iface_t * iface , uint8_t pool_index )
355385{
356- return & iface -> super . super . tx .arbiter ;
386+ return & iface -> tx . dci_pool [ pool_index ] .arbiter ;
357387}
358388
359- static inline int
360- uct_dc_mlx5_iface_dci_has_outstanding (uct_dc_mlx5_iface_t * iface , int dci )
389+ static UCS_F_ALWAYS_INLINE int
390+ uct_dc_mlx5_iface_dci_has_outstanding (uct_dc_mlx5_iface_t * iface , int dci_index )
361391{
362392 uct_rc_txqp_t * txqp ;
363393
364- txqp = & iface -> tx .dcis [dci ].txqp ;
365- return uct_rc_txqp_available (txqp ) < (int16_t )iface -> super . super . config . tx_qp_len ;
394+ txqp = & iface -> tx .dcis [dci_index ].txqp ;
395+ return uct_rc_txqp_available (txqp ) < (int16_t )iface -> tx . bb_max ;
366396}
367397
368- static inline ucs_status_t uct_dc_mlx5_iface_flush_dci (uct_dc_mlx5_iface_t * iface , int dci )
398+ static UCS_F_ALWAYS_INLINE ucs_status_t
399+ uct_dc_mlx5_iface_flush_dci (uct_dc_mlx5_iface_t * iface , int dci_index )
369400{
370401
371- if (!uct_dc_mlx5_iface_dci_has_outstanding (iface , dci )) {
402+ if (!uct_dc_mlx5_iface_dci_has_outstanding (iface , dci_index )) {
372403 return UCS_OK ;
373404 }
374- ucs_trace_poll ( "dci %d is not flushed %d/%d" , dci ,
375- iface -> tx . dcis [ dci ]. txqp . available ,
376- iface -> super . super . config . tx_qp_len );
377- ucs_assertv (uct_rc_txqp_unsignaled (& iface -> tx .dcis [dci ].txqp ) == 0 ,
405+
406+ ucs_trace_poll ( "dci %d is not flushed %d/%d" , dci_index ,
407+ iface -> tx . dcis [ dci_index ]. txqp . available , iface -> tx . bb_max );
408+ ucs_assertv (uct_rc_txqp_unsignaled (& iface -> tx .dcis [dci_index ].txqp ) == 0 ,
378409 "unsignalled send is not supported!!!" );
379410 return UCS_INPROGRESS ;
380411}
381412
382- static inline int
383- uct_dc_mlx5_iface_is_dci_keepalive (uct_dc_mlx5_iface_t * iface , int dci )
413+ static UCS_F_ALWAYS_INLINE int
414+ uct_dc_mlx5_iface_is_dci_keepalive (uct_dc_mlx5_iface_t * iface , int dci_index )
384415{
385- ucs_assert (dci < uct_dc_mlx5_iface_total_ndci (iface ));
386-
387- return dci == iface -> tx .ndci ;
416+ return dci_index == iface -> keepalive_dci ;
388417}
389418
390419#endif
0 commit comments