Skip to content

Commit 2c81364

Browse files
authored
Merge pull request #6376 from Artemy-Mellanox/topic/lag-6
UCT/DC: DCI pool per LAG port
2 parents 210bb5a + cce83f8 commit 2c81364

File tree

10 files changed

+320
-221
lines changed

10 files changed

+320
-221
lines changed

buildlib/io_demo/az-stage-io-demo.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ parameters:
33
default: 'test'
44
- name: iodemo_args
55
default: ''
6+
- name: iodemo_tls
7+
default: 'rc_x'
68

79
steps:
810
- bash: |
@@ -31,6 +33,7 @@ steps:
3133
mkdir -p $(workspace)/${{ parameters.name }}
3234
# set UCX environment variables
3335
export UCX_NET_DEVICES=$(ibdev2netdev | sed -ne 's/\(\w*\) port \([0-9]\) ==> '${roce_iface}' .*/\1:\2/p')
36+
export UCX_TLS=${{ parameters.iodemo_tls }}
3437
export LD_LIBRARY_PATH=$(workspace)/install/lib:$LD_LIBRARY_PATH
3538
$(workspace)/test/apps/iodemo/run_io_demo.sh \
3639
-H $(agent_hosts) \

src/uct/base/uct_iface.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ uct_pending_req_priv_arb_elem(uct_pending_req_t *req)
439439
/**
440440
* Add a pending request to the head of group in arbiter.
441441
*/
442-
#define uct_pending_req_arb_group_push_head(_arbiter, _arbiter_group, _req) \
442+
#define uct_pending_req_arb_group_push_head(_arbiter_group, _req) \
443443
do { \
444444
ucs_arbiter_elem_init(uct_pending_req_priv_arb_elem(_req)); \
445445
ucs_arbiter_group_push_head_elem_always(_arbiter_group, \

src/uct/ib/base/ib_iface.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -662,9 +662,8 @@ void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid,
662662
if (uct_ib_iface_is_roce(iface)) {
663663
ah_attr->dlid = UCT_IB_ROCE_UDP_SRC_PORT_BASE |
664664
(iface->config.roce_path_factor * path_index);
665-
/* Workaround rdma-core issue of calling rand() which affects global
666-
* random state in glibc */
667-
ah_attr->grh.flow_label = 1;
665+
/* Workaround rdma-core flow label to udp sport conversion */
666+
ah_attr->grh.flow_label = ~(iface->config.roce_path_factor * path_index);
668667
} else {
669668
/* TODO iface->path_bits should be removed and replaced by path_index */
670669
path_bits = iface->path_bits[path_index %

src/uct/ib/dc/dc_mlx5.c

Lines changed: 94 additions & 78 deletions
Large diffs are not rendered by default.

src/uct/ib/dc/dc_mlx5.h

Lines changed: 68 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,10 @@ struct ibv_ravh {
3838

3939
#define UCT_DC_MLX5_IFACE_MAX_USER_DCIS 15
4040
#define UCT_DC_MLX5_KEEPALIVE_NUM_DCIS 1
41-
#define UCT_DC_MLX5_IFACE_MAX_DCIS (UCT_DC_MLX5_IFACE_MAX_USER_DCIS + \
42-
UCT_DC_MLX5_KEEPALIVE_NUM_DCIS)
41+
#define UCT_DC_MLX5_IFACE_MAX_DCI_POOLS 8
42+
#define UCT_DC_MLX5_IFACE_MAX_DCIS ((UCT_DC_MLX5_IFACE_MAX_USER_DCIS * \
43+
UCT_DC_MLX5_IFACE_MAX_DCI_POOLS) + \
44+
UCT_DC_MLX5_KEEPALIVE_NUM_DCIS)
4345

4446
#define UCT_DC_MLX5_IFACE_ADDR_TM_ENABLED(_addr) \
4547
(!!((_addr)->flags & UCT_DC_MLX5_IFACE_ADDR_HW_TM))
@@ -130,7 +132,7 @@ typedef struct uct_dc_mlx5_iface_config {
130132

131133
typedef void (*uct_dc_dci_handle_failure_func_t)(uct_dc_mlx5_iface_t *iface,
132134
struct mlx5_cqe64 *cqe,
133-
uint8_t dci,
135+
uint8_t dci_index,
134136
ucs_status_t status);
135137

136138

@@ -147,6 +149,7 @@ typedef struct uct_dc_dci {
147149
processed. Better have dci num
148150
groups scheduled than ep num. */
149151
};
152+
uint8_t pool_index; /* DCI pool index. */
150153
#if UCS_ENABLE_ASSERT
151154
uint8_t flags; /* debug state, @ref uct_dc_dci_state_t */
152155
#endif
@@ -175,22 +178,32 @@ typedef struct uct_dc_fc_request {
175178

176179
KHASH_MAP_INIT_INT64(uct_dc_mlx5_fc_hash, uint64_t);
177180

181+
182+
typedef struct {
183+
uint8_t stack_top; /* dci stack top */
184+
uint8_t stack[UCT_DC_MLX5_IFACE_MAX_USER_DCIS]; /* LIFO of indexes of available dcis */
185+
ucs_arbiter_t arbiter; /* queue of requests
186+
waiting for DCI */
187+
} uct_dc_mlx5_dci_pool_t;
188+
189+
178190
struct uct_dc_mlx5_iface {
179191
uct_rc_mlx5_iface_common_t super;
180192
struct {
181193
/* Array of dcis */
182194
uct_dc_dci_t dcis[UCT_DC_MLX5_IFACE_MAX_DCIS];
183195

184196
uint8_t ndci; /* Number of DCIs */
185-
uct_dc_tx_policy_t policy; /* dci selection algorithm */
186-
int16_t available_quota; /* if available tx is lower, let
187-
another endpoint use the dci */
188197

189198
/* LIFO is only relevant for dcs allocation policy */
190-
uint8_t stack_top; /* dci stack top */
191-
uint8_t dcis_stack[UCT_DC_MLX5_IFACE_MAX_DCIS]; /* LIFO of indexes of available dcis */
199+
uct_dc_mlx5_dci_pool_t dci_pool[UCT_DC_MLX5_IFACE_MAX_DCI_POOLS];
200+
uint8_t num_dci_pools;
192201

193-
ucs_arbiter_t dci_arbiter;
202+
uint8_t policy; /* dci selection algorithm */
203+
int16_t available_quota; /* if available tx is lower, let
204+
another endpoint use the dci */
205+
/* DCI max elements */
206+
unsigned bb_max;
194207

195208
/* Used to send grant messages for all peers */
196209
uct_dc_mlx5_ep_t *fc_ep;
@@ -216,6 +229,8 @@ struct uct_dc_mlx5_iface {
216229
/* iface flags, see uct_dc_mlx5_iface_flags_t */
217230
uint8_t flags;
218231

232+
uint8_t keepalive_dci;
233+
219234
uct_ud_mlx5_iface_common_t ud_common;
220235
};
221236

@@ -261,7 +276,9 @@ void uct_dc_mlx5_iface_set_ep_failed(uct_dc_mlx5_iface_t *iface,
261276
uct_ib_mlx5_txwq_t *txwq,
262277
ucs_status_t ep_status);
263278

264-
void uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface, uint8_t dci);
279+
ucs_status_t uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface);
280+
281+
void uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface, uint8_t dci_index);
265282

266283
#if HAVE_DEVX
267284

@@ -270,7 +287,8 @@ ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface);
270287
ucs_status_t uct_dc_mlx5_iface_devx_set_srq_dc_params(uct_dc_mlx5_iface_t *iface);
271288

272289
ucs_status_t uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface,
273-
uct_ib_mlx5_qp_t *qp);
290+
uct_ib_mlx5_qp_t *qp,
291+
uint8_t pool_index);
274292

275293
#else
276294

@@ -305,11 +323,12 @@ uct_dc_mlx5_iface_fill_ravh(struct ibv_ravh *ravh, uint32_t dct_num)
305323
}
306324
#endif
307325

308-
static inline uint8_t
326+
static UCS_F_ALWAYS_INLINE uint8_t
309327
uct_dc_mlx5_iface_total_ndci(uct_dc_mlx5_iface_t *iface)
310328
{
311-
return iface->tx.ndci + ((iface->flags & UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE) ?
312-
UCT_DC_MLX5_KEEPALIVE_NUM_DCIS : 0);
329+
return (iface->tx.ndci * iface->tx.num_dci_pools) +
330+
((iface->flags & UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE) ?
331+
UCT_DC_MLX5_KEEPALIVE_NUM_DCIS : 0);
313332
}
314333

315334
/* TODO:
@@ -318,13 +337,20 @@ uct_dc_mlx5_iface_total_ndci(uct_dc_mlx5_iface_t *iface)
318337
* linear search is most probably the best way to go
319338
* because the number of dcis is usually small
320339
*/
321-
static inline uint8_t uct_dc_mlx5_iface_dci_find(uct_dc_mlx5_iface_t *iface, uint32_t qp_num)
340+
static UCS_F_ALWAYS_INLINE uint8_t
341+
uct_dc_mlx5_iface_dci_find(uct_dc_mlx5_iface_t *iface, struct mlx5_cqe64 *cqe)
322342
{
323-
uct_dc_dci_t *dcis = iface->tx.dcis;
324-
int i, ndci = uct_dc_mlx5_iface_total_ndci(iface);
343+
uint32_t qp_num;
344+
int i, ndci;
325345

346+
if (ucs_likely(iface->flags & UCT_DC_MLX5_IFACE_FLAG_UIDX)) {
347+
return cqe->srqn_uidx >> UCT_IB_UIDX_SHIFT;
348+
}
349+
350+
qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER);
351+
ndci = uct_dc_mlx5_iface_total_ndci(iface);
326352
for (i = 0; i < ndci; i++) {
327-
if (dcis[i].txwq.super.qp_num == qp_num) {
353+
if (iface->tx.dcis[i].txwq.super.qp_num == qp_num) {
328354
return i;
329355
}
330356
}
@@ -339,52 +365,55 @@ uct_dc_mlx5_iface_has_tx_resources(uct_dc_mlx5_iface_t *iface)
339365
(iface->super.super.tx.reads_available > 0);
340366
}
341367

342-
static inline int uct_dc_mlx5_iface_dci_has_tx_resources(uct_dc_mlx5_iface_t *iface, uint8_t dci)
368+
static UCS_F_ALWAYS_INLINE int
369+
uct_dc_mlx5_iface_dci_has_tx_resources(uct_dc_mlx5_iface_t *iface,
370+
uint8_t dci_index)
343371
{
344-
return uct_rc_txqp_available(&iface->tx.dcis[dci].txqp) > 0;
372+
return uct_rc_txqp_available(&iface->tx.dcis[dci_index].txqp) > 0;
345373
}
346374

347375
/* returns pending queue of eps waiting for tx resources */
348-
static inline ucs_arbiter_t *uct_dc_mlx5_iface_tx_waitq(uct_dc_mlx5_iface_t *iface)
376+
static UCS_F_ALWAYS_INLINE ucs_arbiter_t *
377+
uct_dc_mlx5_iface_tx_waitq(uct_dc_mlx5_iface_t *iface)
349378
{
350-
return &iface->tx.dci_arbiter;
379+
return &iface->super.super.tx.arbiter;
351380
}
352381

353382
/* returns pending queue of eps waiting for the dci allocation */
354-
static inline ucs_arbiter_t *uct_dc_mlx5_iface_dci_waitq(uct_dc_mlx5_iface_t *iface)
383+
static UCS_F_ALWAYS_INLINE ucs_arbiter_t *
384+
uct_dc_mlx5_iface_dci_waitq(uct_dc_mlx5_iface_t *iface, uint8_t pool_index)
355385
{
356-
return &iface->super.super.tx.arbiter;
386+
return &iface->tx.dci_pool[pool_index].arbiter;
357387
}
358388

359-
static inline int
360-
uct_dc_mlx5_iface_dci_has_outstanding(uct_dc_mlx5_iface_t *iface, int dci)
389+
static UCS_F_ALWAYS_INLINE int
390+
uct_dc_mlx5_iface_dci_has_outstanding(uct_dc_mlx5_iface_t *iface, int dci_index)
361391
{
362392
uct_rc_txqp_t *txqp;
363393

364-
txqp = &iface->tx.dcis[dci].txqp;
365-
return uct_rc_txqp_available(txqp) < (int16_t)iface->super.super.config.tx_qp_len;
394+
txqp = &iface->tx.dcis[dci_index].txqp;
395+
return uct_rc_txqp_available(txqp) < (int16_t)iface->tx.bb_max;
366396
}
367397

368-
static inline ucs_status_t uct_dc_mlx5_iface_flush_dci(uct_dc_mlx5_iface_t *iface, int dci)
398+
static UCS_F_ALWAYS_INLINE ucs_status_t
399+
uct_dc_mlx5_iface_flush_dci(uct_dc_mlx5_iface_t *iface, int dci_index)
369400
{
370401

371-
if (!uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) {
402+
if (!uct_dc_mlx5_iface_dci_has_outstanding(iface, dci_index)) {
372403
return UCS_OK;
373404
}
374-
ucs_trace_poll("dci %d is not flushed %d/%d", dci,
375-
iface->tx.dcis[dci].txqp.available,
376-
iface->super.super.config.tx_qp_len);
377-
ucs_assertv(uct_rc_txqp_unsignaled(&iface->tx.dcis[dci].txqp) == 0,
405+
406+
ucs_trace_poll("dci %d is not flushed %d/%d", dci_index,
407+
iface->tx.dcis[dci_index].txqp.available, iface->tx.bb_max);
408+
ucs_assertv(uct_rc_txqp_unsignaled(&iface->tx.dcis[dci_index].txqp) == 0,
378409
"unsignalled send is not supported!!!");
379410
return UCS_INPROGRESS;
380411
}
381412

382-
static inline int
383-
uct_dc_mlx5_iface_is_dci_keepalive(uct_dc_mlx5_iface_t *iface, int dci)
413+
static UCS_F_ALWAYS_INLINE int
414+
uct_dc_mlx5_iface_is_dci_keepalive(uct_dc_mlx5_iface_t *iface, int dci_index)
384415
{
385-
ucs_assert(dci < uct_dc_mlx5_iface_total_ndci(iface));
386-
387-
return dci == iface->tx.ndci;
416+
return dci_index == iface->keepalive_dci;
388417
}
389418

390419
#endif

src/uct/ib/dc/dc_mlx5_devx.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,19 @@ ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface)
7474

7575
ucs_status_t
7676
uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface,
77-
uct_ib_mlx5_qp_t *qp)
77+
uct_ib_mlx5_qp_t *qp,
78+
uint8_t lag_port)
7879
{
80+
uct_ib_device_t *dev = uct_ib_iface_device(&iface->super.super.super);
81+
uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md,
82+
uct_ib_mlx5_md_t);
7983
char in_2init[UCT_IB_MLX5DV_ST_SZ_BYTES(rst2init_qp_in)] = {};
8084
char out_2init[UCT_IB_MLX5DV_ST_SZ_BYTES(rst2init_qp_out)] = {};
8185
char in_2rtr[UCT_IB_MLX5DV_ST_SZ_BYTES(init2rtr_qp_in)] = {};
8286
char out_2rtr[UCT_IB_MLX5DV_ST_SZ_BYTES(init2rtr_qp_out)] = {};
8387
char in_2rts[UCT_IB_MLX5DV_ST_SZ_BYTES(rtr2rts_qp_in)] = {};
8488
char out_2rts[UCT_IB_MLX5DV_ST_SZ_BYTES(rtr2rts_qp_out)] = {};
89+
uint32_t opt_param_mask = UCT_IB_MLX5_QP_OPTPAR_RAE;
8590
ucs_status_t status;
8691
void *qpc;
8792

@@ -101,7 +106,6 @@ uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface,
101106

102107
UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opcode, UCT_IB_MLX5_CMD_OP_INIT2RTR_QP);
103108
UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, qpn, qp->qp_num);
104-
UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opt_param_mask, 4);
105109

106110
qpc = UCT_IB_MLX5DV_ADDR_OF(init2rtr_qp_in, in_2rtr, qpc);
107111
UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED);
@@ -112,11 +116,18 @@ uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface,
112116
if (uct_ib_iface_is_roce(&iface->super.super.super)) {
113117
UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.eth_prio,
114118
iface->super.super.super.config.sl);
119+
120+
if (md->flags & UCT_IB_MLX5_MD_FLAG_LAG) {
121+
opt_param_mask |= UCT_IB_MLX5_QP_OPTPAR_LAG_TX_AFF;
122+
UCT_IB_MLX5DV_SET(qpc, qpc, lag_tx_port_affinity,
123+
dev->first_port + lag_port);
124+
}
115125
} else {
116126
UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.sl,
117127
iface->super.super.super.config.sl);
118128
}
119129

130+
UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opt_param_mask, opt_param_mask);
120131
status = uct_ib_mlx5_devx_modify_qp(qp, in_2rtr, sizeof(in_2rtr),
121132
out_2rtr, sizeof(out_2rtr));
122133
if (status != UCS_OK) {

0 commit comments

Comments
 (0)