Skip to content

Commit 506f7e9

Browse files
Hakon-Buggealoktiwa
authored andcommitted
rds: ib: Add cm_id generation scheme in order to detect new ones
In rds_rdma_cm_event_handler_cmn() we have a simple address compare of the supplied cm_id and the one stored in the IB Connection struct, ic: if (ic->i_cm_id != cm_id) { But this address compare is flaky. Although the address is the same, the cm_id per se, does not need to be the same. The cm_id may be destroyed and re-created. Since we here talk about free() + kzalloc(), there is a fair probability that the new cm_id ends up at the very same address as the old one. If that happens, there is a dissonance between the RDS IB Connection state and the state of the cm_id. This bug used to manifest itself as a NULL ptr deref in rds_rdma_cm_event_handler_cmn(), but that was before commit a71a60c ("rds: ib: Remove incorrect update of the path record sl and qos_class fields"). Kernels with a71a60c may observe the following crash: BUG: kernel NULL pointer dereference, address: 0000000000000034 RIP: 0010:dma_direct_unmap_sg+0x5a/0x220 Call Trace: rds_ib_recv_cqe_handler+0xed/0x395 [rds_rdma] poll_rcq+0x72/0xa0 [rds_rdma] rds_ib_rx+0xb9/0x2b0 [rds_rdma] rds_ib_tasklet_fn_recv+0x2f/0x40 [rds_rdma] tasklet_action_common+0x153/0x240 handle_softirqs+0xe4/0x2ac __irq_exit_rcu+0xab/0xd0 common_interrupt+0x85/0xa0 Drgn analyses of the crash: >>> trace = prog.crashed_thread().stack_trace() >>> trace #0 sg_dma_is_bus_address (./include/linux/scatterlist.h:297:11) #1 dma_direct_unmap_sg (kernel/dma/direct.c:452:7) #2 ib_dma_unmap_sg_attrs (./include/rdma/ib_verbs.h:4232:3) #3 ib_dma_unmap_sg (./include/rdma/ib_verbs.h:4294:2) #4 rds_ib_recv_cqe_handler (net/rds/ib_recv.c:1397:2) #5 poll_rcq (net/rds/ib_cm.c:777:4) #6 rds_ib_rx (net/rds/ib_cm.c:870:2) #7 rds_ib_recv_cb (net/rds/ib_cm.c:916:2) #8 rds_ib_tasklet_fn_recv (net/rds/ib_cm.c:925:2) #9 tasklet_action_common (kernel/softirq.c:795:7) #10 handle_softirqs (kernel/softirq.c:561:3) #11 __do_softirq (kernel/softirq.c:595:2) #12 invoke_softirq (kernel/softirq.c:435:3) #13 __irq_exit_rcu (kernel/softirq.c:644:3) #14 common_interrupt (arch/x86/kernel/irq.c:280:1) #15 asm_common_interrupt+0x26/0x2b (./arch/x86/include/asm/idtentry.h:693) #16 cpuidle_enter_state (drivers/cpuidle/cpuidle.c:288:5) #17 cpuidle_enter (drivers/cpuidle/cpuidle.c:385:9) #18 cpuidle_idle_call (kernel/sched/idle.c:230:19) #19 do_idle (kernel/sched/idle.c:326:4) #20 cpu_startup_entry (kernel/sched/idle.c:424:3) #21 rest_init (init/main.c:747:2) #22 start_kernel (init/main.c:1105:2) #23 x86_64_start_reservations (arch/x86/kernel/head64.c:507:2) #24 x86_64_start_kernel (arch/x86/kernel/head64.c:488:2) #25 secondary_startup_64+0x15d/0x15f (arch/x86/kernel/head_64.S:413) >>> trace[4]["ic"].i_recvs[0] (struct rds_ib_recv_work){ .r_ibinc = (struct rds_ib_incoming *)0x0, .r_frag = (struct rds_page_frag *)0x0, .r_wr = (struct ib_recv_wr){ .next = (struct ib_recv_wr *)0x0, .wr_id = (u64)0, .wr_cqe = (struct ib_cqe *)0x0, .sg_list = (struct ib_sge *)0xffffb91c1a5b5030, .num_sge = (int)5, }, .r_sge = (struct ib_sge [8]){ { .addr = (u64)33819643904, .length = (u32)48, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, { .addr = (u64)0, <=== Note .length = (u32)4096, .lkey = (u32)18181, }, }, .r_ic = (struct rds_ib_connection *)0x0, .r_posted = (int)0, } The prosaic version: We do get an event and processes a recv CQE, but when performing DMA unmap, the addresses for SGE entry 1-4 are all zero. That explains the crash. If you look at rds_ib_recv_init_ring(), the state will be exactly what we see above when this function has been called. So, how can this happen? We do get an RDMA_CM_EVENT_ROUTE_RESOLVED event with a new cm_id which happens to have the same address as the one stored in the RDS IB Connection (ic). Now, without a71a60c, we do not crash in rds_rdma_cm_event_handler_cmn() but calls: trans->cm_initiate_connect() rds_ib_cm_initiate_connect() rds_ib_recv_init_ring() and the latter does: sge = recv->r_sge; sge->addr = ic->i_recv_hdrs_dma[i]; sge->length = sizeof(struct rds_header); sge->lkey = ic->i_mr->lkey; for (j = 1; j < num_send_sge; j++) { sge = recv->r_sge + j; sge->addr = 0; <==== Note sge->length = PAGE_SIZE; sge->lkey = ic->i_mr->lkey; } This fits 100% with the r_sge pasted above. We fix this bug by introducing a generation number and embed that in the last three bits of cm_id->context, since these bits are zero due to alignment requirement. When we use cm_id->context to find the RDS Connection (conn), we mask away the three lower bits. When we create the cm_id, we insert the next generation and make a copy of it in the ic. In rds_rdma_cm_event_handler_cmn(), when we have established that the two addresses are equal, we add an additional check to see if the cm_id generation is the same between the rdma_cm supplied cm_id and the generation stored in ic. If the generations differ, we increment the s_ib_cm_id_resurrected statistics counter and return. When handling an incoming connect, we set the cm_id generation count stored in ic to zero. Orabug: 37799171 Fixes: 3ae09d7 ("net/rds: Don't block workqueues "cma_wq" and "cm.wq"") Signed-off-by: Håkon Bugge <[email protected]> Reviewed-by: Sharath Srinivasan <[email protected]> Signed-off-by: Alok Tiwari <[email protected]>
1 parent d24ea44 commit 506f7e9

File tree

4 files changed

+31
-8
lines changed

4 files changed

+31
-8
lines changed

net/rds/ib.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@ enum rds_ib_conn_flags {
9292

9393
#define RDS_IB_CQ_FOLLOW_AFFINITY_THROTTLE 100 /* msec */
9494

95+
#define RDS_IB_CM_IB_GEN_MASK 7
96+
#define RDS_IB_CM_ID_ADD_GEN(conn, gen) ((void *)((uintptr_t)(conn) + (gen)))
97+
#define RDS_IB_CM_ID_EXTRACT_GEN(id) ((uintptr_t)(id)->context & RDS_IB_CM_IB_GEN_MASK)
98+
#define RDS_IB_CM_ID_EXTRACT_CONN(id) ((void *)((uintptr_t)(id)->context & ~RDS_IB_CM_IB_GEN_MASK))
99+
95100
enum rds_ib_preferred_cpu_options {
96101
RDS_IB_PREFER_CPU_CQ = 1 << 0,
97102
RDS_IB_PREFER_CPU_NUMA = 1 << 1,
@@ -360,6 +365,7 @@ struct rds_ib_connection {
360365
u16 i_frag_sz; /* IB fragment size */
361366
u16 i_frag_cache_sz;
362367
u8 i_frag_pages;
368+
u8 i_cm_id_gen:3;
363369
unsigned long i_flags;
364370
u16 i_frag_cache_inx;
365371
u16 i_hca_sge;
@@ -648,6 +654,7 @@ struct rds_ib_statistics {
648654
uint64_t s_ib_frag_pages_allocated;
649655
uint64_t s_ib_frag_pages_in_ib_recv_queue;
650656
uint64_t s_ib_frag_pages_in_caches;
657+
uint64_t s_ib_cm_id_resurrected;
651658
};
652659

653660
extern struct workqueue_struct *rds_ib_wq;

net/rds/ib_cm.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1828,6 +1828,7 @@ static int rds_ib_cm_accept(struct rds_connection *conn,
18281828
BUG_ON(ic->i_cm_id);
18291829

18301830
ic->i_cm_id = cm_id;
1831+
ic->i_cm_id_gen = 0;
18311832
cm_id->context = conn;
18321833

18331834
if (isv6) {
@@ -2153,7 +2154,7 @@ void rds_ib_conn_destroy_init(struct rds_connection *conn)
21532154

21542155
int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
21552156
{
2156-
struct rds_connection *conn = cm_id->context;
2157+
struct rds_connection *conn = RDS_IB_CM_ID_EXTRACT_CONN(cm_id);
21572158
struct rds_ib_connection *ic = conn->c_transport_data;
21582159
struct rdma_conn_param conn_param;
21592160
union rds_ib_conn_priv dp;
@@ -2358,8 +2359,9 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
23582359
handler = rds_rdma_cm_event_handler;
23592360

23602361
WARN_ON(ic->i_cm_id);
2361-
ic->i_cm_id = rdma_create_id(rds_conn_net(conn),
2362-
handler, conn, RDMA_PS_TCP, IB_QPT_RC);
2362+
ic->i_cm_id = rdma_create_id(rds_conn_net(conn), handler,
2363+
RDS_IB_CM_ID_ADD_GEN(conn, ++ic->i_cm_id_gen),
2364+
RDMA_PS_TCP, IB_QPT_RC);
23632365

23642366
if (IS_ERR(ic->i_cm_id)) {
23652367
ret = PTR_ERR(ic->i_cm_id);

net/rds/ib_stats.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2006 Oracle. All rights reserved.
2+
* Copyright (c) 2006, 2025, Oracle and/or its affiliates.
33
*
44
* This software is available to you under a choice of one of two
55
* licenses. You may choose to be licensed under the terms of the GNU
@@ -112,6 +112,7 @@ static char *rds_ib_stat_names[] = {
112112
"ib_frag_pages_allocated",
113113
"ib_frag_pages_in_ib_recv_queue",
114114
"ib_frag_pages_in_caches",
115+
"ib_cm_id_resurrected",
115116
};
116117

117118
unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,

net/rds/rdma_transport.c

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ struct rds_rdma_cm_event_handler_info {
5353
struct rdma_cm_event event;
5454
struct rds_connection *conn;
5555
bool isv6;
56+
int cm_id_gen;
5657
char private_data[];
5758
};
5859

@@ -111,6 +112,7 @@ static inline bool __rds_rdma_chk_dev(struct rds_connection *conn)
111112
}
112113

113114
static void rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
115+
int cm_id_gen,
114116
struct rdma_cm_event *event,
115117
struct rds_connection *conn,
116118
bool isv6)
@@ -199,6 +201,14 @@ static void rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
199201
return;
200202
}
201203

204+
/* Although same address, could be different generation */
205+
if (cm_id_gen != ic->i_cm_id_gen) {
206+
rds_ib_stats_inc(s_ib_cm_id_resurrected);
207+
up_read(&ic->i_cm_id_free_lock);
208+
mutex_unlock(&conn->c_cm_lock);
209+
return;
210+
}
211+
202212
/* Even though this function no longer accesses "ic->i_cm_id" past this point
203213
* and "cma.c" always blocks "rdma_destroy_id" until "event_callback" is done,
204214
* we still need to hang on to the "i_cm_id_free_lock" until return,
@@ -389,7 +399,8 @@ static void rds_rdma_cm_event_handler_worker(struct work_struct *work)
389399
struct rds_rdma_cm_event_handler_info,
390400
work);
391401

392-
rds_rdma_cm_event_handler_cmn(info->cm_id, &info->event, info->conn, info->isv6);
402+
rds_rdma_cm_event_handler_cmn(info->cm_id, info->cm_id_gen, &info->event,
403+
info->conn, info->isv6);
393404

394405
kfree(info);
395406
}
@@ -398,7 +409,8 @@ static void rds_spawn_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
398409
struct rdma_cm_event *event,
399410
bool isv6)
400411
{
401-
struct rds_connection *conn = cm_id->context;
412+
int cm_id_gen = RDS_IB_CM_ID_EXTRACT_GEN(cm_id);
413+
struct rds_connection *conn = RDS_IB_CM_ID_EXTRACT_CONN(cm_id);
402414
struct rds_ib_connection *ic = conn ? conn->c_transport_data : NULL;
403415
struct workqueue_struct *wq;
404416
struct rds_rdma_cm_event_handler_info *info;
@@ -409,13 +421,13 @@ static void rds_spawn_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
409421
wq = rds_aux_wq;
410422

411423
if (!wq) {
412-
rds_rdma_cm_event_handler_cmn(cm_id, event, conn, isv6);
424+
rds_rdma_cm_event_handler_cmn(cm_id, cm_id_gen, event, conn, isv6);
413425
return;
414426
}
415427

416428
info = kmalloc(sizeof(*info) + event->param.conn.private_data_len, GFP_KERNEL);
417429
if (!info) {
418-
rds_rdma_cm_event_handler_cmn(cm_id, event, conn, isv6);
430+
rds_rdma_cm_event_handler_cmn(cm_id, cm_id_gen, event, conn, isv6);
419431
return;
420432
}
421433

@@ -425,6 +437,7 @@ static void rds_spawn_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
425437
memcpy(&info->event, event, sizeof(*event));
426438
info->conn = conn;
427439
info->isv6 = isv6;
440+
info->cm_id_gen = cm_id_gen;
428441

429442
if (event->param.conn.private_data &&
430443
event->param.conn.private_data_len) {

0 commit comments

Comments
 (0)