Skip to content

Commit 8b83250

Browse files
UCT/CUDA_IPC: move device id caching to ep creation
1 parent 5f205b2 commit 8b83250

File tree

2 files changed

+33
-34
lines changed

2 files changed

+33
-34
lines changed

src/uct/cuda/cuda_ipc/cuda_ipc_ep.c

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,41 @@
2525

2626
static UCS_CLASS_INIT_FUNC(uct_cuda_ipc_ep_t, const uct_ep_params_t *params)
2727
{
28-
uct_cuda_ipc_iface_t *iface = ucs_derived_of(params->iface,
29-
uct_cuda_ipc_iface_t);
28+
uct_cuda_ipc_iface_t *iface = ucs_derived_of(params->iface,
29+
uct_cuda_ipc_iface_t);
30+
uct_cuda_base_sys_dev_map_t *remote = (uct_cuda_base_sys_dev_map_t*)
31+
params->iface_addr;
32+
uct_cuda_base_sys_dev_map_t *hash;
33+
khiter_t khiter;
34+
int khret;
35+
int i;
36+
37+
ucs_recursive_spin_lock(&iface->rem_iface_addr_lock);
38+
39+
khiter = kh_put(cuda_ipc_rem_iface_addr, &iface->rem_iface_addr_hash,
40+
remote->pid, &khret);
41+
if ((khret == UCS_KH_PUT_BUCKET_EMPTY) ||
42+
(khret == UCS_KH_PUT_BUCKET_CLEAR)) {
43+
hash = &kh_val(&iface->rem_iface_addr_hash, khiter);
44+
hash->count = remote->count;
45+
hash->pid = remote->pid;
46+
47+
for (i = 0; i < remote->count; i++) {
48+
hash->sys_dev[i] = remote->sys_dev[i];
49+
hash->bus_id[i] = remote->bus_id[i];
50+
ucs_trace("peer pid %ld sys_dev %u bus_id %u",
51+
(long)hash->pid, (unsigned)hash->sys_dev[i],
52+
(unsigned)hash->bus_id[i]);
53+
}
54+
} else if (khret != UCS_KH_PUT_KEY_PRESENT) {
55+
ucs_error("unable to use cuda_ipc remote_iface_addr hash");
56+
}
57+
ucs_recursive_spin_unlock(&iface->rem_iface_addr_lock);
3058

3159
UCT_EP_PARAMS_CHECK_DEV_IFACE_ADDRS(params);
3260
UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super);
3361

34-
self->remote_pid = *(const pid_t*)params->iface_addr;
62+
self->remote_pid = remote->pid;
3563

3664
return uct_ep_keepalive_init(&self->keepalive, self->remote_pid);
3765
}

src/uct/cuda/cuda_ipc/cuda_ipc_iface.c

Lines changed: 2 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -93,40 +93,11 @@ static int uct_cuda_ipc_iface_is_reachable(const uct_iface_h tl_iface,
9393
const uct_device_addr_t *dev_addr,
9494
const uct_iface_addr_t *iface_addr)
9595
{
96-
uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t);
97-
uct_cuda_base_sys_dev_map_t *remote = (uct_cuda_base_sys_dev_map_t*)iface_addr;
98-
uct_cuda_base_sys_dev_map_t *hash;
99-
khiter_t khiter;
100-
int khret;
101-
int i;
102-
103-
ucs_recursive_spin_lock(&iface->rem_iface_addr_lock);
104-
105-
khiter = kh_put(cuda_ipc_rem_iface_addr, &iface->rem_iface_addr_hash,
106-
remote->pid, &khret);
107-
if ((khret == UCS_KH_PUT_BUCKET_EMPTY) ||
108-
(khret == UCS_KH_PUT_BUCKET_CLEAR)) {
109-
hash = &kh_val(&iface->rem_iface_addr_hash, khiter);
110-
hash->count = remote->count;
111-
hash->pid = remote->pid;
112-
113-
for (i = 0; i < remote->count; i++) {
114-
hash->sys_dev[i] = remote->sys_dev[i];
115-
hash->bus_id[i] = remote->bus_id[i];
116-
ucs_trace("peer pid %ld sys_dev %u bus_id %u",
117-
(long)hash->pid, (unsigned)hash->sys_dev[i],
118-
(unsigned)hash->bus_id[i]);
119-
}
120-
} else if (khret == UCS_KH_PUT_KEY_PRESENT) {
121-
/* do nothing */
122-
} else {
123-
ucs_error("unable to use cuda_ipc remote_iface_addr hash");
124-
}
125-
ucs_recursive_spin_unlock(&iface->rem_iface_addr_lock);
96+
uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t);
12697

12798
return ((uct_cuda_ipc_iface_node_guid(&iface->super) ==
12899
*((const uint64_t *)dev_addr)) &&
129-
((getpid() != remote->pid)));
100+
((getpid() != ((uct_cuda_base_sys_dev_map_t*)iface_addr)->pid)));
130101
}
131102

132103
static double uct_cuda_ipc_iface_get_bw()

0 commit comments

Comments
 (0)