-
Couldn't load subscription status.
- Fork 490
UCT/CUDA_IPC: add remote iface addr sys_dev map #7896
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
| #include "cuda_ipc_ep.h" | ||
|
|
||
| #include <uct/cuda/base/cuda_iface.h> | ||
| #include <uct/cuda/base/cuda_md.h> | ||
| #include <ucs/type/class.h> | ||
| #include <ucs/sys/string.h> | ||
| #include <ucs/debug/assert.h> | ||
|
|
@@ -74,18 +75,29 @@ ucs_status_t uct_cuda_ipc_iface_get_device_address(uct_iface_t *tl_iface, | |
| static ucs_status_t uct_cuda_ipc_iface_get_address(uct_iface_h tl_iface, | ||
| uct_iface_addr_t *iface_addr) | ||
| { | ||
| *(pid_t*)iface_addr = getpid(); | ||
| uct_cuda_base_sys_dev_map_t *local = (uct_cuda_base_sys_dev_map_t*)iface_addr; | ||
| int i; | ||
|
|
||
| local->pid = getpid(); | ||
| local->count = uct_cuda_sys_dev_bus_id_map.count; | ||
|
|
||
| for (i = 0; i < local->count; i++) { | ||
| local->sys_dev[i] = uct_cuda_sys_dev_bus_id_map.sys_dev[i]; | ||
| local->bus_id[i] = uct_cuda_sys_dev_bus_id_map.bus_id[i]; | ||
| } | ||
|
|
||
| return UCS_OK; | ||
| } | ||
|
|
||
| static int uct_cuda_ipc_iface_is_reachable(const uct_iface_h tl_iface, | ||
| const uct_device_addr_t *dev_addr, | ||
| const uct_iface_addr_t *iface_addr) | ||
| { | ||
| uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t); | ||
| uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t); | ||
|
|
||
| return ((uct_cuda_ipc_iface_node_guid(&iface->super) == | ||
| *((const uint64_t *)dev_addr)) && ((getpid() != *(pid_t *)iface_addr))); | ||
| *((const uint64_t *)dev_addr)) && | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will extra checks be added later? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No further checks here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the reason to update hash in this function? How it is it going to be used? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @brminich updated the PR description as well. Lmk if it makes sense. UCT perf_estimate API takes two system devices (local, remote) to provide bandwidth/latency estimates. In the case of cuda_ipc, local GPU's sys_dev and remote GPU's sys_dev. But remote GPU's sys_dev may not match the same bus_id on the local process because of the order in which sys_devices are populated. For this reason, we need a way to interpret remote sys_dev and translate to local sys_dev and use in cuda_ipc perf estimate to check if there are nvlinks between the devices or not. Iface address exchange phase seemed like the most convenient place to add this logic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you mean There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @brminich addressed this |
||
| ((getpid() != ((uct_cuda_base_sys_dev_map_t*)iface_addr)->pid))); | ||
| } | ||
|
|
||
| static double uct_cuda_ipc_iface_get_bw() | ||
|
|
@@ -198,7 +210,7 @@ static ucs_status_t uct_cuda_ipc_iface_query(uct_iface_h tl_iface, | |
|
|
||
| uct_base_iface_query(&iface->super, iface_attr); | ||
|
|
||
| iface_attr->iface_addr_len = sizeof(pid_t); | ||
| iface_attr->iface_addr_len = sizeof(uct_cuda_base_sys_dev_map_t); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we define the length according to the real number of gpus? (to avoid unnecessary address increase) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @brminich Kept it this way to handle the case when different processes within the same node use different number of devices in CUDA_VISIBLE_DEVICES. For example, if process 0 uses CUDA_VISIBLE_DEVICES=0 and process 1 uses CUDA_VISIBLE_DEVICES=1,2 then the instances of cuda_ipc iface for the two processes will have different iface_addr_len. I wasn't sure if iface address lengths have to be consistent across processes during iface_addr exchange. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think it is ok to have different address lengths, but it will not work with unified mode (which is disabled by default) |
||
| iface_attr->device_addr_len = sizeof(uint64_t); | ||
| iface_attr->ep_addr_len = 0; | ||
| iface_attr->max_conn_priv = 0; | ||
|
|
@@ -530,6 +542,9 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_ipc_iface_t, uct_md_h md, uct_worker_h worke | |
| self->cuda_context = 0; | ||
| ucs_queue_head_init(&self->outstanding_d2d_event_q); | ||
|
|
||
| ucs_recursive_spinlock_init(&self->rem_iface_addr_lock, 0); | ||
| kh_init_inplace(cuda_ipc_rem_iface_addr, &self->rem_iface_addr_hash); | ||
|
|
||
| return UCS_OK; | ||
| } | ||
|
|
||
|
|
@@ -560,6 +575,9 @@ static UCS_CLASS_CLEANUP_FUNC(uct_cuda_ipc_iface_t) | |
| if (self->eventfd != -1) { | ||
| close(self->eventfd); | ||
| } | ||
|
|
||
| kh_destroy_inplace(cuda_ipc_rem_iface_addr, &self->rem_iface_addr_hash); | ||
| ucs_recursive_spinlock_destroy(&self->rem_iface_addr_lock); | ||
| } | ||
|
|
||
| ucs_status_t | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is it needed to protect
uct_cuda_sys_dev_bus_id_map? If yes, when can it be accessed concurrently?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As this logic is called as part of md_query functionality and because we access global structure uct_cuda_sys_dev_bus_id_map, I used spin_lock around the access. Is this unnecessary? Is it guaranteed that query_md_resources will be called by only thread at any given time?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently we support multi-threading for UCP only, and all UCP API calls (including progress) are protected by a global lock.
Imo, there is not need for lock here then.