Skip to content

Commit 1f0fe81

Browse files
authored
Merge pull request #34 from GuangguanWang/main
Two fixes for StepMesh
2 parents 03deae2 + 6325888 commit 1f0fe81

File tree

4 files changed

+22
-13
lines changed

4 files changed

+22
-13
lines changed

fserver/csrc/public.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,6 @@ void init() {
178178
q_signal_.store(0);;
179179

180180
ps::StartPS(0, role_, group_size_ * node_rank_ + gpu_, true);
181-
Backend::Get()->SetDevice(gpu_);
182181
if (role_ == Node::WORKER) {
183182
fworker_ = new AFTensorWorker(instance_id_);
184183
barrier(true, true);

include/ps/ps.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,13 @@ inline void StartPS(int customer_id, Node::Role role, int rank, bool do_barrier,
143143
Backend::Register("GPU", new GpuBackend());
144144
#endif
145145

146+
// scheduler do not need to attach to gpu
147+
if (role != Node::SCHEDULER) {
148+
int gpu = 0;
149+
Environment::Get()->find("STEPMESH_GPU", &gpu, gpu);
150+
Backend::Get()->SetDevice(gpu);
151+
}
152+
146153
int group_size = 1;
147154

148155
Environment::Get()->find("DMLC_GROUP_SIZE", &group_size, group_size);

src/network_utils.h

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -335,20 +335,22 @@ static inline int GetAvailablePort(int num_ports, std::array<int, 32>* ports) {
335335
* \return 0 on failure or no cuda, 1 when getting the interface successfully
336336
*/
337337
static inline int GetInterfaceAndIPByCurrentGpu(std::string* interface,
338-
std::string* ip) {
338+
std::string* ip, int* gpu) {
339339
interface->clear();
340340
ip->clear();
341341

342342
#ifdef DMLC_USE_CUDA
343-
int gpu = -1;
344-
cudaGetDevice(&gpu);
345-
if (gpu == -1) return 0;
343+
cudaGetDevice(gpu);
344+
if (*gpu == -1) return 0;
346345
char pciPath[512];
347-
cudaDeviceProp deviceProp = {};
348-
cudaGetDeviceProperties(&deviceProp, gpu);
349-
snprintf(pciPath, sizeof(pciPath),
350-
"/sys/class/pci_bus/0000:%02x/device/0000:%02x:%02x.0/device",
351-
deviceProp.pciBusID, deviceProp.pciBusID, deviceProp.pciDeviceID);
346+
char busId[16];
347+
cudaError_t status;
348+
status = cudaDeviceGetPCIBusId(busId, 16, *gpu);
349+
PS_CHECK_EQ(status, cudaSuccess) << "cudaDeviceGetPCIBusId failed"
350+
<< " (" << cudaGetErrorString(status) << ")";
351+
for (int i = 0; i < 16; i++) busId[i] = std::tolower(busId[i]);
352+
snprintf(pciPath, sizeof(pciPath), "/sys/class/pci_bus/%.7s/device/%s/device",
353+
busId, busId);
352354
char* path = realpath(pciPath, nullptr);
353355

354356
if (path == nullptr) return 0;

src/van.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -552,9 +552,10 @@ void Van::Start(int customer_id, bool standalone) {
552552
std::string interface;
553553
if (itf) interface = std::string(itf);
554554
if (interface == "auto" || interface == "AUTO") {
555-
GetInterfaceAndIPByCurrentGpu(&interface, &ip);
556-
PS_LOG(INFO) << "automatic detect interface and ip from gpu: "
557-
<< interface << " (" << ip << ")";
555+
int gpu = -1;
556+
GetInterfaceAndIPByCurrentGpu(&interface, &ip, &gpu);
557+
PS_LOG(INFO) << "automatic detect interface and ip from gpu(" << gpu
558+
<< "): " << interface << " (" << ip << ")";
558559
Environment::Get()->set("DMLC_NODE_HOST", ip);
559560
Environment::Get()->set("DMLC_INTERFACE", interface);
560561
} else {

0 commit comments

Comments
 (0)