HSDP example
- It ports from fsdp_tp_example.py at pytorch example.
- TP tutorial
PP example is based on torch.distributed.pipelining
.
- pytorch 2.4+ for HSDP example
- pytorch 2.6+ for PP example.
-
install OneAPI 2025.1
-
build pytorch
# clone torch source code
git clone https://github.com/pytorch/pytorch.git
cd pytorch
git checkout v2.8.0-rc4
git submodule sync
git submodule update --init --recursive
# install build dependence
pip install -r requirements.txt
# build torch
USE_XPU=1 USE_CUDA=0 python setup.py develop
- bugfix
- cannot find xpupti
# disable it at cmake/Dependencies.cmake
@@ -1627,7 +1627,7 @@ if(USE_KINETO)
if((NOT USE_XPU) OR (NOT XPU_ENABLE_KINETO))
set(LIBKINETO_NOXPUPTI ON CACHE STRING "" FORCE)
else()
- set(LIBKINETO_NOXPUPTI OFF CACHE STRING "")
+ set(LIBKINETO_NOXPUPTI ON CACHE STRING "")
message(STATUS "Using Kineto with XPUPTI support")
endif()
- collective api issue
# upgrade oneccl version and modify the LD_LIBRARY_PATH before running
export LD_LIBRARY_PATH=<oneapi...>:<built-ccl-package-path>
- single machine with mult-cards
# 4 gpu cards
torchrun --nnodes=1 --nproc_per_node=4 fsdp_tp_example.py
- multi-node
# master node
# 4 gpu cards per node
torchrun --nnodes=2 --nproc_per_node=4 --node_rank=0 --master_addr=<master_ip> --master_port=29500 fsdp_tp_example.py
# other node
torchrun --nnodes=2 --nproc_per_node=4 --node_rank=1 --master_addr=<master_ip> --master_port=29500 fsdp_tp_example.py
# if face connection fail
export NCCL_SOCKET_IFNAME=<eth-port-name>
# schedule: GPipe, 1F1B, ZBVZeroBubble
torchrun --nnodes=1 --nproc_per_node=4 pp_example.py -s <schedule> [--skip-profile] [--capture]
torchrun --nnodes=1 --nproc_per_node=4 tp_example.py