init
This commit is contained in:
70
hy3dshape/scripts/train_deepspeed.sh
Normal file
70
hy3dshape/scripts/train_deepspeed.sh
Normal file
@@ -0,0 +1,70 @@
|
||||
# If: ImportError: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.20' not found
|
||||
# Do: ln /usr/local/gcc-8.3/lib64/libstdc++.so.6 -sf /usr/lib64/libstdc++.so.6
|
||||
|
||||
export NCCL_IB_TIMEOUT=24
|
||||
export NCCL_NVLS_ENABLE=0
|
||||
NET_TYPE="high"
|
||||
if [[ "${NET_TYPE}" = "low" ]]; then
|
||||
export NCCL_SOCKET_IFNAME=eth1
|
||||
export NCCL_IB_GID_INDEX=3
|
||||
export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1
|
||||
export NCCL_IB_SL=3
|
||||
export NCCL_CHECK_DISABLE=1
|
||||
export NCCL_P2P_DISABLE=0
|
||||
export NCCL_LL_THRESHOLD=16384
|
||||
export NCCL_IB_CUDA_SUPPORT=1
|
||||
else
|
||||
export NCCL_IB_GID_INDEX=3
|
||||
export NCCL_IB_SL=3
|
||||
export NCCL_CHECK_DISABLE=1
|
||||
export NCCL_P2P_DISABLE=0
|
||||
export NCCL_IB_DISABLE=0
|
||||
export NCCL_LL_THRESHOLD=16384
|
||||
export NCCL_IB_CUDA_SUPPORT=1
|
||||
export NCCL_SOCKET_IFNAME=bond1
|
||||
export UCX_NET_DEVICES=bond1
|
||||
export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6
|
||||
export NCCL_COLLNET_ENABLE=0
|
||||
export SHARP_COLL_ENABLE_SAT=0
|
||||
export NCCL_NET_GDR_LEVEL=2
|
||||
export NCCL_IB_QPS_PER_CONNECTION=4
|
||||
export NCCL_IB_TC=160
|
||||
export NCCL_PXN_DISABLE=0
|
||||
fi
|
||||
export NCCL_DEBUG=WARN
|
||||
|
||||
node_num=$1
|
||||
node_rank=$2
|
||||
master_ip=$3
|
||||
config=$4
|
||||
output_dir=$5
|
||||
|
||||
# config='configs/dit-from-scratch-overfitting-flowmatching-dinog518-bf16-lr1e4-1024.yaml'
|
||||
# output_dir='output_folder/dit/overfitting_10'
|
||||
|
||||
echo node_num $node_num
|
||||
echo node_rank $node_rank
|
||||
echo master_ip $master_ip
|
||||
echo config $config
|
||||
echo output_dir $output_dir
|
||||
|
||||
if test -d "$output_dir"; then
|
||||
cp $config $output_dir
|
||||
else
|
||||
mkdir -p "$output_dir"
|
||||
cp $config $output_dir
|
||||
fi
|
||||
|
||||
NODE_RANK=$node_rank \
|
||||
HF_HUB_OFFLINE=1 \
|
||||
MASTER_PORT=12348 \
|
||||
MASTER_ADDR=$master_ip \
|
||||
NCCL_SOCKET_IFNAME=bond1 \
|
||||
NCCL_IB_GID_INDEX=3 \
|
||||
NCCL_NVLS_ENABLE=0 \
|
||||
python3 main.py \
|
||||
--num_nodes $node_num \
|
||||
--num_gpus 8 \
|
||||
--config $config \
|
||||
--output_dir $output_dir \
|
||||
--deepspeed
|
||||
Reference in New Issue
Block a user