From af935af6881edeb2461a0e98fc3abf549eb88cd6 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:36:46 +0800 Subject: [PATCH] Update train_deepspeed.sh --- hy3dshape/scripts/train_deepspeed.sh | 35 +++++++++++----------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/hy3dshape/scripts/train_deepspeed.sh b/hy3dshape/scripts/train_deepspeed.sh index 278cf73..ef9ff9d 100644 --- a/hy3dshape/scripts/train_deepspeed.sh +++ b/hy3dshape/scripts/train_deepspeed.sh @@ -1,22 +1,11 @@ # If: ImportError: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.20' not found # Do: ln /usr/local/gcc-8.3/lib64/libstdc++.so.6 -sf /usr/lib64/libstdc++.so.6 -# Try to auto-detect a suitable network interface if NCCL_SOCKET_IFNAME is not already set. -if [ -z "$NCCL_SOCKET_IFNAME" ]; then - # Find the first physical-like interface by excluding common virtual/loopback names. - DETECTED_IFACE=$(ls /sys/class/net | grep -vE '^(lo|docker|veth|cali|tunl|kube|ib|usb)' | head -n 1) - if [ -n "$DETECTED_IFACE" ]; then - echo "NCCL_SOCKET_IFNAME is not set. Auto-detected and exporting: $DETECTED_IFACE" - export NCCL_SOCKET_IFNAME=$DETECTED_IFACE - else - echo "Warning: Could not auto-detect a network interface. You may need to set NCCL_SOCKET_IFNAME manually if NCCL fails." - fi -fi - export NCCL_IB_TIMEOUT=24 export NCCL_NVLS_ENABLE=0 -NET_TYPE="high" +NET_TYPE="high" if [[ "${NET_TYPE}" = "low" ]]; then + export NCCL_SOCKET_IFNAME=eth1 export NCCL_IB_GID_INDEX=3 export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1 export NCCL_IB_SL=3 @@ -29,10 +18,12 @@ else export NCCL_IB_SL=3 export NCCL_CHECK_DISABLE=1 export NCCL_P2P_DISABLE=0 - export NCCL_IB_DISABLE=1 + export NCCL_IB_DISABLE=0 export NCCL_LL_THRESHOLD=16384 export NCCL_IB_CUDA_SUPPORT=1 - # DELETED: The hardcoded export NCCL_SOCKET_IFNAME and UCX_NET_DEVICES lines + export NCCL_SOCKET_IFNAME=bond1 + export UCX_NET_DEVICES=bond1 + export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6 export NCCL_COLLNET_ENABLE=0 export SHARP_COLL_ENABLE_SAT=0 export NCCL_NET_GDR_LEVEL=2 @@ -49,13 +40,12 @@ master_ip=$4 config=$5 output_dir=$6 -echo "--- Script Arguments ---" -echo "node_num: $node_num" -echo "node_rank: $node_rank" -echo "master_ip: $master_ip" -echo "config: $config" -echo "output_dir: $output_dir" -echo "----------------------" + +echo node_num $node_num +echo node_rank $node_rank +echo master_ip $master_ip +echo config $config +echo output_dir $output_dir if test -d "$output_dir"; then cp $config $output_dir @@ -68,6 +58,7 @@ NODE_RANK=$node_rank \ HF_HUB_OFFLINE=0 \ MASTER_PORT=12348 \ MASTER_ADDR=$master_ip \ +NCCL_SOCKET_IFNAME=bond1 \ NCCL_IB_GID_INDEX=3 \ NCCL_NVLS_ENABLE=0 \ python3 main.py \