From 8cd92830fbac44a39490ee0d98214aad07ae749f Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 15:51:55 +0800 Subject: [PATCH] Update train_deepspeed.sh auto detect --- hy3dshape/scripts/train_deepspeed.sh | 36 +++++++++++++++++----------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/hy3dshape/scripts/train_deepspeed.sh b/hy3dshape/scripts/train_deepspeed.sh index de8c61e..278cf73 100644 --- a/hy3dshape/scripts/train_deepspeed.sh +++ b/hy3dshape/scripts/train_deepspeed.sh @@ -1,11 +1,22 @@ # If: ImportError: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.20' not found # Do: ln /usr/local/gcc-8.3/lib64/libstdc++.so.6 -sf /usr/lib64/libstdc++.so.6 +# Try to auto-detect a suitable network interface if NCCL_SOCKET_IFNAME is not already set. +if [ -z "$NCCL_SOCKET_IFNAME" ]; then + # Find the first physical-like interface by excluding common virtual/loopback names. + DETECTED_IFACE=$(ls /sys/class/net | grep -vE '^(lo|docker|veth|cali|tunl|kube|ib|usb)' | head -n 1) + if [ -n "$DETECTED_IFACE" ]; then + echo "NCCL_SOCKET_IFNAME is not set. Auto-detected and exporting: $DETECTED_IFACE" + export NCCL_SOCKET_IFNAME=$DETECTED_IFACE + else + echo "Warning: Could not auto-detect a network interface. You may need to set NCCL_SOCKET_IFNAME manually if NCCL fails." + fi +fi + export NCCL_IB_TIMEOUT=24 export NCCL_NVLS_ENABLE=0 -NET_TYPE="high" +NET_TYPE="high" if [[ "${NET_TYPE}" = "low" ]]; then - export NCCL_SOCKET_IFNAME=eth1 export NCCL_IB_GID_INDEX=3 export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1 export NCCL_IB_SL=3 @@ -18,12 +29,10 @@ else export NCCL_IB_SL=3 export NCCL_CHECK_DISABLE=1 export NCCL_P2P_DISABLE=0 - export NCCL_IB_DISABLE=0 + export NCCL_IB_DISABLE=1 export NCCL_LL_THRESHOLD=16384 export NCCL_IB_CUDA_SUPPORT=1 - export NCCL_SOCKET_IFNAME=bond1 - export UCX_NET_DEVICES=bond1 - export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6 + # DELETED: The hardcoded export NCCL_SOCKET_IFNAME and UCX_NET_DEVICES lines export NCCL_COLLNET_ENABLE=0 export SHARP_COLL_ENABLE_SAT=0 export NCCL_NET_GDR_LEVEL=2 @@ -40,12 +49,13 @@ master_ip=$4 config=$5 output_dir=$6 - -echo node_num $node_num -echo node_rank $node_rank -echo master_ip $master_ip -echo config $config -echo output_dir $output_dir +echo "--- Script Arguments ---" +echo "node_num: $node_num" +echo "node_rank: $node_rank" +echo "master_ip: $master_ip" +echo "config: $config" +echo "output_dir: $output_dir" +echo "----------------------" if test -d "$output_dir"; then cp $config $output_dir @@ -58,7 +68,6 @@ NODE_RANK=$node_rank \ HF_HUB_OFFLINE=0 \ MASTER_PORT=12348 \ MASTER_ADDR=$master_ip \ -NCCL_SOCKET_IFNAME=bond1 \ NCCL_IB_GID_INDEX=3 \ NCCL_NVLS_ENABLE=0 \ python3 main.py \ @@ -67,4 +76,3 @@ python3 main.py \ --config $config \ --output_dir $output_dir \ --deepspeed -