From b06e6ddf37a859b44bf9380dc9b6543393d5b8b3 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 02:29:25 +0800 Subject: [PATCH 01/16] Update pipelines.py --- hy3dshape/hy3dshape/pipelines.py | 108 +++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/hy3dshape/hy3dshape/pipelines.py b/hy3dshape/hy3dshape/pipelines.py index 71de472..0bb7c8f 100644 --- a/hy3dshape/hy3dshape/pipelines.py +++ b/hy3dshape/hy3dshape/pipelines.py @@ -781,3 +781,111 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline): box_v, mc_level, num_chunks, octree_resolution, mc_algo, enable_pbar=enable_pbar, ) + + @classmethod + @synchronize_timer('Hunyuan3DDiTFlowMatchingPipeline from Lightning Checkpoint') + def from_lightning_checkpoint( + cls, + ckpt_path: str, + config_path: str, + device: str = 'cuda', + dtype: torch.dtype = torch.float16, + **kwargs, + ): + """ + Loads a model from a checkpoint created by the project's PyTorch Lightning training script. + + This method correctly handles the nested configuration structure and state_dict prefixes + produced during training, and can intelligently load sharded checkpoints saved by Deepspeed. + + Args: + ckpt_path (str): Path to the .ckpt checkpoint file or directory. + config_path (str): Path to the .yaml configuration file used for training. + device (str, optional): The device to load the model on. Defaults to 'cuda'. + dtype (torch.dtype, optional): The data type for the model. Defaults to torch.float16. + + Returns: + Hunyuan3DDiTFlowMatchingPipeline: An instantiated pipeline ready for inference. + """ + from omegaconf import OmegaConf + from hy3dshape.utils.misc import instantiate_from_config + from hy3dshape.schedulers import FlowMatchEulerDiscreteScheduler + + logger.info(f"Loading model from Lightning checkpoint: {ckpt_path}") + logger.info(f"Using training config: {config_path}") + + config = OmegaConf.load(config_path) + + if os.path.isdir(ckpt_path): + # Assumes a Deepspeed-saved checkpoint directory + model_state_file = os.path.join(ckpt_path, 'checkpoint', 'mp_rank_00_model_states.pt') + if not os.path.exists(model_state_file): + raise FileNotFoundError( + f"Could not find model weights file 'mp_rank_00_model_states.pt' in Deepspeed checkpoint directory: {os.path.join(ckpt_path, 'checkpoint')}" + ) + + logger.info(f"Detected Deepspeed checkpoint directory, loading weights from: '{model_state_file}'") + ckpt = torch.load(model_state_file, map_location='cpu', weights_only=False) + # Deepspeed weights are often nested under the 'module' key + state_dict = ckpt.get('module', ckpt) + else: + # Standard .ckpt file + logger.info("Detected standard .ckpt file.") + ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False) + state_dict = ckpt.get('state_dict', ckpt) + + # 1. Instantiate components that were frozen during training. + # They will load their own pretrained weights upon instantiation. + logger.info("Instantiating VAE, Conditioner, and ImageProcessor...") + vae = instantiate_from_config(config.model.params.first_stage_config) + conditioner = instantiate_from_config(config.model.params.cond_stage_config) + image_processor = instantiate_from_config(config.model.params.image_processor_cfg) + + # 2. Instantiate the component that was trained (the Denoiser). + logger.info("Instantiating Denoiser...") + denoiser = instantiate_from_config(config.model.params.denoiser_cfg) + + # 3. Load weights only for the Denoiser from our training checkpoint. + possible_prefixes = ["model.model.", "_forward_module.model.", "model."] + denoiser_dict = {} + matched_prefix = None + for prefix in possible_prefixes: + sub_dict = {k.replace(prefix, ''): v for k, v in state_dict.items() if k.startswith(prefix)} + if sub_dict: + denoiser_dict = sub_dict + matched_prefix = prefix + break + + if denoiser_dict: + logger.info(f"Successfully matched Denoiser weight prefix: '{matched_prefix}'") + missing_keys, unexpected_keys = denoiser.load_state_dict(denoiser_dict, strict=False) + logger.info(" Successfully loaded weights for 'denoiser'.") + if missing_keys: + logger.warning(f" - Missing keys: {missing_keys}") + if unexpected_keys: + logger.warning(f" - Unexpected keys: {unexpected_keys}") + else: + logger.warning("Could not find weights for 'denoiser' in checkpoint. It will be randomly initialized.") + + # 4. Instantiate a new, inference-compatible scheduler. + logger.info("Creating a new scheduler for inference...") + scheduler = FlowMatchEulerDiscreteScheduler() + + # 5. Assemble the final, healthy pipeline. + pipeline = cls( + model=denoiser, + vae=vae, + scheduler=scheduler, + conditioner=conditioner, + image_processor=image_processor, + **kwargs, + ) + + # 6. Move all model components to the correct device and set to evaluation mode. + pipeline.to(torch.device(device), dtype=dtype) + pipeline.model.eval() + pipeline.vae.eval() + pipeline.conditioner.eval() + + logger.info("\n Pipeline successfully assembled from Lightning checkpoint!") + return pipeline From e34a3ba752a3a2c2f77c5d8270aa4677392839aa Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 02:33:30 +0800 Subject: [PATCH 02/16] Create run_inference_with_fix.py --- run_inference_with_fix.py | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 run_inference_with_fix.py diff --git a/run_inference_with_fix.py b/run_inference_with_fix.py new file mode 100644 index 0000000..a69f9a2 --- /dev/null +++ b/run_inference_with_fix.py @@ -0,0 +1,56 @@ +import os +import sys +import torch +from PIL import Image + +# Add the project's sub-directory to the path to allow direct imports +sys.path.insert(0, os.path.join(os.path.abspath('.'), 'hy3dshape')) +from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline + +# --- 1. Set up the paths for your trained model --- + +# The training configuration file you used +CONFIG_PATH = "./hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml" + +# !!! IMPORTANT: Change this to the actual checkpoint directory you generated !!! +# This should be the path to the directory, e.g., 'ckpt-step=00002000.ckpt' +CKPT_PATH = "./hy3dshape/output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00002000.ckpt" + +# The input image for inference +IMAGE_PATH = "./assets/demo.png" + +# The path where the final 3D model will be saved +OUTPUT_PATH = "./my_model_output.glb" + + +if __name__ == '__main__': + # Setup device and data type + if not torch.cuda.is_available(): + print("Warning: CUDA not available, running on CPU. This will be very slow.") + device = torch.device('cpu') + # Use float32 on CPU as it does not support bfloat16 + dtype = torch.float32 + else: + device = torch.device('cuda') + # Use the same precision as in training for best results + dtype = torch.bfloat16 + + print("\n--- Attempting to load the model using the new from_lightning_checkpoint method ---") + + # Load the pipeline using the new, elegant class method + pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_lightning_checkpoint( + ckpt_path=CKPT_PATH, + config_path=CONFIG_PATH, + device=str(device), + dtype=dtype, + ) + + print("\n Model loaded successfully! Starting inference...") + input_image = Image.open(IMAGE_PATH) + + # Run inference + mesh_output = pipeline(image=input_image)[0] + + # Save the result + mesh_output.export(OUTPUT_PATH) + print(f"\n Inference complete! The 3D model has been saved to: {OUTPUT_PATH}") From 8cd92830fbac44a39490ee0d98214aad07ae749f Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 15:51:55 +0800 Subject: [PATCH 03/16] Update train_deepspeed.sh auto detect --- hy3dshape/scripts/train_deepspeed.sh | 36 +++++++++++++++++----------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/hy3dshape/scripts/train_deepspeed.sh b/hy3dshape/scripts/train_deepspeed.sh index de8c61e..278cf73 100644 --- a/hy3dshape/scripts/train_deepspeed.sh +++ b/hy3dshape/scripts/train_deepspeed.sh @@ -1,11 +1,22 @@ # If: ImportError: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.20' not found # Do: ln /usr/local/gcc-8.3/lib64/libstdc++.so.6 -sf /usr/lib64/libstdc++.so.6 +# Try to auto-detect a suitable network interface if NCCL_SOCKET_IFNAME is not already set. +if [ -z "$NCCL_SOCKET_IFNAME" ]; then + # Find the first physical-like interface by excluding common virtual/loopback names. + DETECTED_IFACE=$(ls /sys/class/net | grep -vE '^(lo|docker|veth|cali|tunl|kube|ib|usb)' | head -n 1) + if [ -n "$DETECTED_IFACE" ]; then + echo "NCCL_SOCKET_IFNAME is not set. Auto-detected and exporting: $DETECTED_IFACE" + export NCCL_SOCKET_IFNAME=$DETECTED_IFACE + else + echo "Warning: Could not auto-detect a network interface. You may need to set NCCL_SOCKET_IFNAME manually if NCCL fails." + fi +fi + export NCCL_IB_TIMEOUT=24 export NCCL_NVLS_ENABLE=0 -NET_TYPE="high" +NET_TYPE="high" if [[ "${NET_TYPE}" = "low" ]]; then - export NCCL_SOCKET_IFNAME=eth1 export NCCL_IB_GID_INDEX=3 export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1 export NCCL_IB_SL=3 @@ -18,12 +29,10 @@ else export NCCL_IB_SL=3 export NCCL_CHECK_DISABLE=1 export NCCL_P2P_DISABLE=0 - export NCCL_IB_DISABLE=0 + export NCCL_IB_DISABLE=1 export NCCL_LL_THRESHOLD=16384 export NCCL_IB_CUDA_SUPPORT=1 - export NCCL_SOCKET_IFNAME=bond1 - export UCX_NET_DEVICES=bond1 - export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6 + # DELETED: The hardcoded export NCCL_SOCKET_IFNAME and UCX_NET_DEVICES lines export NCCL_COLLNET_ENABLE=0 export SHARP_COLL_ENABLE_SAT=0 export NCCL_NET_GDR_LEVEL=2 @@ -40,12 +49,13 @@ master_ip=$4 config=$5 output_dir=$6 - -echo node_num $node_num -echo node_rank $node_rank -echo master_ip $master_ip -echo config $config -echo output_dir $output_dir +echo "--- Script Arguments ---" +echo "node_num: $node_num" +echo "node_rank: $node_rank" +echo "master_ip: $master_ip" +echo "config: $config" +echo "output_dir: $output_dir" +echo "----------------------" if test -d "$output_dir"; then cp $config $output_dir @@ -58,7 +68,6 @@ NODE_RANK=$node_rank \ HF_HUB_OFFLINE=0 \ MASTER_PORT=12348 \ MASTER_ADDR=$master_ip \ -NCCL_SOCKET_IFNAME=bond1 \ NCCL_IB_GID_INDEX=3 \ NCCL_NVLS_ENABLE=0 \ python3 main.py \ @@ -67,4 +76,3 @@ python3 main.py \ --config $config \ --output_dir $output_dir \ --deepspeed - From f2f19d74a8ca9e76d9dc15b567d4bd5b5ab3e17b Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 15:53:01 +0800 Subject: [PATCH 04/16] Update hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml add explain --- ...ng-flowmatching-dinol518-bf16-lr1e4-4096.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml index bc3ec2a..193c5ae 100644 --- a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml +++ b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml @@ -33,6 +33,22 @@ dataset: std: &std [0.5, 0.5, 0.5] #! Point cloud sampling + # Important: The total number of points (pc_size + pc_sharpedge_size) is coupled with the + # architecture of the pretrained ShapeVAE in `first_stage_config`. You must ensure that + # the total number of points provided by the dataset matches the model's expectation. + # + # For the default pretrained ShapeVAE (`tencent/Hunyuan3D-2.1` VAE), the model + # implicitly expects a total of 81920 points. + # + # Correct configuration (for the default model): + # pc_size: 81920 + # pc_sharpedge_size: 0 + # + # Incorrect configuration that will cause a `split_with_sizes` RuntimeError: + # pc_size: 10240 + # pc_sharpedge_size: 10240 + # + # If you need to use a different number of points, you must retrain the ShapeVAE model first. pc_size: &pc_size 81920 pc_sharpedge_size: &pc_sharpedge_size 0 sharpedge_label: &sharpedge_label true From af935af6881edeb2461a0e98fc3abf549eb88cd6 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:36:46 +0800 Subject: [PATCH 05/16] Update train_deepspeed.sh --- hy3dshape/scripts/train_deepspeed.sh | 35 +++++++++++----------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/hy3dshape/scripts/train_deepspeed.sh b/hy3dshape/scripts/train_deepspeed.sh index 278cf73..ef9ff9d 100644 --- a/hy3dshape/scripts/train_deepspeed.sh +++ b/hy3dshape/scripts/train_deepspeed.sh @@ -1,22 +1,11 @@ # If: ImportError: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.20' not found # Do: ln /usr/local/gcc-8.3/lib64/libstdc++.so.6 -sf /usr/lib64/libstdc++.so.6 -# Try to auto-detect a suitable network interface if NCCL_SOCKET_IFNAME is not already set. -if [ -z "$NCCL_SOCKET_IFNAME" ]; then - # Find the first physical-like interface by excluding common virtual/loopback names. - DETECTED_IFACE=$(ls /sys/class/net | grep -vE '^(lo|docker|veth|cali|tunl|kube|ib|usb)' | head -n 1) - if [ -n "$DETECTED_IFACE" ]; then - echo "NCCL_SOCKET_IFNAME is not set. Auto-detected and exporting: $DETECTED_IFACE" - export NCCL_SOCKET_IFNAME=$DETECTED_IFACE - else - echo "Warning: Could not auto-detect a network interface. You may need to set NCCL_SOCKET_IFNAME manually if NCCL fails." - fi -fi - export NCCL_IB_TIMEOUT=24 export NCCL_NVLS_ENABLE=0 -NET_TYPE="high" +NET_TYPE="high" if [[ "${NET_TYPE}" = "low" ]]; then + export NCCL_SOCKET_IFNAME=eth1 export NCCL_IB_GID_INDEX=3 export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1 export NCCL_IB_SL=3 @@ -29,10 +18,12 @@ else export NCCL_IB_SL=3 export NCCL_CHECK_DISABLE=1 export NCCL_P2P_DISABLE=0 - export NCCL_IB_DISABLE=1 + export NCCL_IB_DISABLE=0 export NCCL_LL_THRESHOLD=16384 export NCCL_IB_CUDA_SUPPORT=1 - # DELETED: The hardcoded export NCCL_SOCKET_IFNAME and UCX_NET_DEVICES lines + export NCCL_SOCKET_IFNAME=bond1 + export UCX_NET_DEVICES=bond1 + export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6 export NCCL_COLLNET_ENABLE=0 export SHARP_COLL_ENABLE_SAT=0 export NCCL_NET_GDR_LEVEL=2 @@ -49,13 +40,12 @@ master_ip=$4 config=$5 output_dir=$6 -echo "--- Script Arguments ---" -echo "node_num: $node_num" -echo "node_rank: $node_rank" -echo "master_ip: $master_ip" -echo "config: $config" -echo "output_dir: $output_dir" -echo "----------------------" + +echo node_num $node_num +echo node_rank $node_rank +echo master_ip $master_ip +echo config $config +echo output_dir $output_dir if test -d "$output_dir"; then cp $config $output_dir @@ -68,6 +58,7 @@ NODE_RANK=$node_rank \ HF_HUB_OFFLINE=0 \ MASTER_PORT=12348 \ MASTER_ADDR=$master_ip \ +NCCL_SOCKET_IFNAME=bond1 \ NCCL_IB_GID_INDEX=3 \ NCCL_NVLS_ENABLE=0 \ python3 main.py \ From de7996251db38eded27eb999ebec5cf377a53138 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:37:32 +0800 Subject: [PATCH 06/16] Update hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml --- ...ng-flowmatching-dinol518-bf16-lr1e4-4096.yaml | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml index 193c5ae..bc3ec2a 100644 --- a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml +++ b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml @@ -33,22 +33,6 @@ dataset: std: &std [0.5, 0.5, 0.5] #! Point cloud sampling - # Important: The total number of points (pc_size + pc_sharpedge_size) is coupled with the - # architecture of the pretrained ShapeVAE in `first_stage_config`. You must ensure that - # the total number of points provided by the dataset matches the model's expectation. - # - # For the default pretrained ShapeVAE (`tencent/Hunyuan3D-2.1` VAE), the model - # implicitly expects a total of 81920 points. - # - # Correct configuration (for the default model): - # pc_size: 81920 - # pc_sharpedge_size: 0 - # - # Incorrect configuration that will cause a `split_with_sizes` RuntimeError: - # pc_size: 10240 - # pc_sharpedge_size: 10240 - # - # If you need to use a different number of points, you must retrain the ShapeVAE model first. pc_size: &pc_size 81920 pc_sharpedge_size: &pc_sharpedge_size 0 sharpedge_label: &sharpedge_label true From c6d4cb89e21ddf47fc7c125bef6e32e4b50d6180 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:39:10 +0800 Subject: [PATCH 07/16] Update train_deepspeed.sh From 6726877bbb6790e7936ab840c94de72e7f3a0b08 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:40:01 +0800 Subject: [PATCH 08/16] Update train_deepspeed.sh From 96349ad5d0e3a6db3c4a37bbe26239f3136a0614 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:43:40 +0800 Subject: [PATCH 09/16] Update train_deepspeed.sh --- hy3dshape/scripts/train_deepspeed.sh | 35 +++++++++++++++++----------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/hy3dshape/scripts/train_deepspeed.sh b/hy3dshape/scripts/train_deepspeed.sh index ef9ff9d..278cf73 100644 --- a/hy3dshape/scripts/train_deepspeed.sh +++ b/hy3dshape/scripts/train_deepspeed.sh @@ -1,11 +1,22 @@ # If: ImportError: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.20' not found # Do: ln /usr/local/gcc-8.3/lib64/libstdc++.so.6 -sf /usr/lib64/libstdc++.so.6 +# Try to auto-detect a suitable network interface if NCCL_SOCKET_IFNAME is not already set. +if [ -z "$NCCL_SOCKET_IFNAME" ]; then + # Find the first physical-like interface by excluding common virtual/loopback names. + DETECTED_IFACE=$(ls /sys/class/net | grep -vE '^(lo|docker|veth|cali|tunl|kube|ib|usb)' | head -n 1) + if [ -n "$DETECTED_IFACE" ]; then + echo "NCCL_SOCKET_IFNAME is not set. Auto-detected and exporting: $DETECTED_IFACE" + export NCCL_SOCKET_IFNAME=$DETECTED_IFACE + else + echo "Warning: Could not auto-detect a network interface. You may need to set NCCL_SOCKET_IFNAME manually if NCCL fails." + fi +fi + export NCCL_IB_TIMEOUT=24 export NCCL_NVLS_ENABLE=0 -NET_TYPE="high" +NET_TYPE="high" if [[ "${NET_TYPE}" = "low" ]]; then - export NCCL_SOCKET_IFNAME=eth1 export NCCL_IB_GID_INDEX=3 export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1 export NCCL_IB_SL=3 @@ -18,12 +29,10 @@ else export NCCL_IB_SL=3 export NCCL_CHECK_DISABLE=1 export NCCL_P2P_DISABLE=0 - export NCCL_IB_DISABLE=0 + export NCCL_IB_DISABLE=1 export NCCL_LL_THRESHOLD=16384 export NCCL_IB_CUDA_SUPPORT=1 - export NCCL_SOCKET_IFNAME=bond1 - export UCX_NET_DEVICES=bond1 - export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6 + # DELETED: The hardcoded export NCCL_SOCKET_IFNAME and UCX_NET_DEVICES lines export NCCL_COLLNET_ENABLE=0 export SHARP_COLL_ENABLE_SAT=0 export NCCL_NET_GDR_LEVEL=2 @@ -40,12 +49,13 @@ master_ip=$4 config=$5 output_dir=$6 - -echo node_num $node_num -echo node_rank $node_rank -echo master_ip $master_ip -echo config $config -echo output_dir $output_dir +echo "--- Script Arguments ---" +echo "node_num: $node_num" +echo "node_rank: $node_rank" +echo "master_ip: $master_ip" +echo "config: $config" +echo "output_dir: $output_dir" +echo "----------------------" if test -d "$output_dir"; then cp $config $output_dir @@ -58,7 +68,6 @@ NODE_RANK=$node_rank \ HF_HUB_OFFLINE=0 \ MASTER_PORT=12348 \ MASTER_ADDR=$master_ip \ -NCCL_SOCKET_IFNAME=bond1 \ NCCL_IB_GID_INDEX=3 \ NCCL_NVLS_ENABLE=0 \ python3 main.py \ From dc2ea32d76c8645e0d4453a83ff402511f8fe353 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:47:40 +0800 Subject: [PATCH 10/16] Update hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml --- ...ng-flowmatching-dinol518-bf16-lr1e4-4096.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml index bc3ec2a..193c5ae 100644 --- a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml +++ b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml @@ -33,6 +33,22 @@ dataset: std: &std [0.5, 0.5, 0.5] #! Point cloud sampling + # Important: The total number of points (pc_size + pc_sharpedge_size) is coupled with the + # architecture of the pretrained ShapeVAE in `first_stage_config`. You must ensure that + # the total number of points provided by the dataset matches the model's expectation. + # + # For the default pretrained ShapeVAE (`tencent/Hunyuan3D-2.1` VAE), the model + # implicitly expects a total of 81920 points. + # + # Correct configuration (for the default model): + # pc_size: 81920 + # pc_sharpedge_size: 0 + # + # Incorrect configuration that will cause a `split_with_sizes` RuntimeError: + # pc_size: 10240 + # pc_sharpedge_size: 10240 + # + # If you need to use a different number of points, you must retrain the ShapeVAE model first. pc_size: &pc_size 81920 pc_sharpedge_size: &pc_sharpedge_size 0 sharpedge_label: &sharpedge_label true From f0a008279e856e36cd79c1c019248f52528144b6 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:51:33 +0800 Subject: [PATCH 11/16] Update pipelines.py --- hy3dshape/hy3dshape/pipelines.py | 108 ------------------------------- 1 file changed, 108 deletions(-) diff --git a/hy3dshape/hy3dshape/pipelines.py b/hy3dshape/hy3dshape/pipelines.py index 0bb7c8f..71de472 100644 --- a/hy3dshape/hy3dshape/pipelines.py +++ b/hy3dshape/hy3dshape/pipelines.py @@ -781,111 +781,3 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline): box_v, mc_level, num_chunks, octree_resolution, mc_algo, enable_pbar=enable_pbar, ) - - @classmethod - @synchronize_timer('Hunyuan3DDiTFlowMatchingPipeline from Lightning Checkpoint') - def from_lightning_checkpoint( - cls, - ckpt_path: str, - config_path: str, - device: str = 'cuda', - dtype: torch.dtype = torch.float16, - **kwargs, - ): - """ - Loads a model from a checkpoint created by the project's PyTorch Lightning training script. - - This method correctly handles the nested configuration structure and state_dict prefixes - produced during training, and can intelligently load sharded checkpoints saved by Deepspeed. - - Args: - ckpt_path (str): Path to the .ckpt checkpoint file or directory. - config_path (str): Path to the .yaml configuration file used for training. - device (str, optional): The device to load the model on. Defaults to 'cuda'. - dtype (torch.dtype, optional): The data type for the model. Defaults to torch.float16. - - Returns: - Hunyuan3DDiTFlowMatchingPipeline: An instantiated pipeline ready for inference. - """ - from omegaconf import OmegaConf - from hy3dshape.utils.misc import instantiate_from_config - from hy3dshape.schedulers import FlowMatchEulerDiscreteScheduler - - logger.info(f"Loading model from Lightning checkpoint: {ckpt_path}") - logger.info(f"Using training config: {config_path}") - - config = OmegaConf.load(config_path) - - if os.path.isdir(ckpt_path): - # Assumes a Deepspeed-saved checkpoint directory - model_state_file = os.path.join(ckpt_path, 'checkpoint', 'mp_rank_00_model_states.pt') - if not os.path.exists(model_state_file): - raise FileNotFoundError( - f"Could not find model weights file 'mp_rank_00_model_states.pt' in Deepspeed checkpoint directory: {os.path.join(ckpt_path, 'checkpoint')}" - ) - - logger.info(f"Detected Deepspeed checkpoint directory, loading weights from: '{model_state_file}'") - ckpt = torch.load(model_state_file, map_location='cpu', weights_only=False) - # Deepspeed weights are often nested under the 'module' key - state_dict = ckpt.get('module', ckpt) - else: - # Standard .ckpt file - logger.info("Detected standard .ckpt file.") - ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False) - state_dict = ckpt.get('state_dict', ckpt) - - # 1. Instantiate components that were frozen during training. - # They will load their own pretrained weights upon instantiation. - logger.info("Instantiating VAE, Conditioner, and ImageProcessor...") - vae = instantiate_from_config(config.model.params.first_stage_config) - conditioner = instantiate_from_config(config.model.params.cond_stage_config) - image_processor = instantiate_from_config(config.model.params.image_processor_cfg) - - # 2. Instantiate the component that was trained (the Denoiser). - logger.info("Instantiating Denoiser...") - denoiser = instantiate_from_config(config.model.params.denoiser_cfg) - - # 3. Load weights only for the Denoiser from our training checkpoint. - possible_prefixes = ["model.model.", "_forward_module.model.", "model."] - denoiser_dict = {} - matched_prefix = None - for prefix in possible_prefixes: - sub_dict = {k.replace(prefix, ''): v for k, v in state_dict.items() if k.startswith(prefix)} - if sub_dict: - denoiser_dict = sub_dict - matched_prefix = prefix - break - - if denoiser_dict: - logger.info(f"Successfully matched Denoiser weight prefix: '{matched_prefix}'") - missing_keys, unexpected_keys = denoiser.load_state_dict(denoiser_dict, strict=False) - logger.info(" Successfully loaded weights for 'denoiser'.") - if missing_keys: - logger.warning(f" - Missing keys: {missing_keys}") - if unexpected_keys: - logger.warning(f" - Unexpected keys: {unexpected_keys}") - else: - logger.warning("Could not find weights for 'denoiser' in checkpoint. It will be randomly initialized.") - - # 4. Instantiate a new, inference-compatible scheduler. - logger.info("Creating a new scheduler for inference...") - scheduler = FlowMatchEulerDiscreteScheduler() - - # 5. Assemble the final, healthy pipeline. - pipeline = cls( - model=denoiser, - vae=vae, - scheduler=scheduler, - conditioner=conditioner, - image_processor=image_processor, - **kwargs, - ) - - # 6. Move all model components to the correct device and set to evaluation mode. - pipeline.to(torch.device(device), dtype=dtype) - pipeline.model.eval() - pipeline.vae.eval() - pipeline.conditioner.eval() - - logger.info("\n Pipeline successfully assembled from Lightning checkpoint!") - return pipeline From 7a9d765627b3f3131bfcb089ce54a6582256e33d Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:53:19 +0800 Subject: [PATCH 12/16] Update run_inference_with_fix.py --- run_inference_with_fix.py | 55 --------------------------------------- 1 file changed, 55 deletions(-) diff --git a/run_inference_with_fix.py b/run_inference_with_fix.py index a69f9a2..8b13789 100644 --- a/run_inference_with_fix.py +++ b/run_inference_with_fix.py @@ -1,56 +1 @@ -import os -import sys -import torch -from PIL import Image -# Add the project's sub-directory to the path to allow direct imports -sys.path.insert(0, os.path.join(os.path.abspath('.'), 'hy3dshape')) -from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline - -# --- 1. Set up the paths for your trained model --- - -# The training configuration file you used -CONFIG_PATH = "./hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml" - -# !!! IMPORTANT: Change this to the actual checkpoint directory you generated !!! -# This should be the path to the directory, e.g., 'ckpt-step=00002000.ckpt' -CKPT_PATH = "./hy3dshape/output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00002000.ckpt" - -# The input image for inference -IMAGE_PATH = "./assets/demo.png" - -# The path where the final 3D model will be saved -OUTPUT_PATH = "./my_model_output.glb" - - -if __name__ == '__main__': - # Setup device and data type - if not torch.cuda.is_available(): - print("Warning: CUDA not available, running on CPU. This will be very slow.") - device = torch.device('cpu') - # Use float32 on CPU as it does not support bfloat16 - dtype = torch.float32 - else: - device = torch.device('cuda') - # Use the same precision as in training for best results - dtype = torch.bfloat16 - - print("\n--- Attempting to load the model using the new from_lightning_checkpoint method ---") - - # Load the pipeline using the new, elegant class method - pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_lightning_checkpoint( - ckpt_path=CKPT_PATH, - config_path=CONFIG_PATH, - device=str(device), - dtype=dtype, - ) - - print("\n Model loaded successfully! Starting inference...") - input_image = Image.open(IMAGE_PATH) - - # Run inference - mesh_output = pipeline(image=input_image)[0] - - # Save the result - mesh_output.export(OUTPUT_PATH) - print(f"\n Inference complete! The 3D model has been saved to: {OUTPUT_PATH}") From 8eff6d82337b4d5f8a90673682f28c450d30fc25 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:54:43 +0800 Subject: [PATCH 13/16] Delete run_inference_with_fix.py --- run_inference_with_fix.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 run_inference_with_fix.py diff --git a/run_inference_with_fix.py b/run_inference_with_fix.py deleted file mode 100644 index 8b13789..0000000 --- a/run_inference_with_fix.py +++ /dev/null @@ -1 +0,0 @@ - From f4e03076654ea0264755dbc0bd8d727b2d90efc4 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Fri, 11 Jul 2025 18:32:16 +0800 Subject: [PATCH 14/16] Update train_deepspeed.sh --- hy3dshape/scripts/train_deepspeed.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hy3dshape/scripts/train_deepspeed.sh b/hy3dshape/scripts/train_deepspeed.sh index 278cf73..444e5cc 100644 --- a/hy3dshape/scripts/train_deepspeed.sh +++ b/hy3dshape/scripts/train_deepspeed.sh @@ -29,10 +29,9 @@ else export NCCL_IB_SL=3 export NCCL_CHECK_DISABLE=1 export NCCL_P2P_DISABLE=0 - export NCCL_IB_DISABLE=1 + export NCCL_IB_DISABLE=0 export NCCL_LL_THRESHOLD=16384 export NCCL_IB_CUDA_SUPPORT=1 - # DELETED: The hardcoded export NCCL_SOCKET_IFNAME and UCX_NET_DEVICES lines export NCCL_COLLNET_ENABLE=0 export SHARP_COLL_ENABLE_SAT=0 export NCCL_NET_GDR_LEVEL=2 From d9fc4d31bfcf9fe89d8dc4ab41d22231efd356e1 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Wed, 6 Aug 2025 01:12:13 +0800 Subject: [PATCH 15/16] Update hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml repair --- ...flowmatching-dinol518-bf16-lr1e4-4096.yaml | 21 +++++-------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml index 193c5ae..b32a87e 100644 --- a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml +++ b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml @@ -33,22 +33,6 @@ dataset: std: &std [0.5, 0.5, 0.5] #! Point cloud sampling - # Important: The total number of points (pc_size + pc_sharpedge_size) is coupled with the - # architecture of the pretrained ShapeVAE in `first_stage_config`. You must ensure that - # the total number of points provided by the dataset matches the model's expectation. - # - # For the default pretrained ShapeVAE (`tencent/Hunyuan3D-2.1` VAE), the model - # implicitly expects a total of 81920 points. - # - # Correct configuration (for the default model): - # pc_size: 81920 - # pc_sharpedge_size: 0 - # - # Incorrect configuration that will cause a `split_with_sizes` RuntimeError: - # pc_size: 10240 - # pc_sharpedge_size: 10240 - # - # If you need to use a different number of points, you must retrain the ShapeVAE model first. pc_size: &pc_size 81920 pc_sharpedge_size: &pc_sharpedge_size 0 sharpedge_label: &sharpedge_label true @@ -74,6 +58,11 @@ model: first_stage_config: target: hy3dshape.models.autoencoders.ShapeVAE from_pretrained: tencent/Hunyuan3D-2.1 + #Ensure same with the dataset setting + params: + pc_size: 81920 + pc_sharpedge_size: 0 + cond_stage_config: target: hy3dshape.models.conditioner.SingleImageEncoder From b3dd50ba3736ced7e733aa9b34054a4bf0b42829 Mon Sep 17 00:00:00 2001 From: s572915912 <54531516+s572915912@users.noreply.github.com> Date: Wed, 6 Aug 2025 01:14:49 +0800 Subject: [PATCH 16/16] Update misc.py repair --- hy3dshape/hy3dshape/utils/misc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hy3dshape/hy3dshape/utils/misc.py b/hy3dshape/hy3dshape/utils/misc.py index 55e1136..f875e6b 100644 --- a/hy3dshape/hy3dshape/utils/misc.py +++ b/hy3dshape/hy3dshape/utils/misc.py @@ -49,10 +49,12 @@ def instantiate_from_config(config, **kwargs): cls = get_obj_from_str(config["target"]) if config.get("from_pretrained", None): + params_kwargs = config.get("params", {}) return cls.from_pretrained( config["from_pretrained"], use_safetensors=config.get('use_safetensors', False), - variant=config.get('variant', 'fp16')) + variant=config.get('variant', 'fp16'), + **params_kwargs) params = config.get("params", dict()) # params.update(kwargs)