fix shape training

2025-06-26 16:03:44 +08:00
parent d48c432b58
commit 7c92655a0d
15 changed files with 199 additions and 657 deletions
--- a/hy3dshape/configs/hunyuan3ddit-full-params-finetuning-flowmatching-dinog518-bf16-lr1e5-512.yaml
+++ b/hy3dshape/configs/hunyuan3ddit-full-params-finetuning-flowmatching-dinog518-bf16-lr1e5-512.yaml
@@ -1,174 +0,0 @@
 name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
 training:
  steps: 10_0000_0000
  use_amp: true
  amp_type: "bf16"
  base_lr: 1.e-5
  gradient_clip_val: 1.0
  gradient_clip_algorithm: "norm"
  every_n_train_steps: 2000 # 5000
  val_check_interval: 50 # 4096
  limit_val_batches: 16
 dataset:
  target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
  params:
    #! Base setting
    batch_size: 4
    num_workers: 8
    val_num_workers: 4
    # Data 
    train_data_list: tools/mini_trainset/preprocessed
    val_data_list: tools/mini_trainset/preprocessed
    #! Image loading
    cond_stage_key: "image" # image / text / image_text
    image_size: 518
    mean: &mean [0.5, 0.5, 0.5]
    std: &std [0.5, 0.5, 0.5]
    #! Point cloud sampling
    pc_size: &pc_size 30720
    pc_sharpedge_size: &pc_sharpedge_size 30720
    sharpedge_label: &sharpedge_label true
    return_normal: true
    #! Augmentation
    padding: true
 model:
  target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
  params:
    first_stage_key: "surface"
    cond_stage_key: "image"
    scale_by_std: false
    z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184
    torch_compile: false
    # ema_config:
    #   ema_model: LitEma
    #   ema_decay: 0.999
    #   ema_inference: false
    first_stage_config:
      target: hy3dshape.models.autoencoders.ShapeVAE
      from_pretrained: tencent/Hunyuan3D-2.1
      params:
        num_latents: &num_latents 512
        embed_dim: 64
        num_freqs: 8
        include_pi: false
        heads: 16
        width: 1024
        point_feats: 4
        num_decoder_layers: 16
        pc_size: *pc_size
        pc_sharpedge_size: *pc_sharpedge_size
        qkv_bias: false
        qk_norm: true
        scale_factor: *z_scale_factor
        geo_decoder_mlp_expand_ratio: 4
        geo_decoder_downsample_ratio: 1
        geo_decoder_ln_post: true
    cond_stage_config:
      target: hy3dshape.models.conditioner.SingleImageEncoder
      params:
        main_image_encoder:
            type: DinoImageEncoder # dino giant
            kwargs:
                config:
                  attention_probs_dropout_prob: 0.0
                  drop_path_rate: 0.0
                  hidden_act: gelu
                  hidden_dropout_prob: 0.0
                  hidden_size: 1536
                  image_size: 518
                  initializer_range: 0.02
                  layer_norm_eps: 1.e-6
                  layerscale_value: 1.0
                  mlp_ratio: 4
                  model_type: dinov2
                  num_attention_heads: 24
                  num_channels: 3
                  num_hidden_layers: 40
                  patch_size: 14
                  qkv_bias: true
                  torch_dtype: float32
                  use_swiglu_ffn: true
                image_size: 518
    denoiser_cfg:
      target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT
      params:
        ckpt_path: ~/.cache/hy3dgen/tencent/Hunyuan3D-2-1-Shape/dit/model.fp16.ckpt
        input_size: *num_latents
        context_in_dim: 1536
        hidden_size: 1024
        mlp_ratio: 4.0
        num_heads: 16
        depth: 16
        depth_single_blocks: 32
        axes_dim: [64]
        theta: 10000
        qkv_bias: true
        use_pe: false
        force_norm_fp32: true
    scheduler_cfg:
      transport:
        target: hy3dshape.models.diffusion.transport.create_transport
        params:
          path_type: Linear
          prediction: velocity
      sampler:
        target: hy3dshape.models.diffusion.transport.Sampler
        params: {}
        ode_params:
          sampling_method: euler # dopri5 ...
          num_steps: &num_steps 50
    optimizer_cfg:
      optimizer:
        target: torch.optim.AdamW
        params:
          betas: [0.9, 0.99]
          eps: 1.e-6
          weight_decay: 1.e-2
      scheduler:
        target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
        params:
          warm_up_steps: 50 # 5000
          f_start: 1.e-6
          f_min: 1.e-3
          f_max: 1.0
    pipeline_cfg:
      target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
    image_processor_cfg:
      target: hy3dshape.preprocessors.ImageProcessorV2
      params: {}
 callbacks:
    logger:
      target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
      params:
        step_frequency: 100 # 10000
        num_samples: 1
        sample_times: 1
        mean: *mean
        std: *std
        bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
        octree_depth: 8
        num_chunks: 50000
        mc_level: 0.0
    file_loggers:
        target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
        params:
          step_frequency: 50 # 5000
          test_data_path: "tools/mini_testset/images.json"
--- a/hy3dshape/configs/hunyuan3ddit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-512.yaml
+++ b/hy3dshape/configs/hunyuan3ddit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-512.yaml
@@ -1,173 +0,0 @@
 name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
 training:
  steps: 10_0000_0000
  use_amp: true
  amp_type: "bf16"
  base_lr: 1e-4
  gradient_clip_val: 1.0
  gradient_clip_algorithm: "norm"
  every_n_train_steps: 2000 # 5000
  val_check_interval: 50 # 4096
  limit_val_batches: 16
 dataset:
  target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
  params:
    #! Base setting
    batch_size: 2
    num_workers: 8
    val_num_workers: 4
    # Data 
    train_data_list: tools/mini_trainset/preprocessed
    val_data_list: tools/mini_trainset/preprocessed
    #! Image loading
    cond_stage_key: "image" # image / text / image_text
    image_size: 518
    mean: &mean [0.5, 0.5, 0.5]
    std: &std [0.5, 0.5, 0.5]
    #! Point cloud sampling
    pc_size: &pc_size 10240
    pc_sharpedge_size: &pc_sharpedge_size 10240
    sharpedge_label: &sharpedge_label true
    return_normal: true
    #! Augmentation
    padding: true
 model:
  target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
  params:
    first_stage_key: "surface"
    cond_stage_key: "image"
    scale_by_std: false
    z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184
    torch_compile: false
    # ema_config:
    #   ema_model: LitEma
    #   ema_decay: 0.999
    #   ema_inference: false
    first_stage_config:
      target: hy3dshape.models.autoencoders.ShapeVAE
      from_pretrained: tencent/Hunyuan3D-2.1
      params:
        num_latents: &num_latents 512
        embed_dim: 64
        num_freqs: 8
        include_pi: false
        heads: 16
        width: 1024
        point_feats: 4
        num_decoder_layers: 16
        pc_size: *pc_size
        pc_sharpedge_size: *pc_sharpedge_size
        qkv_bias: false
        qk_norm: true
        scale_factor: *z_scale_factor
        geo_decoder_mlp_expand_ratio: 4
        geo_decoder_downsample_ratio: 1
        geo_decoder_ln_post: true
    cond_stage_config:
      target: hy3dshape.models.conditioner.SingleImageEncoder
      params:
        main_image_encoder:
            type: DinoImageEncoder # dino giant
            kwargs:
                config:
                  attention_probs_dropout_prob: 0.0
                  drop_path_rate: 0.0
                  hidden_act: gelu
                  hidden_dropout_prob: 0.0
                  hidden_size: 1536
                  image_size: 518
                  initializer_range: 0.02
                  layer_norm_eps: 1.e-6
                  layerscale_value: 1.0
                  mlp_ratio: 4
                  model_type: dinov2
                  num_attention_heads: 24
                  num_channels: 3
                  num_hidden_layers: 40
                  patch_size: 14
                  qkv_bias: true
                  torch_dtype: float32
                  use_swiglu_ffn: true
                image_size: 518
    denoiser_cfg:
      target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT
      params:
        input_size: *num_latents
        context_in_dim: 1536
        hidden_size: 1024
        mlp_ratio: 4.0
        num_heads: 16
        depth: 8
        depth_single_blocks: 16
        axes_dim: [64]
        theta: 10000
        qkv_bias: true
        use_pe: false
        force_norm_fp32: true
    scheduler_cfg:
      transport:
        target: hy3dshape.models.diffusion.transport.create_transport
        params:
          path_type: Linear
          prediction: velocity
      sampler:
        target: hy3dshape.models.diffusion.transport.Sampler
        params: {}
        ode_params:
          sampling_method: euler # dopri5 ...
          num_steps: &num_steps 50
    optimizer_cfg:
      optimizer:
        target: torch.optim.AdamW
        params:
          betas: [0.9, 0.99]
          eps: 1.e-6
          weight_decay: 1.e-2
      scheduler:
        target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
        params:
          warm_up_steps: 50 # 5000
          f_start: 1.e-6
          f_min: 1.e-3
          f_max: 1.0
    pipeline_cfg:
      target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
    image_processor_cfg:
      target: hy3dshape.preprocessors.ImageProcessorV2
      params: {}
 callbacks:
    logger:
      target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
      params:
        step_frequency: 100 # 10000
        num_samples: 1
        sample_times: 1
        mean: *mean
        std: *std
        bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
        octree_depth: 8
        num_chunks: 50000
        mc_level: 0.0
    file_loggers:
        target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
        params:
          step_frequency: 50 # 5000
          test_data_path: "tools/mini_testset/images.json"
--- a/hy3dshape/configs/hunyuandit-finetuning-flowmatching-dinol518-bf16-lr1e5-4096.yaml
+++ b/hy3dshape/configs/hunyuandit-finetuning-flowmatching-dinol518-bf16-lr1e5-4096.yaml
@@ -1,4 +1,5 @@
-name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
+name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518"
 # training successfully on 8 x H20 with 98G Memory
 training:
  steps: 10_0000_0000
@@ -8,7 +9,8 @@ training:
  gradient_clip_val: 1.0
  gradient_clip_algorithm: "norm"
  every_n_train_steps: 2000 # 5000
-  val_check_interval: 50 # 4096
+  val_check_interval: 200 # 4096 
  # val_check_interval must be smaller than every_n_train_steps!!!
  limit_val_batches: 16
 dataset:
@@ -24,7 +26,7 @@ dataset:
    val_data_list: tools/mini_trainset/preprocessed
    #! Image loading
-    cond_stage_key: "image" # image / text / image_text
+    cond_stage_key: "image"
    image_size: 518
    mean: &mean [0.5, 0.5, 0.5]
    std: &std [0.5, 0.5, 0.5]
@@ -55,73 +57,21 @@ model:
    first_stage_config:
      target: hy3dshape.models.autoencoders.ShapeVAE
      from_pretrained: tencent/Hunyuan3D-2.1
      params:
        num_latents: &num_latents 4096
        embed_dim: 64
        num_freqs: 8
        include_pi: false
        heads: 16
        width: 1024
        num_encoder_layers: 8
        num_decoder_layers: 16
        qkv_bias: false
        qk_norm: true
        scale_factor: *z_scale_factor
        geo_decoder_mlp_expand_ratio: 4
        geo_decoder_downsample_ratio: 1
        geo_decoder_ln_post: true
        point_feats: 4
        pc_size: *pc_size
        pc_sharpedge_size: *pc_sharpedge_size
    cond_stage_config:
      target: hy3dshape.models.conditioner.SingleImageEncoder
      params:
        drop_ratio: 0.1
        main_image_encoder:
-            type: DinoImageEncoder # dino large
+            type: DinoImageEncoder 
            kwargs:
-                config:
+                version: 'facebook/dinov2-large'
                  attention_probs_dropout_prob: 0.0
                  drop_path_rate: 0.0
                  hidden_act: gelu
                  hidden_dropout_prob: 0.0
                  hidden_size: 1024
                  image_size: 518
                  initializer_range: 0.02
                  layer_norm_eps: 1.e-6
                  layerscale_value: 1.0
                  mlp_ratio: 4
                  model_type: dinov2
                  num_attention_heads: 16
                  num_channels: 3
                  num_hidden_layers: 24
                  patch_size: 14
                  qkv_bias: true
                  torch_dtype: float32
                  use_swiglu_ffn: false
                image_size: 518
                use_cls_token: true
    denoiser_cfg:
      target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
-      params:
+      from_pretrained: tencent/Hunyuan3D-2.1
        input_size: *num_latents
        in_channels: 64
        hidden_size: 2048
        context_dim: 1024
        depth: 21
        num_heads: 16
        qk_norm: true
        text_len: 1370
        with_decoupled_ca: false
        use_attention_pooling: false
        qk_norm_type: 'rms'
        qkv_bias: false
        use_pos_emb: false
        num_moe_layers: 6
        num_experts: 8
        moe_top_k: 2
    scheduler_cfg:
      transport:
@@ -163,7 +113,7 @@ callbacks:
    logger:
      target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
      params:
-        step_frequency: 100 # 10000
+        step_frequency: 1000 # 10000
        num_samples: 1
        sample_times: 1
        mean: *mean
@@ -176,5 +126,5 @@ callbacks:
    file_loggers:
        target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
        params:
-          step_frequency: 50 # 5000
+          step_frequency: 500 # 5000
          test_data_path: "tools/mini_testset/images.json"
--- a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-4096.yaml
+++ b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-4096.yaml
@@ -1,180 +0,0 @@
 name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
 training:
  steps: 10_0000_0000
  use_amp: true
  amp_type: "bf16"
  base_lr: 1e-4
  gradient_clip_val: 1.0
  gradient_clip_algorithm: "norm"
  every_n_train_steps: 2000 # 5000
  val_check_interval: 50 # 4096
  limit_val_batches: 16
 dataset:
  target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
  params:
    #! Base setting
    batch_size: 2
    num_workers: 8
    val_num_workers: 4
    # Data 
    train_data_list: tools/mini_trainset/preprocessed
    val_data_list: tools/mini_trainset/preprocessed
    #! Image loading
    cond_stage_key: "image" # image / text / image_text
    image_size: 518
    mean: &mean [0.5, 0.5, 0.5]
    std: &std [0.5, 0.5, 0.5]
    #! Point cloud sampling
    pc_size: &pc_size 81920
    pc_sharpedge_size: &pc_sharpedge_size 0
    sharpedge_label: &sharpedge_label true
    return_normal: true
    #! Augmentation
    padding: true
 model:
  target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
  params:
    first_stage_key: "surface"
    cond_stage_key: "image"
    scale_by_std: false
    z_scale_factor: &z_scale_factor 1.0039506158752403
    torch_compile: false
    # ema_config:
    #   ema_model: LitEma
    #   ema_decay: 0.999
    #   ema_inference: false
    first_stage_config:
      target: hy3dshape.models.autoencoders.ShapeVAE
      from_pretrained: tencent/Hunyuan3D-2.1
      params:
        num_latents: &num_latents 4096
        embed_dim: 64
        num_freqs: 8
        include_pi: false
        heads: 16
        width: 1024
        num_encoder_layers: 8
        num_decoder_layers: 16
        qkv_bias: false
        qk_norm: true
        scale_factor: *z_scale_factor
        geo_decoder_mlp_expand_ratio: 4
        geo_decoder_downsample_ratio: 1
        geo_decoder_ln_post: true
        point_feats: 4
        pc_size: *pc_size
        pc_sharpedge_size: *pc_sharpedge_size
    cond_stage_config:
      target: hy3dshape.models.conditioner.SingleImageEncoder
      params:
        main_image_encoder:
            type: DinoImageEncoder # dino large
            kwargs:
                config:
                  attention_probs_dropout_prob: 0.0
                  drop_path_rate: 0.0
                  hidden_act: gelu
                  hidden_dropout_prob: 0.0
                  hidden_size: 1024
                  image_size: 518
                  initializer_range: 0.02
                  layer_norm_eps: 1.e-6
                  layerscale_value: 1.0
                  mlp_ratio: 4
                  model_type: dinov2
                  num_attention_heads: 16
                  num_channels: 3
                  num_hidden_layers: 24
                  patch_size: 14
                  qkv_bias: true
                  torch_dtype: float32
                  use_swiglu_ffn: false
                image_size: 518
                use_cls_token: true
    denoiser_cfg:
      target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
      params:
        input_size: *num_latents
        in_channels: 64
        hidden_size: 2048
        context_dim: 1024
        depth: 11
        num_heads: 16
        qk_norm: true
        text_len: 1370
        with_decoupled_ca: false
        use_attention_pooling: false
        qk_norm_type: 'rms'
        qkv_bias: false
        use_pos_emb: false
        num_moe_layers: 6
        num_experts: 8
        moe_top_k: 2
    scheduler_cfg:
      transport:
        target: hy3dshape.models.diffusion.transport.create_transport
        params:
          path_type: Linear
          prediction: velocity
      sampler:
        target: hy3dshape.models.diffusion.transport.Sampler
        params: {}
        ode_params:
          sampling_method: euler # dopri5 ...
          num_steps: &num_steps 50
    optimizer_cfg:
      optimizer:
        target: torch.optim.AdamW
        params:
          betas: [0.9, 0.99]
          eps: 1.e-6
          weight_decay: 1.e-2
      scheduler:
        target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
        params:
          warm_up_steps: 50 # 5000
          f_start: 1.e-6
          f_min: 1.e-3
          f_max: 1.0
    pipeline_cfg:
      target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
    image_processor_cfg:
      target: hy3dshape.preprocessors.ImageProcessorV2
      params: {}
 callbacks:
    logger:
      target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
      params:
        step_frequency: 100 # 10000
        num_samples: 1
        sample_times: 1
        mean: *mean
        std: *std
        bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
        octree_depth: 8
        num_chunks: 50000
        mc_level: 0.0
    file_loggers:
        target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
        params:
          step_frequency: 50 # 5000
          test_data_path: "tools/mini_testset/images.json"
--- a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml
+++ b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml
@@ -1,4 +1,6 @@
-name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
+name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518"
 # oversitting successfully cost 68G memory under current settings
 # you can adjust model arch or batch_size according to your GPU memory
 training:
  steps: 10_0000_0000
@@ -8,14 +10,15 @@ training:
  gradient_clip_val: 1.0
  gradient_clip_algorithm: "norm"
  every_n_train_steps: 2000 # 5000
-  val_check_interval: 50 # 4096
+  val_check_interval: 200 # 4096 
  # val_check_interval must be smaller than every_n_train_steps!!!
  limit_val_batches: 16
 dataset:
  target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
  params:
    #! Base setting
-    batch_size: 2
+    batch_size: 4
    num_workers: 8
    val_num_workers: 4
@@ -24,7 +27,7 @@ dataset:
    val_data_list: tools/mini_trainset/preprocessed
    #! Image loading
-    cond_stage_key: "image" # image / text / image_text
+    cond_stage_key: "image"
    image_size: 518
    mean: &mean [0.5, 0.5, 0.5]
    std: &std [0.5, 0.5, 0.5]
@@ -55,63 +58,27 @@ model:
    first_stage_config:
      target: hy3dshape.models.autoencoders.ShapeVAE
      from_pretrained: tencent/Hunyuan3D-2.1
      params:
        num_latents: &num_latents 512
        embed_dim: 64
        num_freqs: 8
        include_pi: false
        heads: 16
        width: 1024
        num_encoder_layers: 8
        num_decoder_layers: 16
        qkv_bias: false
        qk_norm: true
        scale_factor: *z_scale_factor
        geo_decoder_mlp_expand_ratio: 4
        geo_decoder_downsample_ratio: 1
        geo_decoder_ln_post: true
        point_feats: 4
        pc_size: *pc_size
        pc_sharpedge_size: *pc_sharpedge_size
    cond_stage_config:
      target: hy3dshape.models.conditioner.SingleImageEncoder
      params:
        drop_ratio: 0.1
        main_image_encoder:
-            type: DinoImageEncoder # dino large
+            type: DinoImageEncoder 
            kwargs:
-                config:
+                version: 'facebook/dinov2-large'
                  attention_probs_dropout_prob: 0.0
                  drop_path_rate: 0.0
                  hidden_act: gelu
                  hidden_dropout_prob: 0.0
                  hidden_size: 1024
                  image_size: 518
                  initializer_range: 0.02
                  layer_norm_eps: 1.e-6
                  layerscale_value: 1.0
                  mlp_ratio: 4
                  model_type: dinov2
                  num_attention_heads: 16
                  num_channels: 3
                  num_hidden_layers: 24
                  patch_size: 14
                  qkv_bias: true
                  torch_dtype: float32
                  use_swiglu_ffn: false
                image_size: 518
                use_cls_token: true
    denoiser_cfg:
      target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
      params:
-        input_size: *num_latents
+        input_size: 4096
        in_channels: 64
-        hidden_size: 768
+        hidden_size: 2048
        context_dim: 1024
-        depth: 6
+        depth: 16
-        num_heads: 12
+        num_heads: 16
        qk_norm: true
        text_len: 1370
        with_decoupled_ca: false
@@ -147,7 +114,7 @@ model:
      scheduler:
        target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
        params:
-          warm_up_steps: 50 # 5000
+          warm_up_steps: 500 # 5000
          f_start: 1.e-6
          f_min: 1.e-3
          f_max: 1.0
@@ -163,7 +130,7 @@ callbacks:
    logger:
      target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
      params:
-        step_frequency: 100 # 10000
+        step_frequency: 1000 # 10000
        num_samples: 1
        sample_times: 1
        mean: *mean
@@ -176,5 +143,5 @@ callbacks:
    file_loggers:
        target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
        params:
-          step_frequency: 50 # 5000
+          step_frequency: 500 # 5000
          test_data_path: "tools/mini_testset/images.json"
--- a/hy3dshape/hy3dshape/models/autoencoders/attention_blocks.py
+++ b/hy3dshape/hy3dshape/models/autoencoders/attention_blocks.py
@@ -548,7 +548,7 @@ class PointCrossAttentionEncoder(nn.Module):
        if pc_sharpedge_size == 0:
            print(
-                f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is not given, using pc_size as pc_sharpedge_size')
+                f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is zero')
        else:
            print(
                f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is given, using pc_size={pc_size}, pc_sharpedge_size={pc_sharpedge_size}')
--- a/hy3dshape/hy3dshape/models/conditioner.py
+++ b/hy3dshape/hy3dshape/models/conditioner.py
@@ -32,6 +32,7 @@ from transformers import (
    Dinov2Model,
    Dinov2Config,
 )
 from transformers import AutoImageProcessor, AutoModel
 def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
@@ -66,9 +67,10 @@ class ImageEncoder(nn.Module):
        super().__init__()
        if config is None:
-            self.model = self.MODEL_CLASS.from_pretrained(version)
+            self.model = AutoModel.from_pretrained(version)
        else:
            self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config))
        self.model.eval()
        self.model.requires_grad_(False)
        self.use_cls_token = use_cls_token
@@ -240,11 +242,26 @@ class SingleImageEncoder(nn.Module):
    def __init__(
        self,
        main_image_encoder,
        drop_ratio=0.0
    ):
        super().__init__()
        self.main_image_encoder = build_image_encoder(main_image_encoder)
        self.drop_ratio = drop_ratio
        self.disable_drop = True
    def forward(self, image, mask=None, **kwargs):
        outputs = {
            'main': self.main_image_encoder(image, mask=mask, **kwargs),
        }
        if self.disable_drop:
            return outputs
        else:
            random_p = torch.rand(len(image), device='cuda')
            remain_bool_tensor = random_p > self.drop_ratio
            outputs['main'] *= remain_bool_tensor.view(-1,1,1)
        return outputs
        outputs = {
            'main': self.main_image_encoder(image, mask=mask, **kwargs),
        }
--- a/hy3dshape/hy3dshape/models/denoisers/hunyuandit.py
+++ b/hy3dshape/hy3dshape/models/denoisers/hunyuandit.py
@@ -22,6 +22,8 @@
 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 import os
 import yaml
 import math
 import numpy as np
@@ -31,6 +33,7 @@ import torch.nn.functional as F
 from einops import rearrange
 from .moe_layers import MoEBlock
 from ...utils import logger, synchronize_timer, smart_load_model
 def modulate(x, shift, scale):
@@ -464,6 +467,74 @@ class FinalLayer(nn.Module):
 class HunYuanDiTPlain(nn.Module):
    @classmethod
    @synchronize_timer('HunYuanDiTPlain Model Loading')
    def from_single_file(
        cls,
        ckpt_path,
        config_path,
        device='cuda',
        dtype=torch.float16,
        use_safetensors=None,
        **kwargs,
    ):
        # load config
        with open(config_path, 'r') as f:
            config = yaml.safe_load(f)
        # load ckpt
        if use_safetensors:
            ckpt_path = ckpt_path.replace('.ckpt', '.safetensors')
        if not os.path.exists(ckpt_path):
            raise FileNotFoundError(f"Model file {ckpt_path} not found")
        logger.info(f"Loading model from {ckpt_path}")
        if use_safetensors:
            import safetensors.torch
            ckpt = safetensors.torch.load_file(ckpt_path, device='cpu')
        else:
            ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
        if 'model' in ckpt:
            ckpt = ckpt['model']
        if 'model' in config:
            config = config['model']
        model_kwargs = config['params']
        model_kwargs.update(kwargs)
        model = cls(**model_kwargs)
        model.load_state_dict(ckpt)
        model.to(device=device, dtype=dtype)
        return model
    @classmethod
    def from_pretrained(
        cls,
        model_path,
        device='cuda',
        dtype=torch.float16,
        use_safetensors=False,
        variant='fp16',
        subfolder='hunyuan3d-dit-v2-1',
        **kwargs,
    ):
        config_path, ckpt_path = smart_load_model(
            model_path,
            subfolder=subfolder,
            use_safetensors=use_safetensors,
            variant=variant
        )
        return cls.from_single_file(
            ckpt_path,
            config_path,
            device=device,
            dtype=dtype,
            use_safetensors=use_safetensors,
            **kwargs
        )
    def __init__(
        self,
        input_size=1024,
--- a/hy3dshape/hy3dshape/models/diffusion/flow_matching_sit.py
+++ b/hy3dshape/hy3dshape/models/diffusion/flow_matching_sit.py
@@ -256,10 +256,7 @@ class Diffuser(pl.LightningModule):
    def forward(self, batch):
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16): #float32 for text
            contexts = self.cond_stage_model(image=batch.get('image'), text=batch.get('text'), mask=batch.get('mask'))
-            # t5_text = contexts['t5_text']['prompt_embeds']
+
            # nan_count = torch.isnan(t5_text).sum()
            # if nan_count > 0:
            #     print("t5_text has %d NaN values"%(nan_count))
        with torch.autocast(device_type="cuda", dtype=torch.float16):
            with torch.no_grad():
                latents = self.first_stage_model.encode(batch[self.first_stage_key], sample_posterior=True)
@@ -333,9 +330,6 @@ class Diffuser(pl.LightningModule):
                    image = batch.get("image", None)
                    mask = batch.get('mask', None)
                    # if not isinstance(image, torch.Tensor): print(image.shape)
                    # if isinstance(mask, torch.Tensor): print(mask.shape)
                    outputs = self.pipeline(image=image, 
                                            mask=mask,
                                            generator=generator,
@@ -350,5 +344,6 @@ class Diffuser(pl.LightningModule):
                        f.write(traceback.format_exc())
                        f.write("\n")
                    outputs = [None]
        self.cond_stage_model.disable_drop = False
        return [outputs]
--- a/hy3dshape/hy3dshape/utils/trainings/mesh_log_callback.py
+++ b/hy3dshape/hy3dshape/utils/trainings/mesh_log_callback.py
@@ -323,7 +323,9 @@ class ImageConditionalFixASLDiffuserLogger(Callback):
                    save_path = os.path.join(visual_dir, os.path.basename(image_path))
                save_path = os.path.splitext(save_path)[0] + '.glb'
                if isinstance(image_path, str):
                    print(image_path)
                with torch.no_grad():
                    mesh = pl_module.sample(batch={"image": image_path}, **self.kwargs)[0][0]
                    if isinstance(mesh, tuple) and len(mesh)==2:
--- a/hy3dshape/main.py
+++ b/hy3dshape/main.py
@@ -190,7 +190,7 @@ if __name__ == "__main__":
        precision=amp_type,
        callbacks=callbacks,
        accelerator="gpu",
-        devices=training_cfg.num_gpus,
+        devices=args.num_gpus,
        num_nodes=training_cfg.num_nodes,
        strategy=ddp_strategy,
        gradient_clip_val=training_cfg.get('gradient_clip_val'),
--- a/hy3dshape/minimal_demo.py
+++ b/hy3dshape/minimal_demo.py
@@ -13,7 +13,6 @@
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 from PIL import Image
 from hy3dshape.rembg import BackgroundRemover
 from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline
@@ -21,10 +20,12 @@ model_path = 'tencent/Hunyuan3D-2.1'
 pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path)
 image_path = 'demos/demo.png'
 image = Image.open(image_path).convert("RGBA")
 if image.mode == 'RGB':
    rembg = BackgroundRemover()
    image = rembg(image)
 image = image_path
 mesh = pipeline_shapegen(image=image)[0]
 mesh.export('demo.glb')
--- a/hy3dshape/minimal_demo_with_ckpt.py
+++ b/hy3dshape/minimal_demo_with_ckpt.py
@@ -0,0 +1,51 @@
 # Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
 # except for the third-party components listed below.
 # Hunyuan 3D does not impose any additional limitations beyond what is outlined
 # in the repsective licenses of these third-party components.
 # Users must comply with all terms and conditions of original licenses of these third-party
 # components and must ensure that the usage of the third party components adheres to
 # all relevant laws and regulations.
 # For avoidance of doubts, Hunyuan 3D means the large language models and
 # their software and algorithms, including trained model weights, parameters (including
 # optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
 # fine-tuning enabling code and other elements of the foregoing made publicly available
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 from PIL import Image
 from hy3dshape.rembg import BackgroundRemover
 from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline
 model_path = 'tencent/Hunyuan3D-2.1'
 pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path)
 import torch
 import yaml
 from hy3dshape.utils import instantiate_from_config 
 # For example, you can convert deepspeed weights to a single file
 # cd output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt
 # python3 zero_to_fp32.py ./ ./out --max_shard_size 30GB
 # then you can get output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt/out/pytorch_model.bin
 ckpt_cfg_path = 'output_folder/dit/overfitting_depth_16_token_4096_lr1e4_uc/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml'
 ckpt_path = 'output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt/out/pytorch_model.bin'
 config = yaml.safe_load(open(ckpt_cfg_path, 'r'))
 model = instantiate_from_config(config['model']['params']['denoiser_cfg'])
 sd = torch.load(ckpt_path)
 sd = {k.replace('_forward_module.model.', ''):v for k,v in sd.items()}
 msg = model.load_state_dict(sd)
 print(msg)
 model = model.cuda().half()
 pipeline_shapegen.model = model
 image = 'tools/mini_testset/images/015.png'
 # image = Image.open(image_path).convert("RGBA")
 # if image.mode == 'RGB':
 #     rembg = BackgroundRemover()
 #     image = rembg(image)
 # mesh = pipeline_shapegen(image=image, guidance_scale=1.0)[0]
 mesh = pipeline_shapegen(image=image)[0]
 mesh.export('demo.glb')
--- a/hy3dshape/scripts/train_deepspeed.sh
+++ b/hy3dshape/scripts/train_deepspeed.sh
@@ -35,12 +35,11 @@ export NCCL_DEBUG=WARN
 node_num=$1
 node_rank=$2
-master_ip=$3
+num_gpu_per_node=$3
-config=$4
+master_ip=$4
-output_dir=$5
+config=$5
 output_dir=$6
 # config='configs/dit-from-scratch-overfitting-flowmatching-dinog518-bf16-lr1e4-1024.yaml'
 # output_dir='output_folder/dit/overfitting_10'
 echo node_num $node_num
 echo node_rank $node_rank
@@ -64,7 +63,8 @@ NCCL_IB_GID_INDEX=3 \
 NCCL_NVLS_ENABLE=0 \
 python3 main.py \
    --num_nodes $node_num \
-    --num_gpus 8 \
+    --num_gpus $num_gpu_per_node \
    --config $config \
    --output_dir $output_dir \
    --deepspeed
--- a/hy3dshape/train_demo.sh
+++ b/hy3dshape/train_demo.sh
@@ -0,0 +1,15 @@
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 export num_gpu_per_node=8
 # export CUDA_VISIBLE_DEVICES=0
 # export num_gpu_per_node=1
 export node_num=1
 export node_rank=0
 export master_ip=0.0.0.0 # set your master_ip
 # export config=configs/hunyuandit-finetuning-flowmatching-dinol518-bf16-lr1e5-4096.yaml
 # export output_dir=output_folder/dit/fintuning_lr1e5
 export config=configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml
 export output_dir=output_folder/dit/overfitting_depth_16_token_4096_lr1e4
 bash scripts/train_deepspeed.sh $node_num $node_rank $num_gpu_per_node $master_ip $config $output_dir