fix shape training

2025-06-26 16:03:44 +08:00
parent d48c432b58
commit 7c92655a0d
15 changed files with 199 additions and 657 deletions
--- a/hy3dshape/configs/hunyuan3ddit-full-params-finetuning-flowmatching-dinog518-bf16-lr1e5-512.yaml
+++ b/hy3dshape/configs/hunyuan3ddit-full-params-finetuning-flowmatching-dinog518-bf16-lr1e5-512.yaml
@@ -1,174 +0,0 @@
-name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
-
-training:
-  steps: 10_0000_0000
-  use_amp: true
-  amp_type: "bf16"
-  base_lr: 1.e-5
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: "norm"
-  every_n_train_steps: 2000 # 5000
-  val_check_interval: 50 # 4096
-  limit_val_batches: 16
-
-dataset:
-  target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
-  params:
-    #! Base setting
-    batch_size: 4
-    num_workers: 8
-    val_num_workers: 4
-
-    # Data 
-    train_data_list: tools/mini_trainset/preprocessed
-    val_data_list: tools/mini_trainset/preprocessed
-
-    #! Image loading
-    cond_stage_key: "image" # image / text / image_text
-    image_size: 518
-    mean: &mean [0.5, 0.5, 0.5]
-    std: &std [0.5, 0.5, 0.5]
-
-    #! Point cloud sampling
-    pc_size: &pc_size 30720
-    pc_sharpedge_size: &pc_sharpedge_size 30720
-    sharpedge_label: &sharpedge_label true
-    return_normal: true
-
-    #! Augmentation
-    padding: true
-
-model:
-  target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
-  params:
-    first_stage_key: "surface"
-    cond_stage_key: "image"
-    scale_by_std: false
-    z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184
-    torch_compile: false
-
-    # ema_config:
-    #   ema_model: LitEma
-    #   ema_decay: 0.999
-    #   ema_inference: false
-
-    first_stage_config:
-      target: hy3dshape.models.autoencoders.ShapeVAE
-      from_pretrained: tencent/Hunyuan3D-2.1
-      params:
-        num_latents: &num_latents 512
-        embed_dim: 64
-        num_freqs: 8
-        include_pi: false
-        heads: 16
-        width: 1024
-        point_feats: 4
-        num_decoder_layers: 16
-        pc_size: *pc_size
-        pc_sharpedge_size: *pc_sharpedge_size
-        qkv_bias: false
-        qk_norm: true
-        scale_factor: *z_scale_factor
-        geo_decoder_mlp_expand_ratio: 4
-        geo_decoder_downsample_ratio: 1
-        geo_decoder_ln_post: true
-
-    cond_stage_config:
-      target: hy3dshape.models.conditioner.SingleImageEncoder
-      params:
-        main_image_encoder:
-            type: DinoImageEncoder # dino giant
-            kwargs:
-                config:
-                  attention_probs_dropout_prob: 0.0
-                  drop_path_rate: 0.0
-                  hidden_act: gelu
-                  hidden_dropout_prob: 0.0
-                  hidden_size: 1536
-                  image_size: 518
-                  initializer_range: 0.02
-                  layer_norm_eps: 1.e-6
-                  layerscale_value: 1.0
-                  mlp_ratio: 4
-                  model_type: dinov2
-                  num_attention_heads: 24
-                  num_channels: 3
-                  num_hidden_layers: 40
-                  patch_size: 14
-                  qkv_bias: true
-                  torch_dtype: float32
-                  use_swiglu_ffn: true
-                image_size: 518
-
-    denoiser_cfg:
-      target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT
-      params:
-        ckpt_path: ~/.cache/hy3dgen/tencent/Hunyuan3D-2-1-Shape/dit/model.fp16.ckpt
-        input_size: *num_latents
-        context_in_dim: 1536
-        hidden_size: 1024
-        mlp_ratio: 4.0
-        num_heads: 16
-        depth: 16
-        depth_single_blocks: 32
-        axes_dim: [64]
-        theta: 10000
-        qkv_bias: true
-        use_pe: false
-        force_norm_fp32: true
-
-    scheduler_cfg:
-      transport:
-        target: hy3dshape.models.diffusion.transport.create_transport
-        params:
-          path_type: Linear
-          prediction: velocity
-      sampler:
-        target: hy3dshape.models.diffusion.transport.Sampler
-        params: {}
-        ode_params:
-          sampling_method: euler # dopri5 ...
-          num_steps: &num_steps 50
-
-    optimizer_cfg:
-      optimizer:
-        target: torch.optim.AdamW
-        params:
-          betas: [0.9, 0.99]
-          eps: 1.e-6
-          weight_decay: 1.e-2
-
-      scheduler:
-        target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
-        params:
-          warm_up_steps: 50 # 5000
-          f_start: 1.e-6
-          f_min: 1.e-3
-          f_max: 1.0
-
-    pipeline_cfg:
-      target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
-
-    image_processor_cfg:
-      target: hy3dshape.preprocessors.ImageProcessorV2
-      params: {}
-
-callbacks:
-    logger:
-      target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
-      params:
-        step_frequency: 100 # 10000
-        num_samples: 1
-        sample_times: 1
-        mean: *mean
-        std: *std
-        bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
-        octree_depth: 8
-        num_chunks: 50000
-        mc_level: 0.0
-    
-    file_loggers:
-        target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
-        params:
-          step_frequency: 50 # 5000
-          test_data_path: "tools/mini_testset/images.json"
--- a/hy3dshape/configs/hunyuan3ddit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-512.yaml
+++ b/hy3dshape/configs/hunyuan3ddit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-512.yaml
@@ -1,173 +0,0 @@
-name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
-
-training:
-  steps: 10_0000_0000
-  use_amp: true
-  amp_type: "bf16"
-  base_lr: 1e-4
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: "norm"
-  every_n_train_steps: 2000 # 5000
-  val_check_interval: 50 # 4096
-  limit_val_batches: 16
-
-dataset:
-  target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
-  params:
-    #! Base setting
-    batch_size: 2
-    num_workers: 8
-    val_num_workers: 4
-
-    # Data 
-    train_data_list: tools/mini_trainset/preprocessed
-    val_data_list: tools/mini_trainset/preprocessed
-
-    #! Image loading
-    cond_stage_key: "image" # image / text / image_text
-    image_size: 518
-    mean: &mean [0.5, 0.5, 0.5]
-    std: &std [0.5, 0.5, 0.5]
-
-    #! Point cloud sampling
-    pc_size: &pc_size 10240
-    pc_sharpedge_size: &pc_sharpedge_size 10240
-    sharpedge_label: &sharpedge_label true
-    return_normal: true
-
-    #! Augmentation
-    padding: true
-
-model:
-  target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
-  params:
-    first_stage_key: "surface"
-    cond_stage_key: "image"
-    scale_by_std: false
-    z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184
-    torch_compile: false
-
-    # ema_config:
-    #   ema_model: LitEma
-    #   ema_decay: 0.999
-    #   ema_inference: false
-
-    first_stage_config:
-      target: hy3dshape.models.autoencoders.ShapeVAE
-      from_pretrained: tencent/Hunyuan3D-2.1
-      params:
-        num_latents: &num_latents 512
-        embed_dim: 64
-        num_freqs: 8
-        include_pi: false
-        heads: 16
-        width: 1024
-        point_feats: 4
-        num_decoder_layers: 16
-        pc_size: *pc_size
-        pc_sharpedge_size: *pc_sharpedge_size
-        qkv_bias: false
-        qk_norm: true
-        scale_factor: *z_scale_factor
-        geo_decoder_mlp_expand_ratio: 4
-        geo_decoder_downsample_ratio: 1
-        geo_decoder_ln_post: true
-
-    cond_stage_config:
-      target: hy3dshape.models.conditioner.SingleImageEncoder
-      params:
-        main_image_encoder:
-            type: DinoImageEncoder # dino giant
-            kwargs:
-                config:
-                  attention_probs_dropout_prob: 0.0
-                  drop_path_rate: 0.0
-                  hidden_act: gelu
-                  hidden_dropout_prob: 0.0
-                  hidden_size: 1536
-                  image_size: 518
-                  initializer_range: 0.02
-                  layer_norm_eps: 1.e-6
-                  layerscale_value: 1.0
-                  mlp_ratio: 4
-                  model_type: dinov2
-                  num_attention_heads: 24
-                  num_channels: 3
-                  num_hidden_layers: 40
-                  patch_size: 14
-                  qkv_bias: true
-                  torch_dtype: float32
-                  use_swiglu_ffn: true
-                image_size: 518
-
-    denoiser_cfg:
-      target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT
-      params:
-        input_size: *num_latents
-        context_in_dim: 1536
-        hidden_size: 1024
-        mlp_ratio: 4.0
-        num_heads: 16
-        depth: 8
-        depth_single_blocks: 16
-        axes_dim: [64]
-        theta: 10000
-        qkv_bias: true
-        use_pe: false
-        force_norm_fp32: true
-
-    scheduler_cfg:
-      transport:
-        target: hy3dshape.models.diffusion.transport.create_transport
-        params:
-          path_type: Linear
-          prediction: velocity
-      sampler:
-        target: hy3dshape.models.diffusion.transport.Sampler
-        params: {}
-        ode_params:
-          sampling_method: euler # dopri5 ...
-          num_steps: &num_steps 50
-
-    optimizer_cfg:
-      optimizer:
-        target: torch.optim.AdamW
-        params:
-          betas: [0.9, 0.99]
-          eps: 1.e-6
-          weight_decay: 1.e-2
-
-      scheduler:
-        target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
-        params:
-          warm_up_steps: 50 # 5000
-          f_start: 1.e-6
-          f_min: 1.e-3
-          f_max: 1.0
-
-    pipeline_cfg:
-      target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
-
-    image_processor_cfg:
-      target: hy3dshape.preprocessors.ImageProcessorV2
-      params: {}
-
-callbacks:
-    logger:
-      target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
-      params:
-        step_frequency: 100 # 10000
-        num_samples: 1
-        sample_times: 1
-        mean: *mean
-        std: *std
-        bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
-        octree_depth: 8
-        num_chunks: 50000
-        mc_level: 0.0
-    
-    file_loggers:
-        target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
-        params:
-          step_frequency: 50 # 5000
-          test_data_path: "tools/mini_testset/images.json"
--- a/hy3dshape/configs/hunyuandit-finetuning-flowmatching-dinol518-bf16-lr1e5-4096.yaml
+++ b/hy3dshape/configs/hunyuandit-finetuning-flowmatching-dinol518-bf16-lr1e5-4096.yaml
@@ -1,4 +1,5 @@
-name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
+name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518"
+# training successfully on 8 x H20 with 98G Memory

 training:
  steps: 10_0000_0000
@@ -8,7 +9,8 @@ training:
  gradient_clip_val: 1.0
  gradient_clip_algorithm: "norm"
  every_n_train_steps: 2000 # 5000
-  val_check_interval: 50 # 4096
+  val_check_interval: 200 # 4096 
+  # val_check_interval must be smaller than every_n_train_steps!!!
  limit_val_batches: 16

 dataset:
@@ -24,7 +26,7 @@ dataset:
    val_data_list: tools/mini_trainset/preprocessed

    #! Image loading
-    cond_stage_key: "image" # image / text / image_text
+    cond_stage_key: "image"
    image_size: 518
    mean: &mean [0.5, 0.5, 0.5]
    std: &std [0.5, 0.5, 0.5]
@@ -55,73 +57,21 @@ model:
    first_stage_config:
      target: hy3dshape.models.autoencoders.ShapeVAE
      from_pretrained: tencent/Hunyuan3D-2.1
-      params:
-        num_latents: &num_latents 4096
-        embed_dim: 64
-        num_freqs: 8
-        include_pi: false
-        heads: 16
-        width: 1024
-        num_encoder_layers: 8
-        num_decoder_layers: 16
-        qkv_bias: false
-        qk_norm: true
-        scale_factor: *z_scale_factor
-        geo_decoder_mlp_expand_ratio: 4
-        geo_decoder_downsample_ratio: 1
-        geo_decoder_ln_post: true
-        point_feats: 4
-        pc_size: *pc_size
-        pc_sharpedge_size: *pc_sharpedge_size

    cond_stage_config:
      target: hy3dshape.models.conditioner.SingleImageEncoder
      params:
+        drop_ratio: 0.1
        main_image_encoder:
-            type: DinoImageEncoder # dino large
+            type: DinoImageEncoder 
            kwargs:
-                config:
-                  attention_probs_dropout_prob: 0.0
-                  drop_path_rate: 0.0
-                  hidden_act: gelu
-                  hidden_dropout_prob: 0.0
-                  hidden_size: 1024
-                  image_size: 518
-                  initializer_range: 0.02
-                  layer_norm_eps: 1.e-6
-                  layerscale_value: 1.0
-                  mlp_ratio: 4
-                  model_type: dinov2
-                  num_attention_heads: 16
-                  num_channels: 3
-                  num_hidden_layers: 24
-                  patch_size: 14
-                  qkv_bias: true
-                  torch_dtype: float32
-                  use_swiglu_ffn: false
+                version: 'facebook/dinov2-large'
                image_size: 518
                use_cls_token: true

-
    denoiser_cfg:
      target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
-      params:
-        input_size: *num_latents
-        in_channels: 64
-        hidden_size: 2048
-        context_dim: 1024
-        depth: 21
-        num_heads: 16
-        qk_norm: true
-        text_len: 1370
-        with_decoupled_ca: false
-        use_attention_pooling: false
-        qk_norm_type: 'rms'
-        qkv_bias: false
-        use_pos_emb: false
-        num_moe_layers: 6
-        num_experts: 8
-        moe_top_k: 2
+      from_pretrained: tencent/Hunyuan3D-2.1
        
    scheduler_cfg:
      transport:
@@ -163,7 +113,7 @@ callbacks:
    logger:
      target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
      params:
-        step_frequency: 100 # 10000
+        step_frequency: 1000 # 10000
        num_samples: 1
        sample_times: 1
        mean: *mean
@@ -176,5 +126,5 @@ callbacks:
    file_loggers:
        target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
        params:
-          step_frequency: 50 # 5000
+          step_frequency: 500 # 5000
          test_data_path: "tools/mini_testset/images.json"
--- a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-4096.yaml
+++ b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-4096.yaml
@@ -1,180 +0,0 @@
-name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
-
-training:
-  steps: 10_0000_0000
-  use_amp: true
-  amp_type: "bf16"
-  base_lr: 1e-4
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: "norm"
-  every_n_train_steps: 2000 # 5000
-  val_check_interval: 50 # 4096
-  limit_val_batches: 16
-
-dataset:
-  target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
-  params:
-    #! Base setting
-    batch_size: 2
-    num_workers: 8
-    val_num_workers: 4
-
-    # Data 
-    train_data_list: tools/mini_trainset/preprocessed
-    val_data_list: tools/mini_trainset/preprocessed
-
-    #! Image loading
-    cond_stage_key: "image" # image / text / image_text
-    image_size: 518
-    mean: &mean [0.5, 0.5, 0.5]
-    std: &std [0.5, 0.5, 0.5]
-
-    #! Point cloud sampling
-    pc_size: &pc_size 81920
-    pc_sharpedge_size: &pc_sharpedge_size 0
-    sharpedge_label: &sharpedge_label true
-    return_normal: true
-
-    #! Augmentation
-    padding: true
-
-model:
-  target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
-  params:
-    first_stage_key: "surface"
-    cond_stage_key: "image"
-    scale_by_std: false
-    z_scale_factor: &z_scale_factor 1.0039506158752403
-    torch_compile: false
-
-    # ema_config:
-    #   ema_model: LitEma
-    #   ema_decay: 0.999
-    #   ema_inference: false
-
-    first_stage_config:
-      target: hy3dshape.models.autoencoders.ShapeVAE
-      from_pretrained: tencent/Hunyuan3D-2.1
-      params:
-        num_latents: &num_latents 4096
-        embed_dim: 64
-        num_freqs: 8
-        include_pi: false
-        heads: 16
-        width: 1024
-        num_encoder_layers: 8
-        num_decoder_layers: 16
-        qkv_bias: false
-        qk_norm: true
-        scale_factor: *z_scale_factor
-        geo_decoder_mlp_expand_ratio: 4
-        geo_decoder_downsample_ratio: 1
-        geo_decoder_ln_post: true
-        point_feats: 4
-        pc_size: *pc_size
-        pc_sharpedge_size: *pc_sharpedge_size
-
-    cond_stage_config:
-      target: hy3dshape.models.conditioner.SingleImageEncoder
-      params:
-        main_image_encoder:
-            type: DinoImageEncoder # dino large
-            kwargs:
-                config:
-                  attention_probs_dropout_prob: 0.0
-                  drop_path_rate: 0.0
-                  hidden_act: gelu
-                  hidden_dropout_prob: 0.0
-                  hidden_size: 1024
-                  image_size: 518
-                  initializer_range: 0.02
-                  layer_norm_eps: 1.e-6
-                  layerscale_value: 1.0
-                  mlp_ratio: 4
-                  model_type: dinov2
-                  num_attention_heads: 16
-                  num_channels: 3
-                  num_hidden_layers: 24
-                  patch_size: 14
-                  qkv_bias: true
-                  torch_dtype: float32
-                  use_swiglu_ffn: false
-                image_size: 518
-                use_cls_token: true
-
-
-    denoiser_cfg:
-      target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
-      params:
-        input_size: *num_latents
-        in_channels: 64
-        hidden_size: 2048
-        context_dim: 1024
-        depth: 11
-        num_heads: 16
-        qk_norm: true
-        text_len: 1370
-        with_decoupled_ca: false
-        use_attention_pooling: false
-        qk_norm_type: 'rms'
-        qkv_bias: false
-        use_pos_emb: false
-        num_moe_layers: 6
-        num_experts: 8
-        moe_top_k: 2
-        
-    scheduler_cfg:
-      transport:
-        target: hy3dshape.models.diffusion.transport.create_transport
-        params:
-          path_type: Linear
-          prediction: velocity
-      sampler:
-        target: hy3dshape.models.diffusion.transport.Sampler
-        params: {}
-        ode_params:
-          sampling_method: euler # dopri5 ...
-          num_steps: &num_steps 50
-
-    optimizer_cfg:
-      optimizer:
-        target: torch.optim.AdamW
-        params:
-          betas: [0.9, 0.99]
-          eps: 1.e-6
-          weight_decay: 1.e-2
-
-      scheduler:
-        target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
-        params:
-          warm_up_steps: 50 # 5000
-          f_start: 1.e-6
-          f_min: 1.e-3
-          f_max: 1.0
-
-    pipeline_cfg:
-      target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
-
-    image_processor_cfg:
-      target: hy3dshape.preprocessors.ImageProcessorV2
-      params: {}
-
-callbacks:
-    logger:
-      target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
-      params:
-        step_frequency: 100 # 10000
-        num_samples: 1
-        sample_times: 1
-        mean: *mean
-        std: *std
-        bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
-        octree_depth: 8
-        num_chunks: 50000
-        mc_level: 0.0
-    
-    file_loggers:
-        target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
-        params:
-          step_frequency: 50 # 5000
-          test_data_path: "tools/mini_testset/images.json"
--- a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml
+++ b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml
@@ -1,4 +1,6 @@
-name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
+name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518"
+# oversitting successfully cost 68G memory under current settings
+# you can adjust model arch or batch_size according to your GPU memory

 training:
  steps: 10_0000_0000
@@ -8,14 +10,15 @@ training:
  gradient_clip_val: 1.0
  gradient_clip_algorithm: "norm"
  every_n_train_steps: 2000 # 5000
-  val_check_interval: 50 # 4096
+  val_check_interval: 200 # 4096 
+  # val_check_interval must be smaller than every_n_train_steps!!!
  limit_val_batches: 16

 dataset:
  target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
  params:
    #! Base setting
-    batch_size: 2
+    batch_size: 4
    num_workers: 8
    val_num_workers: 4

@@ -24,7 +27,7 @@ dataset:
    val_data_list: tools/mini_trainset/preprocessed

    #! Image loading
-    cond_stage_key: "image" # image / text / image_text
+    cond_stage_key: "image"
    image_size: 518
    mean: &mean [0.5, 0.5, 0.5]
    std: &std [0.5, 0.5, 0.5]
@@ -55,63 +58,27 @@ model:
    first_stage_config:
      target: hy3dshape.models.autoencoders.ShapeVAE
      from_pretrained: tencent/Hunyuan3D-2.1
-      params:
-        num_latents: &num_latents 512
-        embed_dim: 64
-        num_freqs: 8
-        include_pi: false
-        heads: 16
-        width: 1024
-        num_encoder_layers: 8
-        num_decoder_layers: 16
-        qkv_bias: false
-        qk_norm: true
-        scale_factor: *z_scale_factor
-        geo_decoder_mlp_expand_ratio: 4
-        geo_decoder_downsample_ratio: 1
-        geo_decoder_ln_post: true
-        point_feats: 4
-        pc_size: *pc_size
-        pc_sharpedge_size: *pc_sharpedge_size

    cond_stage_config:
      target: hy3dshape.models.conditioner.SingleImageEncoder
      params:
+        drop_ratio: 0.1
        main_image_encoder:
-            type: DinoImageEncoder # dino large
+            type: DinoImageEncoder 
            kwargs:
-                config:
-                  attention_probs_dropout_prob: 0.0
-                  drop_path_rate: 0.0
-                  hidden_act: gelu
-                  hidden_dropout_prob: 0.0
-                  hidden_size: 1024
-                  image_size: 518
-                  initializer_range: 0.02
-                  layer_norm_eps: 1.e-6
-                  layerscale_value: 1.0
-                  mlp_ratio: 4
-                  model_type: dinov2
-                  num_attention_heads: 16
-                  num_channels: 3
-                  num_hidden_layers: 24
-                  patch_size: 14
-                  qkv_bias: true
-                  torch_dtype: float32
-                  use_swiglu_ffn: false
+                version: 'facebook/dinov2-large'
                image_size: 518
                use_cls_token: true

-
    denoiser_cfg:
      target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
      params:
-        input_size: *num_latents
+        input_size: 4096
        in_channels: 64
-        hidden_size: 768
+        hidden_size: 2048
        context_dim: 1024
-        depth: 6
-        num_heads: 12
+        depth: 16
+        num_heads: 16
        qk_norm: true
        text_len: 1370
        with_decoupled_ca: false
@@ -147,7 +114,7 @@ model:
      scheduler:
        target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
        params:
-          warm_up_steps: 50 # 5000
+          warm_up_steps: 500 # 5000
          f_start: 1.e-6
          f_min: 1.e-3
          f_max: 1.0
@@ -163,7 +130,7 @@ callbacks:
    logger:
      target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
      params:
-        step_frequency: 100 # 10000
+        step_frequency: 1000 # 10000
        num_samples: 1
        sample_times: 1
        mean: *mean
@@ -176,5 +143,5 @@ callbacks:
    file_loggers:
        target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
        params:
-          step_frequency: 50 # 5000
+          step_frequency: 500 # 5000
          test_data_path: "tools/mini_testset/images.json"