diff --git a/hy3dshape/configs/hunyuan3ddit-full-params-finetuning-flowmatching-dinog518-bf16-lr1e5-512.yaml b/hy3dshape/configs/hunyuan3ddit-full-params-finetuning-flowmatching-dinog518-bf16-lr1e5-512.yaml deleted file mode 100755 index 3128e44..0000000 --- a/hy3dshape/configs/hunyuan3ddit-full-params-finetuning-flowmatching-dinog518-bf16-lr1e5-512.yaml +++ /dev/null @@ -1,174 +0,0 @@ -name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518" - -training: - steps: 10_0000_0000 - use_amp: true - amp_type: "bf16" - base_lr: 1.e-5 - gradient_clip_val: 1.0 - gradient_clip_algorithm: "norm" - every_n_train_steps: 2000 # 5000 - val_check_interval: 50 # 4096 - limit_val_batches: 16 - -dataset: - target: hy3dshape.data.dit_asl.AlignedShapeLatentModule - params: - #! Base setting - batch_size: 4 - num_workers: 8 - val_num_workers: 4 - - # Data - train_data_list: tools/mini_trainset/preprocessed - val_data_list: tools/mini_trainset/preprocessed - - #! Image loading - cond_stage_key: "image" # image / text / image_text - image_size: 518 - mean: &mean [0.5, 0.5, 0.5] - std: &std [0.5, 0.5, 0.5] - - #! Point cloud sampling - pc_size: &pc_size 30720 - pc_sharpedge_size: &pc_sharpedge_size 30720 - sharpedge_label: &sharpedge_label true - return_normal: true - - #! Augmentation - padding: true - -model: - target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser - params: - first_stage_key: "surface" - cond_stage_key: "image" - scale_by_std: false - z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184 - torch_compile: false - - # ema_config: - # ema_model: LitEma - # ema_decay: 0.999 - # ema_inference: false - - first_stage_config: - target: hy3dshape.models.autoencoders.ShapeVAE - from_pretrained: tencent/Hunyuan3D-2.1 - params: - num_latents: &num_latents 512 - embed_dim: 64 - num_freqs: 8 - include_pi: false - heads: 16 - width: 1024 - point_feats: 4 - num_decoder_layers: 16 - pc_size: *pc_size - pc_sharpedge_size: *pc_sharpedge_size - qkv_bias: false - qk_norm: true - scale_factor: *z_scale_factor - geo_decoder_mlp_expand_ratio: 4 - geo_decoder_downsample_ratio: 1 - geo_decoder_ln_post: true - - cond_stage_config: - target: hy3dshape.models.conditioner.SingleImageEncoder - params: - main_image_encoder: - type: DinoImageEncoder # dino giant - kwargs: - config: - attention_probs_dropout_prob: 0.0 - drop_path_rate: 0.0 - hidden_act: gelu - hidden_dropout_prob: 0.0 - hidden_size: 1536 - image_size: 518 - initializer_range: 0.02 - layer_norm_eps: 1.e-6 - layerscale_value: 1.0 - mlp_ratio: 4 - model_type: dinov2 - num_attention_heads: 24 - num_channels: 3 - num_hidden_layers: 40 - patch_size: 14 - qkv_bias: true - torch_dtype: float32 - use_swiglu_ffn: true - image_size: 518 - - denoiser_cfg: - target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT - params: - ckpt_path: ~/.cache/hy3dgen/tencent/Hunyuan3D-2-1-Shape/dit/model.fp16.ckpt - input_size: *num_latents - context_in_dim: 1536 - hidden_size: 1024 - mlp_ratio: 4.0 - num_heads: 16 - depth: 16 - depth_single_blocks: 32 - axes_dim: [64] - theta: 10000 - qkv_bias: true - use_pe: false - force_norm_fp32: true - - scheduler_cfg: - transport: - target: hy3dshape.models.diffusion.transport.create_transport - params: - path_type: Linear - prediction: velocity - sampler: - target: hy3dshape.models.diffusion.transport.Sampler - params: {} - ode_params: - sampling_method: euler # dopri5 ... - num_steps: &num_steps 50 - - optimizer_cfg: - optimizer: - target: torch.optim.AdamW - params: - betas: [0.9, 0.99] - eps: 1.e-6 - weight_decay: 1.e-2 - - scheduler: - target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler - params: - warm_up_steps: 50 # 5000 - f_start: 1.e-6 - f_min: 1.e-3 - f_max: 1.0 - - pipeline_cfg: - target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline - - image_processor_cfg: - target: hy3dshape.preprocessors.ImageProcessorV2 - params: {} - -callbacks: - logger: - target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger - params: - step_frequency: 100 # 10000 - num_samples: 1 - sample_times: 1 - mean: *mean - std: *std - bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01] - octree_depth: 8 - num_chunks: 50000 - mc_level: 0.0 - - file_loggers: - target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger - params: - step_frequency: 50 # 5000 - test_data_path: "tools/mini_testset/images.json" diff --git a/hy3dshape/configs/hunyuan3ddit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-512.yaml b/hy3dshape/configs/hunyuan3ddit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-512.yaml deleted file mode 100644 index 33dcbd0..0000000 --- a/hy3dshape/configs/hunyuan3ddit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-512.yaml +++ /dev/null @@ -1,173 +0,0 @@ -name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518" - -training: - steps: 10_0000_0000 - use_amp: true - amp_type: "bf16" - base_lr: 1e-4 - gradient_clip_val: 1.0 - gradient_clip_algorithm: "norm" - every_n_train_steps: 2000 # 5000 - val_check_interval: 50 # 4096 - limit_val_batches: 16 - -dataset: - target: hy3dshape.data.dit_asl.AlignedShapeLatentModule - params: - #! Base setting - batch_size: 2 - num_workers: 8 - val_num_workers: 4 - - # Data - train_data_list: tools/mini_trainset/preprocessed - val_data_list: tools/mini_trainset/preprocessed - - #! Image loading - cond_stage_key: "image" # image / text / image_text - image_size: 518 - mean: &mean [0.5, 0.5, 0.5] - std: &std [0.5, 0.5, 0.5] - - #! Point cloud sampling - pc_size: &pc_size 10240 - pc_sharpedge_size: &pc_sharpedge_size 10240 - sharpedge_label: &sharpedge_label true - return_normal: true - - #! Augmentation - padding: true - -model: - target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser - params: - first_stage_key: "surface" - cond_stage_key: "image" - scale_by_std: false - z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184 - torch_compile: false - - # ema_config: - # ema_model: LitEma - # ema_decay: 0.999 - # ema_inference: false - - first_stage_config: - target: hy3dshape.models.autoencoders.ShapeVAE - from_pretrained: tencent/Hunyuan3D-2.1 - params: - num_latents: &num_latents 512 - embed_dim: 64 - num_freqs: 8 - include_pi: false - heads: 16 - width: 1024 - point_feats: 4 - num_decoder_layers: 16 - pc_size: *pc_size - pc_sharpedge_size: *pc_sharpedge_size - qkv_bias: false - qk_norm: true - scale_factor: *z_scale_factor - geo_decoder_mlp_expand_ratio: 4 - geo_decoder_downsample_ratio: 1 - geo_decoder_ln_post: true - - cond_stage_config: - target: hy3dshape.models.conditioner.SingleImageEncoder - params: - main_image_encoder: - type: DinoImageEncoder # dino giant - kwargs: - config: - attention_probs_dropout_prob: 0.0 - drop_path_rate: 0.0 - hidden_act: gelu - hidden_dropout_prob: 0.0 - hidden_size: 1536 - image_size: 518 - initializer_range: 0.02 - layer_norm_eps: 1.e-6 - layerscale_value: 1.0 - mlp_ratio: 4 - model_type: dinov2 - num_attention_heads: 24 - num_channels: 3 - num_hidden_layers: 40 - patch_size: 14 - qkv_bias: true - torch_dtype: float32 - use_swiglu_ffn: true - image_size: 518 - - denoiser_cfg: - target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT - params: - input_size: *num_latents - context_in_dim: 1536 - hidden_size: 1024 - mlp_ratio: 4.0 - num_heads: 16 - depth: 8 - depth_single_blocks: 16 - axes_dim: [64] - theta: 10000 - qkv_bias: true - use_pe: false - force_norm_fp32: true - - scheduler_cfg: - transport: - target: hy3dshape.models.diffusion.transport.create_transport - params: - path_type: Linear - prediction: velocity - sampler: - target: hy3dshape.models.diffusion.transport.Sampler - params: {} - ode_params: - sampling_method: euler # dopri5 ... - num_steps: &num_steps 50 - - optimizer_cfg: - optimizer: - target: torch.optim.AdamW - params: - betas: [0.9, 0.99] - eps: 1.e-6 - weight_decay: 1.e-2 - - scheduler: - target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler - params: - warm_up_steps: 50 # 5000 - f_start: 1.e-6 - f_min: 1.e-3 - f_max: 1.0 - - pipeline_cfg: - target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline - - image_processor_cfg: - target: hy3dshape.preprocessors.ImageProcessorV2 - params: {} - -callbacks: - logger: - target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger - params: - step_frequency: 100 # 10000 - num_samples: 1 - sample_times: 1 - mean: *mean - std: *std - bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01] - octree_depth: 8 - num_chunks: 50000 - mc_level: 0.0 - - file_loggers: - target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger - params: - step_frequency: 50 # 5000 - test_data_path: "tools/mini_testset/images.json" diff --git a/hy3dshape/configs/hunyuandit-finetuning-flowmatching-dinog518-bf16-lr1e5-4096.yaml b/hy3dshape/configs/hunyuandit-finetuning-flowmatching-dinol518-bf16-lr1e5-4096.yaml similarity index 60% rename from hy3dshape/configs/hunyuandit-finetuning-flowmatching-dinog518-bf16-lr1e5-4096.yaml rename to hy3dshape/configs/hunyuandit-finetuning-flowmatching-dinol518-bf16-lr1e5-4096.yaml index e9fc841..857ff1e 100644 --- a/hy3dshape/configs/hunyuandit-finetuning-flowmatching-dinog518-bf16-lr1e5-4096.yaml +++ b/hy3dshape/configs/hunyuandit-finetuning-flowmatching-dinol518-bf16-lr1e5-4096.yaml @@ -1,4 +1,5 @@ -name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518" +name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518" +# training successfully on 8 x H20 with 98G Memory training: steps: 10_0000_0000 @@ -8,7 +9,8 @@ training: gradient_clip_val: 1.0 gradient_clip_algorithm: "norm" every_n_train_steps: 2000 # 5000 - val_check_interval: 50 # 4096 + val_check_interval: 200 # 4096 + # val_check_interval must be smaller than every_n_train_steps!!! limit_val_batches: 16 dataset: @@ -24,7 +26,7 @@ dataset: val_data_list: tools/mini_trainset/preprocessed #! Image loading - cond_stage_key: "image" # image / text / image_text + cond_stage_key: "image" image_size: 518 mean: &mean [0.5, 0.5, 0.5] std: &std [0.5, 0.5, 0.5] @@ -55,73 +57,21 @@ model: first_stage_config: target: hy3dshape.models.autoencoders.ShapeVAE from_pretrained: tencent/Hunyuan3D-2.1 - params: - num_latents: &num_latents 4096 - embed_dim: 64 - num_freqs: 8 - include_pi: false - heads: 16 - width: 1024 - num_encoder_layers: 8 - num_decoder_layers: 16 - qkv_bias: false - qk_norm: true - scale_factor: *z_scale_factor - geo_decoder_mlp_expand_ratio: 4 - geo_decoder_downsample_ratio: 1 - geo_decoder_ln_post: true - point_feats: 4 - pc_size: *pc_size - pc_sharpedge_size: *pc_sharpedge_size cond_stage_config: target: hy3dshape.models.conditioner.SingleImageEncoder params: + drop_ratio: 0.1 main_image_encoder: - type: DinoImageEncoder # dino large + type: DinoImageEncoder kwargs: - config: - attention_probs_dropout_prob: 0.0 - drop_path_rate: 0.0 - hidden_act: gelu - hidden_dropout_prob: 0.0 - hidden_size: 1024 - image_size: 518 - initializer_range: 0.02 - layer_norm_eps: 1.e-6 - layerscale_value: 1.0 - mlp_ratio: 4 - model_type: dinov2 - num_attention_heads: 16 - num_channels: 3 - num_hidden_layers: 24 - patch_size: 14 - qkv_bias: true - torch_dtype: float32 - use_swiglu_ffn: false + version: 'facebook/dinov2-large' image_size: 518 use_cls_token: true - denoiser_cfg: target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain - params: - input_size: *num_latents - in_channels: 64 - hidden_size: 2048 - context_dim: 1024 - depth: 21 - num_heads: 16 - qk_norm: true - text_len: 1370 - with_decoupled_ca: false - use_attention_pooling: false - qk_norm_type: 'rms' - qkv_bias: false - use_pos_emb: false - num_moe_layers: 6 - num_experts: 8 - moe_top_k: 2 + from_pretrained: tencent/Hunyuan3D-2.1 scheduler_cfg: transport: @@ -163,7 +113,7 @@ callbacks: logger: target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger params: - step_frequency: 100 # 10000 + step_frequency: 1000 # 10000 num_samples: 1 sample_times: 1 mean: *mean @@ -176,5 +126,5 @@ callbacks: file_loggers: target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger params: - step_frequency: 50 # 5000 + step_frequency: 500 # 5000 test_data_path: "tools/mini_testset/images.json" diff --git a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-4096.yaml b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-4096.yaml deleted file mode 100644 index 82e7ebb..0000000 --- a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-4096.yaml +++ /dev/null @@ -1,180 +0,0 @@ -name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518" - -training: - steps: 10_0000_0000 - use_amp: true - amp_type: "bf16" - base_lr: 1e-4 - gradient_clip_val: 1.0 - gradient_clip_algorithm: "norm" - every_n_train_steps: 2000 # 5000 - val_check_interval: 50 # 4096 - limit_val_batches: 16 - -dataset: - target: hy3dshape.data.dit_asl.AlignedShapeLatentModule - params: - #! Base setting - batch_size: 2 - num_workers: 8 - val_num_workers: 4 - - # Data - train_data_list: tools/mini_trainset/preprocessed - val_data_list: tools/mini_trainset/preprocessed - - #! Image loading - cond_stage_key: "image" # image / text / image_text - image_size: 518 - mean: &mean [0.5, 0.5, 0.5] - std: &std [0.5, 0.5, 0.5] - - #! Point cloud sampling - pc_size: &pc_size 81920 - pc_sharpedge_size: &pc_sharpedge_size 0 - sharpedge_label: &sharpedge_label true - return_normal: true - - #! Augmentation - padding: true - -model: - target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser - params: - first_stage_key: "surface" - cond_stage_key: "image" - scale_by_std: false - z_scale_factor: &z_scale_factor 1.0039506158752403 - torch_compile: false - - # ema_config: - # ema_model: LitEma - # ema_decay: 0.999 - # ema_inference: false - - first_stage_config: - target: hy3dshape.models.autoencoders.ShapeVAE - from_pretrained: tencent/Hunyuan3D-2.1 - params: - num_latents: &num_latents 4096 - embed_dim: 64 - num_freqs: 8 - include_pi: false - heads: 16 - width: 1024 - num_encoder_layers: 8 - num_decoder_layers: 16 - qkv_bias: false - qk_norm: true - scale_factor: *z_scale_factor - geo_decoder_mlp_expand_ratio: 4 - geo_decoder_downsample_ratio: 1 - geo_decoder_ln_post: true - point_feats: 4 - pc_size: *pc_size - pc_sharpedge_size: *pc_sharpedge_size - - cond_stage_config: - target: hy3dshape.models.conditioner.SingleImageEncoder - params: - main_image_encoder: - type: DinoImageEncoder # dino large - kwargs: - config: - attention_probs_dropout_prob: 0.0 - drop_path_rate: 0.0 - hidden_act: gelu - hidden_dropout_prob: 0.0 - hidden_size: 1024 - image_size: 518 - initializer_range: 0.02 - layer_norm_eps: 1.e-6 - layerscale_value: 1.0 - mlp_ratio: 4 - model_type: dinov2 - num_attention_heads: 16 - num_channels: 3 - num_hidden_layers: 24 - patch_size: 14 - qkv_bias: true - torch_dtype: float32 - use_swiglu_ffn: false - image_size: 518 - use_cls_token: true - - - denoiser_cfg: - target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain - params: - input_size: *num_latents - in_channels: 64 - hidden_size: 2048 - context_dim: 1024 - depth: 11 - num_heads: 16 - qk_norm: true - text_len: 1370 - with_decoupled_ca: false - use_attention_pooling: false - qk_norm_type: 'rms' - qkv_bias: false - use_pos_emb: false - num_moe_layers: 6 - num_experts: 8 - moe_top_k: 2 - - scheduler_cfg: - transport: - target: hy3dshape.models.diffusion.transport.create_transport - params: - path_type: Linear - prediction: velocity - sampler: - target: hy3dshape.models.diffusion.transport.Sampler - params: {} - ode_params: - sampling_method: euler # dopri5 ... - num_steps: &num_steps 50 - - optimizer_cfg: - optimizer: - target: torch.optim.AdamW - params: - betas: [0.9, 0.99] - eps: 1.e-6 - weight_decay: 1.e-2 - - scheduler: - target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler - params: - warm_up_steps: 50 # 5000 - f_start: 1.e-6 - f_min: 1.e-3 - f_max: 1.0 - - pipeline_cfg: - target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline - - image_processor_cfg: - target: hy3dshape.preprocessors.ImageProcessorV2 - params: {} - -callbacks: - logger: - target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger - params: - step_frequency: 100 # 10000 - num_samples: 1 - sample_times: 1 - mean: *mean - std: *std - bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01] - octree_depth: 8 - num_chunks: 50000 - mc_level: 0.0 - - file_loggers: - target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger - params: - step_frequency: 50 # 5000 - test_data_path: "tools/mini_testset/images.json" diff --git a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-512.yaml b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml similarity index 66% rename from hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-512.yaml rename to hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml index ffed3aa..62cae73 100644 --- a/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinog518-bf16-lr1e4-512.yaml +++ b/hy3dshape/configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml @@ -1,4 +1,6 @@ -name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518" +name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518" +# oversitting successfully cost 68G memory under current settings +# you can adjust model arch or batch_size according to your GPU memory training: steps: 10_0000_0000 @@ -8,14 +10,15 @@ training: gradient_clip_val: 1.0 gradient_clip_algorithm: "norm" every_n_train_steps: 2000 # 5000 - val_check_interval: 50 # 4096 + val_check_interval: 200 # 4096 + # val_check_interval must be smaller than every_n_train_steps!!! limit_val_batches: 16 dataset: target: hy3dshape.data.dit_asl.AlignedShapeLatentModule params: #! Base setting - batch_size: 2 + batch_size: 4 num_workers: 8 val_num_workers: 4 @@ -24,7 +27,7 @@ dataset: val_data_list: tools/mini_trainset/preprocessed #! Image loading - cond_stage_key: "image" # image / text / image_text + cond_stage_key: "image" image_size: 518 mean: &mean [0.5, 0.5, 0.5] std: &std [0.5, 0.5, 0.5] @@ -55,63 +58,27 @@ model: first_stage_config: target: hy3dshape.models.autoencoders.ShapeVAE from_pretrained: tencent/Hunyuan3D-2.1 - params: - num_latents: &num_latents 512 - embed_dim: 64 - num_freqs: 8 - include_pi: false - heads: 16 - width: 1024 - num_encoder_layers: 8 - num_decoder_layers: 16 - qkv_bias: false - qk_norm: true - scale_factor: *z_scale_factor - geo_decoder_mlp_expand_ratio: 4 - geo_decoder_downsample_ratio: 1 - geo_decoder_ln_post: true - point_feats: 4 - pc_size: *pc_size - pc_sharpedge_size: *pc_sharpedge_size cond_stage_config: target: hy3dshape.models.conditioner.SingleImageEncoder params: + drop_ratio: 0.1 main_image_encoder: - type: DinoImageEncoder # dino large + type: DinoImageEncoder kwargs: - config: - attention_probs_dropout_prob: 0.0 - drop_path_rate: 0.0 - hidden_act: gelu - hidden_dropout_prob: 0.0 - hidden_size: 1024 - image_size: 518 - initializer_range: 0.02 - layer_norm_eps: 1.e-6 - layerscale_value: 1.0 - mlp_ratio: 4 - model_type: dinov2 - num_attention_heads: 16 - num_channels: 3 - num_hidden_layers: 24 - patch_size: 14 - qkv_bias: true - torch_dtype: float32 - use_swiglu_ffn: false + version: 'facebook/dinov2-large' image_size: 518 use_cls_token: true - denoiser_cfg: target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain params: - input_size: *num_latents + input_size: 4096 in_channels: 64 - hidden_size: 768 + hidden_size: 2048 context_dim: 1024 - depth: 6 - num_heads: 12 + depth: 16 + num_heads: 16 qk_norm: true text_len: 1370 with_decoupled_ca: false @@ -147,7 +114,7 @@ model: scheduler: target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler params: - warm_up_steps: 50 # 5000 + warm_up_steps: 500 # 5000 f_start: 1.e-6 f_min: 1.e-3 f_max: 1.0 @@ -163,7 +130,7 @@ callbacks: logger: target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger params: - step_frequency: 100 # 10000 + step_frequency: 1000 # 10000 num_samples: 1 sample_times: 1 mean: *mean @@ -176,5 +143,5 @@ callbacks: file_loggers: target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger params: - step_frequency: 50 # 5000 + step_frequency: 500 # 5000 test_data_path: "tools/mini_testset/images.json" diff --git a/hy3dshape/hy3dshape/models/autoencoders/attention_blocks.py b/hy3dshape/hy3dshape/models/autoencoders/attention_blocks.py index 918a695..71ce880 100644 --- a/hy3dshape/hy3dshape/models/autoencoders/attention_blocks.py +++ b/hy3dshape/hy3dshape/models/autoencoders/attention_blocks.py @@ -548,7 +548,7 @@ class PointCrossAttentionEncoder(nn.Module): if pc_sharpedge_size == 0: print( - f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is not given, using pc_size as pc_sharpedge_size') + f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is zero') else: print( f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is given, using pc_size={pc_size}, pc_sharpedge_size={pc_sharpedge_size}') diff --git a/hy3dshape/hy3dshape/models/conditioner.py b/hy3dshape/hy3dshape/models/conditioner.py index d0d848c..5c03464 100644 --- a/hy3dshape/hy3dshape/models/conditioner.py +++ b/hy3dshape/hy3dshape/models/conditioner.py @@ -32,6 +32,7 @@ from transformers import ( Dinov2Model, Dinov2Config, ) +from transformers import AutoImageProcessor, AutoModel def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): @@ -66,9 +67,10 @@ class ImageEncoder(nn.Module): super().__init__() if config is None: - self.model = self.MODEL_CLASS.from_pretrained(version) + self.model = AutoModel.from_pretrained(version) else: self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config)) + self.model.eval() self.model.requires_grad_(False) self.use_cls_token = use_cls_token @@ -240,11 +242,26 @@ class SingleImageEncoder(nn.Module): def __init__( self, main_image_encoder, + drop_ratio=0.0 ): super().__init__() self.main_image_encoder = build_image_encoder(main_image_encoder) + self.drop_ratio = drop_ratio + self.disable_drop = True def forward(self, image, mask=None, **kwargs): + outputs = { + 'main': self.main_image_encoder(image, mask=mask, **kwargs), + } + if self.disable_drop: + return outputs + else: + random_p = torch.rand(len(image), device='cuda') + remain_bool_tensor = random_p > self.drop_ratio + outputs['main'] *= remain_bool_tensor.view(-1,1,1) + return outputs + + outputs = { 'main': self.main_image_encoder(image, mask=mask, **kwargs), } diff --git a/hy3dshape/hy3dshape/models/denoisers/hunyuandit.py b/hy3dshape/hy3dshape/models/denoisers/hunyuandit.py index b4b3b50..05514c0 100644 --- a/hy3dshape/hy3dshape/models/denoisers/hunyuandit.py +++ b/hy3dshape/hy3dshape/models/denoisers/hunyuandit.py @@ -22,6 +22,8 @@ # fine-tuning enabling code and other elements of the foregoing made publicly available # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. +import os +import yaml import math import numpy as np @@ -31,6 +33,7 @@ import torch.nn.functional as F from einops import rearrange from .moe_layers import MoEBlock +from ...utils import logger, synchronize_timer, smart_load_model def modulate(x, shift, scale): @@ -464,6 +467,74 @@ class FinalLayer(nn.Module): class HunYuanDiTPlain(nn.Module): + @classmethod + @synchronize_timer('HunYuanDiTPlain Model Loading') + def from_single_file( + cls, + ckpt_path, + config_path, + device='cuda', + dtype=torch.float16, + use_safetensors=None, + **kwargs, + ): + # load config + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + # load ckpt + if use_safetensors: + ckpt_path = ckpt_path.replace('.ckpt', '.safetensors') + if not os.path.exists(ckpt_path): + raise FileNotFoundError(f"Model file {ckpt_path} not found") + + logger.info(f"Loading model from {ckpt_path}") + if use_safetensors: + import safetensors.torch + ckpt = safetensors.torch.load_file(ckpt_path, device='cpu') + else: + ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True) + + if 'model' in ckpt: + ckpt = ckpt['model'] + if 'model' in config: + config = config['model'] + + model_kwargs = config['params'] + model_kwargs.update(kwargs) + + model = cls(**model_kwargs) + model.load_state_dict(ckpt) + model.to(device=device, dtype=dtype) + return model + + @classmethod + def from_pretrained( + cls, + model_path, + device='cuda', + dtype=torch.float16, + use_safetensors=False, + variant='fp16', + subfolder='hunyuan3d-dit-v2-1', + **kwargs, + ): + config_path, ckpt_path = smart_load_model( + model_path, + subfolder=subfolder, + use_safetensors=use_safetensors, + variant=variant + ) + + return cls.from_single_file( + ckpt_path, + config_path, + device=device, + dtype=dtype, + use_safetensors=use_safetensors, + **kwargs + ) + def __init__( self, input_size=1024, diff --git a/hy3dshape/hy3dshape/models/diffusion/flow_matching_sit.py b/hy3dshape/hy3dshape/models/diffusion/flow_matching_sit.py index 9813b74..7f94bb8 100644 --- a/hy3dshape/hy3dshape/models/diffusion/flow_matching_sit.py +++ b/hy3dshape/hy3dshape/models/diffusion/flow_matching_sit.py @@ -256,17 +256,14 @@ class Diffuser(pl.LightningModule): def forward(self, batch): with torch.autocast(device_type="cuda", dtype=torch.bfloat16): #float32 for text contexts = self.cond_stage_model(image=batch.get('image'), text=batch.get('text'), mask=batch.get('mask')) - # t5_text = contexts['t5_text']['prompt_embeds'] - # nan_count = torch.isnan(t5_text).sum() - # if nan_count > 0: - # print("t5_text has %d NaN values"%(nan_count)) + with torch.autocast(device_type="cuda", dtype=torch.float16): with torch.no_grad(): latents = self.first_stage_model.encode(batch[self.first_stage_key], sample_posterior=True) latents = self.z_scale_factor * latents # print(latents.shape) - # check vae encode and decode is ok? answer is ok ! + # check vae encode and decode is ok? answer is ok! # import time # from hy3dshape.pipelines import export_to_trimesh # latents = 1. / self.z_scale_factor * latents @@ -333,9 +330,6 @@ class Diffuser(pl.LightningModule): image = batch.get("image", None) mask = batch.get('mask', None) - # if not isinstance(image, torch.Tensor): print(image.shape) - # if isinstance(mask, torch.Tensor): print(mask.shape) - outputs = self.pipeline(image=image, mask=mask, generator=generator, @@ -350,5 +344,6 @@ class Diffuser(pl.LightningModule): f.write(traceback.format_exc()) f.write("\n") outputs = [None] + self.cond_stage_model.disable_drop = False return [outputs] diff --git a/hy3dshape/hy3dshape/utils/trainings/mesh_log_callback.py b/hy3dshape/hy3dshape/utils/trainings/mesh_log_callback.py index f2466d5..d5f4049 100755 --- a/hy3dshape/hy3dshape/utils/trainings/mesh_log_callback.py +++ b/hy3dshape/hy3dshape/utils/trainings/mesh_log_callback.py @@ -323,7 +323,9 @@ class ImageConditionalFixASLDiffuserLogger(Callback): save_path = os.path.join(visual_dir, os.path.basename(image_path)) save_path = os.path.splitext(save_path)[0] + '.glb' - print(image_path) + if isinstance(image_path, str): + print(image_path) + with torch.no_grad(): mesh = pl_module.sample(batch={"image": image_path}, **self.kwargs)[0][0] if isinstance(mesh, tuple) and len(mesh)==2: diff --git a/hy3dshape/main.py b/hy3dshape/main.py index 4c1ae51..1f281ef 100644 --- a/hy3dshape/main.py +++ b/hy3dshape/main.py @@ -190,7 +190,7 @@ if __name__ == "__main__": precision=amp_type, callbacks=callbacks, accelerator="gpu", - devices=training_cfg.num_gpus, + devices=args.num_gpus, num_nodes=training_cfg.num_nodes, strategy=ddp_strategy, gradient_clip_val=training_cfg.get('gradient_clip_val'), diff --git a/hy3dshape/minimal_demo.py b/hy3dshape/minimal_demo.py index a4bfe8f..5a5ce8c 100644 --- a/hy3dshape/minimal_demo.py +++ b/hy3dshape/minimal_demo.py @@ -13,7 +13,6 @@ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. from PIL import Image - from hy3dshape.rembg import BackgroundRemover from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline @@ -21,10 +20,12 @@ model_path = 'tencent/Hunyuan3D-2.1' pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path) image_path = 'demos/demo.png' + image = Image.open(image_path).convert("RGBA") if image.mode == 'RGB': rembg = BackgroundRemover() image = rembg(image) +image = image_path mesh = pipeline_shapegen(image=image)[0] mesh.export('demo.glb') diff --git a/hy3dshape/minimal_demo_with_ckpt.py b/hy3dshape/minimal_demo_with_ckpt.py new file mode 100644 index 0000000..b6fdf39 --- /dev/null +++ b/hy3dshape/minimal_demo_with_ckpt.py @@ -0,0 +1,51 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from PIL import Image +from hy3dshape.rembg import BackgroundRemover +from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline + +model_path = 'tencent/Hunyuan3D-2.1' +pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path) + + +import torch +import yaml +from hy3dshape.utils import instantiate_from_config +# For example, you can convert deepspeed weights to a single file +# cd output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt +# python3 zero_to_fp32.py ./ ./out --max_shard_size 30GB +# then you can get output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt/out/pytorch_model.bin +ckpt_cfg_path = 'output_folder/dit/overfitting_depth_16_token_4096_lr1e4_uc/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml' +ckpt_path = 'output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt/out/pytorch_model.bin' +config = yaml.safe_load(open(ckpt_cfg_path, 'r')) +model = instantiate_from_config(config['model']['params']['denoiser_cfg']) +sd = torch.load(ckpt_path) +sd = {k.replace('_forward_module.model.', ''):v for k,v in sd.items()} +msg = model.load_state_dict(sd) +print(msg) +model = model.cuda().half() +pipeline_shapegen.model = model + + +image = 'tools/mini_testset/images/015.png' + +# image = Image.open(image_path).convert("RGBA") +# if image.mode == 'RGB': +# rembg = BackgroundRemover() +# image = rembg(image) + +# mesh = pipeline_shapegen(image=image, guidance_scale=1.0)[0] +mesh = pipeline_shapegen(image=image)[0] +mesh.export('demo.glb') diff --git a/hy3dshape/scripts/train_deepspeed.sh b/hy3dshape/scripts/train_deepspeed.sh index ed6b7c7..de8c61e 100644 --- a/hy3dshape/scripts/train_deepspeed.sh +++ b/hy3dshape/scripts/train_deepspeed.sh @@ -35,12 +35,11 @@ export NCCL_DEBUG=WARN node_num=$1 node_rank=$2 -master_ip=$3 -config=$4 -output_dir=$5 +num_gpu_per_node=$3 +master_ip=$4 +config=$5 +output_dir=$6 -# config='configs/dit-from-scratch-overfitting-flowmatching-dinog518-bf16-lr1e4-1024.yaml' -# output_dir='output_folder/dit/overfitting_10' echo node_num $node_num echo node_rank $node_rank @@ -64,7 +63,8 @@ NCCL_IB_GID_INDEX=3 \ NCCL_NVLS_ENABLE=0 \ python3 main.py \ --num_nodes $node_num \ - --num_gpus 8 \ + --num_gpus $num_gpu_per_node \ --config $config \ --output_dir $output_dir \ --deepspeed + diff --git a/hy3dshape/train_demo.sh b/hy3dshape/train_demo.sh new file mode 100644 index 0000000..717aff2 --- /dev/null +++ b/hy3dshape/train_demo.sh @@ -0,0 +1,15 @@ +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export num_gpu_per_node=8 +# export CUDA_VISIBLE_DEVICES=0 +# export num_gpu_per_node=1 + +export node_num=1 +export node_rank=0 +export master_ip=0.0.0.0 # set your master_ip + +# export config=configs/hunyuandit-finetuning-flowmatching-dinol518-bf16-lr1e5-4096.yaml +# export output_dir=output_folder/dit/fintuning_lr1e5 +export config=configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml +export output_dir=output_folder/dit/overfitting_depth_16_token_4096_lr1e4 + +bash scripts/train_deepspeed.sh $node_num $node_rank $num_gpu_per_node $master_ip $config $output_dir