fix shape training

This commit is contained in:
Huiwenshi
2025-06-26 16:03:44 +08:00
parent d48c432b58
commit 7c92655a0d
15 changed files with 199 additions and 657 deletions

View File

@@ -1,174 +0,0 @@
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
training:
steps: 10_0000_0000
use_amp: true
amp_type: "bf16"
base_lr: 1.e-5
gradient_clip_val: 1.0
gradient_clip_algorithm: "norm"
every_n_train_steps: 2000 # 5000
val_check_interval: 50 # 4096
limit_val_batches: 16
dataset:
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
params:
#! Base setting
batch_size: 4
num_workers: 8
val_num_workers: 4
# Data
train_data_list: tools/mini_trainset/preprocessed
val_data_list: tools/mini_trainset/preprocessed
#! Image loading
cond_stage_key: "image" # image / text / image_text
image_size: 518
mean: &mean [0.5, 0.5, 0.5]
std: &std [0.5, 0.5, 0.5]
#! Point cloud sampling
pc_size: &pc_size 30720
pc_sharpedge_size: &pc_sharpedge_size 30720
sharpedge_label: &sharpedge_label true
return_normal: true
#! Augmentation
padding: true
model:
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
params:
first_stage_key: "surface"
cond_stage_key: "image"
scale_by_std: false
z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184
torch_compile: false
# ema_config:
# ema_model: LitEma
# ema_decay: 0.999
# ema_inference: false
first_stage_config:
target: hy3dshape.models.autoencoders.ShapeVAE
from_pretrained: tencent/Hunyuan3D-2.1
params:
num_latents: &num_latents 512
embed_dim: 64
num_freqs: 8
include_pi: false
heads: 16
width: 1024
point_feats: 4
num_decoder_layers: 16
pc_size: *pc_size
pc_sharpedge_size: *pc_sharpedge_size
qkv_bias: false
qk_norm: true
scale_factor: *z_scale_factor
geo_decoder_mlp_expand_ratio: 4
geo_decoder_downsample_ratio: 1
geo_decoder_ln_post: true
cond_stage_config:
target: hy3dshape.models.conditioner.SingleImageEncoder
params:
main_image_encoder:
type: DinoImageEncoder # dino giant
kwargs:
config:
attention_probs_dropout_prob: 0.0
drop_path_rate: 0.0
hidden_act: gelu
hidden_dropout_prob: 0.0
hidden_size: 1536
image_size: 518
initializer_range: 0.02
layer_norm_eps: 1.e-6
layerscale_value: 1.0
mlp_ratio: 4
model_type: dinov2
num_attention_heads: 24
num_channels: 3
num_hidden_layers: 40
patch_size: 14
qkv_bias: true
torch_dtype: float32
use_swiglu_ffn: true
image_size: 518
denoiser_cfg:
target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT
params:
ckpt_path: ~/.cache/hy3dgen/tencent/Hunyuan3D-2-1-Shape/dit/model.fp16.ckpt
input_size: *num_latents
context_in_dim: 1536
hidden_size: 1024
mlp_ratio: 4.0
num_heads: 16
depth: 16
depth_single_blocks: 32
axes_dim: [64]
theta: 10000
qkv_bias: true
use_pe: false
force_norm_fp32: true
scheduler_cfg:
transport:
target: hy3dshape.models.diffusion.transport.create_transport
params:
path_type: Linear
prediction: velocity
sampler:
target: hy3dshape.models.diffusion.transport.Sampler
params: {}
ode_params:
sampling_method: euler # dopri5 ...
num_steps: &num_steps 50
optimizer_cfg:
optimizer:
target: torch.optim.AdamW
params:
betas: [0.9, 0.99]
eps: 1.e-6
weight_decay: 1.e-2
scheduler:
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
params:
warm_up_steps: 50 # 5000
f_start: 1.e-6
f_min: 1.e-3
f_max: 1.0
pipeline_cfg:
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
image_processor_cfg:
target: hy3dshape.preprocessors.ImageProcessorV2
params: {}
callbacks:
logger:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
params:
step_frequency: 100 # 10000
num_samples: 1
sample_times: 1
mean: *mean
std: *std
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
octree_depth: 8
num_chunks: 50000
mc_level: 0.0
file_loggers:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
params:
step_frequency: 50 # 5000
test_data_path: "tools/mini_testset/images.json"

View File

@@ -1,173 +0,0 @@
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
training:
steps: 10_0000_0000
use_amp: true
amp_type: "bf16"
base_lr: 1e-4
gradient_clip_val: 1.0
gradient_clip_algorithm: "norm"
every_n_train_steps: 2000 # 5000
val_check_interval: 50 # 4096
limit_val_batches: 16
dataset:
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
params:
#! Base setting
batch_size: 2
num_workers: 8
val_num_workers: 4
# Data
train_data_list: tools/mini_trainset/preprocessed
val_data_list: tools/mini_trainset/preprocessed
#! Image loading
cond_stage_key: "image" # image / text / image_text
image_size: 518
mean: &mean [0.5, 0.5, 0.5]
std: &std [0.5, 0.5, 0.5]
#! Point cloud sampling
pc_size: &pc_size 10240
pc_sharpedge_size: &pc_sharpedge_size 10240
sharpedge_label: &sharpedge_label true
return_normal: true
#! Augmentation
padding: true
model:
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
params:
first_stage_key: "surface"
cond_stage_key: "image"
scale_by_std: false
z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184
torch_compile: false
# ema_config:
# ema_model: LitEma
# ema_decay: 0.999
# ema_inference: false
first_stage_config:
target: hy3dshape.models.autoencoders.ShapeVAE
from_pretrained: tencent/Hunyuan3D-2.1
params:
num_latents: &num_latents 512
embed_dim: 64
num_freqs: 8
include_pi: false
heads: 16
width: 1024
point_feats: 4
num_decoder_layers: 16
pc_size: *pc_size
pc_sharpedge_size: *pc_sharpedge_size
qkv_bias: false
qk_norm: true
scale_factor: *z_scale_factor
geo_decoder_mlp_expand_ratio: 4
geo_decoder_downsample_ratio: 1
geo_decoder_ln_post: true
cond_stage_config:
target: hy3dshape.models.conditioner.SingleImageEncoder
params:
main_image_encoder:
type: DinoImageEncoder # dino giant
kwargs:
config:
attention_probs_dropout_prob: 0.0
drop_path_rate: 0.0
hidden_act: gelu
hidden_dropout_prob: 0.0
hidden_size: 1536
image_size: 518
initializer_range: 0.02
layer_norm_eps: 1.e-6
layerscale_value: 1.0
mlp_ratio: 4
model_type: dinov2
num_attention_heads: 24
num_channels: 3
num_hidden_layers: 40
patch_size: 14
qkv_bias: true
torch_dtype: float32
use_swiglu_ffn: true
image_size: 518
denoiser_cfg:
target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT
params:
input_size: *num_latents
context_in_dim: 1536
hidden_size: 1024
mlp_ratio: 4.0
num_heads: 16
depth: 8
depth_single_blocks: 16
axes_dim: [64]
theta: 10000
qkv_bias: true
use_pe: false
force_norm_fp32: true
scheduler_cfg:
transport:
target: hy3dshape.models.diffusion.transport.create_transport
params:
path_type: Linear
prediction: velocity
sampler:
target: hy3dshape.models.diffusion.transport.Sampler
params: {}
ode_params:
sampling_method: euler # dopri5 ...
num_steps: &num_steps 50
optimizer_cfg:
optimizer:
target: torch.optim.AdamW
params:
betas: [0.9, 0.99]
eps: 1.e-6
weight_decay: 1.e-2
scheduler:
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
params:
warm_up_steps: 50 # 5000
f_start: 1.e-6
f_min: 1.e-3
f_max: 1.0
pipeline_cfg:
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
image_processor_cfg:
target: hy3dshape.preprocessors.ImageProcessorV2
params: {}
callbacks:
logger:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
params:
step_frequency: 100 # 10000
num_samples: 1
sample_times: 1
mean: *mean
std: *std
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
octree_depth: 8
num_chunks: 50000
mc_level: 0.0
file_loggers:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
params:
step_frequency: 50 # 5000
test_data_path: "tools/mini_testset/images.json"

View File

@@ -1,4 +1,5 @@
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518" name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518"
# training successfully on 8 x H20 with 98G Memory
training: training:
steps: 10_0000_0000 steps: 10_0000_0000
@@ -8,7 +9,8 @@ training:
gradient_clip_val: 1.0 gradient_clip_val: 1.0
gradient_clip_algorithm: "norm" gradient_clip_algorithm: "norm"
every_n_train_steps: 2000 # 5000 every_n_train_steps: 2000 # 5000
val_check_interval: 50 # 4096 val_check_interval: 200 # 4096
# val_check_interval must be smaller than every_n_train_steps!!!
limit_val_batches: 16 limit_val_batches: 16
dataset: dataset:
@@ -24,7 +26,7 @@ dataset:
val_data_list: tools/mini_trainset/preprocessed val_data_list: tools/mini_trainset/preprocessed
#! Image loading #! Image loading
cond_stage_key: "image" # image / text / image_text cond_stage_key: "image"
image_size: 518 image_size: 518
mean: &mean [0.5, 0.5, 0.5] mean: &mean [0.5, 0.5, 0.5]
std: &std [0.5, 0.5, 0.5] std: &std [0.5, 0.5, 0.5]
@@ -55,73 +57,21 @@ model:
first_stage_config: first_stage_config:
target: hy3dshape.models.autoencoders.ShapeVAE target: hy3dshape.models.autoencoders.ShapeVAE
from_pretrained: tencent/Hunyuan3D-2.1 from_pretrained: tencent/Hunyuan3D-2.1
params:
num_latents: &num_latents 4096
embed_dim: 64
num_freqs: 8
include_pi: false
heads: 16
width: 1024
num_encoder_layers: 8
num_decoder_layers: 16
qkv_bias: false
qk_norm: true
scale_factor: *z_scale_factor
geo_decoder_mlp_expand_ratio: 4
geo_decoder_downsample_ratio: 1
geo_decoder_ln_post: true
point_feats: 4
pc_size: *pc_size
pc_sharpedge_size: *pc_sharpedge_size
cond_stage_config: cond_stage_config:
target: hy3dshape.models.conditioner.SingleImageEncoder target: hy3dshape.models.conditioner.SingleImageEncoder
params: params:
drop_ratio: 0.1
main_image_encoder: main_image_encoder:
type: DinoImageEncoder # dino large type: DinoImageEncoder
kwargs: kwargs:
config: version: 'facebook/dinov2-large'
attention_probs_dropout_prob: 0.0
drop_path_rate: 0.0
hidden_act: gelu
hidden_dropout_prob: 0.0
hidden_size: 1024
image_size: 518
initializer_range: 0.02
layer_norm_eps: 1.e-6
layerscale_value: 1.0
mlp_ratio: 4
model_type: dinov2
num_attention_heads: 16
num_channels: 3
num_hidden_layers: 24
patch_size: 14
qkv_bias: true
torch_dtype: float32
use_swiglu_ffn: false
image_size: 518 image_size: 518
use_cls_token: true use_cls_token: true
denoiser_cfg: denoiser_cfg:
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
params: from_pretrained: tencent/Hunyuan3D-2.1
input_size: *num_latents
in_channels: 64
hidden_size: 2048
context_dim: 1024
depth: 21
num_heads: 16
qk_norm: true
text_len: 1370
with_decoupled_ca: false
use_attention_pooling: false
qk_norm_type: 'rms'
qkv_bias: false
use_pos_emb: false
num_moe_layers: 6
num_experts: 8
moe_top_k: 2
scheduler_cfg: scheduler_cfg:
transport: transport:
@@ -163,7 +113,7 @@ callbacks:
logger: logger:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
params: params:
step_frequency: 100 # 10000 step_frequency: 1000 # 10000
num_samples: 1 num_samples: 1
sample_times: 1 sample_times: 1
mean: *mean mean: *mean
@@ -176,5 +126,5 @@ callbacks:
file_loggers: file_loggers:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
params: params:
step_frequency: 50 # 5000 step_frequency: 500 # 5000
test_data_path: "tools/mini_testset/images.json" test_data_path: "tools/mini_testset/images.json"

View File

@@ -1,180 +0,0 @@
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
training:
steps: 10_0000_0000
use_amp: true
amp_type: "bf16"
base_lr: 1e-4
gradient_clip_val: 1.0
gradient_clip_algorithm: "norm"
every_n_train_steps: 2000 # 5000
val_check_interval: 50 # 4096
limit_val_batches: 16
dataset:
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
params:
#! Base setting
batch_size: 2
num_workers: 8
val_num_workers: 4
# Data
train_data_list: tools/mini_trainset/preprocessed
val_data_list: tools/mini_trainset/preprocessed
#! Image loading
cond_stage_key: "image" # image / text / image_text
image_size: 518
mean: &mean [0.5, 0.5, 0.5]
std: &std [0.5, 0.5, 0.5]
#! Point cloud sampling
pc_size: &pc_size 81920
pc_sharpedge_size: &pc_sharpedge_size 0
sharpedge_label: &sharpedge_label true
return_normal: true
#! Augmentation
padding: true
model:
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
params:
first_stage_key: "surface"
cond_stage_key: "image"
scale_by_std: false
z_scale_factor: &z_scale_factor 1.0039506158752403
torch_compile: false
# ema_config:
# ema_model: LitEma
# ema_decay: 0.999
# ema_inference: false
first_stage_config:
target: hy3dshape.models.autoencoders.ShapeVAE
from_pretrained: tencent/Hunyuan3D-2.1
params:
num_latents: &num_latents 4096
embed_dim: 64
num_freqs: 8
include_pi: false
heads: 16
width: 1024
num_encoder_layers: 8
num_decoder_layers: 16
qkv_bias: false
qk_norm: true
scale_factor: *z_scale_factor
geo_decoder_mlp_expand_ratio: 4
geo_decoder_downsample_ratio: 1
geo_decoder_ln_post: true
point_feats: 4
pc_size: *pc_size
pc_sharpedge_size: *pc_sharpedge_size
cond_stage_config:
target: hy3dshape.models.conditioner.SingleImageEncoder
params:
main_image_encoder:
type: DinoImageEncoder # dino large
kwargs:
config:
attention_probs_dropout_prob: 0.0
drop_path_rate: 0.0
hidden_act: gelu
hidden_dropout_prob: 0.0
hidden_size: 1024
image_size: 518
initializer_range: 0.02
layer_norm_eps: 1.e-6
layerscale_value: 1.0
mlp_ratio: 4
model_type: dinov2
num_attention_heads: 16
num_channels: 3
num_hidden_layers: 24
patch_size: 14
qkv_bias: true
torch_dtype: float32
use_swiglu_ffn: false
image_size: 518
use_cls_token: true
denoiser_cfg:
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
params:
input_size: *num_latents
in_channels: 64
hidden_size: 2048
context_dim: 1024
depth: 11
num_heads: 16
qk_norm: true
text_len: 1370
with_decoupled_ca: false
use_attention_pooling: false
qk_norm_type: 'rms'
qkv_bias: false
use_pos_emb: false
num_moe_layers: 6
num_experts: 8
moe_top_k: 2
scheduler_cfg:
transport:
target: hy3dshape.models.diffusion.transport.create_transport
params:
path_type: Linear
prediction: velocity
sampler:
target: hy3dshape.models.diffusion.transport.Sampler
params: {}
ode_params:
sampling_method: euler # dopri5 ...
num_steps: &num_steps 50
optimizer_cfg:
optimizer:
target: torch.optim.AdamW
params:
betas: [0.9, 0.99]
eps: 1.e-6
weight_decay: 1.e-2
scheduler:
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
params:
warm_up_steps: 50 # 5000
f_start: 1.e-6
f_min: 1.e-3
f_max: 1.0
pipeline_cfg:
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
image_processor_cfg:
target: hy3dshape.preprocessors.ImageProcessorV2
params: {}
callbacks:
logger:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
params:
step_frequency: 100 # 10000
num_samples: 1
sample_times: 1
mean: *mean
std: *std
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
octree_depth: 8
num_chunks: 50000
mc_level: 0.0
file_loggers:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
params:
step_frequency: 50 # 5000
test_data_path: "tools/mini_testset/images.json"

View File

@@ -1,4 +1,6 @@
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518" name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518"
# oversitting successfully cost 68G memory under current settings
# you can adjust model arch or batch_size according to your GPU memory
training: training:
steps: 10_0000_0000 steps: 10_0000_0000
@@ -8,14 +10,15 @@ training:
gradient_clip_val: 1.0 gradient_clip_val: 1.0
gradient_clip_algorithm: "norm" gradient_clip_algorithm: "norm"
every_n_train_steps: 2000 # 5000 every_n_train_steps: 2000 # 5000
val_check_interval: 50 # 4096 val_check_interval: 200 # 4096
# val_check_interval must be smaller than every_n_train_steps!!!
limit_val_batches: 16 limit_val_batches: 16
dataset: dataset:
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
params: params:
#! Base setting #! Base setting
batch_size: 2 batch_size: 4
num_workers: 8 num_workers: 8
val_num_workers: 4 val_num_workers: 4
@@ -24,7 +27,7 @@ dataset:
val_data_list: tools/mini_trainset/preprocessed val_data_list: tools/mini_trainset/preprocessed
#! Image loading #! Image loading
cond_stage_key: "image" # image / text / image_text cond_stage_key: "image"
image_size: 518 image_size: 518
mean: &mean [0.5, 0.5, 0.5] mean: &mean [0.5, 0.5, 0.5]
std: &std [0.5, 0.5, 0.5] std: &std [0.5, 0.5, 0.5]
@@ -55,63 +58,27 @@ model:
first_stage_config: first_stage_config:
target: hy3dshape.models.autoencoders.ShapeVAE target: hy3dshape.models.autoencoders.ShapeVAE
from_pretrained: tencent/Hunyuan3D-2.1 from_pretrained: tencent/Hunyuan3D-2.1
params:
num_latents: &num_latents 512
embed_dim: 64
num_freqs: 8
include_pi: false
heads: 16
width: 1024
num_encoder_layers: 8
num_decoder_layers: 16
qkv_bias: false
qk_norm: true
scale_factor: *z_scale_factor
geo_decoder_mlp_expand_ratio: 4
geo_decoder_downsample_ratio: 1
geo_decoder_ln_post: true
point_feats: 4
pc_size: *pc_size
pc_sharpedge_size: *pc_sharpedge_size
cond_stage_config: cond_stage_config:
target: hy3dshape.models.conditioner.SingleImageEncoder target: hy3dshape.models.conditioner.SingleImageEncoder
params: params:
drop_ratio: 0.1
main_image_encoder: main_image_encoder:
type: DinoImageEncoder # dino large type: DinoImageEncoder
kwargs: kwargs:
config: version: 'facebook/dinov2-large'
attention_probs_dropout_prob: 0.0
drop_path_rate: 0.0
hidden_act: gelu
hidden_dropout_prob: 0.0
hidden_size: 1024
image_size: 518
initializer_range: 0.02
layer_norm_eps: 1.e-6
layerscale_value: 1.0
mlp_ratio: 4
model_type: dinov2
num_attention_heads: 16
num_channels: 3
num_hidden_layers: 24
patch_size: 14
qkv_bias: true
torch_dtype: float32
use_swiglu_ffn: false
image_size: 518 image_size: 518
use_cls_token: true use_cls_token: true
denoiser_cfg: denoiser_cfg:
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
params: params:
input_size: *num_latents input_size: 4096
in_channels: 64 in_channels: 64
hidden_size: 768 hidden_size: 2048
context_dim: 1024 context_dim: 1024
depth: 6 depth: 16
num_heads: 12 num_heads: 16
qk_norm: true qk_norm: true
text_len: 1370 text_len: 1370
with_decoupled_ca: false with_decoupled_ca: false
@@ -147,7 +114,7 @@ model:
scheduler: scheduler:
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
params: params:
warm_up_steps: 50 # 5000 warm_up_steps: 500 # 5000
f_start: 1.e-6 f_start: 1.e-6
f_min: 1.e-3 f_min: 1.e-3
f_max: 1.0 f_max: 1.0
@@ -163,7 +130,7 @@ callbacks:
logger: logger:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
params: params:
step_frequency: 100 # 10000 step_frequency: 1000 # 10000
num_samples: 1 num_samples: 1
sample_times: 1 sample_times: 1
mean: *mean mean: *mean
@@ -176,5 +143,5 @@ callbacks:
file_loggers: file_loggers:
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
params: params:
step_frequency: 50 # 5000 step_frequency: 500 # 5000
test_data_path: "tools/mini_testset/images.json" test_data_path: "tools/mini_testset/images.json"

View File

@@ -548,7 +548,7 @@ class PointCrossAttentionEncoder(nn.Module):
if pc_sharpedge_size == 0: if pc_sharpedge_size == 0:
print( print(
f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is not given, using pc_size as pc_sharpedge_size') f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is zero')
else: else:
print( print(
f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is given, using pc_size={pc_size}, pc_sharpedge_size={pc_sharpedge_size}') f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is given, using pc_size={pc_size}, pc_sharpedge_size={pc_sharpedge_size}')

View File

@@ -32,6 +32,7 @@ from transformers import (
Dinov2Model, Dinov2Model,
Dinov2Config, Dinov2Config,
) )
from transformers import AutoImageProcessor, AutoModel
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
@@ -66,9 +67,10 @@ class ImageEncoder(nn.Module):
super().__init__() super().__init__()
if config is None: if config is None:
self.model = self.MODEL_CLASS.from_pretrained(version) self.model = AutoModel.from_pretrained(version)
else: else:
self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config)) self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config))
self.model.eval() self.model.eval()
self.model.requires_grad_(False) self.model.requires_grad_(False)
self.use_cls_token = use_cls_token self.use_cls_token = use_cls_token
@@ -240,11 +242,26 @@ class SingleImageEncoder(nn.Module):
def __init__( def __init__(
self, self,
main_image_encoder, main_image_encoder,
drop_ratio=0.0
): ):
super().__init__() super().__init__()
self.main_image_encoder = build_image_encoder(main_image_encoder) self.main_image_encoder = build_image_encoder(main_image_encoder)
self.drop_ratio = drop_ratio
self.disable_drop = True
def forward(self, image, mask=None, **kwargs): def forward(self, image, mask=None, **kwargs):
outputs = {
'main': self.main_image_encoder(image, mask=mask, **kwargs),
}
if self.disable_drop:
return outputs
else:
random_p = torch.rand(len(image), device='cuda')
remain_bool_tensor = random_p > self.drop_ratio
outputs['main'] *= remain_bool_tensor.view(-1,1,1)
return outputs
outputs = { outputs = {
'main': self.main_image_encoder(image, mask=mask, **kwargs), 'main': self.main_image_encoder(image, mask=mask, **kwargs),
} }

View File

@@ -22,6 +22,8 @@
# fine-tuning enabling code and other elements of the foregoing made publicly available # fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
import os
import yaml
import math import math
import numpy as np import numpy as np
@@ -31,6 +33,7 @@ import torch.nn.functional as F
from einops import rearrange from einops import rearrange
from .moe_layers import MoEBlock from .moe_layers import MoEBlock
from ...utils import logger, synchronize_timer, smart_load_model
def modulate(x, shift, scale): def modulate(x, shift, scale):
@@ -464,6 +467,74 @@ class FinalLayer(nn.Module):
class HunYuanDiTPlain(nn.Module): class HunYuanDiTPlain(nn.Module):
@classmethod
@synchronize_timer('HunYuanDiTPlain Model Loading')
def from_single_file(
cls,
ckpt_path,
config_path,
device='cuda',
dtype=torch.float16,
use_safetensors=None,
**kwargs,
):
# load config
with open(config_path, 'r') as f:
config = yaml.safe_load(f)
# load ckpt
if use_safetensors:
ckpt_path = ckpt_path.replace('.ckpt', '.safetensors')
if not os.path.exists(ckpt_path):
raise FileNotFoundError(f"Model file {ckpt_path} not found")
logger.info(f"Loading model from {ckpt_path}")
if use_safetensors:
import safetensors.torch
ckpt = safetensors.torch.load_file(ckpt_path, device='cpu')
else:
ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
if 'model' in ckpt:
ckpt = ckpt['model']
if 'model' in config:
config = config['model']
model_kwargs = config['params']
model_kwargs.update(kwargs)
model = cls(**model_kwargs)
model.load_state_dict(ckpt)
model.to(device=device, dtype=dtype)
return model
@classmethod
def from_pretrained(
cls,
model_path,
device='cuda',
dtype=torch.float16,
use_safetensors=False,
variant='fp16',
subfolder='hunyuan3d-dit-v2-1',
**kwargs,
):
config_path, ckpt_path = smart_load_model(
model_path,
subfolder=subfolder,
use_safetensors=use_safetensors,
variant=variant
)
return cls.from_single_file(
ckpt_path,
config_path,
device=device,
dtype=dtype,
use_safetensors=use_safetensors,
**kwargs
)
def __init__( def __init__(
self, self,
input_size=1024, input_size=1024,

View File

@@ -256,10 +256,7 @@ class Diffuser(pl.LightningModule):
def forward(self, batch): def forward(self, batch):
with torch.autocast(device_type="cuda", dtype=torch.bfloat16): #float32 for text with torch.autocast(device_type="cuda", dtype=torch.bfloat16): #float32 for text
contexts = self.cond_stage_model(image=batch.get('image'), text=batch.get('text'), mask=batch.get('mask')) contexts = self.cond_stage_model(image=batch.get('image'), text=batch.get('text'), mask=batch.get('mask'))
# t5_text = contexts['t5_text']['prompt_embeds']
# nan_count = torch.isnan(t5_text).sum()
# if nan_count > 0:
# print("t5_text has %d NaN values"%(nan_count))
with torch.autocast(device_type="cuda", dtype=torch.float16): with torch.autocast(device_type="cuda", dtype=torch.float16):
with torch.no_grad(): with torch.no_grad():
latents = self.first_stage_model.encode(batch[self.first_stage_key], sample_posterior=True) latents = self.first_stage_model.encode(batch[self.first_stage_key], sample_posterior=True)
@@ -333,9 +330,6 @@ class Diffuser(pl.LightningModule):
image = batch.get("image", None) image = batch.get("image", None)
mask = batch.get('mask', None) mask = batch.get('mask', None)
# if not isinstance(image, torch.Tensor): print(image.shape)
# if isinstance(mask, torch.Tensor): print(mask.shape)
outputs = self.pipeline(image=image, outputs = self.pipeline(image=image,
mask=mask, mask=mask,
generator=generator, generator=generator,
@@ -350,5 +344,6 @@ class Diffuser(pl.LightningModule):
f.write(traceback.format_exc()) f.write(traceback.format_exc())
f.write("\n") f.write("\n")
outputs = [None] outputs = [None]
self.cond_stage_model.disable_drop = False self.cond_stage_model.disable_drop = False
return [outputs] return [outputs]

View File

@@ -323,7 +323,9 @@ class ImageConditionalFixASLDiffuserLogger(Callback):
save_path = os.path.join(visual_dir, os.path.basename(image_path)) save_path = os.path.join(visual_dir, os.path.basename(image_path))
save_path = os.path.splitext(save_path)[0] + '.glb' save_path = os.path.splitext(save_path)[0] + '.glb'
if isinstance(image_path, str):
print(image_path) print(image_path)
with torch.no_grad(): with torch.no_grad():
mesh = pl_module.sample(batch={"image": image_path}, **self.kwargs)[0][0] mesh = pl_module.sample(batch={"image": image_path}, **self.kwargs)[0][0]
if isinstance(mesh, tuple) and len(mesh)==2: if isinstance(mesh, tuple) and len(mesh)==2:

View File

@@ -190,7 +190,7 @@ if __name__ == "__main__":
precision=amp_type, precision=amp_type,
callbacks=callbacks, callbacks=callbacks,
accelerator="gpu", accelerator="gpu",
devices=training_cfg.num_gpus, devices=args.num_gpus,
num_nodes=training_cfg.num_nodes, num_nodes=training_cfg.num_nodes,
strategy=ddp_strategy, strategy=ddp_strategy,
gradient_clip_val=training_cfg.get('gradient_clip_val'), gradient_clip_val=training_cfg.get('gradient_clip_val'),

View File

@@ -13,7 +13,6 @@
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
from PIL import Image from PIL import Image
from hy3dshape.rembg import BackgroundRemover from hy3dshape.rembg import BackgroundRemover
from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline
@@ -21,10 +20,12 @@ model_path = 'tencent/Hunyuan3D-2.1'
pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path) pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path)
image_path = 'demos/demo.png' image_path = 'demos/demo.png'
image = Image.open(image_path).convert("RGBA") image = Image.open(image_path).convert("RGBA")
if image.mode == 'RGB': if image.mode == 'RGB':
rembg = BackgroundRemover() rembg = BackgroundRemover()
image = rembg(image) image = rembg(image)
image = image_path
mesh = pipeline_shapegen(image=image)[0] mesh = pipeline_shapegen(image=image)[0]
mesh.export('demo.glb') mesh.export('demo.glb')

View File

@@ -0,0 +1,51 @@
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
from PIL import Image
from hy3dshape.rembg import BackgroundRemover
from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline
model_path = 'tencent/Hunyuan3D-2.1'
pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path)
import torch
import yaml
from hy3dshape.utils import instantiate_from_config
# For example, you can convert deepspeed weights to a single file
# cd output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt
# python3 zero_to_fp32.py ./ ./out --max_shard_size 30GB
# then you can get output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt/out/pytorch_model.bin
ckpt_cfg_path = 'output_folder/dit/overfitting_depth_16_token_4096_lr1e4_uc/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml'
ckpt_path = 'output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt/out/pytorch_model.bin'
config = yaml.safe_load(open(ckpt_cfg_path, 'r'))
model = instantiate_from_config(config['model']['params']['denoiser_cfg'])
sd = torch.load(ckpt_path)
sd = {k.replace('_forward_module.model.', ''):v for k,v in sd.items()}
msg = model.load_state_dict(sd)
print(msg)
model = model.cuda().half()
pipeline_shapegen.model = model
image = 'tools/mini_testset/images/015.png'
# image = Image.open(image_path).convert("RGBA")
# if image.mode == 'RGB':
# rembg = BackgroundRemover()
# image = rembg(image)
# mesh = pipeline_shapegen(image=image, guidance_scale=1.0)[0]
mesh = pipeline_shapegen(image=image)[0]
mesh.export('demo.glb')

View File

@@ -35,12 +35,11 @@ export NCCL_DEBUG=WARN
node_num=$1 node_num=$1
node_rank=$2 node_rank=$2
master_ip=$3 num_gpu_per_node=$3
config=$4 master_ip=$4
output_dir=$5 config=$5
output_dir=$6
# config='configs/dit-from-scratch-overfitting-flowmatching-dinog518-bf16-lr1e4-1024.yaml'
# output_dir='output_folder/dit/overfitting_10'
echo node_num $node_num echo node_num $node_num
echo node_rank $node_rank echo node_rank $node_rank
@@ -64,7 +63,8 @@ NCCL_IB_GID_INDEX=3 \
NCCL_NVLS_ENABLE=0 \ NCCL_NVLS_ENABLE=0 \
python3 main.py \ python3 main.py \
--num_nodes $node_num \ --num_nodes $node_num \
--num_gpus 8 \ --num_gpus $num_gpu_per_node \
--config $config \ --config $config \
--output_dir $output_dir \ --output_dir $output_dir \
--deepspeed --deepspeed

15
hy3dshape/train_demo.sh Normal file
View File

@@ -0,0 +1,15 @@
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export num_gpu_per_node=8
# export CUDA_VISIBLE_DEVICES=0
# export num_gpu_per_node=1
export node_num=1
export node_rank=0
export master_ip=0.0.0.0 # set your master_ip
# export config=configs/hunyuandit-finetuning-flowmatching-dinol518-bf16-lr1e5-4096.yaml
# export output_dir=output_folder/dit/fintuning_lr1e5
export config=configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml
export output_dir=output_folder/dit/overfitting_depth_16_token_4096_lr1e4
bash scripts/train_deepspeed.sh $node_num $node_rank $num_gpu_per_node $master_ip $config $output_dir