fix shape training
This commit is contained in:
@@ -1,174 +0,0 @@
|
||||
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
|
||||
|
||||
training:
|
||||
steps: 10_0000_0000
|
||||
use_amp: true
|
||||
amp_type: "bf16"
|
||||
base_lr: 1.e-5
|
||||
gradient_clip_val: 1.0
|
||||
gradient_clip_algorithm: "norm"
|
||||
every_n_train_steps: 2000 # 5000
|
||||
val_check_interval: 50 # 4096
|
||||
limit_val_batches: 16
|
||||
|
||||
dataset:
|
||||
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
|
||||
params:
|
||||
#! Base setting
|
||||
batch_size: 4
|
||||
num_workers: 8
|
||||
val_num_workers: 4
|
||||
|
||||
# Data
|
||||
train_data_list: tools/mini_trainset/preprocessed
|
||||
val_data_list: tools/mini_trainset/preprocessed
|
||||
|
||||
#! Image loading
|
||||
cond_stage_key: "image" # image / text / image_text
|
||||
image_size: 518
|
||||
mean: &mean [0.5, 0.5, 0.5]
|
||||
std: &std [0.5, 0.5, 0.5]
|
||||
|
||||
#! Point cloud sampling
|
||||
pc_size: &pc_size 30720
|
||||
pc_sharpedge_size: &pc_sharpedge_size 30720
|
||||
sharpedge_label: &sharpedge_label true
|
||||
return_normal: true
|
||||
|
||||
#! Augmentation
|
||||
padding: true
|
||||
|
||||
model:
|
||||
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
|
||||
params:
|
||||
first_stage_key: "surface"
|
||||
cond_stage_key: "image"
|
||||
scale_by_std: false
|
||||
z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184
|
||||
torch_compile: false
|
||||
|
||||
# ema_config:
|
||||
# ema_model: LitEma
|
||||
# ema_decay: 0.999
|
||||
# ema_inference: false
|
||||
|
||||
first_stage_config:
|
||||
target: hy3dshape.models.autoencoders.ShapeVAE
|
||||
from_pretrained: tencent/Hunyuan3D-2.1
|
||||
params:
|
||||
num_latents: &num_latents 512
|
||||
embed_dim: 64
|
||||
num_freqs: 8
|
||||
include_pi: false
|
||||
heads: 16
|
||||
width: 1024
|
||||
point_feats: 4
|
||||
num_decoder_layers: 16
|
||||
pc_size: *pc_size
|
||||
pc_sharpedge_size: *pc_sharpedge_size
|
||||
qkv_bias: false
|
||||
qk_norm: true
|
||||
scale_factor: *z_scale_factor
|
||||
geo_decoder_mlp_expand_ratio: 4
|
||||
geo_decoder_downsample_ratio: 1
|
||||
geo_decoder_ln_post: true
|
||||
|
||||
cond_stage_config:
|
||||
target: hy3dshape.models.conditioner.SingleImageEncoder
|
||||
params:
|
||||
main_image_encoder:
|
||||
type: DinoImageEncoder # dino giant
|
||||
kwargs:
|
||||
config:
|
||||
attention_probs_dropout_prob: 0.0
|
||||
drop_path_rate: 0.0
|
||||
hidden_act: gelu
|
||||
hidden_dropout_prob: 0.0
|
||||
hidden_size: 1536
|
||||
image_size: 518
|
||||
initializer_range: 0.02
|
||||
layer_norm_eps: 1.e-6
|
||||
layerscale_value: 1.0
|
||||
mlp_ratio: 4
|
||||
model_type: dinov2
|
||||
num_attention_heads: 24
|
||||
num_channels: 3
|
||||
num_hidden_layers: 40
|
||||
patch_size: 14
|
||||
qkv_bias: true
|
||||
torch_dtype: float32
|
||||
use_swiglu_ffn: true
|
||||
image_size: 518
|
||||
|
||||
denoiser_cfg:
|
||||
target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT
|
||||
params:
|
||||
ckpt_path: ~/.cache/hy3dgen/tencent/Hunyuan3D-2-1-Shape/dit/model.fp16.ckpt
|
||||
input_size: *num_latents
|
||||
context_in_dim: 1536
|
||||
hidden_size: 1024
|
||||
mlp_ratio: 4.0
|
||||
num_heads: 16
|
||||
depth: 16
|
||||
depth_single_blocks: 32
|
||||
axes_dim: [64]
|
||||
theta: 10000
|
||||
qkv_bias: true
|
||||
use_pe: false
|
||||
force_norm_fp32: true
|
||||
|
||||
scheduler_cfg:
|
||||
transport:
|
||||
target: hy3dshape.models.diffusion.transport.create_transport
|
||||
params:
|
||||
path_type: Linear
|
||||
prediction: velocity
|
||||
sampler:
|
||||
target: hy3dshape.models.diffusion.transport.Sampler
|
||||
params: {}
|
||||
ode_params:
|
||||
sampling_method: euler # dopri5 ...
|
||||
num_steps: &num_steps 50
|
||||
|
||||
optimizer_cfg:
|
||||
optimizer:
|
||||
target: torch.optim.AdamW
|
||||
params:
|
||||
betas: [0.9, 0.99]
|
||||
eps: 1.e-6
|
||||
weight_decay: 1.e-2
|
||||
|
||||
scheduler:
|
||||
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
|
||||
params:
|
||||
warm_up_steps: 50 # 5000
|
||||
f_start: 1.e-6
|
||||
f_min: 1.e-3
|
||||
f_max: 1.0
|
||||
|
||||
pipeline_cfg:
|
||||
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
|
||||
|
||||
image_processor_cfg:
|
||||
target: hy3dshape.preprocessors.ImageProcessorV2
|
||||
params: {}
|
||||
|
||||
callbacks:
|
||||
logger:
|
||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
||||
params:
|
||||
step_frequency: 100 # 10000
|
||||
num_samples: 1
|
||||
sample_times: 1
|
||||
mean: *mean
|
||||
std: *std
|
||||
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
|
||||
octree_depth: 8
|
||||
num_chunks: 50000
|
||||
mc_level: 0.0
|
||||
|
||||
file_loggers:
|
||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
||||
params:
|
||||
step_frequency: 50 # 5000
|
||||
test_data_path: "tools/mini_testset/images.json"
|
||||
@@ -1,173 +0,0 @@
|
||||
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
|
||||
|
||||
training:
|
||||
steps: 10_0000_0000
|
||||
use_amp: true
|
||||
amp_type: "bf16"
|
||||
base_lr: 1e-4
|
||||
gradient_clip_val: 1.0
|
||||
gradient_clip_algorithm: "norm"
|
||||
every_n_train_steps: 2000 # 5000
|
||||
val_check_interval: 50 # 4096
|
||||
limit_val_batches: 16
|
||||
|
||||
dataset:
|
||||
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
|
||||
params:
|
||||
#! Base setting
|
||||
batch_size: 2
|
||||
num_workers: 8
|
||||
val_num_workers: 4
|
||||
|
||||
# Data
|
||||
train_data_list: tools/mini_trainset/preprocessed
|
||||
val_data_list: tools/mini_trainset/preprocessed
|
||||
|
||||
#! Image loading
|
||||
cond_stage_key: "image" # image / text / image_text
|
||||
image_size: 518
|
||||
mean: &mean [0.5, 0.5, 0.5]
|
||||
std: &std [0.5, 0.5, 0.5]
|
||||
|
||||
#! Point cloud sampling
|
||||
pc_size: &pc_size 10240
|
||||
pc_sharpedge_size: &pc_sharpedge_size 10240
|
||||
sharpedge_label: &sharpedge_label true
|
||||
return_normal: true
|
||||
|
||||
#! Augmentation
|
||||
padding: true
|
||||
|
||||
model:
|
||||
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
|
||||
params:
|
||||
first_stage_key: "surface"
|
||||
cond_stage_key: "image"
|
||||
scale_by_std: false
|
||||
z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184
|
||||
torch_compile: false
|
||||
|
||||
# ema_config:
|
||||
# ema_model: LitEma
|
||||
# ema_decay: 0.999
|
||||
# ema_inference: false
|
||||
|
||||
first_stage_config:
|
||||
target: hy3dshape.models.autoencoders.ShapeVAE
|
||||
from_pretrained: tencent/Hunyuan3D-2.1
|
||||
params:
|
||||
num_latents: &num_latents 512
|
||||
embed_dim: 64
|
||||
num_freqs: 8
|
||||
include_pi: false
|
||||
heads: 16
|
||||
width: 1024
|
||||
point_feats: 4
|
||||
num_decoder_layers: 16
|
||||
pc_size: *pc_size
|
||||
pc_sharpedge_size: *pc_sharpedge_size
|
||||
qkv_bias: false
|
||||
qk_norm: true
|
||||
scale_factor: *z_scale_factor
|
||||
geo_decoder_mlp_expand_ratio: 4
|
||||
geo_decoder_downsample_ratio: 1
|
||||
geo_decoder_ln_post: true
|
||||
|
||||
cond_stage_config:
|
||||
target: hy3dshape.models.conditioner.SingleImageEncoder
|
||||
params:
|
||||
main_image_encoder:
|
||||
type: DinoImageEncoder # dino giant
|
||||
kwargs:
|
||||
config:
|
||||
attention_probs_dropout_prob: 0.0
|
||||
drop_path_rate: 0.0
|
||||
hidden_act: gelu
|
||||
hidden_dropout_prob: 0.0
|
||||
hidden_size: 1536
|
||||
image_size: 518
|
||||
initializer_range: 0.02
|
||||
layer_norm_eps: 1.e-6
|
||||
layerscale_value: 1.0
|
||||
mlp_ratio: 4
|
||||
model_type: dinov2
|
||||
num_attention_heads: 24
|
||||
num_channels: 3
|
||||
num_hidden_layers: 40
|
||||
patch_size: 14
|
||||
qkv_bias: true
|
||||
torch_dtype: float32
|
||||
use_swiglu_ffn: true
|
||||
image_size: 518
|
||||
|
||||
denoiser_cfg:
|
||||
target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT
|
||||
params:
|
||||
input_size: *num_latents
|
||||
context_in_dim: 1536
|
||||
hidden_size: 1024
|
||||
mlp_ratio: 4.0
|
||||
num_heads: 16
|
||||
depth: 8
|
||||
depth_single_blocks: 16
|
||||
axes_dim: [64]
|
||||
theta: 10000
|
||||
qkv_bias: true
|
||||
use_pe: false
|
||||
force_norm_fp32: true
|
||||
|
||||
scheduler_cfg:
|
||||
transport:
|
||||
target: hy3dshape.models.diffusion.transport.create_transport
|
||||
params:
|
||||
path_type: Linear
|
||||
prediction: velocity
|
||||
sampler:
|
||||
target: hy3dshape.models.diffusion.transport.Sampler
|
||||
params: {}
|
||||
ode_params:
|
||||
sampling_method: euler # dopri5 ...
|
||||
num_steps: &num_steps 50
|
||||
|
||||
optimizer_cfg:
|
||||
optimizer:
|
||||
target: torch.optim.AdamW
|
||||
params:
|
||||
betas: [0.9, 0.99]
|
||||
eps: 1.e-6
|
||||
weight_decay: 1.e-2
|
||||
|
||||
scheduler:
|
||||
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
|
||||
params:
|
||||
warm_up_steps: 50 # 5000
|
||||
f_start: 1.e-6
|
||||
f_min: 1.e-3
|
||||
f_max: 1.0
|
||||
|
||||
pipeline_cfg:
|
||||
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
|
||||
|
||||
image_processor_cfg:
|
||||
target: hy3dshape.preprocessors.ImageProcessorV2
|
||||
params: {}
|
||||
|
||||
callbacks:
|
||||
logger:
|
||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
||||
params:
|
||||
step_frequency: 100 # 10000
|
||||
num_samples: 1
|
||||
sample_times: 1
|
||||
mean: *mean
|
||||
std: *std
|
||||
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
|
||||
octree_depth: 8
|
||||
num_chunks: 50000
|
||||
mc_level: 0.0
|
||||
|
||||
file_loggers:
|
||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
||||
params:
|
||||
step_frequency: 50 # 5000
|
||||
test_data_path: "tools/mini_testset/images.json"
|
||||
@@ -1,4 +1,5 @@
|
||||
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
|
||||
name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518"
|
||||
# training successfully on 8 x H20 with 98G Memory
|
||||
|
||||
training:
|
||||
steps: 10_0000_0000
|
||||
@@ -8,7 +9,8 @@ training:
|
||||
gradient_clip_val: 1.0
|
||||
gradient_clip_algorithm: "norm"
|
||||
every_n_train_steps: 2000 # 5000
|
||||
val_check_interval: 50 # 4096
|
||||
val_check_interval: 200 # 4096
|
||||
# val_check_interval must be smaller than every_n_train_steps!!!
|
||||
limit_val_batches: 16
|
||||
|
||||
dataset:
|
||||
@@ -24,7 +26,7 @@ dataset:
|
||||
val_data_list: tools/mini_trainset/preprocessed
|
||||
|
||||
#! Image loading
|
||||
cond_stage_key: "image" # image / text / image_text
|
||||
cond_stage_key: "image"
|
||||
image_size: 518
|
||||
mean: &mean [0.5, 0.5, 0.5]
|
||||
std: &std [0.5, 0.5, 0.5]
|
||||
@@ -55,73 +57,21 @@ model:
|
||||
first_stage_config:
|
||||
target: hy3dshape.models.autoencoders.ShapeVAE
|
||||
from_pretrained: tencent/Hunyuan3D-2.1
|
||||
params:
|
||||
num_latents: &num_latents 4096
|
||||
embed_dim: 64
|
||||
num_freqs: 8
|
||||
include_pi: false
|
||||
heads: 16
|
||||
width: 1024
|
||||
num_encoder_layers: 8
|
||||
num_decoder_layers: 16
|
||||
qkv_bias: false
|
||||
qk_norm: true
|
||||
scale_factor: *z_scale_factor
|
||||
geo_decoder_mlp_expand_ratio: 4
|
||||
geo_decoder_downsample_ratio: 1
|
||||
geo_decoder_ln_post: true
|
||||
point_feats: 4
|
||||
pc_size: *pc_size
|
||||
pc_sharpedge_size: *pc_sharpedge_size
|
||||
|
||||
cond_stage_config:
|
||||
target: hy3dshape.models.conditioner.SingleImageEncoder
|
||||
params:
|
||||
drop_ratio: 0.1
|
||||
main_image_encoder:
|
||||
type: DinoImageEncoder # dino large
|
||||
type: DinoImageEncoder
|
||||
kwargs:
|
||||
config:
|
||||
attention_probs_dropout_prob: 0.0
|
||||
drop_path_rate: 0.0
|
||||
hidden_act: gelu
|
||||
hidden_dropout_prob: 0.0
|
||||
hidden_size: 1024
|
||||
image_size: 518
|
||||
initializer_range: 0.02
|
||||
layer_norm_eps: 1.e-6
|
||||
layerscale_value: 1.0
|
||||
mlp_ratio: 4
|
||||
model_type: dinov2
|
||||
num_attention_heads: 16
|
||||
num_channels: 3
|
||||
num_hidden_layers: 24
|
||||
patch_size: 14
|
||||
qkv_bias: true
|
||||
torch_dtype: float32
|
||||
use_swiglu_ffn: false
|
||||
version: 'facebook/dinov2-large'
|
||||
image_size: 518
|
||||
use_cls_token: true
|
||||
|
||||
|
||||
denoiser_cfg:
|
||||
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
|
||||
params:
|
||||
input_size: *num_latents
|
||||
in_channels: 64
|
||||
hidden_size: 2048
|
||||
context_dim: 1024
|
||||
depth: 21
|
||||
num_heads: 16
|
||||
qk_norm: true
|
||||
text_len: 1370
|
||||
with_decoupled_ca: false
|
||||
use_attention_pooling: false
|
||||
qk_norm_type: 'rms'
|
||||
qkv_bias: false
|
||||
use_pos_emb: false
|
||||
num_moe_layers: 6
|
||||
num_experts: 8
|
||||
moe_top_k: 2
|
||||
from_pretrained: tencent/Hunyuan3D-2.1
|
||||
|
||||
scheduler_cfg:
|
||||
transport:
|
||||
@@ -163,7 +113,7 @@ callbacks:
|
||||
logger:
|
||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
||||
params:
|
||||
step_frequency: 100 # 10000
|
||||
step_frequency: 1000 # 10000
|
||||
num_samples: 1
|
||||
sample_times: 1
|
||||
mean: *mean
|
||||
@@ -176,5 +126,5 @@ callbacks:
|
||||
file_loggers:
|
||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
||||
params:
|
||||
step_frequency: 50 # 5000
|
||||
step_frequency: 500 # 5000
|
||||
test_data_path: "tools/mini_testset/images.json"
|
||||
@@ -1,180 +0,0 @@
|
||||
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
|
||||
|
||||
training:
|
||||
steps: 10_0000_0000
|
||||
use_amp: true
|
||||
amp_type: "bf16"
|
||||
base_lr: 1e-4
|
||||
gradient_clip_val: 1.0
|
||||
gradient_clip_algorithm: "norm"
|
||||
every_n_train_steps: 2000 # 5000
|
||||
val_check_interval: 50 # 4096
|
||||
limit_val_batches: 16
|
||||
|
||||
dataset:
|
||||
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
|
||||
params:
|
||||
#! Base setting
|
||||
batch_size: 2
|
||||
num_workers: 8
|
||||
val_num_workers: 4
|
||||
|
||||
# Data
|
||||
train_data_list: tools/mini_trainset/preprocessed
|
||||
val_data_list: tools/mini_trainset/preprocessed
|
||||
|
||||
#! Image loading
|
||||
cond_stage_key: "image" # image / text / image_text
|
||||
image_size: 518
|
||||
mean: &mean [0.5, 0.5, 0.5]
|
||||
std: &std [0.5, 0.5, 0.5]
|
||||
|
||||
#! Point cloud sampling
|
||||
pc_size: &pc_size 81920
|
||||
pc_sharpedge_size: &pc_sharpedge_size 0
|
||||
sharpedge_label: &sharpedge_label true
|
||||
return_normal: true
|
||||
|
||||
#! Augmentation
|
||||
padding: true
|
||||
|
||||
model:
|
||||
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
|
||||
params:
|
||||
first_stage_key: "surface"
|
||||
cond_stage_key: "image"
|
||||
scale_by_std: false
|
||||
z_scale_factor: &z_scale_factor 1.0039506158752403
|
||||
torch_compile: false
|
||||
|
||||
# ema_config:
|
||||
# ema_model: LitEma
|
||||
# ema_decay: 0.999
|
||||
# ema_inference: false
|
||||
|
||||
first_stage_config:
|
||||
target: hy3dshape.models.autoencoders.ShapeVAE
|
||||
from_pretrained: tencent/Hunyuan3D-2.1
|
||||
params:
|
||||
num_latents: &num_latents 4096
|
||||
embed_dim: 64
|
||||
num_freqs: 8
|
||||
include_pi: false
|
||||
heads: 16
|
||||
width: 1024
|
||||
num_encoder_layers: 8
|
||||
num_decoder_layers: 16
|
||||
qkv_bias: false
|
||||
qk_norm: true
|
||||
scale_factor: *z_scale_factor
|
||||
geo_decoder_mlp_expand_ratio: 4
|
||||
geo_decoder_downsample_ratio: 1
|
||||
geo_decoder_ln_post: true
|
||||
point_feats: 4
|
||||
pc_size: *pc_size
|
||||
pc_sharpedge_size: *pc_sharpedge_size
|
||||
|
||||
cond_stage_config:
|
||||
target: hy3dshape.models.conditioner.SingleImageEncoder
|
||||
params:
|
||||
main_image_encoder:
|
||||
type: DinoImageEncoder # dino large
|
||||
kwargs:
|
||||
config:
|
||||
attention_probs_dropout_prob: 0.0
|
||||
drop_path_rate: 0.0
|
||||
hidden_act: gelu
|
||||
hidden_dropout_prob: 0.0
|
||||
hidden_size: 1024
|
||||
image_size: 518
|
||||
initializer_range: 0.02
|
||||
layer_norm_eps: 1.e-6
|
||||
layerscale_value: 1.0
|
||||
mlp_ratio: 4
|
||||
model_type: dinov2
|
||||
num_attention_heads: 16
|
||||
num_channels: 3
|
||||
num_hidden_layers: 24
|
||||
patch_size: 14
|
||||
qkv_bias: true
|
||||
torch_dtype: float32
|
||||
use_swiglu_ffn: false
|
||||
image_size: 518
|
||||
use_cls_token: true
|
||||
|
||||
|
||||
denoiser_cfg:
|
||||
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
|
||||
params:
|
||||
input_size: *num_latents
|
||||
in_channels: 64
|
||||
hidden_size: 2048
|
||||
context_dim: 1024
|
||||
depth: 11
|
||||
num_heads: 16
|
||||
qk_norm: true
|
||||
text_len: 1370
|
||||
with_decoupled_ca: false
|
||||
use_attention_pooling: false
|
||||
qk_norm_type: 'rms'
|
||||
qkv_bias: false
|
||||
use_pos_emb: false
|
||||
num_moe_layers: 6
|
||||
num_experts: 8
|
||||
moe_top_k: 2
|
||||
|
||||
scheduler_cfg:
|
||||
transport:
|
||||
target: hy3dshape.models.diffusion.transport.create_transport
|
||||
params:
|
||||
path_type: Linear
|
||||
prediction: velocity
|
||||
sampler:
|
||||
target: hy3dshape.models.diffusion.transport.Sampler
|
||||
params: {}
|
||||
ode_params:
|
||||
sampling_method: euler # dopri5 ...
|
||||
num_steps: &num_steps 50
|
||||
|
||||
optimizer_cfg:
|
||||
optimizer:
|
||||
target: torch.optim.AdamW
|
||||
params:
|
||||
betas: [0.9, 0.99]
|
||||
eps: 1.e-6
|
||||
weight_decay: 1.e-2
|
||||
|
||||
scheduler:
|
||||
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
|
||||
params:
|
||||
warm_up_steps: 50 # 5000
|
||||
f_start: 1.e-6
|
||||
f_min: 1.e-3
|
||||
f_max: 1.0
|
||||
|
||||
pipeline_cfg:
|
||||
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
|
||||
|
||||
image_processor_cfg:
|
||||
target: hy3dshape.preprocessors.ImageProcessorV2
|
||||
params: {}
|
||||
|
||||
callbacks:
|
||||
logger:
|
||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
||||
params:
|
||||
step_frequency: 100 # 10000
|
||||
num_samples: 1
|
||||
sample_times: 1
|
||||
mean: *mean
|
||||
std: *std
|
||||
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
|
||||
octree_depth: 8
|
||||
num_chunks: 50000
|
||||
mc_level: 0.0
|
||||
|
||||
file_loggers:
|
||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
||||
params:
|
||||
step_frequency: 50 # 5000
|
||||
test_data_path: "tools/mini_testset/images.json"
|
||||
@@ -1,4 +1,6 @@
|
||||
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
|
||||
name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518"
|
||||
# oversitting successfully cost 68G memory under current settings
|
||||
# you can adjust model arch or batch_size according to your GPU memory
|
||||
|
||||
training:
|
||||
steps: 10_0000_0000
|
||||
@@ -8,14 +10,15 @@ training:
|
||||
gradient_clip_val: 1.0
|
||||
gradient_clip_algorithm: "norm"
|
||||
every_n_train_steps: 2000 # 5000
|
||||
val_check_interval: 50 # 4096
|
||||
val_check_interval: 200 # 4096
|
||||
# val_check_interval must be smaller than every_n_train_steps!!!
|
||||
limit_val_batches: 16
|
||||
|
||||
dataset:
|
||||
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
|
||||
params:
|
||||
#! Base setting
|
||||
batch_size: 2
|
||||
batch_size: 4
|
||||
num_workers: 8
|
||||
val_num_workers: 4
|
||||
|
||||
@@ -24,7 +27,7 @@ dataset:
|
||||
val_data_list: tools/mini_trainset/preprocessed
|
||||
|
||||
#! Image loading
|
||||
cond_stage_key: "image" # image / text / image_text
|
||||
cond_stage_key: "image"
|
||||
image_size: 518
|
||||
mean: &mean [0.5, 0.5, 0.5]
|
||||
std: &std [0.5, 0.5, 0.5]
|
||||
@@ -55,63 +58,27 @@ model:
|
||||
first_stage_config:
|
||||
target: hy3dshape.models.autoencoders.ShapeVAE
|
||||
from_pretrained: tencent/Hunyuan3D-2.1
|
||||
params:
|
||||
num_latents: &num_latents 512
|
||||
embed_dim: 64
|
||||
num_freqs: 8
|
||||
include_pi: false
|
||||
heads: 16
|
||||
width: 1024
|
||||
num_encoder_layers: 8
|
||||
num_decoder_layers: 16
|
||||
qkv_bias: false
|
||||
qk_norm: true
|
||||
scale_factor: *z_scale_factor
|
||||
geo_decoder_mlp_expand_ratio: 4
|
||||
geo_decoder_downsample_ratio: 1
|
||||
geo_decoder_ln_post: true
|
||||
point_feats: 4
|
||||
pc_size: *pc_size
|
||||
pc_sharpedge_size: *pc_sharpedge_size
|
||||
|
||||
cond_stage_config:
|
||||
target: hy3dshape.models.conditioner.SingleImageEncoder
|
||||
params:
|
||||
drop_ratio: 0.1
|
||||
main_image_encoder:
|
||||
type: DinoImageEncoder # dino large
|
||||
type: DinoImageEncoder
|
||||
kwargs:
|
||||
config:
|
||||
attention_probs_dropout_prob: 0.0
|
||||
drop_path_rate: 0.0
|
||||
hidden_act: gelu
|
||||
hidden_dropout_prob: 0.0
|
||||
hidden_size: 1024
|
||||
image_size: 518
|
||||
initializer_range: 0.02
|
||||
layer_norm_eps: 1.e-6
|
||||
layerscale_value: 1.0
|
||||
mlp_ratio: 4
|
||||
model_type: dinov2
|
||||
num_attention_heads: 16
|
||||
num_channels: 3
|
||||
num_hidden_layers: 24
|
||||
patch_size: 14
|
||||
qkv_bias: true
|
||||
torch_dtype: float32
|
||||
use_swiglu_ffn: false
|
||||
version: 'facebook/dinov2-large'
|
||||
image_size: 518
|
||||
use_cls_token: true
|
||||
|
||||
|
||||
denoiser_cfg:
|
||||
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
|
||||
params:
|
||||
input_size: *num_latents
|
||||
input_size: 4096
|
||||
in_channels: 64
|
||||
hidden_size: 768
|
||||
hidden_size: 2048
|
||||
context_dim: 1024
|
||||
depth: 6
|
||||
num_heads: 12
|
||||
depth: 16
|
||||
num_heads: 16
|
||||
qk_norm: true
|
||||
text_len: 1370
|
||||
with_decoupled_ca: false
|
||||
@@ -147,7 +114,7 @@ model:
|
||||
scheduler:
|
||||
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
|
||||
params:
|
||||
warm_up_steps: 50 # 5000
|
||||
warm_up_steps: 500 # 5000
|
||||
f_start: 1.e-6
|
||||
f_min: 1.e-3
|
||||
f_max: 1.0
|
||||
@@ -163,7 +130,7 @@ callbacks:
|
||||
logger:
|
||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
||||
params:
|
||||
step_frequency: 100 # 10000
|
||||
step_frequency: 1000 # 10000
|
||||
num_samples: 1
|
||||
sample_times: 1
|
||||
mean: *mean
|
||||
@@ -176,5 +143,5 @@ callbacks:
|
||||
file_loggers:
|
||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
||||
params:
|
||||
step_frequency: 50 # 5000
|
||||
step_frequency: 500 # 5000
|
||||
test_data_path: "tools/mini_testset/images.json"
|
||||
Reference in New Issue
Block a user