fix shape training
This commit is contained in:
@@ -1,174 +0,0 @@
|
|||||||
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
|
|
||||||
|
|
||||||
training:
|
|
||||||
steps: 10_0000_0000
|
|
||||||
use_amp: true
|
|
||||||
amp_type: "bf16"
|
|
||||||
base_lr: 1.e-5
|
|
||||||
gradient_clip_val: 1.0
|
|
||||||
gradient_clip_algorithm: "norm"
|
|
||||||
every_n_train_steps: 2000 # 5000
|
|
||||||
val_check_interval: 50 # 4096
|
|
||||||
limit_val_batches: 16
|
|
||||||
|
|
||||||
dataset:
|
|
||||||
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
|
|
||||||
params:
|
|
||||||
#! Base setting
|
|
||||||
batch_size: 4
|
|
||||||
num_workers: 8
|
|
||||||
val_num_workers: 4
|
|
||||||
|
|
||||||
# Data
|
|
||||||
train_data_list: tools/mini_trainset/preprocessed
|
|
||||||
val_data_list: tools/mini_trainset/preprocessed
|
|
||||||
|
|
||||||
#! Image loading
|
|
||||||
cond_stage_key: "image" # image / text / image_text
|
|
||||||
image_size: 518
|
|
||||||
mean: &mean [0.5, 0.5, 0.5]
|
|
||||||
std: &std [0.5, 0.5, 0.5]
|
|
||||||
|
|
||||||
#! Point cloud sampling
|
|
||||||
pc_size: &pc_size 30720
|
|
||||||
pc_sharpedge_size: &pc_sharpedge_size 30720
|
|
||||||
sharpedge_label: &sharpedge_label true
|
|
||||||
return_normal: true
|
|
||||||
|
|
||||||
#! Augmentation
|
|
||||||
padding: true
|
|
||||||
|
|
||||||
model:
|
|
||||||
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
|
|
||||||
params:
|
|
||||||
first_stage_key: "surface"
|
|
||||||
cond_stage_key: "image"
|
|
||||||
scale_by_std: false
|
|
||||||
z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184
|
|
||||||
torch_compile: false
|
|
||||||
|
|
||||||
# ema_config:
|
|
||||||
# ema_model: LitEma
|
|
||||||
# ema_decay: 0.999
|
|
||||||
# ema_inference: false
|
|
||||||
|
|
||||||
first_stage_config:
|
|
||||||
target: hy3dshape.models.autoencoders.ShapeVAE
|
|
||||||
from_pretrained: tencent/Hunyuan3D-2.1
|
|
||||||
params:
|
|
||||||
num_latents: &num_latents 512
|
|
||||||
embed_dim: 64
|
|
||||||
num_freqs: 8
|
|
||||||
include_pi: false
|
|
||||||
heads: 16
|
|
||||||
width: 1024
|
|
||||||
point_feats: 4
|
|
||||||
num_decoder_layers: 16
|
|
||||||
pc_size: *pc_size
|
|
||||||
pc_sharpedge_size: *pc_sharpedge_size
|
|
||||||
qkv_bias: false
|
|
||||||
qk_norm: true
|
|
||||||
scale_factor: *z_scale_factor
|
|
||||||
geo_decoder_mlp_expand_ratio: 4
|
|
||||||
geo_decoder_downsample_ratio: 1
|
|
||||||
geo_decoder_ln_post: true
|
|
||||||
|
|
||||||
cond_stage_config:
|
|
||||||
target: hy3dshape.models.conditioner.SingleImageEncoder
|
|
||||||
params:
|
|
||||||
main_image_encoder:
|
|
||||||
type: DinoImageEncoder # dino giant
|
|
||||||
kwargs:
|
|
||||||
config:
|
|
||||||
attention_probs_dropout_prob: 0.0
|
|
||||||
drop_path_rate: 0.0
|
|
||||||
hidden_act: gelu
|
|
||||||
hidden_dropout_prob: 0.0
|
|
||||||
hidden_size: 1536
|
|
||||||
image_size: 518
|
|
||||||
initializer_range: 0.02
|
|
||||||
layer_norm_eps: 1.e-6
|
|
||||||
layerscale_value: 1.0
|
|
||||||
mlp_ratio: 4
|
|
||||||
model_type: dinov2
|
|
||||||
num_attention_heads: 24
|
|
||||||
num_channels: 3
|
|
||||||
num_hidden_layers: 40
|
|
||||||
patch_size: 14
|
|
||||||
qkv_bias: true
|
|
||||||
torch_dtype: float32
|
|
||||||
use_swiglu_ffn: true
|
|
||||||
image_size: 518
|
|
||||||
|
|
||||||
denoiser_cfg:
|
|
||||||
target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT
|
|
||||||
params:
|
|
||||||
ckpt_path: ~/.cache/hy3dgen/tencent/Hunyuan3D-2-1-Shape/dit/model.fp16.ckpt
|
|
||||||
input_size: *num_latents
|
|
||||||
context_in_dim: 1536
|
|
||||||
hidden_size: 1024
|
|
||||||
mlp_ratio: 4.0
|
|
||||||
num_heads: 16
|
|
||||||
depth: 16
|
|
||||||
depth_single_blocks: 32
|
|
||||||
axes_dim: [64]
|
|
||||||
theta: 10000
|
|
||||||
qkv_bias: true
|
|
||||||
use_pe: false
|
|
||||||
force_norm_fp32: true
|
|
||||||
|
|
||||||
scheduler_cfg:
|
|
||||||
transport:
|
|
||||||
target: hy3dshape.models.diffusion.transport.create_transport
|
|
||||||
params:
|
|
||||||
path_type: Linear
|
|
||||||
prediction: velocity
|
|
||||||
sampler:
|
|
||||||
target: hy3dshape.models.diffusion.transport.Sampler
|
|
||||||
params: {}
|
|
||||||
ode_params:
|
|
||||||
sampling_method: euler # dopri5 ...
|
|
||||||
num_steps: &num_steps 50
|
|
||||||
|
|
||||||
optimizer_cfg:
|
|
||||||
optimizer:
|
|
||||||
target: torch.optim.AdamW
|
|
||||||
params:
|
|
||||||
betas: [0.9, 0.99]
|
|
||||||
eps: 1.e-6
|
|
||||||
weight_decay: 1.e-2
|
|
||||||
|
|
||||||
scheduler:
|
|
||||||
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
|
|
||||||
params:
|
|
||||||
warm_up_steps: 50 # 5000
|
|
||||||
f_start: 1.e-6
|
|
||||||
f_min: 1.e-3
|
|
||||||
f_max: 1.0
|
|
||||||
|
|
||||||
pipeline_cfg:
|
|
||||||
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
|
|
||||||
|
|
||||||
image_processor_cfg:
|
|
||||||
target: hy3dshape.preprocessors.ImageProcessorV2
|
|
||||||
params: {}
|
|
||||||
|
|
||||||
callbacks:
|
|
||||||
logger:
|
|
||||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
|
||||||
params:
|
|
||||||
step_frequency: 100 # 10000
|
|
||||||
num_samples: 1
|
|
||||||
sample_times: 1
|
|
||||||
mean: *mean
|
|
||||||
std: *std
|
|
||||||
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
|
|
||||||
octree_depth: 8
|
|
||||||
num_chunks: 50000
|
|
||||||
mc_level: 0.0
|
|
||||||
|
|
||||||
file_loggers:
|
|
||||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
|
||||||
params:
|
|
||||||
step_frequency: 50 # 5000
|
|
||||||
test_data_path: "tools/mini_testset/images.json"
|
|
||||||
@@ -1,173 +0,0 @@
|
|||||||
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
|
|
||||||
|
|
||||||
training:
|
|
||||||
steps: 10_0000_0000
|
|
||||||
use_amp: true
|
|
||||||
amp_type: "bf16"
|
|
||||||
base_lr: 1e-4
|
|
||||||
gradient_clip_val: 1.0
|
|
||||||
gradient_clip_algorithm: "norm"
|
|
||||||
every_n_train_steps: 2000 # 5000
|
|
||||||
val_check_interval: 50 # 4096
|
|
||||||
limit_val_batches: 16
|
|
||||||
|
|
||||||
dataset:
|
|
||||||
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
|
|
||||||
params:
|
|
||||||
#! Base setting
|
|
||||||
batch_size: 2
|
|
||||||
num_workers: 8
|
|
||||||
val_num_workers: 4
|
|
||||||
|
|
||||||
# Data
|
|
||||||
train_data_list: tools/mini_trainset/preprocessed
|
|
||||||
val_data_list: tools/mini_trainset/preprocessed
|
|
||||||
|
|
||||||
#! Image loading
|
|
||||||
cond_stage_key: "image" # image / text / image_text
|
|
||||||
image_size: 518
|
|
||||||
mean: &mean [0.5, 0.5, 0.5]
|
|
||||||
std: &std [0.5, 0.5, 0.5]
|
|
||||||
|
|
||||||
#! Point cloud sampling
|
|
||||||
pc_size: &pc_size 10240
|
|
||||||
pc_sharpedge_size: &pc_sharpedge_size 10240
|
|
||||||
sharpedge_label: &sharpedge_label true
|
|
||||||
return_normal: true
|
|
||||||
|
|
||||||
#! Augmentation
|
|
||||||
padding: true
|
|
||||||
|
|
||||||
model:
|
|
||||||
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
|
|
||||||
params:
|
|
||||||
first_stage_key: "surface"
|
|
||||||
cond_stage_key: "image"
|
|
||||||
scale_by_std: false
|
|
||||||
z_scale_factor: &z_scale_factor 0.9990943042622529 # 1 / 1.0009065167661184
|
|
||||||
torch_compile: false
|
|
||||||
|
|
||||||
# ema_config:
|
|
||||||
# ema_model: LitEma
|
|
||||||
# ema_decay: 0.999
|
|
||||||
# ema_inference: false
|
|
||||||
|
|
||||||
first_stage_config:
|
|
||||||
target: hy3dshape.models.autoencoders.ShapeVAE
|
|
||||||
from_pretrained: tencent/Hunyuan3D-2.1
|
|
||||||
params:
|
|
||||||
num_latents: &num_latents 512
|
|
||||||
embed_dim: 64
|
|
||||||
num_freqs: 8
|
|
||||||
include_pi: false
|
|
||||||
heads: 16
|
|
||||||
width: 1024
|
|
||||||
point_feats: 4
|
|
||||||
num_decoder_layers: 16
|
|
||||||
pc_size: *pc_size
|
|
||||||
pc_sharpedge_size: *pc_sharpedge_size
|
|
||||||
qkv_bias: false
|
|
||||||
qk_norm: true
|
|
||||||
scale_factor: *z_scale_factor
|
|
||||||
geo_decoder_mlp_expand_ratio: 4
|
|
||||||
geo_decoder_downsample_ratio: 1
|
|
||||||
geo_decoder_ln_post: true
|
|
||||||
|
|
||||||
cond_stage_config:
|
|
||||||
target: hy3dshape.models.conditioner.SingleImageEncoder
|
|
||||||
params:
|
|
||||||
main_image_encoder:
|
|
||||||
type: DinoImageEncoder # dino giant
|
|
||||||
kwargs:
|
|
||||||
config:
|
|
||||||
attention_probs_dropout_prob: 0.0
|
|
||||||
drop_path_rate: 0.0
|
|
||||||
hidden_act: gelu
|
|
||||||
hidden_dropout_prob: 0.0
|
|
||||||
hidden_size: 1536
|
|
||||||
image_size: 518
|
|
||||||
initializer_range: 0.02
|
|
||||||
layer_norm_eps: 1.e-6
|
|
||||||
layerscale_value: 1.0
|
|
||||||
mlp_ratio: 4
|
|
||||||
model_type: dinov2
|
|
||||||
num_attention_heads: 24
|
|
||||||
num_channels: 3
|
|
||||||
num_hidden_layers: 40
|
|
||||||
patch_size: 14
|
|
||||||
qkv_bias: true
|
|
||||||
torch_dtype: float32
|
|
||||||
use_swiglu_ffn: true
|
|
||||||
image_size: 518
|
|
||||||
|
|
||||||
denoiser_cfg:
|
|
||||||
target: hy3dshape.models.denoisers.hunyuan3ddit.Hunyuan3DDiT
|
|
||||||
params:
|
|
||||||
input_size: *num_latents
|
|
||||||
context_in_dim: 1536
|
|
||||||
hidden_size: 1024
|
|
||||||
mlp_ratio: 4.0
|
|
||||||
num_heads: 16
|
|
||||||
depth: 8
|
|
||||||
depth_single_blocks: 16
|
|
||||||
axes_dim: [64]
|
|
||||||
theta: 10000
|
|
||||||
qkv_bias: true
|
|
||||||
use_pe: false
|
|
||||||
force_norm_fp32: true
|
|
||||||
|
|
||||||
scheduler_cfg:
|
|
||||||
transport:
|
|
||||||
target: hy3dshape.models.diffusion.transport.create_transport
|
|
||||||
params:
|
|
||||||
path_type: Linear
|
|
||||||
prediction: velocity
|
|
||||||
sampler:
|
|
||||||
target: hy3dshape.models.diffusion.transport.Sampler
|
|
||||||
params: {}
|
|
||||||
ode_params:
|
|
||||||
sampling_method: euler # dopri5 ...
|
|
||||||
num_steps: &num_steps 50
|
|
||||||
|
|
||||||
optimizer_cfg:
|
|
||||||
optimizer:
|
|
||||||
target: torch.optim.AdamW
|
|
||||||
params:
|
|
||||||
betas: [0.9, 0.99]
|
|
||||||
eps: 1.e-6
|
|
||||||
weight_decay: 1.e-2
|
|
||||||
|
|
||||||
scheduler:
|
|
||||||
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
|
|
||||||
params:
|
|
||||||
warm_up_steps: 50 # 5000
|
|
||||||
f_start: 1.e-6
|
|
||||||
f_min: 1.e-3
|
|
||||||
f_max: 1.0
|
|
||||||
|
|
||||||
pipeline_cfg:
|
|
||||||
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
|
|
||||||
|
|
||||||
image_processor_cfg:
|
|
||||||
target: hy3dshape.preprocessors.ImageProcessorV2
|
|
||||||
params: {}
|
|
||||||
|
|
||||||
callbacks:
|
|
||||||
logger:
|
|
||||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
|
||||||
params:
|
|
||||||
step_frequency: 100 # 10000
|
|
||||||
num_samples: 1
|
|
||||||
sample_times: 1
|
|
||||||
mean: *mean
|
|
||||||
std: *std
|
|
||||||
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
|
|
||||||
octree_depth: 8
|
|
||||||
num_chunks: 50000
|
|
||||||
mc_level: 0.0
|
|
||||||
|
|
||||||
file_loggers:
|
|
||||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
|
||||||
params:
|
|
||||||
step_frequency: 50 # 5000
|
|
||||||
test_data_path: "tools/mini_testset/images.json"
|
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
|
name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518"
|
||||||
|
# training successfully on 8 x H20 with 98G Memory
|
||||||
|
|
||||||
training:
|
training:
|
||||||
steps: 10_0000_0000
|
steps: 10_0000_0000
|
||||||
@@ -8,7 +9,8 @@ training:
|
|||||||
gradient_clip_val: 1.0
|
gradient_clip_val: 1.0
|
||||||
gradient_clip_algorithm: "norm"
|
gradient_clip_algorithm: "norm"
|
||||||
every_n_train_steps: 2000 # 5000
|
every_n_train_steps: 2000 # 5000
|
||||||
val_check_interval: 50 # 4096
|
val_check_interval: 200 # 4096
|
||||||
|
# val_check_interval must be smaller than every_n_train_steps!!!
|
||||||
limit_val_batches: 16
|
limit_val_batches: 16
|
||||||
|
|
||||||
dataset:
|
dataset:
|
||||||
@@ -24,7 +26,7 @@ dataset:
|
|||||||
val_data_list: tools/mini_trainset/preprocessed
|
val_data_list: tools/mini_trainset/preprocessed
|
||||||
|
|
||||||
#! Image loading
|
#! Image loading
|
||||||
cond_stage_key: "image" # image / text / image_text
|
cond_stage_key: "image"
|
||||||
image_size: 518
|
image_size: 518
|
||||||
mean: &mean [0.5, 0.5, 0.5]
|
mean: &mean [0.5, 0.5, 0.5]
|
||||||
std: &std [0.5, 0.5, 0.5]
|
std: &std [0.5, 0.5, 0.5]
|
||||||
@@ -55,73 +57,21 @@ model:
|
|||||||
first_stage_config:
|
first_stage_config:
|
||||||
target: hy3dshape.models.autoencoders.ShapeVAE
|
target: hy3dshape.models.autoencoders.ShapeVAE
|
||||||
from_pretrained: tencent/Hunyuan3D-2.1
|
from_pretrained: tencent/Hunyuan3D-2.1
|
||||||
params:
|
|
||||||
num_latents: &num_latents 4096
|
|
||||||
embed_dim: 64
|
|
||||||
num_freqs: 8
|
|
||||||
include_pi: false
|
|
||||||
heads: 16
|
|
||||||
width: 1024
|
|
||||||
num_encoder_layers: 8
|
|
||||||
num_decoder_layers: 16
|
|
||||||
qkv_bias: false
|
|
||||||
qk_norm: true
|
|
||||||
scale_factor: *z_scale_factor
|
|
||||||
geo_decoder_mlp_expand_ratio: 4
|
|
||||||
geo_decoder_downsample_ratio: 1
|
|
||||||
geo_decoder_ln_post: true
|
|
||||||
point_feats: 4
|
|
||||||
pc_size: *pc_size
|
|
||||||
pc_sharpedge_size: *pc_sharpedge_size
|
|
||||||
|
|
||||||
cond_stage_config:
|
cond_stage_config:
|
||||||
target: hy3dshape.models.conditioner.SingleImageEncoder
|
target: hy3dshape.models.conditioner.SingleImageEncoder
|
||||||
params:
|
params:
|
||||||
|
drop_ratio: 0.1
|
||||||
main_image_encoder:
|
main_image_encoder:
|
||||||
type: DinoImageEncoder # dino large
|
type: DinoImageEncoder
|
||||||
kwargs:
|
kwargs:
|
||||||
config:
|
version: 'facebook/dinov2-large'
|
||||||
attention_probs_dropout_prob: 0.0
|
|
||||||
drop_path_rate: 0.0
|
|
||||||
hidden_act: gelu
|
|
||||||
hidden_dropout_prob: 0.0
|
|
||||||
hidden_size: 1024
|
|
||||||
image_size: 518
|
|
||||||
initializer_range: 0.02
|
|
||||||
layer_norm_eps: 1.e-6
|
|
||||||
layerscale_value: 1.0
|
|
||||||
mlp_ratio: 4
|
|
||||||
model_type: dinov2
|
|
||||||
num_attention_heads: 16
|
|
||||||
num_channels: 3
|
|
||||||
num_hidden_layers: 24
|
|
||||||
patch_size: 14
|
|
||||||
qkv_bias: true
|
|
||||||
torch_dtype: float32
|
|
||||||
use_swiglu_ffn: false
|
|
||||||
image_size: 518
|
image_size: 518
|
||||||
use_cls_token: true
|
use_cls_token: true
|
||||||
|
|
||||||
|
|
||||||
denoiser_cfg:
|
denoiser_cfg:
|
||||||
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
|
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
|
||||||
params:
|
from_pretrained: tencent/Hunyuan3D-2.1
|
||||||
input_size: *num_latents
|
|
||||||
in_channels: 64
|
|
||||||
hidden_size: 2048
|
|
||||||
context_dim: 1024
|
|
||||||
depth: 21
|
|
||||||
num_heads: 16
|
|
||||||
qk_norm: true
|
|
||||||
text_len: 1370
|
|
||||||
with_decoupled_ca: false
|
|
||||||
use_attention_pooling: false
|
|
||||||
qk_norm_type: 'rms'
|
|
||||||
qkv_bias: false
|
|
||||||
use_pos_emb: false
|
|
||||||
num_moe_layers: 6
|
|
||||||
num_experts: 8
|
|
||||||
moe_top_k: 2
|
|
||||||
|
|
||||||
scheduler_cfg:
|
scheduler_cfg:
|
||||||
transport:
|
transport:
|
||||||
@@ -163,7 +113,7 @@ callbacks:
|
|||||||
logger:
|
logger:
|
||||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
||||||
params:
|
params:
|
||||||
step_frequency: 100 # 10000
|
step_frequency: 1000 # 10000
|
||||||
num_samples: 1
|
num_samples: 1
|
||||||
sample_times: 1
|
sample_times: 1
|
||||||
mean: *mean
|
mean: *mean
|
||||||
@@ -176,5 +126,5 @@ callbacks:
|
|||||||
file_loggers:
|
file_loggers:
|
||||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
||||||
params:
|
params:
|
||||||
step_frequency: 50 # 5000
|
step_frequency: 500 # 5000
|
||||||
test_data_path: "tools/mini_testset/images.json"
|
test_data_path: "tools/mini_testset/images.json"
|
||||||
@@ -1,180 +0,0 @@
|
|||||||
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
|
|
||||||
|
|
||||||
training:
|
|
||||||
steps: 10_0000_0000
|
|
||||||
use_amp: true
|
|
||||||
amp_type: "bf16"
|
|
||||||
base_lr: 1e-4
|
|
||||||
gradient_clip_val: 1.0
|
|
||||||
gradient_clip_algorithm: "norm"
|
|
||||||
every_n_train_steps: 2000 # 5000
|
|
||||||
val_check_interval: 50 # 4096
|
|
||||||
limit_val_batches: 16
|
|
||||||
|
|
||||||
dataset:
|
|
||||||
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
|
|
||||||
params:
|
|
||||||
#! Base setting
|
|
||||||
batch_size: 2
|
|
||||||
num_workers: 8
|
|
||||||
val_num_workers: 4
|
|
||||||
|
|
||||||
# Data
|
|
||||||
train_data_list: tools/mini_trainset/preprocessed
|
|
||||||
val_data_list: tools/mini_trainset/preprocessed
|
|
||||||
|
|
||||||
#! Image loading
|
|
||||||
cond_stage_key: "image" # image / text / image_text
|
|
||||||
image_size: 518
|
|
||||||
mean: &mean [0.5, 0.5, 0.5]
|
|
||||||
std: &std [0.5, 0.5, 0.5]
|
|
||||||
|
|
||||||
#! Point cloud sampling
|
|
||||||
pc_size: &pc_size 81920
|
|
||||||
pc_sharpedge_size: &pc_sharpedge_size 0
|
|
||||||
sharpedge_label: &sharpedge_label true
|
|
||||||
return_normal: true
|
|
||||||
|
|
||||||
#! Augmentation
|
|
||||||
padding: true
|
|
||||||
|
|
||||||
model:
|
|
||||||
target: hy3dshape.models.diffusion.flow_matching_sit.Diffuser
|
|
||||||
params:
|
|
||||||
first_stage_key: "surface"
|
|
||||||
cond_stage_key: "image"
|
|
||||||
scale_by_std: false
|
|
||||||
z_scale_factor: &z_scale_factor 1.0039506158752403
|
|
||||||
torch_compile: false
|
|
||||||
|
|
||||||
# ema_config:
|
|
||||||
# ema_model: LitEma
|
|
||||||
# ema_decay: 0.999
|
|
||||||
# ema_inference: false
|
|
||||||
|
|
||||||
first_stage_config:
|
|
||||||
target: hy3dshape.models.autoencoders.ShapeVAE
|
|
||||||
from_pretrained: tencent/Hunyuan3D-2.1
|
|
||||||
params:
|
|
||||||
num_latents: &num_latents 4096
|
|
||||||
embed_dim: 64
|
|
||||||
num_freqs: 8
|
|
||||||
include_pi: false
|
|
||||||
heads: 16
|
|
||||||
width: 1024
|
|
||||||
num_encoder_layers: 8
|
|
||||||
num_decoder_layers: 16
|
|
||||||
qkv_bias: false
|
|
||||||
qk_norm: true
|
|
||||||
scale_factor: *z_scale_factor
|
|
||||||
geo_decoder_mlp_expand_ratio: 4
|
|
||||||
geo_decoder_downsample_ratio: 1
|
|
||||||
geo_decoder_ln_post: true
|
|
||||||
point_feats: 4
|
|
||||||
pc_size: *pc_size
|
|
||||||
pc_sharpedge_size: *pc_sharpedge_size
|
|
||||||
|
|
||||||
cond_stage_config:
|
|
||||||
target: hy3dshape.models.conditioner.SingleImageEncoder
|
|
||||||
params:
|
|
||||||
main_image_encoder:
|
|
||||||
type: DinoImageEncoder # dino large
|
|
||||||
kwargs:
|
|
||||||
config:
|
|
||||||
attention_probs_dropout_prob: 0.0
|
|
||||||
drop_path_rate: 0.0
|
|
||||||
hidden_act: gelu
|
|
||||||
hidden_dropout_prob: 0.0
|
|
||||||
hidden_size: 1024
|
|
||||||
image_size: 518
|
|
||||||
initializer_range: 0.02
|
|
||||||
layer_norm_eps: 1.e-6
|
|
||||||
layerscale_value: 1.0
|
|
||||||
mlp_ratio: 4
|
|
||||||
model_type: dinov2
|
|
||||||
num_attention_heads: 16
|
|
||||||
num_channels: 3
|
|
||||||
num_hidden_layers: 24
|
|
||||||
patch_size: 14
|
|
||||||
qkv_bias: true
|
|
||||||
torch_dtype: float32
|
|
||||||
use_swiglu_ffn: false
|
|
||||||
image_size: 518
|
|
||||||
use_cls_token: true
|
|
||||||
|
|
||||||
|
|
||||||
denoiser_cfg:
|
|
||||||
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
|
|
||||||
params:
|
|
||||||
input_size: *num_latents
|
|
||||||
in_channels: 64
|
|
||||||
hidden_size: 2048
|
|
||||||
context_dim: 1024
|
|
||||||
depth: 11
|
|
||||||
num_heads: 16
|
|
||||||
qk_norm: true
|
|
||||||
text_len: 1370
|
|
||||||
with_decoupled_ca: false
|
|
||||||
use_attention_pooling: false
|
|
||||||
qk_norm_type: 'rms'
|
|
||||||
qkv_bias: false
|
|
||||||
use_pos_emb: false
|
|
||||||
num_moe_layers: 6
|
|
||||||
num_experts: 8
|
|
||||||
moe_top_k: 2
|
|
||||||
|
|
||||||
scheduler_cfg:
|
|
||||||
transport:
|
|
||||||
target: hy3dshape.models.diffusion.transport.create_transport
|
|
||||||
params:
|
|
||||||
path_type: Linear
|
|
||||||
prediction: velocity
|
|
||||||
sampler:
|
|
||||||
target: hy3dshape.models.diffusion.transport.Sampler
|
|
||||||
params: {}
|
|
||||||
ode_params:
|
|
||||||
sampling_method: euler # dopri5 ...
|
|
||||||
num_steps: &num_steps 50
|
|
||||||
|
|
||||||
optimizer_cfg:
|
|
||||||
optimizer:
|
|
||||||
target: torch.optim.AdamW
|
|
||||||
params:
|
|
||||||
betas: [0.9, 0.99]
|
|
||||||
eps: 1.e-6
|
|
||||||
weight_decay: 1.e-2
|
|
||||||
|
|
||||||
scheduler:
|
|
||||||
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
|
|
||||||
params:
|
|
||||||
warm_up_steps: 50 # 5000
|
|
||||||
f_start: 1.e-6
|
|
||||||
f_min: 1.e-3
|
|
||||||
f_max: 1.0
|
|
||||||
|
|
||||||
pipeline_cfg:
|
|
||||||
target: hy3dshape.pipelines.Hunyuan3DDiTFlowMatchingPipeline
|
|
||||||
|
|
||||||
image_processor_cfg:
|
|
||||||
target: hy3dshape.preprocessors.ImageProcessorV2
|
|
||||||
params: {}
|
|
||||||
|
|
||||||
callbacks:
|
|
||||||
logger:
|
|
||||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
|
||||||
params:
|
|
||||||
step_frequency: 100 # 10000
|
|
||||||
num_samples: 1
|
|
||||||
sample_times: 1
|
|
||||||
mean: *mean
|
|
||||||
std: *std
|
|
||||||
bounds: [-1.01, -1.01, -1.01, 1.01, 1.01, 1.01]
|
|
||||||
octree_depth: 8
|
|
||||||
num_chunks: 50000
|
|
||||||
mc_level: 0.0
|
|
||||||
|
|
||||||
file_loggers:
|
|
||||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
|
||||||
params:
|
|
||||||
step_frequency: 50 # 5000
|
|
||||||
test_data_path: "tools/mini_testset/images.json"
|
|
||||||
@@ -1,4 +1,6 @@
|
|||||||
name: "DiT: Flux large flowmatching; VAE: 1024 token length; ImageEncoder: DINO Giant; ImageSize: 518"
|
name: "HunyuanDiT flowmatching; VAE: 4096 token length; ImageEncoder: DINO-v2 Large; ImageSize: 518"
|
||||||
|
# oversitting successfully cost 68G memory under current settings
|
||||||
|
# you can adjust model arch or batch_size according to your GPU memory
|
||||||
|
|
||||||
training:
|
training:
|
||||||
steps: 10_0000_0000
|
steps: 10_0000_0000
|
||||||
@@ -8,14 +10,15 @@ training:
|
|||||||
gradient_clip_val: 1.0
|
gradient_clip_val: 1.0
|
||||||
gradient_clip_algorithm: "norm"
|
gradient_clip_algorithm: "norm"
|
||||||
every_n_train_steps: 2000 # 5000
|
every_n_train_steps: 2000 # 5000
|
||||||
val_check_interval: 50 # 4096
|
val_check_interval: 200 # 4096
|
||||||
|
# val_check_interval must be smaller than every_n_train_steps!!!
|
||||||
limit_val_batches: 16
|
limit_val_batches: 16
|
||||||
|
|
||||||
dataset:
|
dataset:
|
||||||
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
|
target: hy3dshape.data.dit_asl.AlignedShapeLatentModule
|
||||||
params:
|
params:
|
||||||
#! Base setting
|
#! Base setting
|
||||||
batch_size: 2
|
batch_size: 4
|
||||||
num_workers: 8
|
num_workers: 8
|
||||||
val_num_workers: 4
|
val_num_workers: 4
|
||||||
|
|
||||||
@@ -24,7 +27,7 @@ dataset:
|
|||||||
val_data_list: tools/mini_trainset/preprocessed
|
val_data_list: tools/mini_trainset/preprocessed
|
||||||
|
|
||||||
#! Image loading
|
#! Image loading
|
||||||
cond_stage_key: "image" # image / text / image_text
|
cond_stage_key: "image"
|
||||||
image_size: 518
|
image_size: 518
|
||||||
mean: &mean [0.5, 0.5, 0.5]
|
mean: &mean [0.5, 0.5, 0.5]
|
||||||
std: &std [0.5, 0.5, 0.5]
|
std: &std [0.5, 0.5, 0.5]
|
||||||
@@ -55,63 +58,27 @@ model:
|
|||||||
first_stage_config:
|
first_stage_config:
|
||||||
target: hy3dshape.models.autoencoders.ShapeVAE
|
target: hy3dshape.models.autoencoders.ShapeVAE
|
||||||
from_pretrained: tencent/Hunyuan3D-2.1
|
from_pretrained: tencent/Hunyuan3D-2.1
|
||||||
params:
|
|
||||||
num_latents: &num_latents 512
|
|
||||||
embed_dim: 64
|
|
||||||
num_freqs: 8
|
|
||||||
include_pi: false
|
|
||||||
heads: 16
|
|
||||||
width: 1024
|
|
||||||
num_encoder_layers: 8
|
|
||||||
num_decoder_layers: 16
|
|
||||||
qkv_bias: false
|
|
||||||
qk_norm: true
|
|
||||||
scale_factor: *z_scale_factor
|
|
||||||
geo_decoder_mlp_expand_ratio: 4
|
|
||||||
geo_decoder_downsample_ratio: 1
|
|
||||||
geo_decoder_ln_post: true
|
|
||||||
point_feats: 4
|
|
||||||
pc_size: *pc_size
|
|
||||||
pc_sharpedge_size: *pc_sharpedge_size
|
|
||||||
|
|
||||||
cond_stage_config:
|
cond_stage_config:
|
||||||
target: hy3dshape.models.conditioner.SingleImageEncoder
|
target: hy3dshape.models.conditioner.SingleImageEncoder
|
||||||
params:
|
params:
|
||||||
|
drop_ratio: 0.1
|
||||||
main_image_encoder:
|
main_image_encoder:
|
||||||
type: DinoImageEncoder # dino large
|
type: DinoImageEncoder
|
||||||
kwargs:
|
kwargs:
|
||||||
config:
|
version: 'facebook/dinov2-large'
|
||||||
attention_probs_dropout_prob: 0.0
|
|
||||||
drop_path_rate: 0.0
|
|
||||||
hidden_act: gelu
|
|
||||||
hidden_dropout_prob: 0.0
|
|
||||||
hidden_size: 1024
|
|
||||||
image_size: 518
|
|
||||||
initializer_range: 0.02
|
|
||||||
layer_norm_eps: 1.e-6
|
|
||||||
layerscale_value: 1.0
|
|
||||||
mlp_ratio: 4
|
|
||||||
model_type: dinov2
|
|
||||||
num_attention_heads: 16
|
|
||||||
num_channels: 3
|
|
||||||
num_hidden_layers: 24
|
|
||||||
patch_size: 14
|
|
||||||
qkv_bias: true
|
|
||||||
torch_dtype: float32
|
|
||||||
use_swiglu_ffn: false
|
|
||||||
image_size: 518
|
image_size: 518
|
||||||
use_cls_token: true
|
use_cls_token: true
|
||||||
|
|
||||||
|
|
||||||
denoiser_cfg:
|
denoiser_cfg:
|
||||||
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
|
target: hy3dshape.models.denoisers.hunyuandit.HunYuanDiTPlain
|
||||||
params:
|
params:
|
||||||
input_size: *num_latents
|
input_size: 4096
|
||||||
in_channels: 64
|
in_channels: 64
|
||||||
hidden_size: 768
|
hidden_size: 2048
|
||||||
context_dim: 1024
|
context_dim: 1024
|
||||||
depth: 6
|
depth: 16
|
||||||
num_heads: 12
|
num_heads: 16
|
||||||
qk_norm: true
|
qk_norm: true
|
||||||
text_len: 1370
|
text_len: 1370
|
||||||
with_decoupled_ca: false
|
with_decoupled_ca: false
|
||||||
@@ -147,7 +114,7 @@ model:
|
|||||||
scheduler:
|
scheduler:
|
||||||
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
|
target: hy3dshape.utils.trainings.lr_scheduler.LambdaWarmUpCosineFactorScheduler
|
||||||
params:
|
params:
|
||||||
warm_up_steps: 50 # 5000
|
warm_up_steps: 500 # 5000
|
||||||
f_start: 1.e-6
|
f_start: 1.e-6
|
||||||
f_min: 1.e-3
|
f_min: 1.e-3
|
||||||
f_max: 1.0
|
f_max: 1.0
|
||||||
@@ -163,7 +130,7 @@ callbacks:
|
|||||||
logger:
|
logger:
|
||||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalASLDiffuserLogger
|
||||||
params:
|
params:
|
||||||
step_frequency: 100 # 10000
|
step_frequency: 1000 # 10000
|
||||||
num_samples: 1
|
num_samples: 1
|
||||||
sample_times: 1
|
sample_times: 1
|
||||||
mean: *mean
|
mean: *mean
|
||||||
@@ -176,5 +143,5 @@ callbacks:
|
|||||||
file_loggers:
|
file_loggers:
|
||||||
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
target: hy3dshape.utils.trainings.mesh_log_callback.ImageConditionalFixASLDiffuserLogger
|
||||||
params:
|
params:
|
||||||
step_frequency: 50 # 5000
|
step_frequency: 500 # 5000
|
||||||
test_data_path: "tools/mini_testset/images.json"
|
test_data_path: "tools/mini_testset/images.json"
|
||||||
@@ -548,7 +548,7 @@ class PointCrossAttentionEncoder(nn.Module):
|
|||||||
|
|
||||||
if pc_sharpedge_size == 0:
|
if pc_sharpedge_size == 0:
|
||||||
print(
|
print(
|
||||||
f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is not given, using pc_size as pc_sharpedge_size')
|
f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is zero')
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is given, using pc_size={pc_size}, pc_sharpedge_size={pc_sharpedge_size}')
|
f'PointCrossAttentionEncoder INFO: pc_sharpedge_size is given, using pc_size={pc_size}, pc_sharpedge_size={pc_sharpedge_size}')
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ from transformers import (
|
|||||||
Dinov2Model,
|
Dinov2Model,
|
||||||
Dinov2Config,
|
Dinov2Config,
|
||||||
)
|
)
|
||||||
|
from transformers import AutoImageProcessor, AutoModel
|
||||||
|
|
||||||
|
|
||||||
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
|
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
|
||||||
@@ -66,9 +67,10 @@ class ImageEncoder(nn.Module):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
if config is None:
|
if config is None:
|
||||||
self.model = self.MODEL_CLASS.from_pretrained(version)
|
self.model = AutoModel.from_pretrained(version)
|
||||||
else:
|
else:
|
||||||
self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config))
|
self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config))
|
||||||
|
|
||||||
self.model.eval()
|
self.model.eval()
|
||||||
self.model.requires_grad_(False)
|
self.model.requires_grad_(False)
|
||||||
self.use_cls_token = use_cls_token
|
self.use_cls_token = use_cls_token
|
||||||
@@ -240,11 +242,26 @@ class SingleImageEncoder(nn.Module):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
main_image_encoder,
|
main_image_encoder,
|
||||||
|
drop_ratio=0.0
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
||||||
|
self.drop_ratio = drop_ratio
|
||||||
|
self.disable_drop = True
|
||||||
|
|
||||||
def forward(self, image, mask=None, **kwargs):
|
def forward(self, image, mask=None, **kwargs):
|
||||||
|
outputs = {
|
||||||
|
'main': self.main_image_encoder(image, mask=mask, **kwargs),
|
||||||
|
}
|
||||||
|
if self.disable_drop:
|
||||||
|
return outputs
|
||||||
|
else:
|
||||||
|
random_p = torch.rand(len(image), device='cuda')
|
||||||
|
remain_bool_tensor = random_p > self.drop_ratio
|
||||||
|
outputs['main'] *= remain_bool_tensor.view(-1,1,1)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
outputs = {
|
outputs = {
|
||||||
'main': self.main_image_encoder(image, mask=mask, **kwargs),
|
'main': self.main_image_encoder(image, mask=mask, **kwargs),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,6 +22,8 @@
|
|||||||
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
||||||
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
import math
|
import math
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -31,6 +33,7 @@ import torch.nn.functional as F
|
|||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
|
|
||||||
from .moe_layers import MoEBlock
|
from .moe_layers import MoEBlock
|
||||||
|
from ...utils import logger, synchronize_timer, smart_load_model
|
||||||
|
|
||||||
|
|
||||||
def modulate(x, shift, scale):
|
def modulate(x, shift, scale):
|
||||||
@@ -464,6 +467,74 @@ class FinalLayer(nn.Module):
|
|||||||
|
|
||||||
class HunYuanDiTPlain(nn.Module):
|
class HunYuanDiTPlain(nn.Module):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@synchronize_timer('HunYuanDiTPlain Model Loading')
|
||||||
|
def from_single_file(
|
||||||
|
cls,
|
||||||
|
ckpt_path,
|
||||||
|
config_path,
|
||||||
|
device='cuda',
|
||||||
|
dtype=torch.float16,
|
||||||
|
use_safetensors=None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
# load config
|
||||||
|
with open(config_path, 'r') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# load ckpt
|
||||||
|
if use_safetensors:
|
||||||
|
ckpt_path = ckpt_path.replace('.ckpt', '.safetensors')
|
||||||
|
if not os.path.exists(ckpt_path):
|
||||||
|
raise FileNotFoundError(f"Model file {ckpt_path} not found")
|
||||||
|
|
||||||
|
logger.info(f"Loading model from {ckpt_path}")
|
||||||
|
if use_safetensors:
|
||||||
|
import safetensors.torch
|
||||||
|
ckpt = safetensors.torch.load_file(ckpt_path, device='cpu')
|
||||||
|
else:
|
||||||
|
ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
|
||||||
|
|
||||||
|
if 'model' in ckpt:
|
||||||
|
ckpt = ckpt['model']
|
||||||
|
if 'model' in config:
|
||||||
|
config = config['model']
|
||||||
|
|
||||||
|
model_kwargs = config['params']
|
||||||
|
model_kwargs.update(kwargs)
|
||||||
|
|
||||||
|
model = cls(**model_kwargs)
|
||||||
|
model.load_state_dict(ckpt)
|
||||||
|
model.to(device=device, dtype=dtype)
|
||||||
|
return model
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(
|
||||||
|
cls,
|
||||||
|
model_path,
|
||||||
|
device='cuda',
|
||||||
|
dtype=torch.float16,
|
||||||
|
use_safetensors=False,
|
||||||
|
variant='fp16',
|
||||||
|
subfolder='hunyuan3d-dit-v2-1',
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
config_path, ckpt_path = smart_load_model(
|
||||||
|
model_path,
|
||||||
|
subfolder=subfolder,
|
||||||
|
use_safetensors=use_safetensors,
|
||||||
|
variant=variant
|
||||||
|
)
|
||||||
|
|
||||||
|
return cls.from_single_file(
|
||||||
|
ckpt_path,
|
||||||
|
config_path,
|
||||||
|
device=device,
|
||||||
|
dtype=dtype,
|
||||||
|
use_safetensors=use_safetensors,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
input_size=1024,
|
input_size=1024,
|
||||||
|
|||||||
@@ -256,10 +256,7 @@ class Diffuser(pl.LightningModule):
|
|||||||
def forward(self, batch):
|
def forward(self, batch):
|
||||||
with torch.autocast(device_type="cuda", dtype=torch.bfloat16): #float32 for text
|
with torch.autocast(device_type="cuda", dtype=torch.bfloat16): #float32 for text
|
||||||
contexts = self.cond_stage_model(image=batch.get('image'), text=batch.get('text'), mask=batch.get('mask'))
|
contexts = self.cond_stage_model(image=batch.get('image'), text=batch.get('text'), mask=batch.get('mask'))
|
||||||
# t5_text = contexts['t5_text']['prompt_embeds']
|
|
||||||
# nan_count = torch.isnan(t5_text).sum()
|
|
||||||
# if nan_count > 0:
|
|
||||||
# print("t5_text has %d NaN values"%(nan_count))
|
|
||||||
with torch.autocast(device_type="cuda", dtype=torch.float16):
|
with torch.autocast(device_type="cuda", dtype=torch.float16):
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
latents = self.first_stage_model.encode(batch[self.first_stage_key], sample_posterior=True)
|
latents = self.first_stage_model.encode(batch[self.first_stage_key], sample_posterior=True)
|
||||||
@@ -333,9 +330,6 @@ class Diffuser(pl.LightningModule):
|
|||||||
image = batch.get("image", None)
|
image = batch.get("image", None)
|
||||||
mask = batch.get('mask', None)
|
mask = batch.get('mask', None)
|
||||||
|
|
||||||
# if not isinstance(image, torch.Tensor): print(image.shape)
|
|
||||||
# if isinstance(mask, torch.Tensor): print(mask.shape)
|
|
||||||
|
|
||||||
outputs = self.pipeline(image=image,
|
outputs = self.pipeline(image=image,
|
||||||
mask=mask,
|
mask=mask,
|
||||||
generator=generator,
|
generator=generator,
|
||||||
@@ -350,5 +344,6 @@ class Diffuser(pl.LightningModule):
|
|||||||
f.write(traceback.format_exc())
|
f.write(traceback.format_exc())
|
||||||
f.write("\n")
|
f.write("\n")
|
||||||
outputs = [None]
|
outputs = [None]
|
||||||
|
|
||||||
self.cond_stage_model.disable_drop = False
|
self.cond_stage_model.disable_drop = False
|
||||||
return [outputs]
|
return [outputs]
|
||||||
|
|||||||
@@ -323,7 +323,9 @@ class ImageConditionalFixASLDiffuserLogger(Callback):
|
|||||||
save_path = os.path.join(visual_dir, os.path.basename(image_path))
|
save_path = os.path.join(visual_dir, os.path.basename(image_path))
|
||||||
save_path = os.path.splitext(save_path)[0] + '.glb'
|
save_path = os.path.splitext(save_path)[0] + '.glb'
|
||||||
|
|
||||||
|
if isinstance(image_path, str):
|
||||||
print(image_path)
|
print(image_path)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
mesh = pl_module.sample(batch={"image": image_path}, **self.kwargs)[0][0]
|
mesh = pl_module.sample(batch={"image": image_path}, **self.kwargs)[0][0]
|
||||||
if isinstance(mesh, tuple) and len(mesh)==2:
|
if isinstance(mesh, tuple) and len(mesh)==2:
|
||||||
|
|||||||
@@ -190,7 +190,7 @@ if __name__ == "__main__":
|
|||||||
precision=amp_type,
|
precision=amp_type,
|
||||||
callbacks=callbacks,
|
callbacks=callbacks,
|
||||||
accelerator="gpu",
|
accelerator="gpu",
|
||||||
devices=training_cfg.num_gpus,
|
devices=args.num_gpus,
|
||||||
num_nodes=training_cfg.num_nodes,
|
num_nodes=training_cfg.num_nodes,
|
||||||
strategy=ddp_strategy,
|
strategy=ddp_strategy,
|
||||||
gradient_clip_val=training_cfg.get('gradient_clip_val'),
|
gradient_clip_val=training_cfg.get('gradient_clip_val'),
|
||||||
|
|||||||
@@ -13,7 +13,6 @@
|
|||||||
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from hy3dshape.rembg import BackgroundRemover
|
from hy3dshape.rembg import BackgroundRemover
|
||||||
from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline
|
from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline
|
||||||
|
|
||||||
@@ -21,10 +20,12 @@ model_path = 'tencent/Hunyuan3D-2.1'
|
|||||||
pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path)
|
pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path)
|
||||||
|
|
||||||
image_path = 'demos/demo.png'
|
image_path = 'demos/demo.png'
|
||||||
|
|
||||||
image = Image.open(image_path).convert("RGBA")
|
image = Image.open(image_path).convert("RGBA")
|
||||||
if image.mode == 'RGB':
|
if image.mode == 'RGB':
|
||||||
rembg = BackgroundRemover()
|
rembg = BackgroundRemover()
|
||||||
image = rembg(image)
|
image = rembg(image)
|
||||||
|
|
||||||
|
image = image_path
|
||||||
mesh = pipeline_shapegen(image=image)[0]
|
mesh = pipeline_shapegen(image=image)[0]
|
||||||
mesh.export('demo.glb')
|
mesh.export('demo.glb')
|
||||||
|
|||||||
51
hy3dshape/minimal_demo_with_ckpt.py
Normal file
51
hy3dshape/minimal_demo_with_ckpt.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
|
||||||
|
# except for the third-party components listed below.
|
||||||
|
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
|
||||||
|
# in the repsective licenses of these third-party components.
|
||||||
|
# Users must comply with all terms and conditions of original licenses of these third-party
|
||||||
|
# components and must ensure that the usage of the third party components adheres to
|
||||||
|
# all relevant laws and regulations.
|
||||||
|
|
||||||
|
# For avoidance of doubts, Hunyuan 3D means the large language models and
|
||||||
|
# their software and algorithms, including trained model weights, parameters (including
|
||||||
|
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
|
||||||
|
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
||||||
|
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
from hy3dshape.rembg import BackgroundRemover
|
||||||
|
from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline
|
||||||
|
|
||||||
|
model_path = 'tencent/Hunyuan3D-2.1'
|
||||||
|
pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path)
|
||||||
|
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import yaml
|
||||||
|
from hy3dshape.utils import instantiate_from_config
|
||||||
|
# For example, you can convert deepspeed weights to a single file
|
||||||
|
# cd output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt
|
||||||
|
# python3 zero_to_fp32.py ./ ./out --max_shard_size 30GB
|
||||||
|
# then you can get output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt/out/pytorch_model.bin
|
||||||
|
ckpt_cfg_path = 'output_folder/dit/overfitting_depth_16_token_4096_lr1e4_uc/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml'
|
||||||
|
ckpt_path = 'output_folder/dit/overfitting_depth_16_token_4096_lr1e4/ckpt/ckpt-step=00004000.ckpt/out/pytorch_model.bin'
|
||||||
|
config = yaml.safe_load(open(ckpt_cfg_path, 'r'))
|
||||||
|
model = instantiate_from_config(config['model']['params']['denoiser_cfg'])
|
||||||
|
sd = torch.load(ckpt_path)
|
||||||
|
sd = {k.replace('_forward_module.model.', ''):v for k,v in sd.items()}
|
||||||
|
msg = model.load_state_dict(sd)
|
||||||
|
print(msg)
|
||||||
|
model = model.cuda().half()
|
||||||
|
pipeline_shapegen.model = model
|
||||||
|
|
||||||
|
|
||||||
|
image = 'tools/mini_testset/images/015.png'
|
||||||
|
|
||||||
|
# image = Image.open(image_path).convert("RGBA")
|
||||||
|
# if image.mode == 'RGB':
|
||||||
|
# rembg = BackgroundRemover()
|
||||||
|
# image = rembg(image)
|
||||||
|
|
||||||
|
# mesh = pipeline_shapegen(image=image, guidance_scale=1.0)[0]
|
||||||
|
mesh = pipeline_shapegen(image=image)[0]
|
||||||
|
mesh.export('demo.glb')
|
||||||
@@ -35,12 +35,11 @@ export NCCL_DEBUG=WARN
|
|||||||
|
|
||||||
node_num=$1
|
node_num=$1
|
||||||
node_rank=$2
|
node_rank=$2
|
||||||
master_ip=$3
|
num_gpu_per_node=$3
|
||||||
config=$4
|
master_ip=$4
|
||||||
output_dir=$5
|
config=$5
|
||||||
|
output_dir=$6
|
||||||
|
|
||||||
# config='configs/dit-from-scratch-overfitting-flowmatching-dinog518-bf16-lr1e4-1024.yaml'
|
|
||||||
# output_dir='output_folder/dit/overfitting_10'
|
|
||||||
|
|
||||||
echo node_num $node_num
|
echo node_num $node_num
|
||||||
echo node_rank $node_rank
|
echo node_rank $node_rank
|
||||||
@@ -64,7 +63,8 @@ NCCL_IB_GID_INDEX=3 \
|
|||||||
NCCL_NVLS_ENABLE=0 \
|
NCCL_NVLS_ENABLE=0 \
|
||||||
python3 main.py \
|
python3 main.py \
|
||||||
--num_nodes $node_num \
|
--num_nodes $node_num \
|
||||||
--num_gpus 8 \
|
--num_gpus $num_gpu_per_node \
|
||||||
--config $config \
|
--config $config \
|
||||||
--output_dir $output_dir \
|
--output_dir $output_dir \
|
||||||
--deepspeed
|
--deepspeed
|
||||||
|
|
||||||
|
|||||||
15
hy3dshape/train_demo.sh
Normal file
15
hy3dshape/train_demo.sh
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||||
|
export num_gpu_per_node=8
|
||||||
|
# export CUDA_VISIBLE_DEVICES=0
|
||||||
|
# export num_gpu_per_node=1
|
||||||
|
|
||||||
|
export node_num=1
|
||||||
|
export node_rank=0
|
||||||
|
export master_ip=0.0.0.0 # set your master_ip
|
||||||
|
|
||||||
|
# export config=configs/hunyuandit-finetuning-flowmatching-dinol518-bf16-lr1e5-4096.yaml
|
||||||
|
# export output_dir=output_folder/dit/fintuning_lr1e5
|
||||||
|
export config=configs/hunyuandit-mini-overfitting-flowmatching-dinol518-bf16-lr1e4-4096.yaml
|
||||||
|
export output_dir=output_folder/dit/overfitting_depth_16_token_4096_lr1e4
|
||||||
|
|
||||||
|
bash scripts/train_deepspeed.sh $node_num $node_rank $num_gpu_per_node $master_ip $config $output_dir
|
||||||
Reference in New Issue
Block a user