diff --git a/gradio_app.py b/gradio_app.py index c111320..3b767b4 100644 --- a/gradio_app.py +++ b/gradio_app.py @@ -49,6 +49,11 @@ import numpy as np from hy3dshape.utils import logger from hy3dpaint.convert_utils import create_glb_with_pbr_materials +# Globals for lazy load/unload +i23d_worker = None +tex_pipeline = None +tex_conf = None + MAX_SEED = 1e7 ENV = "Local" # "Huggingface" @@ -220,36 +225,45 @@ height="{height}" width="100%" frameborder="0">' # on next request — no CPU intermediate, VRAM freed immediately. # --------------------------------------------------------------------------- -def _unload_i23d_worker(): - """Delete shape model from GPU and free VRAM.""" +# --------------------------------------------------------------------------- +# VRAM management helpers (used when --low_vram_mode is set) +# +# Strategy: +# i23d (shape model, ~7.25GB VRAM): +# → kept in system RAM between requests via .to('cpu')/.to('cuda') +# → no disk re-read, no OOM on reload, fast GPU↔CPU switch +# +# tex_pipeline (texture model, ~6.59GB VRAM): +# → fully del'd after each use (no CPU copy kept) +# → reloaded from cached weights on next texture request +# → tex config/weights are HF-cached so reload is fast (~20s) +# +# This ensures the two models never simultaneously occupy VRAM. +# --------------------------------------------------------------------------- + +def _offload_i23d_to_cpu(): + """Move shape model tensors to CPU RAM, freeing its VRAM.""" global i23d_worker - del i23d_worker - i23d_worker = None - gc.collect() - torch.cuda.empty_cache() + if i23d_worker is not None: + i23d_worker.to('cpu') + torch.cuda.empty_cache() -def _ensure_i23d_worker(): - """Reload shape model to GPU if it was previously unloaded.""" +def _restore_i23d_to_gpu(): + """Move shape model tensors back to GPU from CPU RAM.""" global i23d_worker - if i23d_worker is None: - from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline - logger.info("Reloading shape model to GPU...") - i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( - args.model_path, - subfolder=args.subfolder, - use_safetensors=False, - device=args.device, - ) + if i23d_worker is not None: + i23d_worker.to(args.device) def _unload_tex_pipeline(): - """Delete texture pipeline from GPU and free VRAM.""" + """Delete texture pipeline entirely, freeing its VRAM.""" global tex_pipeline - del tex_pipeline - tex_pipeline = None - gc.collect() - torch.cuda.empty_cache() + if tex_pipeline is not None: + del tex_pipeline + tex_pipeline = None + gc.collect() + torch.cuda.empty_cache() def _ensure_tex_pipeline(): @@ -346,7 +360,7 @@ def _gen_shape( start_time = time.time() if args.low_vram_mode: - _ensure_i23d_worker() + _restore_i23d_to_gpu() generator = torch.Generator() generator = generator.manual_seed(int(seed)) @@ -430,10 +444,10 @@ def generation_all( text_path = os.path.join(save_folder, f'textured_mesh.obj') - # In low_vram_mode: unload shape model entirely (del, no CPU copy) to free VRAM, - # then load texture pipeline on demand. Shape model reloads lazily on next request. + # In low_vram_mode: move shape model to CPU RAM (keeps weights in RAM, no disk re-read), + # then load texture pipeline on demand. Shape model stays in RAM to avoid OOM on reload. if args.low_vram_mode: - _unload_i23d_worker() + _offload_i23d_to_cpu() _ensure_tex_pipeline() path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)