diff --git a/gradio_app.py b/gradio_app.py index 3b767b4..e03ecbb 100644 --- a/gradio_app.py +++ b/gradio_app.py @@ -220,49 +220,63 @@ height="{height}" width="100%" frameborder="0">' """ # --------------------------------------------------------------------------- -# VRAM management helpers (used when --low_vram_mode is set) -# Models are unloaded (del'd) before the other model runs, then reloaded -# on next request — no CPU intermediate, VRAM freed immediately. -# --------------------------------------------------------------------------- - # --------------------------------------------------------------------------- # VRAM management helpers (used when --low_vram_mode is set) # -# Strategy: +# Strategy (RTX 3080, 16GB RAM / 20GB VRAM): +# Both models are fully deleted between uses — neither is kept in CPU RAM. +# This prevents the 16GB RAM from being exhausted when loading one model +# while the other is still resident in CPU memory. +# # i23d (shape model, ~7.25GB VRAM): -# → kept in system RAM between requests via .to('cpu')/.to('cuda') -# → no disk re-read, no OOM on reload, fast GPU↔CPU switch +# → fully del'd after each shape generation +# → reloaded from HF-cached weights on next shape request (~20-30s) # # tex_pipeline (texture model, ~6.59GB VRAM): -# → fully del'd after each use (no CPU copy kept) -# → reloaded from cached weights on next texture request -# → tex config/weights are HF-cached so reload is fast (~20s) +# → fully del'd after each texture generation +# → reloaded from HF-cached weights on next texture request (~20s) # -# This ensures the two models never simultaneously occupy VRAM. +# gc.collect() + empty_cache() before each load ensures previous tensors +# are freed before the new checkpoint is staged in CPU RAM. # --------------------------------------------------------------------------- -def _offload_i23d_to_cpu(): - """Move shape model tensors to CPU RAM, freeing its VRAM.""" +def _unload_i23d_worker(): + """Delete shape model entirely, freeing VRAM and CPU RAM.""" global i23d_worker if i23d_worker is not None: - i23d_worker.to('cpu') + logger.info("Unloading shape model from memory...") + del i23d_worker + i23d_worker = None + gc.collect() + gc.collect() torch.cuda.empty_cache() -def _restore_i23d_to_gpu(): - """Move shape model tensors back to GPU from CPU RAM.""" +def _ensure_i23d_worker(): + """Load shape model to GPU if not already loaded.""" global i23d_worker - if i23d_worker is not None: - i23d_worker.to(args.device) + if i23d_worker is None: + logger.info("Reloading shape model to GPU...") + gc.collect() + torch.cuda.empty_cache() + from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline + i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + args.model_path, + subfolder=args.subfolder, + use_safetensors=False, + device=args.device, + ) def _unload_tex_pipeline(): """Delete texture pipeline entirely, freeing its VRAM.""" global tex_pipeline if tex_pipeline is not None: + logger.info("Unloading texture pipeline from memory...") del tex_pipeline tex_pipeline = None gc.collect() + gc.collect() torch.cuda.empty_cache() @@ -270,6 +284,8 @@ def _ensure_tex_pipeline(): """Load texture pipeline to GPU if not already loaded.""" global tex_pipeline if tex_pipeline is None and tex_conf is not None: + gc.collect() + torch.cuda.empty_cache() from hy3dpaint.textureGenPipeline import Hunyuan3DPaintPipeline logger.info("Loading texture pipeline to GPU...") tex_pipeline = Hunyuan3DPaintPipeline(tex_conf) @@ -360,7 +376,7 @@ def _gen_shape( start_time = time.time() if args.low_vram_mode: - _restore_i23d_to_gpu() + _ensure_i23d_worker() generator = torch.Generator() generator = generator.manual_seed(int(seed)) @@ -444,10 +460,11 @@ def generation_all( text_path = os.path.join(save_folder, f'textured_mesh.obj') - # In low_vram_mode: move shape model to CPU RAM (keeps weights in RAM, no disk re-read), - # then load texture pipeline on demand. Shape model stays in RAM to avoid OOM on reload. + # In low_vram_mode: fully delete shape model from RAM before loading texture + # pipeline. With only 16GB system RAM, keeping i23d in CPU RAM (~7GB) while + # loading tex (~7GB) would exceed available memory and trigger OOM Killer. if args.low_vram_mode: - _offload_i23d_to_cpu() + _unload_i23d_worker() _ensure_tex_pipeline() path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False) @@ -910,17 +927,22 @@ if __name__ == '__main__': from hy3dshape.rembg import BackgroundRemover rmbg_worker = BackgroundRemover() - i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( - args.model_path, - subfolder=args.subfolder, - use_safetensors=False, - device=args.device, - ) - if args.enable_flashvdm: - mc_algo = 'mc' if args.device in ['cpu', 'mps'] else args.mc_algo - i23d_worker.enable_flashvdm(mc_algo=mc_algo) - if args.compile: - i23d_worker.compile() + if args.low_vram_mode: + # Defer i23d loading to first request — saves ~7.25GB VRAM at startup + # and avoids keeping it in RAM while tex pipeline loads. + logger.info("low_vram_mode: shape model will be loaded on first request") + else: + i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + args.model_path, + subfolder=args.subfolder, + use_safetensors=False, + device=args.device, + ) + if args.enable_flashvdm: + mc_algo = 'mc' if args.device in ['cpu', 'mps'] else args.mc_algo + i23d_worker.enable_flashvdm(mc_algo=mc_algo) + if args.compile: + i23d_worker.compile() floater_remove_worker = FloaterRemover() degenerate_face_remove_worker = DegenerateFaceRemover()