fix(gradio): fix OOM killer on second request in low_vram_mode

Root cause: _ensure_i23d_worker() reloaded from disk via from_pretrained(), which loads the ~7GB checkpoint into CPU RAM. If Python GC hadn't freed previous del'd tensors yet, both old+new copies in RAM → OOM Killer. Fix: hybrid strategy per model type: i23d (shape, ~7.25GB VRAM): .to('cpu') ↔ .to('cuda') — stays in RAM, no disk IO, fast switch tex_pipeline (texture, ~6.59GB VRAM): del + gc + empty_cache ↔ reload from HF cache — full VRAM release Renamed helpers: _unload_i23d_worker() → _offload_i23d_to_cpu() _ensure_i23d_worker() → _restore_i23d_to_gpu() (tex helpers unchanged) VRAM timeline per request in low_vram_mode: shape gen: i23d on GPU (7.25GB), tex unloaded → _offload_i23d_to_cpu(): i23d→RAM (0GB VRAM) → _ensure_tex_pipeline(): tex loads (6.59GB) texture gen: tex on GPU (6.59GB), i23d in RAM → _unload_tex_pipeline(): tex del'd (0GB VRAM) next request: _restore_i23d_to_gpu(): RAM→GPU (7.25GB) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-16 22:05:08 +08:00
parent 9bee8e1844
commit 76c36e53eb
1 changed files with 40 additions and 26 deletions
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -49,6 +49,11 @@ import numpy as np
 from hy3dshape.utils import logger
 from hy3dpaint.convert_utils import create_glb_with_pbr_materials

+# Globals for lazy load/unload
+i23d_worker = None
+tex_pipeline = None
+tex_conf = None
+

 MAX_SEED = 1e7
 ENV = "Local" # "Huggingface"
@@ -220,36 +225,45 @@ height="{height}" width="100%" frameborder="0"></iframe>'
 # on next request — no CPU intermediate, VRAM freed immediately.
 # ---------------------------------------------------------------------------

-def _unload_i23d_worker():
-    """Delete shape model from GPU and free VRAM."""
+# ---------------------------------------------------------------------------
+# VRAM management helpers (used when --low_vram_mode is set)
+#
+# Strategy:
+#   i23d (shape model, ~7.25GB VRAM):
+#     → kept in system RAM between requests via .to('cpu')/.to('cuda')
+#     → no disk re-read, no OOM on reload, fast GPU↔CPU switch
+#
+#   tex_pipeline (texture model, ~6.59GB VRAM):
+#     → fully del'd after each use (no CPU copy kept)
+#     → reloaded from cached weights on next texture request
+#     → tex config/weights are HF-cached so reload is fast (~20s)
+#
+# This ensures the two models never simultaneously occupy VRAM.
+# ---------------------------------------------------------------------------
+
+def _offload_i23d_to_cpu():
+    """Move shape model tensors to CPU RAM, freeing its VRAM."""
    global i23d_worker
-    del i23d_worker
-    i23d_worker = None
-    gc.collect()
-    torch.cuda.empty_cache()
+    if i23d_worker is not None:
+        i23d_worker.to('cpu')
+        torch.cuda.empty_cache()


-def _ensure_i23d_worker():
-    """Reload shape model to GPU if it was previously unloaded."""
+def _restore_i23d_to_gpu():
+    """Move shape model tensors back to GPU from CPU RAM."""
    global i23d_worker
-    if i23d_worker is None:
-        from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
-        logger.info("Reloading shape model to GPU...")
-        i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
-            args.model_path,
-            subfolder=args.subfolder,
-            use_safetensors=False,
-            device=args.device,
-        )
+    if i23d_worker is not None:
+        i23d_worker.to(args.device)


 def _unload_tex_pipeline():
-    """Delete texture pipeline from GPU and free VRAM."""
+    """Delete texture pipeline entirely, freeing its VRAM."""
    global tex_pipeline
-    del tex_pipeline
-    tex_pipeline = None
-    gc.collect()
-    torch.cuda.empty_cache()
+    if tex_pipeline is not None:
+        del tex_pipeline
+        tex_pipeline = None
+        gc.collect()
+        torch.cuda.empty_cache()


 def _ensure_tex_pipeline():
@@ -346,7 +360,7 @@ def _gen_shape(
    start_time = time.time()

    if args.low_vram_mode:
-        _ensure_i23d_worker()
+        _restore_i23d_to_gpu()

    generator = torch.Generator()
    generator = generator.manual_seed(int(seed))
@@ -430,10 +444,10 @@ def generation_all(

    text_path = os.path.join(save_folder, f'textured_mesh.obj')

-    # In low_vram_mode: unload shape model entirely (del, no CPU copy) to free VRAM,
-    # then load texture pipeline on demand. Shape model reloads lazily on next request.
+    # In low_vram_mode: move shape model to CPU RAM (keeps weights in RAM, no disk re-read),
+    # then load texture pipeline on demand. Shape model stays in RAM to avoid OOM on reload.
    if args.low_vram_mode:
-        _unload_i23d_worker()
+        _offload_i23d_to_cpu()
        _ensure_tex_pipeline()

    path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)