fix(gradio): prevent OOM on 16GB RAM by fully deleting models between uses

Previous hybrid strategy (i23d in CPU RAM, tex del'd) still caused OOM: - i23d in CPU RAM: ~7GB - tex loading from disk: ~7GB peak in RAM before GPU transfer - Total: ~14GB > 16GB system RAM → OOM Killer New strategy: fully delete both models between uses. Neither model persists in CPU RAM between requests. Peak RAM during any load: ~7GB (one model staging to GPU). Changes: - Replace _offload_i23d_to_cpu/_restore_i23d_to_gpu with _unload_i23d_worker/_ensure_i23d_worker (full del + reload) - Add double gc.collect() + empty_cache before each load - Skip i23d startup load in low_vram_mode (load on first request) - Both models reload from local HF cache (~20-30s each) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-16 22:39:03 +08:00
parent 474001da6b
commit 3cd767a18d
1 changed files with 56 additions and 34 deletions
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -220,49 +220,63 @@ height="{height}" width="100%" frameborder="0"></iframe>'
    """
 # ---------------------------------------------------------------------------
 # VRAM management helpers (used when --low_vram_mode is set)
 # Models are unloaded (del'd) before the other model runs, then reloaded
 # on next request — no CPU intermediate, VRAM freed immediately.
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
 # VRAM management helpers (used when --low_vram_mode is set)
 #
-# Strategy:
+# Strategy (RTX 3080, 16GB RAM / 20GB VRAM):
 #   Both models are fully deleted between uses — neither is kept in CPU RAM.
 #   This prevents the 16GB RAM from being exhausted when loading one model
 #   while the other is still resident in CPU memory.
 #
 #   i23d (shape model, ~7.25GB VRAM):
-#     → kept in system RAM between requests via .to('cpu')/.to('cuda')
+#     → fully del'd after each shape generation
-#     → no disk re-read, no OOM on reload, fast GPU↔CPU switch
+#     → reloaded from HF-cached weights on next shape request (~20-30s)
 #
 #   tex_pipeline (texture model, ~6.59GB VRAM):
-#     → fully del'd after each use (no CPU copy kept)
+#     → fully del'd after each texture generation
-#     → reloaded from cached weights on next texture request
+#     → reloaded from HF-cached weights on next texture request (~20s)
 #     → tex config/weights are HF-cached so reload is fast (~20s)
 #
-# This ensures the two models never simultaneously occupy VRAM.
+#   gc.collect() + empty_cache() before each load ensures previous tensors
 #   are freed before the new checkpoint is staged in CPU RAM.
 # ---------------------------------------------------------------------------
-def _offload_i23d_to_cpu():
+def _unload_i23d_worker():
-    """Move shape model tensors to CPU RAM, freeing its VRAM."""
+    """Delete shape model entirely, freeing VRAM and CPU RAM."""
    global i23d_worker
    if i23d_worker is not None:
-        i23d_worker.to('cpu')
+        logger.info("Unloading shape model from memory...")
        del i23d_worker
        i23d_worker = None
        gc.collect()
        gc.collect()
        torch.cuda.empty_cache()
-def _restore_i23d_to_gpu():
+def _ensure_i23d_worker():
-    """Move shape model tensors back to GPU from CPU RAM."""
+    """Load shape model to GPU if not already loaded."""
    global i23d_worker
-    if i23d_worker is not None:
+    if i23d_worker is None:
-        i23d_worker.to(args.device)
+        logger.info("Reloading shape model to GPU...")
        gc.collect()
        torch.cuda.empty_cache()
        from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
        i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
            args.model_path,
            subfolder=args.subfolder,
            use_safetensors=False,
            device=args.device,
        )
 def _unload_tex_pipeline():
    """Delete texture pipeline entirely, freeing its VRAM."""
    global tex_pipeline
    if tex_pipeline is not None:
        logger.info("Unloading texture pipeline from memory...")
        del tex_pipeline
        tex_pipeline = None
        gc.collect()
        gc.collect()
        torch.cuda.empty_cache()
@@ -270,6 +284,8 @@ def _ensure_tex_pipeline():
    """Load texture pipeline to GPU if not already loaded."""
    global tex_pipeline
    if tex_pipeline is None and tex_conf is not None:
        gc.collect()
        torch.cuda.empty_cache()
        from hy3dpaint.textureGenPipeline import Hunyuan3DPaintPipeline
        logger.info("Loading texture pipeline to GPU...")
        tex_pipeline = Hunyuan3DPaintPipeline(tex_conf)
@@ -360,7 +376,7 @@ def _gen_shape(
    start_time = time.time()
    if args.low_vram_mode:
-        _restore_i23d_to_gpu()
+        _ensure_i23d_worker()
    generator = torch.Generator()
    generator = generator.manual_seed(int(seed))
@@ -444,10 +460,11 @@ def generation_all(
    text_path = os.path.join(save_folder, f'textured_mesh.obj')
-    # In low_vram_mode: move shape model to CPU RAM (keeps weights in RAM, no disk re-read),
+    # In low_vram_mode: fully delete shape model from RAM before loading texture
-    # then load texture pipeline on demand. Shape model stays in RAM to avoid OOM on reload.
+    # pipeline. With only 16GB system RAM, keeping i23d in CPU RAM (~7GB) while
    # loading tex (~7GB) would exceed available memory and trigger OOM Killer.
    if args.low_vram_mode:
-        _offload_i23d_to_cpu()
+        _unload_i23d_worker()
        _ensure_tex_pipeline()
    path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)
@@ -910,6 +927,11 @@ if __name__ == '__main__':
    from hy3dshape.rembg import BackgroundRemover
    rmbg_worker = BackgroundRemover()
    if args.low_vram_mode:
        # Defer i23d loading to first request — saves ~7.25GB VRAM at startup
        # and avoids keeping it in RAM while tex pipeline loads.
        logger.info("low_vram_mode: shape model will be loaded on first request")
    else:
        i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
            args.model_path,
            subfolder=args.subfolder,