fix(gradio): fix OOM killer on second request in low_vram_mode

Root cause: _ensure_i23d_worker() reloaded from disk via from_pretrained(),
which loads the ~7GB checkpoint into CPU RAM. If Python GC hadn't freed
previous del'd tensors yet, both old+new copies in RAM → OOM Killer.

Fix: hybrid strategy per model type:
  i23d (shape, ~7.25GB VRAM):
    .to('cpu') ↔ .to('cuda') — stays in RAM, no disk IO, fast switch
  tex_pipeline (texture, ~6.59GB VRAM):
    del + gc + empty_cache ↔ reload from HF cache — full VRAM release

Renamed helpers:
  _unload_i23d_worker()  → _offload_i23d_to_cpu()
  _ensure_i23d_worker()  → _restore_i23d_to_gpu()
  (tex helpers unchanged)

VRAM timeline per request in low_vram_mode:
  shape gen: i23d on GPU (7.25GB), tex unloaded
  → _offload_i23d_to_cpu(): i23d→RAM (0GB VRAM)
  → _ensure_tex_pipeline(): tex loads (6.59GB)
  texture gen: tex on GPU (6.59GB), i23d in RAM
  → _unload_tex_pipeline(): tex del'd (0GB VRAM)
  next request: _restore_i23d_to_gpu(): RAM→GPU (7.25GB)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Akasei
2026-03-16 22:05:08 +08:00
parent 9bee8e1844
commit 76c36e53eb

View File

@@ -49,6 +49,11 @@ import numpy as np
from hy3dshape.utils import logger
from hy3dpaint.convert_utils import create_glb_with_pbr_materials
# Globals for lazy load/unload
i23d_worker = None
tex_pipeline = None
tex_conf = None
MAX_SEED = 1e7
ENV = "Local" # "Huggingface"
@@ -220,36 +225,45 @@ height="{height}" width="100%" frameborder="0"></iframe>'
# on next request — no CPU intermediate, VRAM freed immediately.
# ---------------------------------------------------------------------------
def _unload_i23d_worker():
"""Delete shape model from GPU and free VRAM."""
# ---------------------------------------------------------------------------
# VRAM management helpers (used when --low_vram_mode is set)
#
# Strategy:
# i23d (shape model, ~7.25GB VRAM):
# → kept in system RAM between requests via .to('cpu')/.to('cuda')
# → no disk re-read, no OOM on reload, fast GPU↔CPU switch
#
# tex_pipeline (texture model, ~6.59GB VRAM):
# → fully del'd after each use (no CPU copy kept)
# → reloaded from cached weights on next texture request
# → tex config/weights are HF-cached so reload is fast (~20s)
#
# This ensures the two models never simultaneously occupy VRAM.
# ---------------------------------------------------------------------------
def _offload_i23d_to_cpu():
"""Move shape model tensors to CPU RAM, freeing its VRAM."""
global i23d_worker
del i23d_worker
i23d_worker = None
gc.collect()
torch.cuda.empty_cache()
if i23d_worker is not None:
i23d_worker.to('cpu')
torch.cuda.empty_cache()
def _ensure_i23d_worker():
"""Reload shape model to GPU if it was previously unloaded."""
def _restore_i23d_to_gpu():
"""Move shape model tensors back to GPU from CPU RAM."""
global i23d_worker
if i23d_worker is None:
from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
logger.info("Reloading shape model to GPU...")
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
args.model_path,
subfolder=args.subfolder,
use_safetensors=False,
device=args.device,
)
if i23d_worker is not None:
i23d_worker.to(args.device)
def _unload_tex_pipeline():
"""Delete texture pipeline from GPU and free VRAM."""
"""Delete texture pipeline entirely, freeing its VRAM."""
global tex_pipeline
del tex_pipeline
tex_pipeline = None
gc.collect()
torch.cuda.empty_cache()
if tex_pipeline is not None:
del tex_pipeline
tex_pipeline = None
gc.collect()
torch.cuda.empty_cache()
def _ensure_tex_pipeline():
@@ -346,7 +360,7 @@ def _gen_shape(
start_time = time.time()
if args.low_vram_mode:
_ensure_i23d_worker()
_restore_i23d_to_gpu()
generator = torch.Generator()
generator = generator.manual_seed(int(seed))
@@ -430,10 +444,10 @@ def generation_all(
text_path = os.path.join(save_folder, f'textured_mesh.obj')
# In low_vram_mode: unload shape model entirely (del, no CPU copy) to free VRAM,
# then load texture pipeline on demand. Shape model reloads lazily on next request.
# In low_vram_mode: move shape model to CPU RAM (keeps weights in RAM, no disk re-read),
# then load texture pipeline on demand. Shape model stays in RAM to avoid OOM on reload.
if args.low_vram_mode:
_unload_i23d_worker()
_offload_i23d_to_cpu()
_ensure_tex_pipeline()
path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)