fix(gradio): prevent OOM on 16GB RAM by fully deleting models between uses

Previous hybrid strategy (i23d in CPU RAM, tex del'd) still caused OOM:
- i23d in CPU RAM: ~7GB
- tex loading from disk: ~7GB peak in RAM before GPU transfer
- Total: ~14GB > 16GB system RAM → OOM Killer

New strategy: fully delete both models between uses.
Neither model persists in CPU RAM between requests.
Peak RAM during any load: ~7GB (one model staging to GPU).

Changes:
- Replace _offload_i23d_to_cpu/_restore_i23d_to_gpu with
  _unload_i23d_worker/_ensure_i23d_worker (full del + reload)
- Add double gc.collect() + empty_cache before each load
- Skip i23d startup load in low_vram_mode (load on first request)
- Both models reload from local HF cache (~20-30s each)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Akasei
2026-03-16 22:39:03 +08:00
parent 474001da6b
commit 3cd767a18d

View File

@@ -220,49 +220,63 @@ height="{height}" width="100%" frameborder="0"></iframe>'
"""
# ---------------------------------------------------------------------------
# VRAM management helpers (used when --low_vram_mode is set)
# Models are unloaded (del'd) before the other model runs, then reloaded
# on next request — no CPU intermediate, VRAM freed immediately.
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# VRAM management helpers (used when --low_vram_mode is set)
#
# Strategy:
# Strategy (RTX 3080, 16GB RAM / 20GB VRAM):
# Both models are fully deleted between uses — neither is kept in CPU RAM.
# This prevents the 16GB RAM from being exhausted when loading one model
# while the other is still resident in CPU memory.
#
# i23d (shape model, ~7.25GB VRAM):
# → kept in system RAM between requests via .to('cpu')/.to('cuda')
# → no disk re-read, no OOM on reload, fast GPU↔CPU switch
# → fully del'd after each shape generation
# → reloaded from HF-cached weights on next shape request (~20-30s)
#
# tex_pipeline (texture model, ~6.59GB VRAM):
# → fully del'd after each use (no CPU copy kept)
# → reloaded from cached weights on next texture request
# → tex config/weights are HF-cached so reload is fast (~20s)
# → fully del'd after each texture generation
# → reloaded from HF-cached weights on next texture request (~20s)
#
# This ensures the two models never simultaneously occupy VRAM.
# gc.collect() + empty_cache() before each load ensures previous tensors
# are freed before the new checkpoint is staged in CPU RAM.
# ---------------------------------------------------------------------------
def _offload_i23d_to_cpu():
"""Move shape model tensors to CPU RAM, freeing its VRAM."""
def _unload_i23d_worker():
"""Delete shape model entirely, freeing VRAM and CPU RAM."""
global i23d_worker
if i23d_worker is not None:
i23d_worker.to('cpu')
logger.info("Unloading shape model from memory...")
del i23d_worker
i23d_worker = None
gc.collect()
gc.collect()
torch.cuda.empty_cache()
def _restore_i23d_to_gpu():
"""Move shape model tensors back to GPU from CPU RAM."""
def _ensure_i23d_worker():
"""Load shape model to GPU if not already loaded."""
global i23d_worker
if i23d_worker is not None:
i23d_worker.to(args.device)
if i23d_worker is None:
logger.info("Reloading shape model to GPU...")
gc.collect()
torch.cuda.empty_cache()
from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
args.model_path,
subfolder=args.subfolder,
use_safetensors=False,
device=args.device,
)
def _unload_tex_pipeline():
"""Delete texture pipeline entirely, freeing its VRAM."""
global tex_pipeline
if tex_pipeline is not None:
logger.info("Unloading texture pipeline from memory...")
del tex_pipeline
tex_pipeline = None
gc.collect()
gc.collect()
torch.cuda.empty_cache()
@@ -270,6 +284,8 @@ def _ensure_tex_pipeline():
"""Load texture pipeline to GPU if not already loaded."""
global tex_pipeline
if tex_pipeline is None and tex_conf is not None:
gc.collect()
torch.cuda.empty_cache()
from hy3dpaint.textureGenPipeline import Hunyuan3DPaintPipeline
logger.info("Loading texture pipeline to GPU...")
tex_pipeline = Hunyuan3DPaintPipeline(tex_conf)
@@ -360,7 +376,7 @@ def _gen_shape(
start_time = time.time()
if args.low_vram_mode:
_restore_i23d_to_gpu()
_ensure_i23d_worker()
generator = torch.Generator()
generator = generator.manual_seed(int(seed))
@@ -444,10 +460,11 @@ def generation_all(
text_path = os.path.join(save_folder, f'textured_mesh.obj')
# In low_vram_mode: move shape model to CPU RAM (keeps weights in RAM, no disk re-read),
# then load texture pipeline on demand. Shape model stays in RAM to avoid OOM on reload.
# In low_vram_mode: fully delete shape model from RAM before loading texture
# pipeline. With only 16GB system RAM, keeping i23d in CPU RAM (~7GB) while
# loading tex (~7GB) would exceed available memory and trigger OOM Killer.
if args.low_vram_mode:
_offload_i23d_to_cpu()
_unload_i23d_worker()
_ensure_tex_pipeline()
path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)
@@ -910,17 +927,22 @@ if __name__ == '__main__':
from hy3dshape.rembg import BackgroundRemover
rmbg_worker = BackgroundRemover()
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
args.model_path,
subfolder=args.subfolder,
use_safetensors=False,
device=args.device,
)
if args.enable_flashvdm:
mc_algo = 'mc' if args.device in ['cpu', 'mps'] else args.mc_algo
i23d_worker.enable_flashvdm(mc_algo=mc_algo)
if args.compile:
i23d_worker.compile()
if args.low_vram_mode:
# Defer i23d loading to first request — saves ~7.25GB VRAM at startup
# and avoids keeping it in RAM while tex pipeline loads.
logger.info("low_vram_mode: shape model will be loaded on first request")
else:
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
args.model_path,
subfolder=args.subfolder,
use_safetensors=False,
device=args.device,
)
if args.enable_flashvdm:
mc_algo = 'mc' if args.device in ['cpu', 'mps'] else args.mc_algo
i23d_worker.enable_flashvdm(mc_algo=mc_algo)
if args.compile:
i23d_worker.compile()
floater_remove_worker = FloaterRemover()
degenerate_face_remove_worker = DegenerateFaceRemover()