fix(gradio): prevent OOM on 16GB RAM by fully deleting models between uses
Previous hybrid strategy (i23d in CPU RAM, tex del'd) still caused OOM: - i23d in CPU RAM: ~7GB - tex loading from disk: ~7GB peak in RAM before GPU transfer - Total: ~14GB > 16GB system RAM → OOM Killer New strategy: fully delete both models between uses. Neither model persists in CPU RAM between requests. Peak RAM during any load: ~7GB (one model staging to GPU). Changes: - Replace _offload_i23d_to_cpu/_restore_i23d_to_gpu with _unload_i23d_worker/_ensure_i23d_worker (full del + reload) - Add double gc.collect() + empty_cache before each load - Skip i23d startup load in low_vram_mode (load on first request) - Both models reload from local HF cache (~20-30s each) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -220,49 +220,63 @@ height="{height}" width="100%" frameborder="0"></iframe>'
|
||||
"""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# VRAM management helpers (used when --low_vram_mode is set)
|
||||
# Models are unloaded (del'd) before the other model runs, then reloaded
|
||||
# on next request — no CPU intermediate, VRAM freed immediately.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# VRAM management helpers (used when --low_vram_mode is set)
|
||||
#
|
||||
# Strategy:
|
||||
# Strategy (RTX 3080, 16GB RAM / 20GB VRAM):
|
||||
# Both models are fully deleted between uses — neither is kept in CPU RAM.
|
||||
# This prevents the 16GB RAM from being exhausted when loading one model
|
||||
# while the other is still resident in CPU memory.
|
||||
#
|
||||
# i23d (shape model, ~7.25GB VRAM):
|
||||
# → kept in system RAM between requests via .to('cpu')/.to('cuda')
|
||||
# → no disk re-read, no OOM on reload, fast GPU↔CPU switch
|
||||
# → fully del'd after each shape generation
|
||||
# → reloaded from HF-cached weights on next shape request (~20-30s)
|
||||
#
|
||||
# tex_pipeline (texture model, ~6.59GB VRAM):
|
||||
# → fully del'd after each use (no CPU copy kept)
|
||||
# → reloaded from cached weights on next texture request
|
||||
# → tex config/weights are HF-cached so reload is fast (~20s)
|
||||
# → fully del'd after each texture generation
|
||||
# → reloaded from HF-cached weights on next texture request (~20s)
|
||||
#
|
||||
# This ensures the two models never simultaneously occupy VRAM.
|
||||
# gc.collect() + empty_cache() before each load ensures previous tensors
|
||||
# are freed before the new checkpoint is staged in CPU RAM.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _offload_i23d_to_cpu():
|
||||
"""Move shape model tensors to CPU RAM, freeing its VRAM."""
|
||||
def _unload_i23d_worker():
|
||||
"""Delete shape model entirely, freeing VRAM and CPU RAM."""
|
||||
global i23d_worker
|
||||
if i23d_worker is not None:
|
||||
i23d_worker.to('cpu')
|
||||
logger.info("Unloading shape model from memory...")
|
||||
del i23d_worker
|
||||
i23d_worker = None
|
||||
gc.collect()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def _restore_i23d_to_gpu():
|
||||
"""Move shape model tensors back to GPU from CPU RAM."""
|
||||
def _ensure_i23d_worker():
|
||||
"""Load shape model to GPU if not already loaded."""
|
||||
global i23d_worker
|
||||
if i23d_worker is not None:
|
||||
i23d_worker.to(args.device)
|
||||
if i23d_worker is None:
|
||||
logger.info("Reloading shape model to GPU...")
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
|
||||
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
|
||||
args.model_path,
|
||||
subfolder=args.subfolder,
|
||||
use_safetensors=False,
|
||||
device=args.device,
|
||||
)
|
||||
|
||||
|
||||
def _unload_tex_pipeline():
|
||||
"""Delete texture pipeline entirely, freeing its VRAM."""
|
||||
global tex_pipeline
|
||||
if tex_pipeline is not None:
|
||||
logger.info("Unloading texture pipeline from memory...")
|
||||
del tex_pipeline
|
||||
tex_pipeline = None
|
||||
gc.collect()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@@ -270,6 +284,8 @@ def _ensure_tex_pipeline():
|
||||
"""Load texture pipeline to GPU if not already loaded."""
|
||||
global tex_pipeline
|
||||
if tex_pipeline is None and tex_conf is not None:
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
from hy3dpaint.textureGenPipeline import Hunyuan3DPaintPipeline
|
||||
logger.info("Loading texture pipeline to GPU...")
|
||||
tex_pipeline = Hunyuan3DPaintPipeline(tex_conf)
|
||||
@@ -360,7 +376,7 @@ def _gen_shape(
|
||||
start_time = time.time()
|
||||
|
||||
if args.low_vram_mode:
|
||||
_restore_i23d_to_gpu()
|
||||
_ensure_i23d_worker()
|
||||
|
||||
generator = torch.Generator()
|
||||
generator = generator.manual_seed(int(seed))
|
||||
@@ -444,10 +460,11 @@ def generation_all(
|
||||
|
||||
text_path = os.path.join(save_folder, f'textured_mesh.obj')
|
||||
|
||||
# In low_vram_mode: move shape model to CPU RAM (keeps weights in RAM, no disk re-read),
|
||||
# then load texture pipeline on demand. Shape model stays in RAM to avoid OOM on reload.
|
||||
# In low_vram_mode: fully delete shape model from RAM before loading texture
|
||||
# pipeline. With only 16GB system RAM, keeping i23d in CPU RAM (~7GB) while
|
||||
# loading tex (~7GB) would exceed available memory and trigger OOM Killer.
|
||||
if args.low_vram_mode:
|
||||
_offload_i23d_to_cpu()
|
||||
_unload_i23d_worker()
|
||||
_ensure_tex_pipeline()
|
||||
|
||||
path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)
|
||||
@@ -910,17 +927,22 @@ if __name__ == '__main__':
|
||||
from hy3dshape.rembg import BackgroundRemover
|
||||
|
||||
rmbg_worker = BackgroundRemover()
|
||||
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
|
||||
args.model_path,
|
||||
subfolder=args.subfolder,
|
||||
use_safetensors=False,
|
||||
device=args.device,
|
||||
)
|
||||
if args.enable_flashvdm:
|
||||
mc_algo = 'mc' if args.device in ['cpu', 'mps'] else args.mc_algo
|
||||
i23d_worker.enable_flashvdm(mc_algo=mc_algo)
|
||||
if args.compile:
|
||||
i23d_worker.compile()
|
||||
if args.low_vram_mode:
|
||||
# Defer i23d loading to first request — saves ~7.25GB VRAM at startup
|
||||
# and avoids keeping it in RAM while tex pipeline loads.
|
||||
logger.info("low_vram_mode: shape model will be loaded on first request")
|
||||
else:
|
||||
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
|
||||
args.model_path,
|
||||
subfolder=args.subfolder,
|
||||
use_safetensors=False,
|
||||
device=args.device,
|
||||
)
|
||||
if args.enable_flashvdm:
|
||||
mc_algo = 'mc' if args.device in ['cpu', 'mps'] else args.mc_algo
|
||||
i23d_worker.enable_flashvdm(mc_algo=mc_algo)
|
||||
if args.compile:
|
||||
i23d_worker.compile()
|
||||
|
||||
floater_remove_worker = FloaterRemover()
|
||||
degenerate_face_remove_worker = DegenerateFaceRemover()
|
||||
|
||||
Reference in New Issue
Block a user