fix(gradio): prevent OOM on 16GB RAM by fully deleting models between uses
Previous hybrid strategy (i23d in CPU RAM, tex del'd) still caused OOM: - i23d in CPU RAM: ~7GB - tex loading from disk: ~7GB peak in RAM before GPU transfer - Total: ~14GB > 16GB system RAM → OOM Killer New strategy: fully delete both models between uses. Neither model persists in CPU RAM between requests. Peak RAM during any load: ~7GB (one model staging to GPU). Changes: - Replace _offload_i23d_to_cpu/_restore_i23d_to_gpu with _unload_i23d_worker/_ensure_i23d_worker (full del + reload) - Add double gc.collect() + empty_cache before each load - Skip i23d startup load in low_vram_mode (load on first request) - Both models reload from local HF cache (~20-30s each) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -220,49 +220,63 @@ height="{height}" width="100%" frameborder="0"></iframe>'
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# VRAM management helpers (used when --low_vram_mode is set)
|
|
||||||
# Models are unloaded (del'd) before the other model runs, then reloaded
|
|
||||||
# on next request — no CPU intermediate, VRAM freed immediately.
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# VRAM management helpers (used when --low_vram_mode is set)
|
# VRAM management helpers (used when --low_vram_mode is set)
|
||||||
#
|
#
|
||||||
# Strategy:
|
# Strategy (RTX 3080, 16GB RAM / 20GB VRAM):
|
||||||
|
# Both models are fully deleted between uses — neither is kept in CPU RAM.
|
||||||
|
# This prevents the 16GB RAM from being exhausted when loading one model
|
||||||
|
# while the other is still resident in CPU memory.
|
||||||
|
#
|
||||||
# i23d (shape model, ~7.25GB VRAM):
|
# i23d (shape model, ~7.25GB VRAM):
|
||||||
# → kept in system RAM between requests via .to('cpu')/.to('cuda')
|
# → fully del'd after each shape generation
|
||||||
# → no disk re-read, no OOM on reload, fast GPU↔CPU switch
|
# → reloaded from HF-cached weights on next shape request (~20-30s)
|
||||||
#
|
#
|
||||||
# tex_pipeline (texture model, ~6.59GB VRAM):
|
# tex_pipeline (texture model, ~6.59GB VRAM):
|
||||||
# → fully del'd after each use (no CPU copy kept)
|
# → fully del'd after each texture generation
|
||||||
# → reloaded from cached weights on next texture request
|
# → reloaded from HF-cached weights on next texture request (~20s)
|
||||||
# → tex config/weights are HF-cached so reload is fast (~20s)
|
|
||||||
#
|
#
|
||||||
# This ensures the two models never simultaneously occupy VRAM.
|
# gc.collect() + empty_cache() before each load ensures previous tensors
|
||||||
|
# are freed before the new checkpoint is staged in CPU RAM.
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _offload_i23d_to_cpu():
|
def _unload_i23d_worker():
|
||||||
"""Move shape model tensors to CPU RAM, freeing its VRAM."""
|
"""Delete shape model entirely, freeing VRAM and CPU RAM."""
|
||||||
global i23d_worker
|
global i23d_worker
|
||||||
if i23d_worker is not None:
|
if i23d_worker is not None:
|
||||||
i23d_worker.to('cpu')
|
logger.info("Unloading shape model from memory...")
|
||||||
|
del i23d_worker
|
||||||
|
i23d_worker = None
|
||||||
|
gc.collect()
|
||||||
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
def _restore_i23d_to_gpu():
|
def _ensure_i23d_worker():
|
||||||
"""Move shape model tensors back to GPU from CPU RAM."""
|
"""Load shape model to GPU if not already loaded."""
|
||||||
global i23d_worker
|
global i23d_worker
|
||||||
if i23d_worker is not None:
|
if i23d_worker is None:
|
||||||
i23d_worker.to(args.device)
|
logger.info("Reloading shape model to GPU...")
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
|
||||||
|
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
|
||||||
|
args.model_path,
|
||||||
|
subfolder=args.subfolder,
|
||||||
|
use_safetensors=False,
|
||||||
|
device=args.device,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _unload_tex_pipeline():
|
def _unload_tex_pipeline():
|
||||||
"""Delete texture pipeline entirely, freeing its VRAM."""
|
"""Delete texture pipeline entirely, freeing its VRAM."""
|
||||||
global tex_pipeline
|
global tex_pipeline
|
||||||
if tex_pipeline is not None:
|
if tex_pipeline is not None:
|
||||||
|
logger.info("Unloading texture pipeline from memory...")
|
||||||
del tex_pipeline
|
del tex_pipeline
|
||||||
tex_pipeline = None
|
tex_pipeline = None
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
@@ -270,6 +284,8 @@ def _ensure_tex_pipeline():
|
|||||||
"""Load texture pipeline to GPU if not already loaded."""
|
"""Load texture pipeline to GPU if not already loaded."""
|
||||||
global tex_pipeline
|
global tex_pipeline
|
||||||
if tex_pipeline is None and tex_conf is not None:
|
if tex_pipeline is None and tex_conf is not None:
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
from hy3dpaint.textureGenPipeline import Hunyuan3DPaintPipeline
|
from hy3dpaint.textureGenPipeline import Hunyuan3DPaintPipeline
|
||||||
logger.info("Loading texture pipeline to GPU...")
|
logger.info("Loading texture pipeline to GPU...")
|
||||||
tex_pipeline = Hunyuan3DPaintPipeline(tex_conf)
|
tex_pipeline = Hunyuan3DPaintPipeline(tex_conf)
|
||||||
@@ -360,7 +376,7 @@ def _gen_shape(
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
if args.low_vram_mode:
|
if args.low_vram_mode:
|
||||||
_restore_i23d_to_gpu()
|
_ensure_i23d_worker()
|
||||||
|
|
||||||
generator = torch.Generator()
|
generator = torch.Generator()
|
||||||
generator = generator.manual_seed(int(seed))
|
generator = generator.manual_seed(int(seed))
|
||||||
@@ -444,10 +460,11 @@ def generation_all(
|
|||||||
|
|
||||||
text_path = os.path.join(save_folder, f'textured_mesh.obj')
|
text_path = os.path.join(save_folder, f'textured_mesh.obj')
|
||||||
|
|
||||||
# In low_vram_mode: move shape model to CPU RAM (keeps weights in RAM, no disk re-read),
|
# In low_vram_mode: fully delete shape model from RAM before loading texture
|
||||||
# then load texture pipeline on demand. Shape model stays in RAM to avoid OOM on reload.
|
# pipeline. With only 16GB system RAM, keeping i23d in CPU RAM (~7GB) while
|
||||||
|
# loading tex (~7GB) would exceed available memory and trigger OOM Killer.
|
||||||
if args.low_vram_mode:
|
if args.low_vram_mode:
|
||||||
_offload_i23d_to_cpu()
|
_unload_i23d_worker()
|
||||||
_ensure_tex_pipeline()
|
_ensure_tex_pipeline()
|
||||||
|
|
||||||
path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)
|
path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)
|
||||||
@@ -910,6 +927,11 @@ if __name__ == '__main__':
|
|||||||
from hy3dshape.rembg import BackgroundRemover
|
from hy3dshape.rembg import BackgroundRemover
|
||||||
|
|
||||||
rmbg_worker = BackgroundRemover()
|
rmbg_worker = BackgroundRemover()
|
||||||
|
if args.low_vram_mode:
|
||||||
|
# Defer i23d loading to first request — saves ~7.25GB VRAM at startup
|
||||||
|
# and avoids keeping it in RAM while tex pipeline loads.
|
||||||
|
logger.info("low_vram_mode: shape model will be loaded on first request")
|
||||||
|
else:
|
||||||
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
|
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
|
||||||
args.model_path,
|
args.model_path,
|
||||||
subfolder=args.subfolder,
|
subfolder=args.subfolder,
|
||||||
|
|||||||
Reference in New Issue
Block a user