fix: adaptive VRAM strategy + force rembg CPU to prevent OOM
Two root causes of CUDA OOM fixed:
1. onnxruntime-gpu CUDAExecutionProvider pre-allocated ~12GB VRAM arena
for bria-rmbg background removal, starving PyTorch models.
Fix: force CPUExecutionProvider in BackgroundRemover (rembg is
lightweight, runs fine on CPU, frees all VRAM for shape/tex).
2. Previous 'always delete' strategy was wasteful on high-RAM machines.
New adaptive strategy checks available system RAM at runtime:
- RAM >= 16GB free: offload i23d to CPU (.to('cpu')) — fast, ~1s
- RAM < 16GB free: full del + reload from disk — safe, ~20-30s
This gives instant model switching on 32GB+ machines while keeping
16GB machines safe from OOM Killer.
Helper functions:
- _prepare_for_tex(): adaptive offload/delete based on RAM check
- _ensure_i23d_worker(): restore from CPU (fast) or disk (slow)
- _get_available_ram_gb(): reads /proc/meminfo
- _can_offload_to_cpu(): threshold check with logging
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -223,40 +223,85 @@ height="{height}" width="100%" frameborder="0"></iframe>'
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# VRAM management helpers (used when --low_vram_mode is set)
|
# VRAM management helpers (used when --low_vram_mode is set)
|
||||||
#
|
#
|
||||||
# Strategy (RTX 3080, 16GB RAM / 20GB VRAM):
|
# Adaptive strategy based on available system RAM:
|
||||||
# Both models are fully deleted between uses — neither is kept in CPU RAM.
|
|
||||||
# This prevents the 16GB RAM from being exhausted when loading one model
|
|
||||||
# while the other is still resident in CPU memory.
|
|
||||||
#
|
#
|
||||||
# i23d (shape model, ~7.25GB VRAM):
|
# When switching from shape → texture (or vice versa):
|
||||||
# → fully del'd after each shape generation
|
# 1. Check available RAM via /proc/meminfo
|
||||||
# → reloaded from HF-cached weights on next shape request (~20-30s)
|
# 2. If enough RAM to hold a model in CPU while loading the other (~17GB):
|
||||||
|
# → .to('cpu') the outgoing model (fast, no disk reload needed later)
|
||||||
|
# 3. If RAM is tight:
|
||||||
|
# → fully del the outgoing model, reload from disk later (~20-30s)
|
||||||
#
|
#
|
||||||
# tex_pipeline (texture model, ~6.59GB VRAM):
|
# This allows machines with ≥32GB RAM to swap models instantly,
|
||||||
# → fully del'd after each texture generation
|
# while 16GB machines safely fall back to disk reload.
|
||||||
# → reloaded from HF-cached weights on next texture request (~20s)
|
|
||||||
#
|
|
||||||
# gc.collect() + empty_cache() before each load ensures previous tensors
|
|
||||||
# are freed before the new checkpoint is staged in CPU RAM.
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _unload_i23d_worker():
|
# Approximate RAM required (GB) to hold one model in CPU while loading another.
|
||||||
"""Delete shape model entirely, freeing VRAM and CPU RAM."""
|
# Model weights: ~7GB each. Loading from disk stages ~7GB temporarily.
|
||||||
global i23d_worker
|
# Total: 7 (existing in CPU) + 7 (loading new) + 2 (OS headroom) = 16GB.
|
||||||
if i23d_worker is not None:
|
_RAM_THRESHOLD_GB = 16.0
|
||||||
logger.info("Unloading shape model from memory...")
|
|
||||||
|
# Track whether i23d is offloaded to CPU RAM (vs deleted entirely).
|
||||||
|
_i23d_on_cpu = False
|
||||||
|
|
||||||
|
|
||||||
|
def _get_available_ram_gb():
|
||||||
|
"""Return available system RAM in GB from /proc/meminfo."""
|
||||||
|
try:
|
||||||
|
with open('/proc/meminfo') as f:
|
||||||
|
for line in f:
|
||||||
|
if line.startswith('MemAvailable:'):
|
||||||
|
return int(line.split()[1]) / (1024 * 1024)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def _can_offload_to_cpu():
|
||||||
|
"""Check if there's enough RAM to keep a model in CPU while loading another."""
|
||||||
|
available = _get_available_ram_gb()
|
||||||
|
can = available >= _RAM_THRESHOLD_GB
|
||||||
|
logger.info(
|
||||||
|
f"RAM check: {available:.1f}GB available, "
|
||||||
|
f"need {_RAM_THRESHOLD_GB:.0f}GB for CPU offload → "
|
||||||
|
f"{'CPU offload (fast)' if can else 'full delete (safe)'}"
|
||||||
|
)
|
||||||
|
return can
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_for_tex():
|
||||||
|
"""Free VRAM from shape model before loading texture pipeline."""
|
||||||
|
global i23d_worker, _i23d_on_cpu
|
||||||
|
if i23d_worker is None:
|
||||||
|
_ensure_tex_pipeline()
|
||||||
|
return
|
||||||
|
|
||||||
|
if _can_offload_to_cpu():
|
||||||
|
logger.info("Offloading shape model to CPU RAM (fast path)...")
|
||||||
|
i23d_worker.to('cpu')
|
||||||
|
_i23d_on_cpu = True
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
else:
|
||||||
|
logger.info("Deleting shape model entirely (safe path, limited RAM)...")
|
||||||
del i23d_worker
|
del i23d_worker
|
||||||
i23d_worker = None
|
i23d_worker = None
|
||||||
|
_i23d_on_cpu = False
|
||||||
gc.collect()
|
gc.collect()
|
||||||
gc.collect()
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
_ensure_tex_pipeline()
|
||||||
|
|
||||||
|
|
||||||
def _ensure_i23d_worker():
|
def _ensure_i23d_worker():
|
||||||
"""Load shape model to GPU if not already loaded."""
|
"""Load shape model to GPU — from CPU RAM (fast) or disk (slow)."""
|
||||||
global i23d_worker
|
global i23d_worker, _i23d_on_cpu
|
||||||
if i23d_worker is None:
|
if i23d_worker is not None and _i23d_on_cpu:
|
||||||
logger.info("Reloading shape model to GPU...")
|
logger.info("Restoring shape model from CPU to GPU (fast path)...")
|
||||||
|
i23d_worker.to(args.device)
|
||||||
|
_i23d_on_cpu = False
|
||||||
|
elif i23d_worker is None:
|
||||||
|
logger.info("Reloading shape model from disk to GPU (slow path)...")
|
||||||
gc.collect()
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
|
from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
|
||||||
@@ -266,6 +311,8 @@ def _ensure_i23d_worker():
|
|||||||
use_safetensors=False,
|
use_safetensors=False,
|
||||||
device=args.device,
|
device=args.device,
|
||||||
)
|
)
|
||||||
|
_i23d_on_cpu = False
|
||||||
|
# else: already on GPU, nothing to do
|
||||||
|
|
||||||
|
|
||||||
def _unload_tex_pipeline():
|
def _unload_tex_pipeline():
|
||||||
@@ -460,12 +507,10 @@ def generation_all(
|
|||||||
|
|
||||||
text_path = os.path.join(save_folder, f'textured_mesh.obj')
|
text_path = os.path.join(save_folder, f'textured_mesh.obj')
|
||||||
|
|
||||||
# In low_vram_mode: fully delete shape model from RAM before loading texture
|
# In low_vram_mode: adaptively offload shape model (CPU or delete based on
|
||||||
# pipeline. With only 16GB system RAM, keeping i23d in CPU RAM (~7GB) while
|
# available RAM), then load texture pipeline.
|
||||||
# loading tex (~7GB) would exceed available memory and trigger OOM Killer.
|
|
||||||
if args.low_vram_mode:
|
if args.low_vram_mode:
|
||||||
_unload_i23d_worker()
|
_prepare_for_tex()
|
||||||
_ensure_tex_pipeline()
|
|
||||||
|
|
||||||
path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)
|
path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)
|
||||||
|
|
||||||
|
|||||||
@@ -13,12 +13,21 @@
|
|||||||
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
import onnxruntime as ort
|
||||||
from rembg import remove, new_session
|
from rembg import remove, new_session
|
||||||
|
|
||||||
|
|
||||||
class BackgroundRemover():
|
class BackgroundRemover():
|
||||||
def __init__(self, model_name: str = "bria-rmbg"):
|
def __init__(self, model_name: str = "bria-rmbg"):
|
||||||
self.session = new_session(model_name)
|
# Force CPU-only execution for onnxruntime to prevent CUDA arena
|
||||||
|
# from consuming ~12GB+ VRAM that PyTorch models need.
|
||||||
|
# Background removal is lightweight and runs fast on CPU.
|
||||||
|
_orig = ort.get_device
|
||||||
|
ort.get_device = lambda: "CPU"
|
||||||
|
try:
|
||||||
|
self.session = new_session(model_name)
|
||||||
|
finally:
|
||||||
|
ort.get_device = _orig
|
||||||
|
|
||||||
def __call__(self, image: Image.Image):
|
def __call__(self, image: Image.Image):
|
||||||
output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0])
|
output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0])
|
||||||
|
|||||||
Reference in New Issue
Block a user