diff --git a/gradio_app.py b/gradio_app.py index e03ecbb..b38b11d 100644 --- a/gradio_app.py +++ b/gradio_app.py @@ -223,40 +223,85 @@ height="{height}" width="100%" frameborder="0">' # --------------------------------------------------------------------------- # VRAM management helpers (used when --low_vram_mode is set) # -# Strategy (RTX 3080, 16GB RAM / 20GB VRAM): -# Both models are fully deleted between uses — neither is kept in CPU RAM. -# This prevents the 16GB RAM from being exhausted when loading one model -# while the other is still resident in CPU memory. +# Adaptive strategy based on available system RAM: # -# i23d (shape model, ~7.25GB VRAM): -# → fully del'd after each shape generation -# → reloaded from HF-cached weights on next shape request (~20-30s) +# When switching from shape → texture (or vice versa): +# 1. Check available RAM via /proc/meminfo +# 2. If enough RAM to hold a model in CPU while loading the other (~17GB): +# → .to('cpu') the outgoing model (fast, no disk reload needed later) +# 3. If RAM is tight: +# → fully del the outgoing model, reload from disk later (~20-30s) # -# tex_pipeline (texture model, ~6.59GB VRAM): -# → fully del'd after each texture generation -# → reloaded from HF-cached weights on next texture request (~20s) -# -# gc.collect() + empty_cache() before each load ensures previous tensors -# are freed before the new checkpoint is staged in CPU RAM. +# This allows machines with ≥32GB RAM to swap models instantly, +# while 16GB machines safely fall back to disk reload. # --------------------------------------------------------------------------- -def _unload_i23d_worker(): - """Delete shape model entirely, freeing VRAM and CPU RAM.""" - global i23d_worker - if i23d_worker is not None: - logger.info("Unloading shape model from memory...") +# Approximate RAM required (GB) to hold one model in CPU while loading another. +# Model weights: ~7GB each. Loading from disk stages ~7GB temporarily. +# Total: 7 (existing in CPU) + 7 (loading new) + 2 (OS headroom) = 16GB. +_RAM_THRESHOLD_GB = 16.0 + +# Track whether i23d is offloaded to CPU RAM (vs deleted entirely). +_i23d_on_cpu = False + + +def _get_available_ram_gb(): + """Return available system RAM in GB from /proc/meminfo.""" + try: + with open('/proc/meminfo') as f: + for line in f: + if line.startswith('MemAvailable:'): + return int(line.split()[1]) / (1024 * 1024) + except Exception: + pass + return 0.0 + + +def _can_offload_to_cpu(): + """Check if there's enough RAM to keep a model in CPU while loading another.""" + available = _get_available_ram_gb() + can = available >= _RAM_THRESHOLD_GB + logger.info( + f"RAM check: {available:.1f}GB available, " + f"need {_RAM_THRESHOLD_GB:.0f}GB for CPU offload → " + f"{'CPU offload (fast)' if can else 'full delete (safe)'}" + ) + return can + + +def _prepare_for_tex(): + """Free VRAM from shape model before loading texture pipeline.""" + global i23d_worker, _i23d_on_cpu + if i23d_worker is None: + _ensure_tex_pipeline() + return + + if _can_offload_to_cpu(): + logger.info("Offloading shape model to CPU RAM (fast path)...") + i23d_worker.to('cpu') + _i23d_on_cpu = True + torch.cuda.empty_cache() + else: + logger.info("Deleting shape model entirely (safe path, limited RAM)...") del i23d_worker i23d_worker = None + _i23d_on_cpu = False gc.collect() gc.collect() torch.cuda.empty_cache() + _ensure_tex_pipeline() + def _ensure_i23d_worker(): - """Load shape model to GPU if not already loaded.""" - global i23d_worker - if i23d_worker is None: - logger.info("Reloading shape model to GPU...") + """Load shape model to GPU — from CPU RAM (fast) or disk (slow).""" + global i23d_worker, _i23d_on_cpu + if i23d_worker is not None and _i23d_on_cpu: + logger.info("Restoring shape model from CPU to GPU (fast path)...") + i23d_worker.to(args.device) + _i23d_on_cpu = False + elif i23d_worker is None: + logger.info("Reloading shape model from disk to GPU (slow path)...") gc.collect() torch.cuda.empty_cache() from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline @@ -266,6 +311,8 @@ def _ensure_i23d_worker(): use_safetensors=False, device=args.device, ) + _i23d_on_cpu = False + # else: already on GPU, nothing to do def _unload_tex_pipeline(): @@ -460,12 +507,10 @@ def generation_all( text_path = os.path.join(save_folder, f'textured_mesh.obj') - # In low_vram_mode: fully delete shape model from RAM before loading texture - # pipeline. With only 16GB system RAM, keeping i23d in CPU RAM (~7GB) while - # loading tex (~7GB) would exceed available memory and trigger OOM Killer. + # In low_vram_mode: adaptively offload shape model (CPU or delete based on + # available RAM), then load texture pipeline. if args.low_vram_mode: - _unload_i23d_worker() - _ensure_tex_pipeline() + _prepare_for_tex() path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False) diff --git a/hy3dshape/hy3dshape/rembg.py b/hy3dshape/hy3dshape/rembg.py index bd94de3..92d4d79 100644 --- a/hy3dshape/hy3dshape/rembg.py +++ b/hy3dshape/hy3dshape/rembg.py @@ -13,12 +13,21 @@ # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. from PIL import Image +import onnxruntime as ort from rembg import remove, new_session class BackgroundRemover(): def __init__(self, model_name: str = "bria-rmbg"): - self.session = new_session(model_name) + # Force CPU-only execution for onnxruntime to prevent CUDA arena + # from consuming ~12GB+ VRAM that PyTorch models need. + # Background removal is lightweight and runs fast on CPU. + _orig = ort.get_device + ort.get_device = lambda: "CPU" + try: + self.session = new_session(model_name) + finally: + ort.get_device = _orig def __call__(self, image: Image.Image): output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0])