fix(oom): use mmap=True for checkpoint loading + malloc_trim + expandable_segments

Root cause: torch.load() reads 6.9GB .ckpt into Python heap + model params in CPU RAM = ~14GB peak, exceeding 16GB system RAM → OOM Killer. Fix 1 - mmap=True on all torch.load() calls (torch 2.7 supports this): With mmap, checkpoint storage is file-backed (not heap). Only the model parameters (also ~7GB) exist in physical RAM during loading. Peak RAM drops from ~14GB to ~7GB — within safe limits on 16GB machines. Files changed: pipelines.py, hunyuan3ddit.py, model.py (×2), flow_matching_sit.py Fix 2 - malloc_trim(0) after every gc.collect(): Forces glibc to return freed heap pages to OS immediately, so Python's memory pool doesn't hoard freed model memory before the next load. Fix 3 - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True: Prevents CUDA allocator fragmentation between model switches. Fix 4 - Adaptive threshold recalculated: With mmap loading, loading a model requires ~7.5GB (model params) not 14GB. CPU offload threshold lowered from 16GB → 10.5GB, enabling fast path on machines with more headroom. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-03-16 23:18:16 +08:00
parent 6534f4ba15
commit f192c86c60
46 changed files with 334079 additions and 10 deletions
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -34,6 +34,8 @@ import random
 import shutil
 import subprocess
 import time
+import ctypes
+import ctypes.util
 from glob import glob
 from pathlib import Path

@@ -49,6 +51,18 @@ import numpy as np
 from hy3dshape.utils import logger
 from hy3dpaint.convert_utils import create_glb_with_pbr_materials

+# Force OS to reclaim freed heap pages, reducing Python's RSS after model deletion.
+_libc = ctypes.CDLL(ctypes.util.find_library("c") or "libc.so.6", use_errno=True)
+
+def _malloc_trim():
+    try:
+        _libc.malloc_trim(0)
+    except Exception:
+        pass
+
+# Allow CUDA allocator to use expandable segments, reducing fragmentation.
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+
 # Globals for lazy load/unload
 i23d_worker = None
 tex_pipeline = None
@@ -237,9 +251,10 @@ height="{height}" width="100%" frameborder="0"></iframe>'
 # ---------------------------------------------------------------------------

 # Approximate RAM required (GB) to hold one model in CPU while loading another.
-# Model weights: ~7GB each. Loading from disk stages ~7GB temporarily.
-# Total: 7 (existing in CPU) + 7 (loading new) + 2 (OS headroom) = 16GB.
-_RAM_THRESHOLD_GB = 16.0
+# With mmap=True loading, staging a model needs ~0 extra heap RAM.
+# So threshold = size of model in CPU RAM = ~7.5GB, plus 3GB headroom = 10.5GB.
+# With 16GB total, we need at least ~10.5GB free to safely offload i23d to CPU.
+_RAM_THRESHOLD_GB = 10.5

 # Track whether i23d is offloaded to CPU RAM (vs deleted entirely).
 _i23d_on_cpu = False
@@ -258,12 +273,12 @@ def _get_available_ram_gb():


 def _can_offload_to_cpu():
-    """Check if there's enough RAM to keep a model in CPU while loading another."""
+    """Check if there's enough RAM to keep i23d in CPU while loading tex."""
    available = _get_available_ram_gb()
    can = available >= _RAM_THRESHOLD_GB
    logger.info(
        f"RAM check: {available:.1f}GB available, "
-        f"need {_RAM_THRESHOLD_GB:.0f}GB for CPU offload → "
+        f"need {_RAM_THRESHOLD_GB:.1f}GB for CPU offload → "
        f"{'CPU offload (fast)' if can else 'full delete (safe)'}"
    )
    return can
@@ -280,6 +295,8 @@ def _prepare_for_tex():
        logger.info("Offloading shape model to CPU RAM (fast path)...")
        i23d_worker.to('cpu')
        _i23d_on_cpu = True
+        gc.collect()
+        _malloc_trim()
        torch.cuda.empty_cache()
    else:
        logger.info("Deleting shape model entirely (safe path, limited RAM)...")
@@ -288,6 +305,7 @@ def _prepare_for_tex():
        _i23d_on_cpu = False
        gc.collect()
        gc.collect()
+        _malloc_trim()
        torch.cuda.empty_cache()

    _ensure_tex_pipeline()
@@ -303,6 +321,7 @@ def _ensure_i23d_worker():
    elif i23d_worker is None:
        logger.info("Reloading shape model from disk to GPU (slow path)...")
        gc.collect()
+        _malloc_trim()
        torch.cuda.empty_cache()
        from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
        i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
@@ -324,6 +343,7 @@ def _unload_tex_pipeline():
        tex_pipeline = None
        gc.collect()
        gc.collect()
+        _malloc_trim()
        torch.cuda.empty_cache()


@@ -332,6 +352,7 @@ def _ensure_tex_pipeline():
    global tex_pipeline
    if tex_pipeline is None and tex_conf is not None:
        gc.collect()
+        _malloc_trim()
        torch.cuda.empty_cache()
        from hy3dpaint.textureGenPipeline import Hunyuan3DPaintPipeline
        logger.info("Loading texture pipeline to GPU...")