fix(oom): use mmap=True for checkpoint loading + malloc_trim + expandable_segments
Root cause: torch.load() reads 6.9GB .ckpt into Python heap + model params in CPU RAM = ~14GB peak, exceeding 16GB system RAM → OOM Killer. Fix 1 - mmap=True on all torch.load() calls (torch 2.7 supports this): With mmap, checkpoint storage is file-backed (not heap). Only the model parameters (also ~7GB) exist in physical RAM during loading. Peak RAM drops from ~14GB to ~7GB — within safe limits on 16GB machines. Files changed: pipelines.py, hunyuan3ddit.py, model.py (×2), flow_matching_sit.py Fix 2 - malloc_trim(0) after every gc.collect(): Forces glibc to return freed heap pages to OS immediately, so Python's memory pool doesn't hoard freed model memory before the next load. Fix 3 - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True: Prevents CUDA allocator fragmentation between model switches. Fix 4 - Adaptive threshold recalculated: With mmap loading, loading a model requires ~7.5GB (model params) not 14GB. CPU offload threshold lowered from 16GB → 10.5GB, enabling fast path on machines with more headroom. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -34,6 +34,8 @@ import random
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
import ctypes
|
||||
import ctypes.util
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
|
||||
@@ -49,6 +51,18 @@ import numpy as np
|
||||
from hy3dshape.utils import logger
|
||||
from hy3dpaint.convert_utils import create_glb_with_pbr_materials
|
||||
|
||||
# Force OS to reclaim freed heap pages, reducing Python's RSS after model deletion.
|
||||
_libc = ctypes.CDLL(ctypes.util.find_library("c") or "libc.so.6", use_errno=True)
|
||||
|
||||
def _malloc_trim():
|
||||
try:
|
||||
_libc.malloc_trim(0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Allow CUDA allocator to use expandable segments, reducing fragmentation.
|
||||
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
||||
|
||||
# Globals for lazy load/unload
|
||||
i23d_worker = None
|
||||
tex_pipeline = None
|
||||
@@ -237,9 +251,10 @@ height="{height}" width="100%" frameborder="0"></iframe>'
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Approximate RAM required (GB) to hold one model in CPU while loading another.
|
||||
# Model weights: ~7GB each. Loading from disk stages ~7GB temporarily.
|
||||
# Total: 7 (existing in CPU) + 7 (loading new) + 2 (OS headroom) = 16GB.
|
||||
_RAM_THRESHOLD_GB = 16.0
|
||||
# With mmap=True loading, staging a model needs ~0 extra heap RAM.
|
||||
# So threshold = size of model in CPU RAM = ~7.5GB, plus 3GB headroom = 10.5GB.
|
||||
# With 16GB total, we need at least ~10.5GB free to safely offload i23d to CPU.
|
||||
_RAM_THRESHOLD_GB = 10.5
|
||||
|
||||
# Track whether i23d is offloaded to CPU RAM (vs deleted entirely).
|
||||
_i23d_on_cpu = False
|
||||
@@ -258,12 +273,12 @@ def _get_available_ram_gb():
|
||||
|
||||
|
||||
def _can_offload_to_cpu():
|
||||
"""Check if there's enough RAM to keep a model in CPU while loading another."""
|
||||
"""Check if there's enough RAM to keep i23d in CPU while loading tex."""
|
||||
available = _get_available_ram_gb()
|
||||
can = available >= _RAM_THRESHOLD_GB
|
||||
logger.info(
|
||||
f"RAM check: {available:.1f}GB available, "
|
||||
f"need {_RAM_THRESHOLD_GB:.0f}GB for CPU offload → "
|
||||
f"need {_RAM_THRESHOLD_GB:.1f}GB for CPU offload → "
|
||||
f"{'CPU offload (fast)' if can else 'full delete (safe)'}"
|
||||
)
|
||||
return can
|
||||
@@ -280,6 +295,8 @@ def _prepare_for_tex():
|
||||
logger.info("Offloading shape model to CPU RAM (fast path)...")
|
||||
i23d_worker.to('cpu')
|
||||
_i23d_on_cpu = True
|
||||
gc.collect()
|
||||
_malloc_trim()
|
||||
torch.cuda.empty_cache()
|
||||
else:
|
||||
logger.info("Deleting shape model entirely (safe path, limited RAM)...")
|
||||
@@ -288,6 +305,7 @@ def _prepare_for_tex():
|
||||
_i23d_on_cpu = False
|
||||
gc.collect()
|
||||
gc.collect()
|
||||
_malloc_trim()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
_ensure_tex_pipeline()
|
||||
@@ -303,6 +321,7 @@ def _ensure_i23d_worker():
|
||||
elif i23d_worker is None:
|
||||
logger.info("Reloading shape model from disk to GPU (slow path)...")
|
||||
gc.collect()
|
||||
_malloc_trim()
|
||||
torch.cuda.empty_cache()
|
||||
from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
|
||||
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
|
||||
@@ -324,6 +343,7 @@ def _unload_tex_pipeline():
|
||||
tex_pipeline = None
|
||||
gc.collect()
|
||||
gc.collect()
|
||||
_malloc_trim()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@@ -332,6 +352,7 @@ def _ensure_tex_pipeline():
|
||||
global tex_pipeline
|
||||
if tex_pipeline is None and tex_conf is not None:
|
||||
gc.collect()
|
||||
_malloc_trim()
|
||||
torch.cuda.empty_cache()
|
||||
from hy3dpaint.textureGenPipeline import Hunyuan3DPaintPipeline
|
||||
logger.info("Loading texture pipeline to GPU...")
|
||||
|
||||
Reference in New Issue
Block a user