fix(oom): use mmap=True for checkpoint loading + malloc_trim + expandable_segments

Root cause: torch.load() reads 6.9GB .ckpt into Python heap + model params
in CPU RAM = ~14GB peak, exceeding 16GB system RAM → OOM Killer.

Fix 1 - mmap=True on all torch.load() calls (torch 2.7 supports this):
  With mmap, checkpoint storage is file-backed (not heap). Only the model
  parameters (also ~7GB) exist in physical RAM during loading. Peak RAM
  drops from ~14GB to ~7GB — within safe limits on 16GB machines.
  Files changed: pipelines.py, hunyuan3ddit.py, model.py (×2), flow_matching_sit.py

Fix 2 - malloc_trim(0) after every gc.collect():
  Forces glibc to return freed heap pages to OS immediately, so Python's
  memory pool doesn't hoard freed model memory before the next load.

Fix 3 - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True:
  Prevents CUDA allocator fragmentation between model switches.

Fix 4 - Adaptive threshold recalculated:
  With mmap loading, loading a model requires ~7.5GB (model params) not
  14GB. CPU offload threshold lowered from 16GB → 10.5GB, enabling fast
  path on machines with more headroom.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Akasei
2026-03-16 23:18:16 +08:00
parent 6534f4ba15
commit f192c86c60
46 changed files with 334079 additions and 10 deletions

View File

@@ -34,6 +34,8 @@ import random
import shutil
import subprocess
import time
import ctypes
import ctypes.util
from glob import glob
from pathlib import Path
@@ -49,6 +51,18 @@ import numpy as np
from hy3dshape.utils import logger
from hy3dpaint.convert_utils import create_glb_with_pbr_materials
# Force OS to reclaim freed heap pages, reducing Python's RSS after model deletion.
_libc = ctypes.CDLL(ctypes.util.find_library("c") or "libc.so.6", use_errno=True)
def _malloc_trim():
try:
_libc.malloc_trim(0)
except Exception:
pass
# Allow CUDA allocator to use expandable segments, reducing fragmentation.
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
# Globals for lazy load/unload
i23d_worker = None
tex_pipeline = None
@@ -237,9 +251,10 @@ height="{height}" width="100%" frameborder="0"></iframe>'
# ---------------------------------------------------------------------------
# Approximate RAM required (GB) to hold one model in CPU while loading another.
# Model weights: ~7GB each. Loading from disk stages ~7GB temporarily.
# Total: 7 (existing in CPU) + 7 (loading new) + 2 (OS headroom) = 16GB.
_RAM_THRESHOLD_GB = 16.0
# With mmap=True loading, staging a model needs ~0 extra heap RAM.
# So threshold = size of model in CPU RAM = ~7.5GB, plus 3GB headroom = 10.5GB.
# With 16GB total, we need at least ~10.5GB free to safely offload i23d to CPU.
_RAM_THRESHOLD_GB = 10.5
# Track whether i23d is offloaded to CPU RAM (vs deleted entirely).
_i23d_on_cpu = False
@@ -258,12 +273,12 @@ def _get_available_ram_gb():
def _can_offload_to_cpu():
"""Check if there's enough RAM to keep a model in CPU while loading another."""
"""Check if there's enough RAM to keep i23d in CPU while loading tex."""
available = _get_available_ram_gb()
can = available >= _RAM_THRESHOLD_GB
logger.info(
f"RAM check: {available:.1f}GB available, "
f"need {_RAM_THRESHOLD_GB:.0f}GB for CPU offload → "
f"need {_RAM_THRESHOLD_GB:.1f}GB for CPU offload → "
f"{'CPU offload (fast)' if can else 'full delete (safe)'}"
)
return can
@@ -280,6 +295,8 @@ def _prepare_for_tex():
logger.info("Offloading shape model to CPU RAM (fast path)...")
i23d_worker.to('cpu')
_i23d_on_cpu = True
gc.collect()
_malloc_trim()
torch.cuda.empty_cache()
else:
logger.info("Deleting shape model entirely (safe path, limited RAM)...")
@@ -288,6 +305,7 @@ def _prepare_for_tex():
_i23d_on_cpu = False
gc.collect()
gc.collect()
_malloc_trim()
torch.cuda.empty_cache()
_ensure_tex_pipeline()
@@ -303,6 +321,7 @@ def _ensure_i23d_worker():
elif i23d_worker is None:
logger.info("Reloading shape model from disk to GPU (slow path)...")
gc.collect()
_malloc_trim()
torch.cuda.empty_cache()
from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
@@ -324,6 +343,7 @@ def _unload_tex_pipeline():
tex_pipeline = None
gc.collect()
gc.collect()
_malloc_trim()
torch.cuda.empty_cache()
@@ -332,6 +352,7 @@ def _ensure_tex_pipeline():
global tex_pipeline
if tex_pipeline is None and tex_conf is not None:
gc.collect()
_malloc_trim()
torch.cuda.empty_cache()
from hy3dpaint.textureGenPipeline import Hunyuan3DPaintPipeline
logger.info("Loading texture pipeline to GPU...")