diff --git a/gradio_app.py b/gradio_app.py index ff5e1d8..c0cd5fd 100644 --- a/gradio_app.py +++ b/gradio_app.py @@ -51,10 +51,25 @@ import numpy as np from hy3dshape.utils import logger from hy3dpaint.convert_utils import create_glb_with_pbr_materials -# Force OS to reclaim freed heap pages, reducing Python's RSS after model deletion. +# ── glibc malloc tuning ─────────────────────────────────────────────────────── +# Applied BEFORE any large allocation so glibc honours them from the start. +# M_MMAP_THRESHOLD (-3): allocations > 1 MB use anonymous mmap instead of +# the heap; when freed they are immediately returned to the OS via munmap, +# eliminating heap fragmentation for PyTorch tensors (all >> 1 MB). +# M_ARENA_MAX (-8 via env): limit to 1 arena so malloc_trim() can release +# ALL freed pages, not just the main-thread arena. +os.environ.setdefault("MALLOC_ARENA_MAX", "1") +os.environ.setdefault("MALLOC_MMAP_THRESHOLD_", "1048576") # 1 MB + _libc = ctypes.CDLL(ctypes.util.find_library("c") or "libc.so.6", use_errno=True) +try: + _libc.mallopt(-3, 1024 * 1024) # M_MMAP_THRESHOLD = 1 MB (runtime) + _libc.mallopt(-1, 128 * 1024) # M_TRIM_THRESHOLD = 128 KB (trim aggressively) +except Exception: + pass def _malloc_trim(): + """Return all free heap pages to the OS (glibc brk-based heap).""" try: _libc.malloc_trim(0) except Exception: @@ -285,13 +300,18 @@ def _can_offload_to_cpu(): def _prepare_for_tex(): - """Free VRAM from shape model before loading texture pipeline.""" + """Free VRAM from shape model before loading texture pipeline. + + In low_vram_mode the shape model is always fully deleted so that its + ~7.25 GB of VRAM is completely free before the texture pipeline loads. + CPU-offload path is only considered when low_vram_mode is disabled. + """ global i23d_worker, _i23d_on_cpu if i23d_worker is None: _ensure_tex_pipeline() return - if _can_offload_to_cpu(): + if not args.low_vram_mode and _can_offload_to_cpu(): logger.info("Offloading shape model to CPU RAM (fast path)...") i23d_worker.to('cpu') _i23d_on_cpu = True @@ -299,7 +319,7 @@ def _prepare_for_tex(): _malloc_trim() torch.cuda.empty_cache() else: - logger.info("Deleting shape model entirely (safe path, limited RAM)...") + logger.info("Deleting shape model entirely (low_vram path)...") del i23d_worker i23d_worker = None _i23d_on_cpu = False @@ -312,14 +332,17 @@ def _prepare_for_tex(): def _ensure_i23d_worker(): - """Load shape model to GPU — from CPU RAM (fast) or disk (slow).""" + """Load shape model to GPU. + + In low_vram_mode always reload from disk (CPU-offload path is never used). + """ global i23d_worker, _i23d_on_cpu - if i23d_worker is not None and _i23d_on_cpu: + if not args.low_vram_mode and i23d_worker is not None and _i23d_on_cpu: logger.info("Restoring shape model from CPU to GPU (fast path)...") i23d_worker.to(args.device) _i23d_on_cpu = False elif i23d_worker is None: - logger.info("Reloading shape model from disk to GPU (slow path)...") + logger.info("Reloading shape model from disk to GPU...") gc.collect() _malloc_trim() torch.cuda.empty_cache() @@ -487,6 +510,27 @@ def generation_all( num_chunks=200000, randomize_seed: bool = False, ): + import os as _os + def _rss_mb(): + try: + with open('/proc/self/status') as _f: + for _l in _f: + if _l.startswith('VmRSS:'): + return int(_l.split()[1]) // 1024 + except Exception: + pass + return 0 + def _rlog(label): + vram = torch.cuda.memory_allocated() // (1024*1024) + logger.info(f"[MEM] {label:40s} RSS={_rss_mb():6d} MB VRAM={vram:5d} MB") + + # Proactively free any memory left over from previous generations so that + # fresh model loading starts from the lowest possible RSS baseline. + gc.collect(2) + _malloc_trim() + torch.cuda.empty_cache() + _rlog("generation_all start") + start_time_0 = time.time() mesh, image, save_folder, stats, seed = _gen_shape( caption, @@ -503,18 +547,12 @@ def generation_all( num_chunks=num_chunks, randomize_seed=randomize_seed, ) + _rlog("after _gen_shape") path = export_mesh(mesh, save_folder, textured=False) - print(path) print('='*40) - # tmp_time = time.time() - # mesh = floater_remove_worker(mesh) - # mesh = degenerate_face_remove_worker(mesh) - # logger.info("---Postprocessing takes %s seconds ---" % (time.time() - tmp_time)) - # stats['time']['postprocessing'] = time.time() - tmp_time - tmp_time = time.time() mesh = face_reduce_worker(mesh) @@ -523,22 +561,25 @@ def generation_all( logger.info("---Face Reduction takes %s seconds ---" % (time.time() - tmp_time)) stats['time']['face reduction'] = time.time() - tmp_time + _rlog("after face reduction") tmp_time = time.time() text_path = os.path.join(save_folder, f'textured_mesh.obj') - # In low_vram_mode: adaptively offload shape model (CPU or delete based on - # available RAM), then load texture pipeline. + # In low_vram_mode: delete shape model then load texture pipeline. if args.low_vram_mode: _prepare_for_tex() + _rlog("after _prepare_for_tex (shape deleted, tex loaded)") path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False) + _rlog("after tex_pipeline inference") # Unload texture pipeline after use so VRAM is free for the next shape request. if args.low_vram_mode: _unload_tex_pipeline() - + _rlog("after _unload_tex_pipeline") + logger.info("---Texture Generation takes %s seconds ---" % (time.time() - tmp_time)) stats['time']['texture generation'] = time.time() - tmp_time @@ -555,6 +596,7 @@ def generation_all( width=HTML_WIDTH, textured=True) if args.low_vram_mode: torch.cuda.empty_cache() + _rlog("generation_all complete") return ( gr.update(value=path), gr.update(value=glb_path_textured), diff --git a/hy3dshape/hy3dshape/models/autoencoders/model.py b/hy3dshape/hy3dshape/models/autoencoders/model.py index 5497177..6c44704 100644 --- a/hy3dshape/hy3dshape/models/autoencoders/model.py +++ b/hy3dshape/hy3dshape/models/autoencoders/model.py @@ -149,7 +149,7 @@ class VectsetVAE(nn.Module): model_kwargs.update(kwargs) model = cls(**model_kwargs) - model.load_state_dict(ckpt) + model.load_state_dict(ckpt, assign=True) model.to(device=device, dtype=dtype) return model @@ -189,7 +189,7 @@ class VectsetVAE(nn.Module): if k.startswith(ik): print("Deleting key {} from state_dict.".format(k)) del state_dict[k] - missing, unexpected = self.load_state_dict(state_dict, strict=False) + missing, unexpected = self.load_state_dict(state_dict, strict=False, assign=True) print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys") if len(missing) > 0: print(f"Missing Keys: {missing}") diff --git a/hy3dshape/hy3dshape/pipelines.py b/hy3dshape/hy3dshape/pipelines.py index 57b36df..e622c48 100644 --- a/hy3dshape/hy3dshape/pipelines.py +++ b/hy3dshape/hy3dshape/pipelines.py @@ -166,14 +166,16 @@ class Hunyuan3DDiTPipeline: ckpt[model_name][new_key] = value else: ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True, mmap=True) - # load model + # load model — use assign=True so mmap fp16 tensors are assigned directly as + # parameters (no fp16→fp32 widening copy), keeping CPU anon-rss near zero. model = instantiate_from_config(config['model']) - model.load_state_dict(ckpt['model']) + model.load_state_dict(ckpt['model'], assign=True) vae = instantiate_from_config(config['vae']) - vae.load_state_dict(ckpt['vae'], strict=False) + vae.load_state_dict(ckpt['vae'], strict=False, assign=True) conditioner = instantiate_from_config(config['conditioner']) if 'conditioner' in ckpt: - conditioner.load_state_dict(ckpt['conditioner']) + conditioner.load_state_dict(ckpt['conditioner'], assign=True) + del ckpt # free mmap file-backed pages now that params hold their own refs image_processor = instantiate_from_config(config['image_processor']) scheduler = instantiate_from_config(config['scheduler'])