diff --git a/gradio_app.py b/gradio_app.py
index ff5e1d8..c0cd5fd 100644
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -51,10 +51,25 @@ import numpy as np
 from hy3dshape.utils import logger
 from hy3dpaint.convert_utils import create_glb_with_pbr_materials
 
-# Force OS to reclaim freed heap pages, reducing Python's RSS after model deletion.
+# ── glibc malloc tuning ───────────────────────────────────────────────────────
+# Applied BEFORE any large allocation so glibc honours them from the start.
+# M_MMAP_THRESHOLD (-3): allocations > 1 MB use anonymous mmap instead of
+#   the heap; when freed they are immediately returned to the OS via munmap,
+#   eliminating heap fragmentation for PyTorch tensors (all >> 1 MB).
+# M_ARENA_MAX   (-8 via env): limit to 1 arena so malloc_trim() can release
+#   ALL freed pages, not just the main-thread arena.
+os.environ.setdefault("MALLOC_ARENA_MAX", "1")
+os.environ.setdefault("MALLOC_MMAP_THRESHOLD_", "1048576")   # 1 MB
+
 _libc = ctypes.CDLL(ctypes.util.find_library("c") or "libc.so.6", use_errno=True)
+try:
+    _libc.mallopt(-3, 1024 * 1024)  # M_MMAP_THRESHOLD = 1 MB  (runtime)
+    _libc.mallopt(-1, 128 * 1024)   # M_TRIM_THRESHOLD = 128 KB (trim aggressively)
+except Exception:
+    pass
 
 def _malloc_trim():
+    """Return all free heap pages to the OS (glibc brk-based heap)."""
     try:
         _libc.malloc_trim(0)
     except Exception:
@@ -285,13 +300,18 @@ def _can_offload_to_cpu():
 
 
 def _prepare_for_tex():
-    """Free VRAM from shape model before loading texture pipeline."""
+    """Free VRAM from shape model before loading texture pipeline.
+
+    In low_vram_mode the shape model is always fully deleted so that its
+    ~7.25 GB of VRAM is completely free before the texture pipeline loads.
+    CPU-offload path is only considered when low_vram_mode is disabled.
+    """
     global i23d_worker, _i23d_on_cpu
     if i23d_worker is None:
         _ensure_tex_pipeline()
         return
 
-    if _can_offload_to_cpu():
+    if not args.low_vram_mode and _can_offload_to_cpu():
         logger.info("Offloading shape model to CPU RAM (fast path)...")
         i23d_worker.to('cpu')
         _i23d_on_cpu = True
@@ -299,7 +319,7 @@ def _prepare_for_tex():
         _malloc_trim()
         torch.cuda.empty_cache()
     else:
-        logger.info("Deleting shape model entirely (safe path, limited RAM)...")
+        logger.info("Deleting shape model entirely (low_vram path)...")
         del i23d_worker
         i23d_worker = None
         _i23d_on_cpu = False
@@ -312,14 +332,17 @@ def _prepare_for_tex():
 
 
 def _ensure_i23d_worker():
-    """Load shape model to GPU — from CPU RAM (fast) or disk (slow)."""
+    """Load shape model to GPU.
+
+    In low_vram_mode always reload from disk (CPU-offload path is never used).
+    """
     global i23d_worker, _i23d_on_cpu
-    if i23d_worker is not None and _i23d_on_cpu:
+    if not args.low_vram_mode and i23d_worker is not None and _i23d_on_cpu:
         logger.info("Restoring shape model from CPU to GPU (fast path)...")
         i23d_worker.to(args.device)
         _i23d_on_cpu = False
     elif i23d_worker is None:
-        logger.info("Reloading shape model from disk to GPU (slow path)...")
+        logger.info("Reloading shape model from disk to GPU...")
         gc.collect()
         _malloc_trim()
         torch.cuda.empty_cache()
@@ -487,6 +510,27 @@ def generation_all(
     num_chunks=200000,
     randomize_seed: bool = False,
 ):
+    import os as _os
+    def _rss_mb():
+        try:
+            with open('/proc/self/status') as _f:
+                for _l in _f:
+                    if _l.startswith('VmRSS:'):
+                        return int(_l.split()[1]) // 1024
+        except Exception:
+            pass
+        return 0
+    def _rlog(label):
+        vram = torch.cuda.memory_allocated() // (1024*1024)
+        logger.info(f"[MEM] {label:40s} RSS={_rss_mb():6d} MB  VRAM={vram:5d} MB")
+
+    # Proactively free any memory left over from previous generations so that
+    # fresh model loading starts from the lowest possible RSS baseline.
+    gc.collect(2)
+    _malloc_trim()
+    torch.cuda.empty_cache()
+    _rlog("generation_all start")
+
     start_time_0 = time.time()
     mesh, image, save_folder, stats, seed = _gen_shape(
         caption,
@@ -503,18 +547,12 @@ def generation_all(
         num_chunks=num_chunks,
         randomize_seed=randomize_seed,
     )
+    _rlog("after _gen_shape")
     path = export_mesh(mesh, save_folder, textured=False)
-    
 
     print(path)
     print('='*40)
 
-    # tmp_time = time.time()
-    # mesh = floater_remove_worker(mesh)
-    # mesh = degenerate_face_remove_worker(mesh)
-    # logger.info("---Postprocessing takes %s seconds ---" % (time.time() - tmp_time))
-    # stats['time']['postprocessing'] = time.time() - tmp_time
-
     tmp_time = time.time()
     mesh = face_reduce_worker(mesh)
 
@@ -523,22 +561,25 @@ def generation_all(
 
     logger.info("---Face Reduction takes %s seconds ---" % (time.time() - tmp_time))
     stats['time']['face reduction'] = time.time() - tmp_time
+    _rlog("after face reduction")
 
     tmp_time = time.time()
 
     text_path = os.path.join(save_folder, f'textured_mesh.obj')
 
-    # In low_vram_mode: adaptively offload shape model (CPU or delete based on
-    # available RAM), then load texture pipeline.
+    # In low_vram_mode: delete shape model then load texture pipeline.
     if args.low_vram_mode:
         _prepare_for_tex()
+    _rlog("after _prepare_for_tex (shape deleted, tex loaded)")
 
     path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)
+    _rlog("after tex_pipeline inference")
 
     # Unload texture pipeline after use so VRAM is free for the next shape request.
     if args.low_vram_mode:
         _unload_tex_pipeline()
-        
+    _rlog("after _unload_tex_pipeline")
+
     logger.info("---Texture Generation takes %s seconds ---" % (time.time() - tmp_time))
     stats['time']['texture generation'] = time.time() - tmp_time
 
@@ -555,6 +596,7 @@ def generation_all(
                                                          width=HTML_WIDTH, textured=True)
     if args.low_vram_mode:
         torch.cuda.empty_cache()
+    _rlog("generation_all complete")
     return (
         gr.update(value=path),
         gr.update(value=glb_path_textured),
diff --git a/hy3dshape/hy3dshape/models/autoencoders/model.py b/hy3dshape/hy3dshape/models/autoencoders/model.py
index 5497177..6c44704 100644
--- a/hy3dshape/hy3dshape/models/autoencoders/model.py
+++ b/hy3dshape/hy3dshape/models/autoencoders/model.py
@@ -149,7 +149,7 @@ class VectsetVAE(nn.Module):
         model_kwargs.update(kwargs)
 
         model = cls(**model_kwargs)
-        model.load_state_dict(ckpt)
+        model.load_state_dict(ckpt, assign=True)
         model.to(device=device, dtype=dtype)
         return model
 
@@ -189,7 +189,7 @@ class VectsetVAE(nn.Module):
                 if k.startswith(ik):
                     print("Deleting key {} from state_dict.".format(k))
                     del state_dict[k]
-        missing, unexpected = self.load_state_dict(state_dict, strict=False)
+        missing, unexpected = self.load_state_dict(state_dict, strict=False, assign=True)
         print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
         if len(missing) > 0:
             print(f"Missing Keys: {missing}")
diff --git a/hy3dshape/hy3dshape/pipelines.py b/hy3dshape/hy3dshape/pipelines.py
index 57b36df..e622c48 100644
--- a/hy3dshape/hy3dshape/pipelines.py
+++ b/hy3dshape/hy3dshape/pipelines.py
@@ -166,14 +166,16 @@ class Hunyuan3DDiTPipeline:
                 ckpt[model_name][new_key] = value
         else:
             ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True, mmap=True)
-        # load model
+        # load model — use assign=True so mmap fp16 tensors are assigned directly as
+        # parameters (no fp16→fp32 widening copy), keeping CPU anon-rss near zero.
         model = instantiate_from_config(config['model'])
-        model.load_state_dict(ckpt['model'])
+        model.load_state_dict(ckpt['model'], assign=True)
         vae = instantiate_from_config(config['vae'])
-        vae.load_state_dict(ckpt['vae'], strict=False)
+        vae.load_state_dict(ckpt['vae'], strict=False, assign=True)
         conditioner = instantiate_from_config(config['conditioner'])
         if 'conditioner' in ckpt:
-            conditioner.load_state_dict(ckpt['conditioner'])
+            conditioner.load_state_dict(ckpt['conditioner'], assign=True)
+        del ckpt  # free mmap file-backed pages now that params hold their own refs
         image_processor = instantiate_from_config(config['image_processor'])
         scheduler = instantiate_from_config(config['scheduler'])