diff --git a/gradio_app.py b/gradio_app.py
index c111320..3b767b4 100644
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -49,6 +49,11 @@ import numpy as np
 from hy3dshape.utils import logger
 from hy3dpaint.convert_utils import create_glb_with_pbr_materials
 
+# Globals for lazy load/unload
+i23d_worker = None
+tex_pipeline = None
+tex_conf = None
+
 
 MAX_SEED = 1e7
 ENV = "Local" # "Huggingface"
@@ -220,36 +225,45 @@ height="{height}" width="100%" frameborder="0"></iframe>'
 # on next request — no CPU intermediate, VRAM freed immediately.
 # ---------------------------------------------------------------------------
 
-def _unload_i23d_worker():
-    """Delete shape model from GPU and free VRAM."""
+# ---------------------------------------------------------------------------
+# VRAM management helpers (used when --low_vram_mode is set)
+#
+# Strategy:
+#   i23d (shape model, ~7.25GB VRAM):
+#     → kept in system RAM between requests via .to('cpu')/.to('cuda')
+#     → no disk re-read, no OOM on reload, fast GPU↔CPU switch
+#
+#   tex_pipeline (texture model, ~6.59GB VRAM):
+#     → fully del'd after each use (no CPU copy kept)
+#     → reloaded from cached weights on next texture request
+#     → tex config/weights are HF-cached so reload is fast (~20s)
+#
+# This ensures the two models never simultaneously occupy VRAM.
+# ---------------------------------------------------------------------------
+
+def _offload_i23d_to_cpu():
+    """Move shape model tensors to CPU RAM, freeing its VRAM."""
     global i23d_worker
-    del i23d_worker
-    i23d_worker = None
-    gc.collect()
-    torch.cuda.empty_cache()
+    if i23d_worker is not None:
+        i23d_worker.to('cpu')
+        torch.cuda.empty_cache()
 
 
-def _ensure_i23d_worker():
-    """Reload shape model to GPU if it was previously unloaded."""
+def _restore_i23d_to_gpu():
+    """Move shape model tensors back to GPU from CPU RAM."""
     global i23d_worker
-    if i23d_worker is None:
-        from hy3dshape import Hunyuan3DDiTFlowMatchingPipeline
-        logger.info("Reloading shape model to GPU...")
-        i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
-            args.model_path,
-            subfolder=args.subfolder,
-            use_safetensors=False,
-            device=args.device,
-        )
+    if i23d_worker is not None:
+        i23d_worker.to(args.device)
 
 
 def _unload_tex_pipeline():
-    """Delete texture pipeline from GPU and free VRAM."""
+    """Delete texture pipeline entirely, freeing its VRAM."""
     global tex_pipeline
-    del tex_pipeline
-    tex_pipeline = None
-    gc.collect()
-    torch.cuda.empty_cache()
+    if tex_pipeline is not None:
+        del tex_pipeline
+        tex_pipeline = None
+        gc.collect()
+        torch.cuda.empty_cache()
 
 
 def _ensure_tex_pipeline():
@@ -346,7 +360,7 @@ def _gen_shape(
     start_time = time.time()
 
     if args.low_vram_mode:
-        _ensure_i23d_worker()
+        _restore_i23d_to_gpu()
 
     generator = torch.Generator()
     generator = generator.manual_seed(int(seed))
@@ -430,10 +444,10 @@ def generation_all(
 
     text_path = os.path.join(save_folder, f'textured_mesh.obj')
 
-    # In low_vram_mode: unload shape model entirely (del, no CPU copy) to free VRAM,
-    # then load texture pipeline on demand. Shape model reloads lazily on next request.
+    # In low_vram_mode: move shape model to CPU RAM (keeps weights in RAM, no disk re-read),
+    # then load texture pipeline on demand. Shape model stays in RAM to avoid OOM on reload.
     if args.low_vram_mode:
-        _unload_i23d_worker()
+        _offload_i23d_to_cpu()
         _ensure_tex_pipeline()
 
     path_textured = tex_pipeline(mesh_path=path, image_path=image, output_mesh_path=text_path, save_glb=False)