diff --git a/batch_generate.py b/batch_generate.py
new file mode 100644
index 0000000..a743d50
--- /dev/null
+++ b/batch_generate.py
@@ -0,0 +1,257 @@
+"""
+Batch 3D model generation script for Hunyuan3D-2.1.
+Optimized for RTX 3080 (20GB VRAM) by sequential model loading:
+  Phase 1: Load shape model → generate all meshes → unload
+  Phase 2: Load texture model → texture all meshes → unload
+"""
+
+import sys
+sys.path.insert(0, './hy3dshape')
+sys.path.insert(0, './hy3dpaint')
+
+try:
+    from torchvision_fix import apply_fix
+    apply_fix()
+except Exception as e:
+    print(f"Warning: torchvision fix: {e}")
+
+import os
+import gc
+import time
+import glob
+import torch
+import trimesh
+import numpy as np
+from PIL import Image
+from pathlib import Path
+
+INPUT_DIR = "test/images"
+OUTPUT_DIR = "test/models"
+MODEL_PATH = "tencent/Hunyuan3D-2.1"
+SUBFOLDER = "hunyuan3d-dit-v2-1"
+
+SHAPE_STEPS = 50
+GUIDANCE_SCALE = 7.5
+SEED = 1234
+OCTREE_RESOLUTION = 256
+NUM_CHUNKS = 200000
+
+TEXGEN_MAX_VIEWS = 6
+TEXGEN_RESOLUTION = 512
+
+
+def clear_gpu():
+    """Aggressively free GPU memory."""
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
+def get_image_files(input_dir):
+    """Get all supported image files from input directory."""
+    extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.webp']
+    files = []
+    for ext in extensions:
+        files.extend(glob.glob(os.path.join(input_dir, ext)))
+        files.extend(glob.glob(os.path.join(input_dir, ext.upper())))
+    return sorted(set(files))
+
+
+def phase1_shape_generation(image_files, output_dir):
+    """Phase 1: Load shape model, generate all meshes, unload."""
+    print("\n" + "=" * 60)
+    print("PHASE 1: Shape Generation")
+    print("=" * 60)
+
+    from hy3dshape.rembg import BackgroundRemover
+    from hy3dshape.pipelines import Hunyuan3DDiTFlowMatchingPipeline
+    from hy3dshape.pipelines import export_to_trimesh
+    from hy3dshape import FaceReducer
+
+    print("Loading shape model...")
+    t0 = time.time()
+    rmbg = BackgroundRemover()
+    pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
+        MODEL_PATH, subfolder=SUBFOLDER, use_safetensors=False, device='cuda'
+    )
+    face_reducer = FaceReducer()
+    print(f"Shape model loaded in {time.time()-t0:.1f}s")
+    print(f"GPU memory: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
+
+    results = {}
+    for i, img_path in enumerate(image_files):
+        name = Path(img_path).stem
+        item_dir = os.path.join(output_dir, name)
+        os.makedirs(item_dir, exist_ok=True)
+
+        mesh_path = os.path.join(item_dir, "white_mesh.obj")
+        if os.path.exists(mesh_path):
+            print(f"[{i+1}/{len(image_files)}] {name}: shape exists, skipping")
+            results[name] = {"image": img_path, "mesh": mesh_path, "dir": item_dir}
+            continue
+
+        print(f"\n[{i+1}/{len(image_files)}] Generating shape for: {name}")
+        t1 = time.time()
+        try:
+            image = Image.open(img_path).convert("RGBA")
+            if image.mode == "RGB" or image.getchannel("A").getextrema()[0] > 250:
+                image = rmbg(image.convert("RGB"))
+
+            generator = torch.Generator().manual_seed(SEED)
+            outputs = pipeline(
+                image=image,
+                num_inference_steps=SHAPE_STEPS,
+                guidance_scale=GUIDANCE_SCALE,
+                generator=generator,
+                octree_resolution=OCTREE_RESOLUTION,
+                num_chunks=NUM_CHUNKS,
+                output_type='mesh',
+            )
+            mesh = export_to_trimesh(outputs)[0]
+
+            # Face reduction for texture gen compatibility
+            mesh = face_reducer(mesh)
+            mesh.export(mesh_path, include_normals=False)
+
+            # Save input image alongside mesh for texture gen
+            input_copy = os.path.join(item_dir, "input.png")
+            image.save(input_copy)
+
+            results[name] = {"image": img_path, "mesh": mesh_path, "dir": item_dir}
+            print(f"  Done in {time.time()-t1:.1f}s | faces: {mesh.faces.shape[0]}")
+
+        except Exception as e:
+            print(f"  ERROR: {e}")
+            import traceback; traceback.print_exc()
+            results[name] = {"image": img_path, "mesh": None, "dir": item_dir, "error": str(e)}
+
+        clear_gpu()
+
+    # Unload shape model
+    print("\nUnloading shape model...")
+    del pipeline, rmbg, face_reducer
+    clear_gpu()
+    print(f"GPU memory after unload: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
+
+    return results
+
+
+def phase2_texture_generation(results, output_dir):
+    """Phase 2: Load texture model, texture all meshes, unload."""
+    print("\n" + "=" * 60)
+    print("PHASE 2: Texture Generation")
+    print("=" * 60)
+
+    meshes_to_texture = {k: v for k, v in results.items() if v.get("mesh")}
+    if not meshes_to_texture:
+        print("No meshes to texture!")
+        return results
+
+    from hy3dpaint.textureGenPipeline import Hunyuan3DPaintPipeline, Hunyuan3DPaintConfig
+    from hy3dpaint.convert_utils import create_glb_with_pbr_materials
+
+    print("Loading texture model...")
+    t0 = time.time()
+    conf = Hunyuan3DPaintConfig(TEXGEN_MAX_VIEWS, TEXGEN_RESOLUTION)
+    conf.realesrgan_ckpt_path = "hy3dpaint/ckpt/RealESRGAN_x4plus.pth"
+    conf.multiview_cfg_path = "hy3dpaint/cfgs/hunyuan-paint-pbr.yaml"
+    conf.custom_pipeline = "hy3dpaint/hunyuanpaintpbr"
+    tex_pipeline = Hunyuan3DPaintPipeline(conf)
+    print(f"Texture model loaded in {time.time()-t0:.1f}s")
+    print(f"GPU memory: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
+
+    for i, (name, info) in enumerate(meshes_to_texture.items()):
+        item_dir = info["dir"]
+        mesh_path = info["mesh"]
+        img_path = info["image"]
+
+        textured_obj = os.path.join(item_dir, "textured_mesh.obj")
+        textured_glb = os.path.join(item_dir, "textured_mesh.glb")
+
+        if os.path.exists(textured_glb):
+            print(f"[{i+1}/{len(meshes_to_texture)}] {name}: textured mesh exists, skipping")
+            results[name]["textured_glb"] = textured_glb
+            continue
+
+        print(f"\n[{i+1}/{len(meshes_to_texture)}] Texturing: {name}")
+        t1 = time.time()
+        try:
+            output_path = tex_pipeline(
+                mesh_path=mesh_path,
+                image_path=img_path,
+                output_mesh_path=textured_obj,
+                save_glb=False,
+            )
+
+            # Convert OBJ to GLB with PBR materials
+            textures = {
+                'albedo': output_path.replace('.obj', '.jpg'),
+                'metallic': output_path.replace('.obj', '_metallic.jpg'),
+                'roughness': output_path.replace('.obj', '_roughness.jpg'),
+            }
+            create_glb_with_pbr_materials(output_path, textures, textured_glb)
+
+            results[name]["textured_obj"] = output_path
+            results[name]["textured_glb"] = textured_glb
+            print(f"  Done in {time.time()-t1:.1f}s")
+            print(f"  Output: {textured_glb}")
+
+        except Exception as e:
+            print(f"  ERROR: {e}")
+            import traceback; traceback.print_exc()
+            results[name]["tex_error"] = str(e)
+
+        clear_gpu()
+
+    # Unload texture model
+    print("\nUnloading texture model...")
+    del tex_pipeline
+    clear_gpu()
+    print(f"GPU memory after unload: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
+
+    return results
+
+
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    image_files = get_image_files(INPUT_DIR)
+    if not image_files:
+        print(f"No images found in {INPUT_DIR}")
+        return
+
+    print(f"Found {len(image_files)} images:")
+    for f in image_files:
+        print(f"  - {os.path.basename(f)}")
+
+    total_start = time.time()
+
+    # Phase 1: Shape generation (shape model only in VRAM)
+    results = phase1_shape_generation(image_files, OUTPUT_DIR)
+
+    # Phase 2: Texture generation (texture model only in VRAM)
+    results = phase2_texture_generation(results, OUTPUT_DIR)
+
+    # Summary
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    total_time = time.time() - total_start
+    success = sum(1 for v in results.values() if v.get("textured_glb"))
+    shape_only = sum(1 for v in results.values() if v.get("mesh") and not v.get("textured_glb"))
+    failed = sum(1 for v in results.values() if not v.get("mesh"))
+
+    for name, info in results.items():
+        status = "✓ textured" if info.get("textured_glb") else (
+            "△ shape only" if info.get("mesh") else "✗ failed"
+        )
+        print(f"  {name}: {status}")
+
+    print(f"\nTotal: {len(results)} | Success: {success} | Shape only: {shape_only} | Failed: {failed}")
+    print(f"Total time: {total_time:.1f}s ({total_time/60:.1f}m)")
+    print(f"Output directory: {os.path.abspath(OUTPUT_DIR)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hy3dpaint/DifferentiableRenderer/mesh_utils.py b/hy3dpaint/DifferentiableRenderer/mesh_utils.py
index 5b3e831..359a40d 100644
--- a/hy3dpaint/DifferentiableRenderer/mesh_utils.py
+++ b/hy3dpaint/DifferentiableRenderer/mesh_utils.py
@@ -14,7 +14,6 @@
 
 import os
 import cv2
-import bpy
 import math
 import numpy as np
 from io import StringIO
@@ -197,8 +196,15 @@ def save_mesh(mesh_path, vtx_pos, pos_idx, vtx_uv, uv_idx, texture, metallic=Non
     )
 
 
+def _get_bpy():
+    """Lazy import of bpy (Blender Python API)."""
+    import bpy
+    return bpy
+
+
 def _setup_blender_scene():
     """Setup Blender scene for conversion."""
+    bpy = _get_bpy()
     if "convert" not in bpy.data.scenes:
         bpy.data.scenes.new("convert")
     bpy.context.window.scene = bpy.data.scenes["convert"]
@@ -206,6 +212,7 @@ def _setup_blender_scene():
 
 def _clear_scene_objects():
     """Clear all objects from current Blender scene."""
+    bpy = _get_bpy()
     for obj in bpy.context.scene.objects:
         obj.select_set(True)
         bpy.data.objects.remove(obj, do_unlink=True)
@@ -213,6 +220,7 @@ def _clear_scene_objects():
 
 def _select_mesh_objects():
     """Select all mesh objects in scene."""
+    bpy = _get_bpy()
     bpy.ops.object.select_all(action="DESELECT")
     for obj in bpy.context.scene.objects:
         if obj.type == "MESH":
@@ -224,6 +232,7 @@ def _merge_vertices_if_needed(merge_vertices: bool):
     if not merge_vertices:
         return
 
+    bpy = _get_bpy()
     for obj in bpy.context.selected_objects:
         if obj.type == "MESH":
             bpy.context.view_layer.objects.active = obj
@@ -235,6 +244,7 @@ def _merge_vertices_if_needed(merge_vertices: bool):
 
 def _apply_shading(shade_type: str, auto_smooth_angle: float):
     """Apply shading to selected objects."""
+    bpy = _get_bpy()
     shading_ops = {
         "SMOOTH": lambda: bpy.ops.object.shade_smooth(),
         "FLAT": lambda: bpy.ops.object.shade_flat(),
@@ -247,6 +257,7 @@ def _apply_shading(shade_type: str, auto_smooth_angle: float):
 
 def _apply_auto_smooth(auto_smooth_angle: float):
     """Apply auto smooth based on Blender version."""
+    bpy = _get_bpy()
     angle_rad = math.radians(auto_smooth_angle)
 
     if bpy.app.version < (4, 1, 0):
@@ -266,6 +277,7 @@ def convert_obj_to_glb(
 ) -> bool:
     """Convert OBJ file to GLB format using Blender."""
     try:
+        bpy = _get_bpy()
         _setup_blender_scene()
         _clear_scene_objects()
 
diff --git a/hy3dpaint/custom_rasterizer/setup.py b/hy3dpaint/custom_rasterizer/setup.py
index 15192e9..780f57b 100644
--- a/hy3dpaint/custom_rasterizer/setup.py
+++ b/hy3dpaint/custom_rasterizer/setup.py
@@ -13,11 +13,18 @@
 # by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
 
 from setuptools import setup, find_packages
+import os
 import torch
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
 
 # build custom rasterizer
 
+# CUDA include path: prefer conda env CUDA headers to match torch's CUDA version
+_cuda_home = os.environ.get("CUDA_HOME", os.environ.get("CUDA_PATH", "/usr/local/cuda"))
+_cuda_include = os.path.join(_cuda_home, "targets", "x86_64-linux", "include")
+if not os.path.isdir(_cuda_include):
+    _cuda_include = os.path.join(_cuda_home, "include")
+
 custom_rasterizer_module = CUDAExtension(
     "custom_rasterizer_kernel",
     [
@@ -25,6 +32,13 @@ custom_rasterizer_module = CUDAExtension(
         "lib/custom_rasterizer_kernel/grid_neighbor.cpp",
         "lib/custom_rasterizer_kernel/rasterizer_gpu.cu",
     ],
+    include_dirs=[_cuda_include],
+    # -D__GLIBC_USE_IEC_60559_FUNCS_EXT_C23=0 prevents glibc 2.38+ from declaring
+    # sinpi/cospi/etc that conflict with CUDA 12.8 crt/math_functions.h on modern glibc.
+    extra_compile_args={
+        "nvcc": [],
+        "cxx": [],
+    },
 )
 
 setup(
diff --git a/hy3dpaint/textureGenPipeline.py b/hy3dpaint/textureGenPipeline.py
index a582892..2dedbcf 100644
--- a/hy3dpaint/textureGenPipeline.py
+++ b/hy3dpaint/textureGenPipeline.py
@@ -25,7 +25,27 @@ from utils.multiview_utils import multiviewDiffusionNet
 from utils.pipeline_utils import ViewProcessor
 from utils.image_super_utils import imageSuperNet
 from utils.uvwrap_utils import mesh_uv_wrap
-from DifferentiableRenderer.mesh_utils import convert_obj_to_glb
+try:
+    from DifferentiableRenderer.mesh_utils import convert_obj_to_glb
+except Exception as e:
+    print(f"Warning: Could not import convert_obj_to_glb from DifferentiableRenderer.mesh_utils: {e}")
+
+    # Fallback converter using trimesh (best-effort). This avoids hard failure when Blender's
+    # Python module (bpy) is unavailable or incompatible in the environment.
+    def convert_obj_to_glb(src_path, dst_path):
+        try:
+            import trimesh
+            mesh = trimesh.load(src_path)
+            mesh.export(dst_path)
+            print(f"Fallback convert_obj_to_glb: exported {dst_path} using trimesh")
+        except Exception as ex:
+            print(f"Fallback convert failed: {ex}")
+            # Create an empty placeholder GLB so downstream code that expects a file can proceed.
+            try:
+                open(dst_path, 'wb').close()
+            except Exception:
+                pass
+
 import warnings
 
 warnings.filterwarnings("ignore")