From 7facf4b45931636b72da4a51de61898393f1608d Mon Sep 17 00:00:00 2001
From: qinmaohui <820102443@qq.com>
Date: Thu, 10 Jul 2025 17:22:47 +0800
Subject: [PATCH 1/4] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=9C=A8windows=E7=B3=BB?=
 =?UTF-8?q?=E7=BB=9F=E4=B8=AD=E5=AE=89=E8=A3=85custom=5Frastorizer?=
 =?UTF-8?q?=E6=8A=A5=E9=94=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .idea/.gitignore                              |  8 ++++
 .idea/Hunyuan3D-2.1.iml                       | 12 +++++
 .../inspectionProfiles/profiles_settings.xml  |  6 +++
 .idea/misc.xml                                |  4 ++
 .idea/modules.xml                             |  8 ++++
 .idea/vcs.xml                                 |  6 +++
 .../grid_neighbor.cpp                         | 44 +++++++++----------
 .../custom_rasterizer_kernel/rasterizer.cpp   | 22 +++++-----
 .../lib/custom_rasterizer_kernel/rasterizer.h |  3 +-
 .../rasterizer_gpu.cu                         | 22 +++++-----
 10 files changed, 89 insertions(+), 46 deletions(-)
 create mode 100644 .idea/.gitignore
 create mode 100644 .idea/Hunyuan3D-2.1.iml
 create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/vcs.xml
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..35410ca
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/Hunyuan3D-2.1.iml b/.idea/Hunyuan3D-2.1.iml
new file mode 100644
index 0000000..41f56a8
--- /dev/null
+++ b/.idea/Hunyuan3D-2.1.iml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="hunyuan" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="GOOGLE" />
+    <option name="myDocStringFormat" value="Google" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..cfe85d6
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="hunyuan" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..23b18b0
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Hunyuan3D-2.1.iml" filepath="$PROJECT_DIR$/.idea/Hunyuan3D-2.1.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
index f02bcba..cbe21e1 100644
--- a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
@@ -312,7 +312,7 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tenso
     std::vector<torch::Tensor> view_layer_normals, int num_level, int resolution)
 {
     if (view_layer_positions.size() != 3 || num_level < 1) {
-        printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level);
+        printf("Alert! We require 3 layers and at least 1 level! (%zu %d)\n", view_layer_positions.size(), num_level);
         return {{},{},{},{}};
     }
 
@@ -394,8 +394,8 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tenso
     std::vector<torch::Tensor> grid_evencorners(grids.size());
     std::vector<torch::Tensor> grid_oddcorners(grids.size());
 
-    texture_positions[0] = torch::zeros({seq2pos.size() / 3, 3}, float_options);
-    texture_positions[1] = torch::zeros({seq2pos.size() / 3}, float_options);
+    texture_positions[0] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3), 3}, float_options);
+    texture_positions[1] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3)}, float_options);
     float* positions_out_ptr = texture_positions[0].data_ptr<float>();
     memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size());
     positions_out_ptr = texture_positions[1].data_ptr<float>();
@@ -404,25 +404,24 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tenso
     }
 
     for (int i = 0; i < grids.size(); ++i) {
-        grid_neighbors[i] = torch::zeros({grids[i].seq2grid.size(), 9}, int64_options);
-        long* nptr = grid_neighbors[i].data_ptr<long>();
+        grid_neighbors[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2grid.size()), 9}, int64_options);
+        int64_t* nptr = grid_neighbors[i].data_ptr<int64_t>();
         for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) {
             nptr[j] = grids[i].seq2neighbor[j];
         }
 
-        grid_evencorners[i] = torch::zeros({grids[i].seq2evencorner.size()}, int64_options);
-        grid_oddcorners[i] = torch::zeros({grids[i].seq2oddcorner.size()}, int64_options);
-        long* dptr = grid_evencorners[i].data_ptr<long>();
+        grid_evencorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2evencorner.size())}, int64_options);
+        int64_t* dptr = grid_evencorners[i].data_ptr<int64_t>();
         for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) {
             dptr[j] = grids[i].seq2evencorner[j];
         }
-        dptr = grid_oddcorners[i].data_ptr<long>();
+        dptr = grid_oddcorners[i].data_ptr<int64_t>();
         for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) {
             dptr[j] = grids[i].seq2oddcorner[j];
         }            
         if (i + 1 < grids.size()) {
-            grid_downsamples[i] = torch::zeros({grids[i].downsample_seq.size()}, int64_options);
-            long* dptr = grid_downsamples[i].data_ptr<long>();
+            grid_downsamples[i] = torch::zeros({static_cast<int64_t>(grids[i].downsample_seq.size())}, int64_options);
+            int64_t* dptr = grid_downsamples[i].data_ptr<int64_t>();
             for (int j = 0; j < grids[i].downsample_seq.size(); ++j) {
                 dptr[j] = grids[i].downsample_seq[j];
             }
@@ -439,7 +438,7 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
     int num_level, int resolution)
 {
     if (view_layer_positions.size() != 3 || num_level < 1) {
-        printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level);
+        printf("Alert! We require 3 layers and at least 1 level! (%zu %d)\n", view_layer_positions.size(), num_level);
         return {{},{},{},{}};
     }
 
@@ -534,9 +533,9 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
     std::vector<torch::Tensor> grid_evencorners(grids.size());
     std::vector<torch::Tensor> grid_oddcorners(grids.size());
 
-    texture_positions[0] = torch::zeros({seq2pos.size() / 3, 3}, float_options);
-    texture_positions[1] = torch::zeros({seq2pos.size() / 3}, float_options);
-    texture_feats[0] = torch::zeros({seq2feat.size() / feat_channel, feat_channel}, float_options);
+    texture_positions[0] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3), 3}, float_options);
+    texture_positions[1] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3)}, float_options);
+    texture_feats[0] = torch::zeros({static_cast<int64_t>(seq2feat.size() / feat_channel), static_cast<int64_t>(feat_channel)}, float_options);
     float* positions_out_ptr = texture_positions[0].data_ptr<float>();
     memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size());
     positions_out_ptr = texture_positions[1].data_ptr<float>();
@@ -547,24 +546,23 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
     memcpy(feats_out_ptr, seq2feat.data(), sizeof(float) * seq2feat.size());
 
     for (int i = 0; i < grids.size(); ++i) {
-        grid_neighbors[i] = torch::zeros({grids[i].seq2grid.size(), 9}, int64_options);
-        long* nptr = grid_neighbors[i].data_ptr<long>();
+        grid_neighbors[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2grid.size()), 9}, int64_options);
+        int64_t* nptr = grid_neighbors[i].data_ptr<int64_t>();
         for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) {
             nptr[j] = grids[i].seq2neighbor[j];
         }
-        grid_evencorners[i] = torch::zeros({grids[i].seq2evencorner.size()}, int64_options);
-        grid_oddcorners[i] = torch::zeros({grids[i].seq2oddcorner.size()}, int64_options);
-        long* dptr = grid_evencorners[i].data_ptr<long>();
+        grid_evencorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2evencorner.size())}, int64_options);
+        int64_t* dptr = grid_evencorners[i].data_ptr<int64_t>();
         for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) {
             dptr[j] = grids[i].seq2evencorner[j];
         }
-        dptr = grid_oddcorners[i].data_ptr<long>();
+        dptr = grid_oddcorners[i].data_ptr<int64_t>();
         for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) {
             dptr[j] = grids[i].seq2oddcorner[j];
         }
         if (i + 1 < grids.size()) {
-            grid_downsamples[i] = torch::zeros({grids[i].downsample_seq.size()}, int64_options);
-            long* dptr = grid_downsamples[i].data_ptr<long>();
+            grid_downsamples[i] = torch::zeros({static_cast<int64_t>(grids[i].downsample_seq.size())}, int64_options);
+            int64_t* dptr = grid_downsamples[i].data_ptr<int64_t>();
             for (int j = 0; j < grids[i].downsample_seq.size(); ++j) {
                 dptr[j] = grids[i].downsample_seq[j];
             }
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
index b3ff69f..c02f1f9 100644
--- a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
@@ -1,6 +1,6 @@
 #include "rasterizer.h"
 
-void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) {
+void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, int64_t* zbuffer, float* d, float occlusion_truncation) {
     float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0]));
     float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0]));
     float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1]));
@@ -18,7 +18,7 @@ void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width
             if (isBarycentricCoordInBounds(baryCentricCoordinate)) {
                 int pixel = py * width + px;
                 if (zbuffer == 0) {
-                    zbuffer[pixel] = (INT64)(idx + 1);
+                    zbuffer[pixel] = (int64_t)(idx + 1);
                     continue;
                 }
 
@@ -29,7 +29,7 @@ void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width
                 }
                 
                 int z_quantize = depth * (2<<17);
-                INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1);
+                int64_t token = (int64_t)z_quantize * MAXINT + (int64_t)(idx + 1);
                 if (depth < depth_thres)
                     continue;
                 zbuffer[pixel] = std::min(zbuffer[pixel], token);
@@ -38,10 +38,10 @@ void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width
     }
 }
 
-void barycentricFromImgcoordCPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces,
+void barycentricFromImgcoordCPU(float* V, int* F, int* findices, int64_t* zbuffer, int width, int height, int num_vertices, int num_faces,
     float* barycentric_map, int pix)
 {
-    INT64 f = zbuffer[pix] % MAXINT;
+    int64_t f = zbuffer[pix] % MAXINT;
     if (f == (MAXINT-1)) {
         findices[pix] = 0;
         barycentric_map[pix * 3] = 0;
@@ -78,7 +78,7 @@ void barycentricFromImgcoordCPU(float* V, int* F, int* findices, INT64* zbuffer,
     barycentric_map[pix * 3 + 2] = barycentric[2];
 }
 
-void rasterizeImagecoordsKernelCPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces, int f)
+void rasterizeImagecoordsKernelCPU(float* V, int* F, float* d, int64_t* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces, int f)
 {
     float* vt0_ptr = V + (F[f * 3] * 4);
     float* vt1_ptr = V + (F[f * 3 + 1] * 4);
@@ -99,25 +99,25 @@ std::vector<torch::Tensor> rasterize_image_cpu(torch::Tensor V, torch::Tensor F,
     auto options = torch::TensorOptions().dtype(torch::kInt32).requires_grad(false);
     auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false);
     auto findices = torch::zeros({height, width}, options);
-    INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1);
-    auto z_min = torch::ones({height, width}, INT64_options) * (long)maxint;
+    int64_t maxint = (int64_t)MAXINT * (int64_t)MAXINT + (MAXINT - 1);
+    auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint;
 
     if (!use_depth_prior) {
         for (int i = 0; i < num_faces; ++i) {
             rasterizeImagecoordsKernelCPU(V.data_ptr<float>(), F.data_ptr<int>(), 0,
-                (INT64*)z_min.data_ptr<long>(), occlusion_truncation, width, height, num_vertices, num_faces, i); 
+                (int64_t*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces, i); 
         }
     } else {
         for (int i = 0; i < num_faces; ++i)
             rasterizeImagecoordsKernelCPU(V.data_ptr<float>(), F.data_ptr<int>(), D.data_ptr<float>(),
-                (INT64*)z_min.data_ptr<long>(), occlusion_truncation, width, height, num_vertices, num_faces, i);
+                (int64_t*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces, i);
     }
 
     auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
     auto barycentric = torch::zeros({height, width, 3}, float_options);
     for (int i = 0; i < width * height; ++i)
         barycentricFromImgcoordCPU(V.data_ptr<float>(), F.data_ptr<int>(),
-            findices.data_ptr<int>(), (INT64*)z_min.data_ptr<long>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>(), i);
+            findices.data_ptr<int>(), (int64_t*)z_min.data_ptr<int64_t>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>(), i);
 
     return {findices, barycentric};
 }
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h
index cf4f987..8e198b1 100644
--- a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h
@@ -5,8 +5,9 @@
 #include <vector>
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h> // For CUDA context
+#include <cstdint>
 
-#define INT64 unsigned long long
+#define INT64 int64_t
 #define MAXINT 2147483647
 
 __host__ __device__ inline float calculateSignedArea2(float* a, float* b, float* c) {
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
index 709c1b8..ba6ab91 100644
--- a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
@@ -1,6 +1,6 @@
 #include "rasterizer.h"
 
-__device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) {
+__device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, uint64_t* zbuffer, float* d, float occlusion_truncation) {
     float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0]));
     float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0]));
     float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1]));
@@ -18,7 +18,7 @@ __device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2
             if (isBarycentricCoordInBounds(baryCentricCoordinate)) {
                 int pixel = py * width + px;
                 if (zbuffer == 0) {
-                    atomicExch(&zbuffer[pixel], (INT64)(idx + 1));
+                    atomicExch(&zbuffer[pixel], (uint64_t)(idx + 1));
                     continue;
                 }
                 float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2];
@@ -28,7 +28,7 @@ __device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2
                 }
                 
                 int z_quantize = depth * (2<<17);
-                INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1);
+                uint64_t token = (uint64_t)z_quantize * MAXINT + (uint64_t)(idx + 1);
                 if (depth < depth_thres)
                     continue;
                 atomicMin(&zbuffer[pixel], token);
@@ -37,13 +37,13 @@ __device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2
     }
 }
 
-__global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces,
+__global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, uint64_t* zbuffer, int width, int height, int num_vertices, int num_faces,
     float* barycentric_map)
 {
     int pix = blockIdx.x * blockDim.x + threadIdx.x;
     if (pix >= width * height)
         return;
-    INT64 f = zbuffer[pix] % MAXINT;
+    uint64_t f = zbuffer[pix] % MAXINT;
     if (f == (MAXINT-1)) {
         findices[pix] = 0;
         barycentric_map[pix * 3] = 0;
@@ -80,7 +80,7 @@ __global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, INT6
     barycentric_map[pix * 3 + 2] = barycentric[2];
 }
 
-__global__ void rasterizeImagecoordsKernelGPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces)
+__global__ void rasterizeImagecoordsKernelGPU(float* V, int* F, float* d, uint64_t* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces)
 {
     int f = blockIdx.x * blockDim.x + threadIdx.x;
     if (f >= num_faces)
@@ -107,21 +107,21 @@ std::vector<torch::Tensor> rasterize_image_gpu(torch::Tensor V, torch::Tensor F,
     auto options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA, device_id).requires_grad(false);
     auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA, device_id).requires_grad(false);
     auto findices = torch::zeros({height, width}, options);
-    INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1);
-    auto z_min = torch::ones({height, width}, INT64_options) * (long)maxint;
+    uint64_t maxint = (uint64_t)MAXINT * (uint64_t)MAXINT + (MAXINT - 1);
+    auto z_min = torch::ones({height, width}, INT64_options) * (uint64_t)maxint;
 
     if (!use_depth_prior) {
         rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr<float>(), F.data_ptr<int>(), 0,
-            (INT64*)z_min.data_ptr<long>(), occlusion_truncation, width, height, num_vertices, num_faces); 
+            (uint64_t*)z_min.data_ptr<uint64_t>(), occlusion_truncation, width, height, num_vertices, num_faces); 
     } else {
         rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr<float>(), F.data_ptr<int>(), D.data_ptr<float>(),
-            (INT64*)z_min.data_ptr<long>(), occlusion_truncation, width, height, num_vertices, num_faces); 
+            (uint64_t*)z_min.data_ptr<uint64_t>(), occlusion_truncation, width, height, num_vertices, num_faces); 
     }
 
     auto float_options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA, device_id).requires_grad(false);
     auto barycentric = torch::zeros({height, width, 3}, float_options);
     barycentricFromImgcoordGPU<<<(width * height + 255)/256, 256>>>(V.data_ptr<float>(), F.data_ptr<int>(),
-        findices.data_ptr<int>(), (INT64*)z_min.data_ptr<long>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>());
+        findices.data_ptr<int>(), (uint64_t*)z_min.data_ptr<uint64_t>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>());
 
     return {findices, barycentric};
 }

From 48a2a2bab9fc0690263e77deec3830ce8e0dfd33 Mon Sep 17 00:00:00 2001
From: qinmaohui <820102443@qq.com>
Date: Thu, 10 Jul 2025 17:29:07 +0800
Subject: [PATCH 2/4] =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=A4=9A=E4=BD=99?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .idea/.gitignore                               |  8 --------
 .idea/Hunyuan3D-2.1.iml                        | 12 ------------
 .idea/inspectionProfiles/profiles_settings.xml |  6 ------
 .idea/misc.xml                                 |  4 ----
 .idea/modules.xml                              |  8 --------
 .idea/vcs.xml                                  |  6 ------
 6 files changed, 44 deletions(-)
 delete mode 100644 .idea/.gitignore
 delete mode 100644 .idea/Hunyuan3D-2.1.iml
 delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml
 delete mode 100644 .idea/misc.xml
 delete mode 100644 .idea/modules.xml
 delete mode 100644 .idea/vcs.xml

diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 35410ca..0000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-# 默认忽略的文件
-/shelf/
-/workspace.xml
-# 基于编辑器的 HTTP 客户端请求
-/httpRequests/
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml
diff --git a/.idea/Hunyuan3D-2.1.iml b/.idea/Hunyuan3D-2.1.iml
deleted file mode 100644
index 41f56a8..0000000
--- a/.idea/Hunyuan3D-2.1.iml
+++ /dev/null
@@ -1,12 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="hunyuan" jdkType="Python SDK" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-  <component name="PyDocumentationSettings">
-    <option name="format" value="GOOGLE" />
-    <option name="myDocStringFormat" value="Google" />
-  </component>
-</module>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
deleted file mode 100644
index 105ce2d..0000000
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <settings>
-    <option name="USE_PROJECT_PROFILE" value="false" />
-    <version value="1.0" />
-  </settings>
-</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index cfe85d6..0000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="hunyuan" project-jdk-type="Python SDK" />
-</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 23b18b0..0000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/Hunyuan3D-2.1.iml" filepath="$PROJECT_DIR$/.idea/Hunyuan3D-2.1.iml" />
-    </modules>
-  </component>
-</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 35eb1dd..0000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="" vcs="Git" />
-  </component>
-</project>
\ No newline at end of file

From 928f41b28980bcb272c64f5f089d464110f76d82 Mon Sep 17 00:00:00 2001
From: qinmaohui <820102443@qq.com>
Date: Wed, 10 Sep 2025 09:04:36 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E5=B0=86=E5=8E=9F=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E6=81=A2=E5=A4=8D=EF=BC=8C=E6=96=B0=E5=BB=BAcustom=5Frasterize?=
 =?UTF-8?q?r=5Fkernel=5Ffor=5Fwindows=E6=96=87=E4=BB=B6=E5=A4=B9=E6=94=BE?=
 =?UTF-8?q?=E7=BD=AE=E4=BF=AE=E6=94=B9=E7=9A=84=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../grid_neighbor.cpp                         |  46 +-
 .../custom_rasterizer_kernel/rasterizer.cpp   |  24 +-
 .../lib/custom_rasterizer_kernel/rasterizer.h |   3 +-
 .../rasterizer_gpu.cu                         |  24 +-
 .../__init__.py                               |   0
 .../grid_neighbor.cpp                         | 572 ++++++++++++++++++
 .../rasterizer.cpp                            | 139 +++++
 .../rasterizer.h                              |  55 ++
 .../rasterizer_gpu.cu                         | 127 ++++
 9 files changed, 942 insertions(+), 48 deletions(-)
 create mode 100644 hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/__init__.py
 create mode 100644 hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/grid_neighbor.cpp
 create mode 100644 hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer.cpp
 create mode 100644 hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer.h
 create mode 100644 hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer_gpu.cu

diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
index cbe21e1..c741c04 100644
--- a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
@@ -312,7 +312,7 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tenso
     std::vector<torch::Tensor> view_layer_normals, int num_level, int resolution)
 {
     if (view_layer_positions.size() != 3 || num_level < 1) {
-        printf("Alert! We require 3 layers and at least 1 level! (%zu %d)\n", view_layer_positions.size(), num_level);
+        printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level);
         return {{},{},{},{}};
     }
 
@@ -394,8 +394,8 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tenso
     std::vector<torch::Tensor> grid_evencorners(grids.size());
     std::vector<torch::Tensor> grid_oddcorners(grids.size());
 
-    texture_positions[0] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3), 3}, float_options);
-    texture_positions[1] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3)}, float_options);
+    texture_positions[0] = torch::zeros({seq2pos.size() / 3, 3}, float_options);
+    texture_positions[1] = torch::zeros({seq2pos.size() / 3}, float_options);
     float* positions_out_ptr = texture_positions[0].data_ptr<float>();
     memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size());
     positions_out_ptr = texture_positions[1].data_ptr<float>();
@@ -404,24 +404,25 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tenso
     }
 
     for (int i = 0; i < grids.size(); ++i) {
-        grid_neighbors[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2grid.size()), 9}, int64_options);
-        int64_t* nptr = grid_neighbors[i].data_ptr<int64_t>();
+        grid_neighbors[i] = torch::zeros({grids[i].seq2grid.size(), 9}, int64_options);
+        long* nptr = grid_neighbors[i].data_ptr<long>();
         for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) {
             nptr[j] = grids[i].seq2neighbor[j];
         }
 
-        grid_evencorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2evencorner.size())}, int64_options);
-        int64_t* dptr = grid_evencorners[i].data_ptr<int64_t>();
+        grid_evencorners[i] = torch::zeros({grids[i].seq2evencorner.size()}, int64_options);
+        grid_oddcorners[i] = torch::zeros({grids[i].seq2oddcorner.size()}, int64_options);
+        long* dptr = grid_evencorners[i].data_ptr<long>();
         for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) {
             dptr[j] = grids[i].seq2evencorner[j];
         }
-        dptr = grid_oddcorners[i].data_ptr<int64_t>();
+        dptr = grid_oddcorners[i].data_ptr<long>();
         for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) {
             dptr[j] = grids[i].seq2oddcorner[j];
         }            
         if (i + 1 < grids.size()) {
-            grid_downsamples[i] = torch::zeros({static_cast<int64_t>(grids[i].downsample_seq.size())}, int64_options);
-            int64_t* dptr = grid_downsamples[i].data_ptr<int64_t>();
+            grid_downsamples[i] = torch::zeros({grids[i].downsample_seq.size()}, int64_options);
+            long* dptr = grid_downsamples[i].data_ptr<long>();
             for (int j = 0; j < grids[i].downsample_seq.size(); ++j) {
                 dptr[j] = grids[i].downsample_seq[j];
             }
@@ -438,7 +439,7 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
     int num_level, int resolution)
 {
     if (view_layer_positions.size() != 3 || num_level < 1) {
-        printf("Alert! We require 3 layers and at least 1 level! (%zu %d)\n", view_layer_positions.size(), num_level);
+        printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level);
         return {{},{},{},{}};
     }
 
@@ -533,9 +534,9 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
     std::vector<torch::Tensor> grid_evencorners(grids.size());
     std::vector<torch::Tensor> grid_oddcorners(grids.size());
 
-    texture_positions[0] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3), 3}, float_options);
-    texture_positions[1] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3)}, float_options);
-    texture_feats[0] = torch::zeros({static_cast<int64_t>(seq2feat.size() / feat_channel), static_cast<int64_t>(feat_channel)}, float_options);
+    texture_positions[0] = torch::zeros({seq2pos.size() / 3, 3}, float_options);
+    texture_positions[1] = torch::zeros({seq2pos.size() / 3}, float_options);
+    texture_feats[0] = torch::zeros({seq2feat.size() / feat_channel, feat_channel}, float_options);
     float* positions_out_ptr = texture_positions[0].data_ptr<float>();
     memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size());
     positions_out_ptr = texture_positions[1].data_ptr<float>();
@@ -546,27 +547,28 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
     memcpy(feats_out_ptr, seq2feat.data(), sizeof(float) * seq2feat.size());
 
     for (int i = 0; i < grids.size(); ++i) {
-        grid_neighbors[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2grid.size()), 9}, int64_options);
-        int64_t* nptr = grid_neighbors[i].data_ptr<int64_t>();
+        grid_neighbors[i] = torch::zeros({grids[i].seq2grid.size(), 9}, int64_options);
+        long* nptr = grid_neighbors[i].data_ptr<long>();
         for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) {
             nptr[j] = grids[i].seq2neighbor[j];
         }
-        grid_evencorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2evencorner.size())}, int64_options);
-        int64_t* dptr = grid_evencorners[i].data_ptr<int64_t>();
+        grid_evencorners[i] = torch::zeros({grids[i].seq2evencorner.size()}, int64_options);
+        grid_oddcorners[i] = torch::zeros({grids[i].seq2oddcorner.size()}, int64_options);
+        long* dptr = grid_evencorners[i].data_ptr<long>();
         for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) {
             dptr[j] = grids[i].seq2evencorner[j];
         }
-        dptr = grid_oddcorners[i].data_ptr<int64_t>();
+        dptr = grid_oddcorners[i].data_ptr<long>();
         for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) {
             dptr[j] = grids[i].seq2oddcorner[j];
         }
         if (i + 1 < grids.size()) {
-            grid_downsamples[i] = torch::zeros({static_cast<int64_t>(grids[i].downsample_seq.size())}, int64_options);
-            int64_t* dptr = grid_downsamples[i].data_ptr<int64_t>();
+            grid_downsamples[i] = torch::zeros({grids[i].downsample_seq.size()}, int64_options);
+            long* dptr = grid_downsamples[i].data_ptr<long>();
             for (int j = 0; j < grids[i].downsample_seq.size(); ++j) {
                 dptr[j] = grids[i].downsample_seq[j];
             }
         }
     }
     return {texture_positions, texture_feats, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners};
-}
+}
\ No newline at end of file
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
index c02f1f9..2375636 100644
--- a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
@@ -1,6 +1,6 @@
 #include "rasterizer.h"
 
-void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, int64_t* zbuffer, float* d, float occlusion_truncation) {
+void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) {
     float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0]));
     float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0]));
     float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1]));
@@ -18,7 +18,7 @@ void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width
             if (isBarycentricCoordInBounds(baryCentricCoordinate)) {
                 int pixel = py * width + px;
                 if (zbuffer == 0) {
-                    zbuffer[pixel] = (int64_t)(idx + 1);
+                    zbuffer[pixel] = (INT64)(idx + 1);
                     continue;
                 }
 
@@ -29,7 +29,7 @@ void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width
                 }
                 
                 int z_quantize = depth * (2<<17);
-                int64_t token = (int64_t)z_quantize * MAXINT + (int64_t)(idx + 1);
+                INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1);
                 if (depth < depth_thres)
                     continue;
                 zbuffer[pixel] = std::min(zbuffer[pixel], token);
@@ -38,10 +38,10 @@ void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width
     }
 }
 
-void barycentricFromImgcoordCPU(float* V, int* F, int* findices, int64_t* zbuffer, int width, int height, int num_vertices, int num_faces,
+void barycentricFromImgcoordCPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces,
     float* barycentric_map, int pix)
 {
-    int64_t f = zbuffer[pix] % MAXINT;
+    INT64 f = zbuffer[pix] % MAXINT;
     if (f == (MAXINT-1)) {
         findices[pix] = 0;
         barycentric_map[pix * 3] = 0;
@@ -78,7 +78,7 @@ void barycentricFromImgcoordCPU(float* V, int* F, int* findices, int64_t* zbuffe
     barycentric_map[pix * 3 + 2] = barycentric[2];
 }
 
-void rasterizeImagecoordsKernelCPU(float* V, int* F, float* d, int64_t* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces, int f)
+void rasterizeImagecoordsKernelCPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces, int f)
 {
     float* vt0_ptr = V + (F[f * 3] * 4);
     float* vt1_ptr = V + (F[f * 3 + 1] * 4);
@@ -99,25 +99,25 @@ std::vector<torch::Tensor> rasterize_image_cpu(torch::Tensor V, torch::Tensor F,
     auto options = torch::TensorOptions().dtype(torch::kInt32).requires_grad(false);
     auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false);
     auto findices = torch::zeros({height, width}, options);
-    int64_t maxint = (int64_t)MAXINT * (int64_t)MAXINT + (MAXINT - 1);
-    auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint;
+    INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1);
+    auto z_min = torch::ones({height, width}, INT64_options) * (long)maxint;
 
     if (!use_depth_prior) {
         for (int i = 0; i < num_faces; ++i) {
             rasterizeImagecoordsKernelCPU(V.data_ptr<float>(), F.data_ptr<int>(), 0,
-                (int64_t*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces, i); 
+                (INT64*)z_min.data_ptr<long>(), occlusion_truncation, width, height, num_vertices, num_faces, i); 
         }
     } else {
         for (int i = 0; i < num_faces; ++i)
             rasterizeImagecoordsKernelCPU(V.data_ptr<float>(), F.data_ptr<int>(), D.data_ptr<float>(),
-                (int64_t*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces, i);
+                (INT64*)z_min.data_ptr<long>(), occlusion_truncation, width, height, num_vertices, num_faces, i);
     }
 
     auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
     auto barycentric = torch::zeros({height, width, 3}, float_options);
     for (int i = 0; i < width * height; ++i)
         barycentricFromImgcoordCPU(V.data_ptr<float>(), F.data_ptr<int>(),
-            findices.data_ptr<int>(), (int64_t*)z_min.data_ptr<int64_t>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>(), i);
+            findices.data_ptr<int>(), (INT64*)z_min.data_ptr<long>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>(), i);
 
     return {findices, barycentric};
 }
@@ -136,4 +136,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("rasterize_image", &rasterize_image, "Custom image rasterization");
   m.def("build_hierarchy", &build_hierarchy, "Custom image rasterization");
   m.def("build_hierarchy_with_feat", &build_hierarchy_with_feat, "Custom image rasterization");
-}
+}
\ No newline at end of file
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h
index 8e198b1..cf4f987 100644
--- a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h
@@ -5,9 +5,8 @@
 #include <vector>
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h> // For CUDA context
-#include <cstdint>
 
-#define INT64 int64_t
+#define INT64 unsigned long long
 #define MAXINT 2147483647
 
 __host__ __device__ inline float calculateSignedArea2(float* a, float* b, float* c) {
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
index ba6ab91..373f78d 100644
--- a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
@@ -1,6 +1,6 @@
 #include "rasterizer.h"
 
-__device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, uint64_t* zbuffer, float* d, float occlusion_truncation) {
+__device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) {
     float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0]));
     float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0]));
     float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1]));
@@ -18,7 +18,7 @@ __device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2
             if (isBarycentricCoordInBounds(baryCentricCoordinate)) {
                 int pixel = py * width + px;
                 if (zbuffer == 0) {
-                    atomicExch(&zbuffer[pixel], (uint64_t)(idx + 1));
+                    atomicExch(&zbuffer[pixel], (INT64)(idx + 1));
                     continue;
                 }
                 float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2];
@@ -28,7 +28,7 @@ __device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2
                 }
                 
                 int z_quantize = depth * (2<<17);
-                uint64_t token = (uint64_t)z_quantize * MAXINT + (uint64_t)(idx + 1);
+                INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1);
                 if (depth < depth_thres)
                     continue;
                 atomicMin(&zbuffer[pixel], token);
@@ -37,13 +37,13 @@ __device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2
     }
 }
 
-__global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, uint64_t* zbuffer, int width, int height, int num_vertices, int num_faces,
+__global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces,
     float* barycentric_map)
 {
     int pix = blockIdx.x * blockDim.x + threadIdx.x;
     if (pix >= width * height)
         return;
-    uint64_t f = zbuffer[pix] % MAXINT;
+    INT64 f = zbuffer[pix] % MAXINT;
     if (f == (MAXINT-1)) {
         findices[pix] = 0;
         barycentric_map[pix * 3] = 0;
@@ -80,7 +80,7 @@ __global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, uint
     barycentric_map[pix * 3 + 2] = barycentric[2];
 }
 
-__global__ void rasterizeImagecoordsKernelGPU(float* V, int* F, float* d, uint64_t* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces)
+__global__ void rasterizeImagecoordsKernelGPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces)
 {
     int f = blockIdx.x * blockDim.x + threadIdx.x;
     if (f >= num_faces)
@@ -107,21 +107,21 @@ std::vector<torch::Tensor> rasterize_image_gpu(torch::Tensor V, torch::Tensor F,
     auto options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA, device_id).requires_grad(false);
     auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA, device_id).requires_grad(false);
     auto findices = torch::zeros({height, width}, options);
-    uint64_t maxint = (uint64_t)MAXINT * (uint64_t)MAXINT + (MAXINT - 1);
-    auto z_min = torch::ones({height, width}, INT64_options) * (uint64_t)maxint;
+    INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1);
+    auto z_min = torch::ones({height, width}, INT64_options) * (long)maxint;
 
     if (!use_depth_prior) {
         rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr<float>(), F.data_ptr<int>(), 0,
-            (uint64_t*)z_min.data_ptr<uint64_t>(), occlusion_truncation, width, height, num_vertices, num_faces); 
+            (INT64*)z_min.data_ptr<long>(), occlusion_truncation, width, height, num_vertices, num_faces); 
     } else {
         rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr<float>(), F.data_ptr<int>(), D.data_ptr<float>(),
-            (uint64_t*)z_min.data_ptr<uint64_t>(), occlusion_truncation, width, height, num_vertices, num_faces); 
+            (INT64*)z_min.data_ptr<long>(), occlusion_truncation, width, height, num_vertices, num_faces); 
     }
 
     auto float_options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA, device_id).requires_grad(false);
     auto barycentric = torch::zeros({height, width, 3}, float_options);
     barycentricFromImgcoordGPU<<<(width * height + 255)/256, 256>>>(V.data_ptr<float>(), F.data_ptr<int>(),
-        findices.data_ptr<int>(), (uint64_t*)z_min.data_ptr<uint64_t>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>());
+        findices.data_ptr<int>(), (INT64*)z_min.data_ptr<long>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>());
 
     return {findices, barycentric};
-}
+}
\ No newline at end of file
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/__init__.py b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/grid_neighbor.cpp b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/grid_neighbor.cpp
new file mode 100644
index 0000000..cbe21e1
--- /dev/null
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/grid_neighbor.cpp
@@ -0,0 +1,572 @@
+#include "rasterizer.h"
+#include <fstream>
+
+inline int pos2key(float* p, int resolution) {
+    int x = (p[0] * 0.5 + 0.5) * resolution;
+    int y = (p[1] * 0.5 + 0.5) * resolution;
+    int z = (p[2] * 0.5 + 0.5) * resolution;
+    return (x * resolution + y) * resolution + z;
+}
+
+inline void key2pos(int key, int resolution, float* p) {
+    int x = key / resolution / resolution;
+    int y = key / resolution % resolution;
+    int z = key % resolution;
+    p[0] = ((x + 0.5) / resolution - 0.5) * 2;
+    p[1] = ((y + 0.5) / resolution - 0.5) * 2;
+    p[2] = ((z + 0.5) / resolution - 0.5) * 2;
+}
+
+inline void key2cornerpos(int key, int resolution, float* p) {
+    int x = key / resolution / resolution;
+    int y = key / resolution % resolution;
+    int z = key % resolution;
+    p[0] = ((x + 0.75) / resolution - 0.5) * 2;
+    p[1] = ((y + 0.25) / resolution - 0.5) * 2;
+    p[2] = ((z + 0.75) / resolution - 0.5) * 2;
+}
+
+inline float* pos_ptr(int l, int i, int j, torch::Tensor t) {
+    float* pdata = t.data_ptr<float>();
+    int height = t.size(1);
+    int width = t.size(2);
+    return &pdata[((l * height + i) * width + j) * 4];
+}
+
+struct Grid
+{
+    std::vector<int> seq2oddcorner;
+    std::vector<int> seq2evencorner;
+    std::vector<int> seq2grid;
+    std::vector<int> seq2normal;
+    std::vector<int> seq2neighbor;
+    std::unordered_map<int, int> grid2seq;
+    std::vector<int> downsample_seq;
+    int num_origin_seq;
+    int resolution;
+    int stride;
+};
+
+inline void pos_from_seq(Grid& grid, int seq, float* p) {
+    auto k = grid.seq2grid[seq];
+    key2pos(k, grid.resolution, p);
+}
+
+inline int fetch_seq(Grid& grid, int l, int i, int j, torch::Tensor pdata) {
+    float* p = pos_ptr(l, i, j, pdata);
+    if (p[3] == 0)
+        return -1;
+    auto key = pos2key(p, grid.resolution);
+    int seq = grid.grid2seq[key];
+    return seq;
+}
+
+inline int fetch_last_seq(Grid& grid, int i, int j, torch::Tensor pdata) {
+    int num_layers = pdata.size(0);
+    int l = 0;
+    int idx = fetch_seq(grid, l, i, j, pdata);
+    while (l < num_layers - 1) {
+        l += 1;
+        int new_idx = fetch_seq(grid, l, i, j, pdata);
+        if (new_idx == -1)
+            break;
+        idx = new_idx;
+    }
+    return idx;
+}
+
+inline int fetch_nearest_seq(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) {
+    float p[3];
+    float max_dist = 1e10;
+    int best_idx = -1;
+    int num_layers = pdata.size(0);
+    for (int l = 0; l < num_layers; ++l) {
+        int idx = fetch_seq(grid, l, i, j, pdata);
+        if (idx == -1)
+            break;
+        pos_from_seq(grid, idx, p);
+        float dist = std::abs(d - p[(dim + 2) % 3]);
+        if (dist < max_dist) {
+            max_dist = dist;
+            best_idx = idx;
+        }
+    }
+    return best_idx;
+}
+
+inline int fetch_nearest_seq_layer(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) {
+    float p[3];
+    float max_dist = 1e10;
+    int best_layer = -1;
+    int num_layers = pdata.size(0);
+    for (int l = 0; l < num_layers; ++l) {
+        int idx = fetch_seq(grid, l, i, j, pdata);
+        if (idx == -1)
+            break;
+        pos_from_seq(grid, idx, p);
+        float dist = std::abs(d - p[(dim + 2) % 3]);
+        if (dist < max_dist) {
+            max_dist = dist;
+            best_layer = l;
+        }
+    }
+    return best_layer;
+}
+
+void FetchNeighbor(Grid& grid, int seq, float* pos, int dim, int boundary_info, std::vector<torch::Tensor>& view_layer_positions,
+    int* output_indices)
+{
+    auto t = view_layer_positions[dim];
+    int height = t.size(1);
+    int width = t.size(2);
+    int top = 0;
+    int ci = 0;
+    int cj = 0;
+    if (dim == 0) {
+        ci = (pos[1]/2+0.5)*height;
+        cj = (pos[0]/2+0.5)*width;
+    }
+    else if (dim == 1) {
+        ci = (pos[1]/2+0.5)*height;
+        cj = (pos[2]/2+0.5)*width;
+    }
+    else {
+        ci = (-pos[2]/2+0.5)*height;
+        cj = (pos[0]/2+0.5)*width;
+    }
+    int stride = grid.stride;
+    for (int ni = ci + stride; ni >= ci - stride; ni -= stride) {
+        for (int nj = cj - stride; nj <= cj + stride; nj += stride) {
+            int idx = -1;
+            if (ni == ci && nj == cj)
+                idx = seq;
+            else if (!(ni < 0 || ni >= height || nj < 0 || nj >= width)) {
+                if (boundary_info == -1)
+                    idx = fetch_seq(grid, 0, ni, nj, t);
+                else if (boundary_info == 1)
+                    idx = fetch_last_seq(grid, ni, nj, t);
+                else
+                    idx = fetch_nearest_seq(grid, ni, nj, dim, pos[(dim + 2) % 3], t);
+            }
+            output_indices[top] = idx;
+            top += 1;
+        }
+    }
+}
+
+void DownsampleGrid(Grid& src, Grid& tar)
+{
+    src.downsample_seq.resize(src.seq2grid.size(), -1);
+    tar.resolution = src.resolution / 2;
+    tar.stride = src.stride * 2;
+    float pos[3];
+    std::vector<int> seq2normal_count;
+    for (int i = 0; i < src.seq2grid.size(); ++i) {
+        key2pos(src.seq2grid[i], src.resolution, pos);
+        int k = pos2key(pos, tar.resolution);
+        int s = seq2normal_count.size();
+        if (!tar.grid2seq.count(k)) {
+            tar.grid2seq[k] = tar.seq2grid.size();
+            tar.seq2grid.emplace_back(k);
+            seq2normal_count.emplace_back(0);
+            seq2normal_count.emplace_back(0);
+            seq2normal_count.emplace_back(0);
+            //tar.seq2normal.emplace_back(src.seq2normal[i]);
+        } else {
+            s = tar.grid2seq[k] * 3;
+        }
+        seq2normal_count[s + src.seq2normal[i]] += 1;
+        src.downsample_seq[i] = tar.grid2seq[k];
+    }
+    tar.seq2normal.resize(seq2normal_count.size() / 3);
+    for (int i = 0; i < seq2normal_count.size(); i += 3) {
+        int t = 0;
+        for (int j = 1; j < 3; ++j) {
+            if (seq2normal_count[i + j] > seq2normal_count[i + t])
+                t = j;
+        }
+        tar.seq2normal[i / 3] = t;
+    }
+}
+
+void NeighborGrid(Grid& grid, std::vector<torch::Tensor> view_layer_positions, int v)
+{
+    grid.seq2evencorner.resize(grid.seq2grid.size(), 0);
+    grid.seq2oddcorner.resize(grid.seq2grid.size(), 0);
+    std::unordered_set<int> visited_seq;
+    for (int vd = 0; vd < 3; ++vd) {
+        auto t = view_layer_positions[vd];
+        auto t0 = view_layer_positions[v];
+        int height = t.size(1);
+        int width = t.size(2);
+        int num_layers = t.size(0);
+        int num_view_layers = t0.size(0);
+        for (int i = 0; i < height; ++i) {
+            for (int j = 0; j < width; ++j) {
+                for (int l = 0; l < num_layers; ++l) {
+                    int seq = fetch_seq(grid, l, i, j, t);
+                    if (seq == -1)
+                        break;
+                    int dim = grid.seq2normal[seq];
+                    if (dim != v)
+                        continue;
+
+                    float pos[3];
+                    pos_from_seq(grid, seq, pos);
+
+                    int ci = 0;
+                    int cj = 0;
+                    if (dim == 0) {
+                        ci = (pos[1]/2+0.5)*height;
+                        cj = (pos[0]/2+0.5)*width;
+                    }
+                    else if (dim == 1) {
+                        ci = (pos[1]/2+0.5)*height;
+                        cj = (pos[2]/2+0.5)*width;
+                    }
+                    else {
+                        ci = (-pos[2]/2+0.5)*height;
+                        cj = (pos[0]/2+0.5)*width;
+                    }
+
+                    if ((ci % (grid.stride * 2) < grid.stride) && (cj % (grid.stride * 2) >= grid.stride))
+                        grid.seq2evencorner[seq] = 1;
+
+                    if ((ci % (grid.stride * 2) >= grid.stride) && (cj % (grid.stride * 2) < grid.stride))
+                        grid.seq2oddcorner[seq] = 1;
+
+                    bool is_boundary = false;
+                    if (vd == v) {
+                        if (l == 0 || l == num_layers - 1)
+                            is_boundary = true;
+                        else {
+                            int seq_new = fetch_seq(grid, l + 1, i, j, t);
+                            if (seq_new == -1)
+                                is_boundary = true;
+                        }
+                    }
+                    int boundary_info = 0;
+                    if (is_boundary && (l == 0))
+                        boundary_info = -1;
+                    else if (is_boundary)
+                        boundary_info = 1;
+                    if (visited_seq.count(seq))
+                        continue;
+                    visited_seq.insert(seq);
+
+                    FetchNeighbor(grid, seq, pos, dim, boundary_info, view_layer_positions, &grid.seq2neighbor[seq * 9]);
+                }
+            }
+        }
+    }
+}
+
+void PadGrid(Grid& src, Grid& tar, std::vector<torch::Tensor>& view_layer_positions) {
+    auto& downsample_seq = src.downsample_seq;
+    auto& seq2evencorner = src.seq2evencorner;
+    auto& seq2oddcorner = src.seq2oddcorner;
+    int indices[9];
+    std::vector<int> mapped_even_corners(tar.seq2grid.size(), 0);
+    std::vector<int> mapped_odd_corners(tar.seq2grid.size(), 0);
+    for (int i = 0; i < downsample_seq.size(); ++i) {
+        if (seq2evencorner[i] > 0) {
+            mapped_even_corners[downsample_seq[i]] = 1;
+        }
+        if (seq2oddcorner[i] > 0) {
+            mapped_odd_corners[downsample_seq[i]] = 1;
+        }
+    }
+    auto& tar_seq2normal = tar.seq2normal;
+    auto& tar_seq2grid = tar.seq2grid;
+    for (int i = 0; i < tar_seq2grid.size(); ++i) {
+        if (mapped_even_corners[i] == 1 && mapped_odd_corners[i] == 1)
+            continue;
+        auto k = tar_seq2grid[i];
+        float p[3];
+        key2cornerpos(k, tar.resolution, p);
+
+        int src_key = pos2key(p, src.resolution);
+        if (!src.grid2seq.count(src_key)) {
+            int seq = src.seq2grid.size();
+            src.grid2seq[src_key] = seq;
+            src.seq2evencorner.emplace_back((mapped_even_corners[i] == 0));
+            src.seq2oddcorner.emplace_back((mapped_odd_corners[i] == 0));
+            src.seq2grid.emplace_back(src_key);
+            src.seq2normal.emplace_back(tar_seq2normal[i]);
+            FetchNeighbor(src, seq, p, tar_seq2normal[i], 0, view_layer_positions, indices);
+            for (int j = 0; j < 9; ++j) {
+                src.seq2neighbor.emplace_back(indices[j]);
+            }
+            src.downsample_seq.emplace_back(i);
+        } else {
+            int seq = src.grid2seq[src_key];
+            if (mapped_even_corners[i] == 0)
+                src.seq2evencorner[seq] = 1;
+            if (mapped_odd_corners[i] == 0)
+                src.seq2oddcorner[seq] = 1;
+        }
+    }
+}
+
+std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tensor> view_layer_positions,
+    std::vector<torch::Tensor> view_layer_normals, int num_level, int resolution)
+{
+    if (view_layer_positions.size() != 3 || num_level < 1) {
+        printf("Alert! We require 3 layers and at least 1 level! (%zu %d)\n", view_layer_positions.size(), num_level);
+        return {{},{},{},{}};
+    }
+
+    std::vector<Grid> grids;
+    grids.resize(num_level);
+
+    std::vector<float> seq2pos;
+    auto& seq2grid = grids[0].seq2grid;
+    auto& seq2normal = grids[0].seq2normal;
+    auto& grid2seq = grids[0].grid2seq;
+    grids[0].resolution = resolution;
+    grids[0].stride = 1;
+
+    auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false);
+    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
+
+    for (int v = 0; v < 3; ++v) {
+        int num_layers = view_layer_positions[v].size(0);
+        int height = view_layer_positions[v].size(1);
+        int width = view_layer_positions[v].size(2);
+        float* data = view_layer_positions[v].data_ptr<float>();
+        float* data_normal = view_layer_normals[v].data_ptr<float>();
+        for (int l = 0; l < num_layers; ++l) {
+            for (int i = 0; i < height; ++i) {
+                for (int j = 0; j < width; ++j) {
+                    float* p = &data[(i * width + j) * 4];
+                    float* n = &data_normal[(i * width + j) * 3];
+                    if (p[3] == 0)
+                        continue;
+                    auto k = pos2key(p, resolution);
+                    if (!grid2seq.count(k)) {
+                        int dim = 0;
+                        for (int d = 0; d < 3; ++d) {
+                            if (std::abs(n[d]) > std::abs(n[dim]))
+                                dim = d;
+                        }
+                        dim = (dim + 1) % 3;
+                        grid2seq[k] = seq2grid.size();
+                        seq2grid.emplace_back(k);
+                        seq2pos.push_back(p[0]);
+                        seq2pos.push_back(p[1]);
+                        seq2pos.push_back(p[2]);
+                        seq2normal.emplace_back(dim);
+                    }
+                }
+            }
+            data += (height * width * 4);
+            data_normal += (height * width * 3);
+        }
+    }
+
+    for (int i = 0; i < num_level - 1; ++i) {
+        DownsampleGrid(grids[i], grids[i + 1]);
+    }
+
+    for (int l = 0; l < num_level; ++l) {
+        grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1);
+        grids[l].num_origin_seq = grids[l].seq2grid.size();
+        for (int d = 0; d < 3; ++d) {
+            NeighborGrid(grids[l], view_layer_positions, d);
+        }
+    }
+
+    for (int i = num_level - 2; i >= 0; --i) {
+        PadGrid(grids[i], grids[i + 1], view_layer_positions);
+    }
+    for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) {
+        int k = grids[0].seq2grid[i];
+        float p[3];
+        key2pos(k, grids[0].resolution, p);
+        seq2pos.push_back(p[0]);
+        seq2pos.push_back(p[1]);
+        seq2pos.push_back(p[2]);
+    }
+
+    std::vector<torch::Tensor> texture_positions(2);
+    std::vector<torch::Tensor> grid_neighbors(grids.size());
+    std::vector<torch::Tensor> grid_downsamples(grids.size() - 1);
+    std::vector<torch::Tensor> grid_evencorners(grids.size());
+    std::vector<torch::Tensor> grid_oddcorners(grids.size());
+
+    texture_positions[0] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3), 3}, float_options);
+    texture_positions[1] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3)}, float_options);
+    float* positions_out_ptr = texture_positions[0].data_ptr<float>();
+    memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size());
+    positions_out_ptr = texture_positions[1].data_ptr<float>();
+    for (int i = 0; i < grids[0].seq2grid.size(); ++i) {
+        positions_out_ptr[i] = (i < grids[0].num_origin_seq);
+    }
+
+    for (int i = 0; i < grids.size(); ++i) {
+        grid_neighbors[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2grid.size()), 9}, int64_options);
+        int64_t* nptr = grid_neighbors[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) {
+            nptr[j] = grids[i].seq2neighbor[j];
+        }
+
+        grid_evencorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2evencorner.size())}, int64_options);
+        int64_t* dptr = grid_evencorners[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) {
+            dptr[j] = grids[i].seq2evencorner[j];
+        }
+        dptr = grid_oddcorners[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) {
+            dptr[j] = grids[i].seq2oddcorner[j];
+        }            
+        if (i + 1 < grids.size()) {
+            grid_downsamples[i] = torch::zeros({static_cast<int64_t>(grids[i].downsample_seq.size())}, int64_options);
+            int64_t* dptr = grid_downsamples[i].data_ptr<int64_t>();
+            for (int j = 0; j < grids[i].downsample_seq.size(); ++j) {
+                dptr[j] = grids[i].downsample_seq[j];
+            }
+        }
+
+    }
+    return {texture_positions, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners};
+}
+
+std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
+    std::vector<torch::Tensor> view_layer_positions,
+    std::vector<torch::Tensor> view_layer_normals,
+    std::vector<torch::Tensor> view_layer_feats,
+    int num_level, int resolution)
+{
+    if (view_layer_positions.size() != 3 || num_level < 1) {
+        printf("Alert! We require 3 layers and at least 1 level! (%zu %d)\n", view_layer_positions.size(), num_level);
+        return {{},{},{},{}};
+    }
+
+    std::vector<Grid> grids;
+    grids.resize(num_level);
+
+    std::vector<float> seq2pos;
+    std::vector<float> seq2feat;
+    auto& seq2grid = grids[0].seq2grid;
+    auto& seq2normal = grids[0].seq2normal;
+    auto& grid2seq = grids[0].grid2seq;
+    grids[0].resolution = resolution;
+    grids[0].stride = 1;
+
+    auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false);
+    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
+
+    int feat_channel = 3;
+    for (int v = 0; v < 3; ++v) {
+        int num_layers = view_layer_positions[v].size(0);
+        int height = view_layer_positions[v].size(1);
+        int width = view_layer_positions[v].size(2);
+        float* data = view_layer_positions[v].data_ptr<float>();
+        float* data_normal = view_layer_normals[v].data_ptr<float>();
+        float* data_feat = view_layer_feats[v].data_ptr<float>();
+        feat_channel = view_layer_feats[v].size(3);
+        for (int l = 0; l < num_layers; ++l) {
+            for (int i = 0; i < height; ++i) {
+                for (int j = 0; j < width; ++j) {
+                    float* p = &data[(i * width + j) * 4];
+                    float* n = &data_normal[(i * width + j) * 3];
+                    float* f = &data_feat[(i * width + j) * feat_channel];
+                    if (p[3] == 0)
+                        continue;
+                    auto k = pos2key(p, resolution);
+                    if (!grid2seq.count(k)) {
+                        int dim = 0;
+                        for (int d = 0; d < 3; ++d) {
+                            if (std::abs(n[d]) > std::abs(n[dim]))
+                                dim = d;
+                        }
+                        dim = (dim + 1) % 3;
+                        grid2seq[k] = seq2grid.size();
+                        seq2grid.emplace_back(k);
+                        seq2pos.push_back(p[0]);
+                        seq2pos.push_back(p[1]);
+                        seq2pos.push_back(p[2]);
+                        for (int c = 0; c < feat_channel; ++c) {
+                            seq2feat.emplace_back(f[c]);
+                        }
+                        seq2normal.emplace_back(dim);
+                    }
+                }
+            }
+            data += (height * width * 4);
+            data_normal += (height * width * 3);
+            data_feat += (height * width * feat_channel);
+        }
+    }
+
+    for (int i = 0; i < num_level - 1; ++i) {
+        DownsampleGrid(grids[i], grids[i + 1]);
+    }
+
+    for (int l = 0; l < num_level; ++l) {
+        grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1);
+        grids[l].num_origin_seq = grids[l].seq2grid.size();
+        for (int d = 0; d < 3; ++d) {
+            NeighborGrid(grids[l], view_layer_positions, d);
+        }
+    }
+
+    for (int i = num_level - 2; i >= 0; --i) {
+        PadGrid(grids[i], grids[i + 1], view_layer_positions);
+    }
+    for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) {
+        int k = grids[0].seq2grid[i];
+        float p[3];
+        key2pos(k, grids[0].resolution, p);
+        seq2pos.push_back(p[0]);
+        seq2pos.push_back(p[1]);
+        seq2pos.push_back(p[2]);
+        for (int c = 0; c < feat_channel; ++c) {
+            seq2feat.emplace_back(0.5);
+        }
+    }
+
+    std::vector<torch::Tensor> texture_positions(2);
+    std::vector<torch::Tensor> texture_feats(1);
+    std::vector<torch::Tensor> grid_neighbors(grids.size());
+    std::vector<torch::Tensor> grid_downsamples(grids.size() - 1);
+    std::vector<torch::Tensor> grid_evencorners(grids.size());
+    std::vector<torch::Tensor> grid_oddcorners(grids.size());
+
+    texture_positions[0] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3), 3}, float_options);
+    texture_positions[1] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3)}, float_options);
+    texture_feats[0] = torch::zeros({static_cast<int64_t>(seq2feat.size() / feat_channel), static_cast<int64_t>(feat_channel)}, float_options);
+    float* positions_out_ptr = texture_positions[0].data_ptr<float>();
+    memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size());
+    positions_out_ptr = texture_positions[1].data_ptr<float>();
+    for (int i = 0; i < grids[0].seq2grid.size(); ++i) {
+        positions_out_ptr[i] = (i < grids[0].num_origin_seq);
+    }
+    float* feats_out_ptr = texture_feats[0].data_ptr<float>();
+    memcpy(feats_out_ptr, seq2feat.data(), sizeof(float) * seq2feat.size());
+
+    for (int i = 0; i < grids.size(); ++i) {
+        grid_neighbors[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2grid.size()), 9}, int64_options);
+        int64_t* nptr = grid_neighbors[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) {
+            nptr[j] = grids[i].seq2neighbor[j];
+        }
+        grid_evencorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2evencorner.size())}, int64_options);
+        int64_t* dptr = grid_evencorners[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) {
+            dptr[j] = grids[i].seq2evencorner[j];
+        }
+        dptr = grid_oddcorners[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) {
+            dptr[j] = grids[i].seq2oddcorner[j];
+        }
+        if (i + 1 < grids.size()) {
+            grid_downsamples[i] = torch::zeros({static_cast<int64_t>(grids[i].downsample_seq.size())}, int64_options);
+            int64_t* dptr = grid_downsamples[i].data_ptr<int64_t>();
+            for (int j = 0; j < grids[i].downsample_seq.size(); ++j) {
+                dptr[j] = grids[i].downsample_seq[j];
+            }
+        }
+    }
+    return {texture_positions, texture_feats, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners};
+}
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer.cpp b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer.cpp
new file mode 100644
index 0000000..c02f1f9
--- /dev/null
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer.cpp
@@ -0,0 +1,139 @@
+#include "rasterizer.h"
+
+void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, int64_t* zbuffer, float* d, float occlusion_truncation) {
+    float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0]));
+    float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0]));
+    float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1]));
+    float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1]));
+
+    for (int px = x_min; px < x_max + 1; ++px) {
+        if (px < 0 || px >= width)
+            continue;
+        for (int py = y_min; py < y_max + 1; ++py) {
+            if (py < 0 || py >= height)
+                continue;
+            float vt[2] = {px + 0.5, py + 0.5};
+            float baryCentricCoordinate[3];
+            calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate);
+            if (isBarycentricCoordInBounds(baryCentricCoordinate)) {
+                int pixel = py * width + px;
+                if (zbuffer == 0) {
+                    zbuffer[pixel] = (int64_t)(idx + 1);
+                    continue;
+                }
+
+                float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2];
+                float depth_thres = 0;
+                if (d) {
+                    depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation;
+                }
+                
+                int z_quantize = depth * (2<<17);
+                int64_t token = (int64_t)z_quantize * MAXINT + (int64_t)(idx + 1);
+                if (depth < depth_thres)
+                    continue;
+                zbuffer[pixel] = std::min(zbuffer[pixel], token);
+            }
+        }
+    }
+}
+
+void barycentricFromImgcoordCPU(float* V, int* F, int* findices, int64_t* zbuffer, int width, int height, int num_vertices, int num_faces,
+    float* barycentric_map, int pix)
+{
+    int64_t f = zbuffer[pix] % MAXINT;
+    if (f == (MAXINT-1)) {
+        findices[pix] = 0;
+        barycentric_map[pix * 3] = 0;
+        barycentric_map[pix * 3 + 1] = 0;
+        barycentric_map[pix * 3 + 2] = 0;
+        return;
+    }
+    findices[pix] = f;
+    f -= 1;
+    float barycentric[3] = {0, 0, 0};
+    if (f >= 0) {
+        float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f};
+        float* vt0_ptr = V + (F[f * 3] * 4);
+        float* vt1_ptr = V + (F[f * 3 + 1] * 4);
+        float* vt2_ptr = V + (F[f * 3 + 2] * 4);
+
+        float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f};
+        float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f};
+        float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f};
+
+        calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric);
+
+        barycentric[0] = barycentric[0] / vt0_ptr[3];
+        barycentric[1] = barycentric[1] / vt1_ptr[3];
+        barycentric[2] = barycentric[2] / vt2_ptr[3];
+        float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]);
+        barycentric[0] *= w;
+        barycentric[1] *= w;
+        barycentric[2] *= w;
+
+    }
+    barycentric_map[pix * 3] = barycentric[0];
+    barycentric_map[pix * 3 + 1] = barycentric[1];
+    barycentric_map[pix * 3 + 2] = barycentric[2];
+}
+
+void rasterizeImagecoordsKernelCPU(float* V, int* F, float* d, int64_t* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces, int f)
+{
+    float* vt0_ptr = V + (F[f * 3] * 4);
+    float* vt1_ptr = V + (F[f * 3 + 1] * 4);
+    float* vt2_ptr = V + (F[f * 3 + 2] * 4);
+
+    float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f};
+    float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f};
+    float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f};
+
+    rasterizeTriangleCPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc);
+}
+
+std::vector<torch::Tensor> rasterize_image_cpu(torch::Tensor V, torch::Tensor F, torch::Tensor D,
+    int width, int height, float occlusion_truncation, int use_depth_prior)
+{
+    int num_faces = F.size(0);
+    int num_vertices = V.size(0);
+    auto options = torch::TensorOptions().dtype(torch::kInt32).requires_grad(false);
+    auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false);
+    auto findices = torch::zeros({height, width}, options);
+    int64_t maxint = (int64_t)MAXINT * (int64_t)MAXINT + (MAXINT - 1);
+    auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint;
+
+    if (!use_depth_prior) {
+        for (int i = 0; i < num_faces; ++i) {
+            rasterizeImagecoordsKernelCPU(V.data_ptr<float>(), F.data_ptr<int>(), 0,
+                (int64_t*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces, i); 
+        }
+    } else {
+        for (int i = 0; i < num_faces; ++i)
+            rasterizeImagecoordsKernelCPU(V.data_ptr<float>(), F.data_ptr<int>(), D.data_ptr<float>(),
+                (int64_t*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces, i);
+    }
+
+    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
+    auto barycentric = torch::zeros({height, width, 3}, float_options);
+    for (int i = 0; i < width * height; ++i)
+        barycentricFromImgcoordCPU(V.data_ptr<float>(), F.data_ptr<int>(),
+            findices.data_ptr<int>(), (int64_t*)z_min.data_ptr<int64_t>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>(), i);
+
+    return {findices, barycentric};
+}
+
+std::vector<torch::Tensor> rasterize_image(torch::Tensor V, torch::Tensor F, torch::Tensor D,
+    int width, int height, float occlusion_truncation, int use_depth_prior)
+{
+    int device_id = V.get_device();
+    if (device_id == -1)
+        return rasterize_image_cpu(V, F, D, width, height, occlusion_truncation, use_depth_prior);
+    else
+        return rasterize_image_gpu(V, F, D, width, height, occlusion_truncation, use_depth_prior);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("rasterize_image", &rasterize_image, "Custom image rasterization");
+  m.def("build_hierarchy", &build_hierarchy, "Custom image rasterization");
+  m.def("build_hierarchy_with_feat", &build_hierarchy_with_feat, "Custom image rasterization");
+}
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer.h b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer.h
new file mode 100644
index 0000000..8e198b1
--- /dev/null
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer.h
@@ -0,0 +1,55 @@
+#ifndef RASTERIZER_H_
+#define RASTERIZER_H_
+
+#include <torch/extension.h>
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h> // For CUDA context
+#include <cstdint>
+
+#define INT64 int64_t
+#define MAXINT 2147483647
+
+__host__ __device__ inline float calculateSignedArea2(float* a, float* b, float* c) {
+    return ((c[0] - a[0]) * (b[1] - a[1]) - (b[0] - a[0]) * (c[1] - a[1]));
+}
+
+__host__ __device__  inline void calculateBarycentricCoordinate(float* a, float* b, float* c, float* p,
+    float* barycentric)
+{
+    float beta_tri = calculateSignedArea2(a, p, c);
+    float gamma_tri = calculateSignedArea2(a, b, p);
+    float area = calculateSignedArea2(a, b, c);
+    if (area == 0) {
+        barycentric[0] = -1.0;
+        barycentric[1] = -1.0;
+        barycentric[2] = -1.0;
+        return;
+    }
+    float tri_inv = 1.0 / area;
+    float beta = beta_tri * tri_inv;
+    float gamma = gamma_tri * tri_inv;
+    float alpha = 1.0 - beta - gamma;
+    barycentric[0] = alpha;
+    barycentric[1] = beta;
+    barycentric[2] = gamma;
+}
+
+__host__ __device__  inline bool isBarycentricCoordInBounds(float* barycentricCoord) {
+    return barycentricCoord[0] >= 0.0 && barycentricCoord[0] <= 1.0 &&
+           barycentricCoord[1] >= 0.0 && barycentricCoord[1] <= 1.0 &&
+           barycentricCoord[2] >= 0.0 && barycentricCoord[2] <= 1.0;
+}
+
+std::vector<torch::Tensor> rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D,
+    int width, int height, float occlusion_truncation, int use_depth_prior);
+
+std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tensor> view_layer_positions, std::vector<torch::Tensor> view_layer_normals, int num_level, int resolution);
+
+std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
+    std::vector<torch::Tensor> view_layer_positions,
+    std::vector<torch::Tensor> view_layer_normals,
+    std::vector<torch::Tensor> view_layer_feats,
+    int num_level, int resolution);
+
+#endif
\ No newline at end of file
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer_gpu.cu b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer_gpu.cu
new file mode 100644
index 0000000..ba6ab91
--- /dev/null
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel_for_windows/rasterizer_gpu.cu
@@ -0,0 +1,127 @@
+#include "rasterizer.h"
+
+__device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, uint64_t* zbuffer, float* d, float occlusion_truncation) {
+    float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0]));
+    float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0]));
+    float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1]));
+    float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1]));
+
+    for (int px = x_min; px < x_max + 1; ++px) {
+        if (px < 0 || px >= width)
+            continue;
+        for (int py = y_min; py < y_max + 1; ++py) {
+            if (py < 0 || py >= height)
+                continue;
+            float vt[2] = {px + 0.5f, py + 0.5f};
+            float baryCentricCoordinate[3];
+            calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate);
+            if (isBarycentricCoordInBounds(baryCentricCoordinate)) {
+                int pixel = py * width + px;
+                if (zbuffer == 0) {
+                    atomicExch(&zbuffer[pixel], (uint64_t)(idx + 1));
+                    continue;
+                }
+                float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2];
+                float depth_thres = 0;
+                if (d) {
+                    depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation;
+                }
+                
+                int z_quantize = depth * (2<<17);
+                uint64_t token = (uint64_t)z_quantize * MAXINT + (uint64_t)(idx + 1);
+                if (depth < depth_thres)
+                    continue;
+                atomicMin(&zbuffer[pixel], token);
+            }
+        }
+    }
+}
+
+__global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, uint64_t* zbuffer, int width, int height, int num_vertices, int num_faces,
+    float* barycentric_map)
+{
+    int pix = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pix >= width * height)
+        return;
+    uint64_t f = zbuffer[pix] % MAXINT;
+    if (f == (MAXINT-1)) {
+        findices[pix] = 0;
+        barycentric_map[pix * 3] = 0;
+        barycentric_map[pix * 3 + 1] = 0;
+        barycentric_map[pix * 3 + 2] = 0;
+        return;
+    }
+    findices[pix] = f;
+    f -= 1;
+    float barycentric[3] = {0, 0, 0};
+    if (f >= 0) {
+        float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f};
+        float* vt0_ptr = V + (F[f * 3] * 4);
+        float* vt1_ptr = V + (F[f * 3 + 1] * 4);
+        float* vt2_ptr = V + (F[f * 3 + 2] * 4);
+
+        float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f};
+        float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f};
+        float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f};
+
+        calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric);
+
+        barycentric[0] = barycentric[0] / vt0_ptr[3];
+        barycentric[1] = barycentric[1] / vt1_ptr[3];
+        barycentric[2] = barycentric[2] / vt2_ptr[3];
+        float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]);
+        barycentric[0] *= w;
+        barycentric[1] *= w;
+        barycentric[2] *= w;
+
+    }
+    barycentric_map[pix * 3] = barycentric[0];
+    barycentric_map[pix * 3 + 1] = barycentric[1];
+    barycentric_map[pix * 3 + 2] = barycentric[2];
+}
+
+__global__ void rasterizeImagecoordsKernelGPU(float* V, int* F, float* d, uint64_t* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces)
+{
+    int f = blockIdx.x * blockDim.x + threadIdx.x;
+    if (f >= num_faces)
+        return; 
+
+    float* vt0_ptr = V + (F[f * 3] * 4);
+    float* vt1_ptr = V + (F[f * 3 + 1] * 4);
+    float* vt2_ptr = V + (F[f * 3 + 2] * 4);
+
+    float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f};
+    float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f};
+    float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f};
+
+    rasterizeTriangleGPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc);
+}
+
+std::vector<torch::Tensor> rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D,
+    int width, int height, float occlusion_truncation, int use_depth_prior)
+{
+    int device_id = V.get_device();
+    cudaSetDevice(device_id);
+    int num_faces = F.size(0);
+    int num_vertices = V.size(0);
+    auto options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA, device_id).requires_grad(false);
+    auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA, device_id).requires_grad(false);
+    auto findices = torch::zeros({height, width}, options);
+    uint64_t maxint = (uint64_t)MAXINT * (uint64_t)MAXINT + (MAXINT - 1);
+    auto z_min = torch::ones({height, width}, INT64_options) * (uint64_t)maxint;
+
+    if (!use_depth_prior) {
+        rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr<float>(), F.data_ptr<int>(), 0,
+            (uint64_t*)z_min.data_ptr<uint64_t>(), occlusion_truncation, width, height, num_vertices, num_faces); 
+    } else {
+        rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr<float>(), F.data_ptr<int>(), D.data_ptr<float>(),
+            (uint64_t*)z_min.data_ptr<uint64_t>(), occlusion_truncation, width, height, num_vertices, num_faces); 
+    }
+
+    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA, device_id).requires_grad(false);
+    auto barycentric = torch::zeros({height, width, 3}, float_options);
+    barycentricFromImgcoordGPU<<<(width * height + 255)/256, 256>>>(V.data_ptr<float>(), F.data_ptr<int>(),
+        findices.data_ptr<int>(), (uint64_t*)z_min.data_ptr<uint64_t>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>());
+
+    return {findices, barycentric};
+}

From 663ee27446a4bb0d820db4e5419711dfe823abf3 Mon Sep 17 00:00:00 2001
From: qinmaohui <820102443@qq.com>
Date: Wed, 10 Sep 2025 22:37:29 +0800
Subject: [PATCH 4/4] =?UTF-8?q?=E8=BF=98=E5=8E=9F=E5=AF=B9=E4=BA=8Ecustom?=
 =?UTF-8?q?=5Frasterizer=5Fkernel=E7=9A=84=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../lib/custom_rasterizer_kernel/grid_neighbor.cpp              | 2 +-
 .../lib/custom_rasterizer_kernel/rasterizer.cpp                 | 2 +-
 .../lib/custom_rasterizer_kernel/rasterizer_gpu.cu              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
index c741c04..f02bcba 100644
--- a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
@@ -571,4 +571,4 @@ std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
         }
     }
     return {texture_positions, texture_feats, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners};
-}
\ No newline at end of file
+}
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
index 2375636..b3ff69f 100644
--- a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
@@ -136,4 +136,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("rasterize_image", &rasterize_image, "Custom image rasterization");
   m.def("build_hierarchy", &build_hierarchy, "Custom image rasterization");
   m.def("build_hierarchy_with_feat", &build_hierarchy_with_feat, "Custom image rasterization");
-}
\ No newline at end of file
+}
diff --git a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
index 373f78d..709c1b8 100644
--- a/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
+++ b/hy3dpaint/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
@@ -124,4 +124,4 @@ std::vector<torch::Tensor> rasterize_image_gpu(torch::Tensor V, torch::Tensor F,
         findices.data_ptr<int>(), (INT64*)z_min.data_ptr<long>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>());
 
     return {findices, barycentric};
-}
\ No newline at end of file
+}