From 0e87bc58a771c3a24fcc4f28353ddf3e19be6be0 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:30:12 +0800
Subject: [PATCH 01/57] more

---
 debug_print/__init__.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index addae96..4cbc802 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -1,6 +1,31 @@
+from typing import Dict
+
 import torch
 from ._kernels import print_tensor as print_tensor_kernel
 
 
+class _Buffer:
+    def __init__(self, device_index: int):
+        self._tensor = torch.zeros((10_000_000,), dtype=torch.char, device=f"cuda:{device_index}")
+        self._used_len = 0
+
+    def allocate(self, size: int):
+        output = self._tensor[self._used_len: self._used_len + size]
+        self._used_len += size
+        assert self._used_len <= len(self._tensor)
+        return output
+
+
+class _DebugPrinter:
+    def __init__(self):
+        # Can be optimized
+        self._buffers: Dict[int, _Buffer] = {
+            device_index: _Buffer(device_index=device_index)
+            for device_index in range(torch.cuda.device_count())
+        }
+
+    def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
+
+
 def print_tensor(x: torch.Tensor, print_ptr: bool = False):
     print_tensor_kernel(x, print_ptr)

From 741959ec067a696c4f7383dc150c04599474e1d5 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:31:21 +0800
Subject: [PATCH 02/57] more

---
 debug_print/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index 4cbc802..2280ed8 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -25,6 +25,10 @@ def __init__(self):
         }
 
     def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
+        if len(name) > 0:
+            name_bytes = TODO
+            name_buffer = self._buffers[x.device].allocate(len(name_bytes) + 1)
+            name_buffer.copy_(TODO)
 
 
 def print_tensor(x: torch.Tensor, print_ptr: bool = False):

From 805b35e260245ae9228e24b1545fc392e975d195 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:32:14 +0800
Subject: [PATCH 03/57] more

---
 debug_print/__init__.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index 2280ed8..671b259 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -1,4 +1,4 @@
-from typing import Dict
+from typing import Dict, Optional
 
 import torch
 from ._kernels import print_tensor as print_tensor_kernel
@@ -31,5 +31,14 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
             name_buffer.copy_(TODO)
 
 
-def print_tensor(x: torch.Tensor, print_ptr: bool = False):
-    print_tensor_kernel(x, print_ptr)
+_printer: Optional[_DebugPrinter] = None
+
+
+def initialize():
+    global _printer
+    assert _printer is None
+    _printer = _DebugPrinter()
+
+
+def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = False):
+    _printer(x=x, name=name, print_ptr=print_ptr)

From 7b0f41ce26747c5c2eb3846227290f475ff2df35 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:32:37 +0800
Subject: [PATCH 04/57] more

---
 debug_print/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index 671b259..c85a9c2 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -1,7 +1,7 @@
 from typing import Dict, Optional
 
 import torch
-from ._kernels import print_tensor as print_tensor_kernel
+from ._kernels import print_tensor as _print_tensor_kernel
 
 
 class _Buffer:
@@ -29,6 +29,9 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
             name_bytes = TODO
             name_buffer = self._buffers[x.device].allocate(len(name_bytes) + 1)
             name_buffer.copy_(TODO)
+        else:
+            name_buffer = None
+        _print_tensor_kernel(x, name_buffer, print_ptr)
 
 
 _printer: Optional[_DebugPrinter] = None

From e09b2969a606162dc1098ac775212dc47b75a636 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:34:21 +0800
Subject: [PATCH 05/57] more

---
 debug_print/__init__.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index c85a9c2..8832e2a 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -26,9 +26,10 @@ def __init__(self):
 
     def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
         if len(name) > 0:
-            name_bytes = TODO
-            name_buffer = self._buffers[x.device].allocate(len(name_bytes) + 1)
-            name_buffer.copy_(TODO)
+            name_bytes = name.encode("utf-8")
+            name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1)
+            tmp = torch.empty(list(name_bytes) + [0], dtype=torch.uint8, device="cpu")
+            name_buffer.copy_(tmp.to(name_buffer.device).view(torch.char))
         else:
             name_buffer = None
         _print_tensor_kernel(x, name_buffer, print_ptr)

From a6c756d6748aea01010f64cd276fa260a5284c46 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:36:32 +0800
Subject: [PATCH 06/57] more

---
 csrc/debug_print.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 5527ba4..6a9c056 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -102,9 +102,12 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1,
   printf("\n");
 }
 
-void PrintTensor(torch::Tensor x, bool print_ptr) {
+void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name, bool print_ptr) {
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index());
   TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor");
+
+  const char* name_ptr = name.has_value() ? name->data_ptr() : nullptr;
+
   if (x.is_floating_point()) {
     if (x.dim() == 1) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(

From d0e7b5488f3f1138bfc48013c46d5654ddf4f110 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:37:08 +0800
Subject: [PATCH 07/57] more

---
 csrc/debug_print.cu | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 6a9c056..b600243 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -107,27 +107,28 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name, bool print_
   TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor");
 
   const char* name_ptr = name.has_value() ? name->data_ptr() : nullptr;
+  const int name_len = name.has_value() ? name->numel() : 0;
 
   if (x.is_floating_point()) {
     if (x.dim() == 1) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintFloatTensor1D", ([&] {
             PrintFloatTensor1D<<<1, 1, 0, stream>>>(
-                x.data_ptr<scalar_t>(), x.stride(0), x.numel(), print_ptr);
+                x.data_ptr<scalar_t>(), x.stride(0), x.numel(), name_ptr, name_len, print_ptr);
           }));
     } else if (x.dim() == 2) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintFloatTensor2D", ([&] {
             PrintFloatTensor2D<<<1, 1, 0, stream>>>(
                 x.data_ptr<scalar_t>(), x.size(1), x.stride(0), x.stride(1),
-                x.numel(), print_ptr);
+                x.numel(), name_ptr, name_len, print_ptr);
           }));
     } else if (x.dim() == 3) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintFloatTensor3D", ([&] {
             PrintFloatTensor3D<<<1, 1, 0, stream>>>(
                 x.data_ptr<scalar_t>(), x.size(1), x.size(2), x.stride(0),
-                x.stride(1), x.stride(2), x.numel(), print_ptr);
+                x.stride(1), x.stride(2), x.numel(), name_ptr, name_len, print_ptr);
           }));
     } else {
       // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
@@ -143,21 +144,21 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name, bool print_
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] {
                                    PrintIntTensor1D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.stride(0),
-                                       x.numel(), print_ptr);
+                                       x.numel(), name_ptr, name_len, print_ptr);
                                  }));
     } else if (x.dim() == 2) {
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] {
                                    PrintIntTensor2D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.size(1),
                                        x.stride(0), x.stride(1), x.numel(),
-                                       print_ptr);
+                                       name_ptr, name_len, print_ptr);
                                  }));
     } else if (x.dim() == 3) {
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] {
                                    PrintIntTensor3D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.size(1),
                                        x.size(2), x.stride(0), x.stride(1),
-                                       x.stride(2), x.numel(), print_ptr);
+                                       x.stride(2), x.numel(), name_ptr, name_len, print_ptr);
                                  }));
     } else {
       // NOTE(Zihao): I'm just too lazy to do this, codegen for higher

From b66aa9a60f2cbeb400a369a35ac20c1bbbf23051 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:37:39 +0800
Subject: [PATCH 08/57] more

---
 csrc/debug_print.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index b600243..efd017a 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -18,7 +18,7 @@
 template <typename float_t>
 __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
                                    const size_t stride_0, const size_t n,
-                                   const bool print_ptr) {
+                                   const char* name_ptr, const int name_len, const bool print_ptr) {
   if (print_ptr) {
     printf("addr: %lld\n", x);
   }
@@ -30,7 +30,7 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
 
 template <typename int_t>
 __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0,
-                                 const size_t n, const bool print_ptr) {
+                                 const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) {
   if (print_ptr) {
     printf("addr: %lld\n", x);
   }
@@ -44,7 +44,7 @@ template <typename float_t>
 __global__ void PrintFloatTensor2D(float_t *__restrict__ x,
                                    const size_t shape_0, const size_t stride_1,
                                    const size_t stride_0, const size_t n,
-                                   const bool print_ptr) {
+                                   const char* name_ptr, const int name_len, const bool print_ptr) {
   if (print_ptr) {
     printf("addr: %lld\n", x);
   }
@@ -58,7 +58,7 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x,
 template <typename int_t>
 __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0,
                                  const size_t stride_1, const size_t stride_0,
-                                 const size_t n, const bool print_ptr) {
+                                 const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) {
   if (print_ptr) {
     printf("addr: %lld\n", x);
   }
@@ -74,7 +74,7 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x,
                                    const size_t shape_1, const size_t shape_0,
                                    const size_t stride_2, const size_t stride_1,
                                    const size_t stride_0, const size_t n,
-                                   const bool print_ptr) {
+                                   const char* name_ptr, const int name_len, const bool print_ptr) {
   if (print_ptr) {
     printf("addr: %lld\n", x);
   }
@@ -90,7 +90,7 @@ template <typename int_t>
 __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1,
                                  const size_t shape_0, const size_t stride_2,
                                  const size_t stride_1, const size_t stride_0,
-                                 const size_t n, const bool print_ptr) {
+                                 const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) {
   if (print_ptr) {
     printf("addr: %lld\n", x);
   }

From f43e6a7127edb31fc25d3faf72266c7e4012ac42 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:37:51 +0800
Subject: [PATCH 09/57] more

---
 csrc/debug_print.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index efd017a..61e7e69 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -102,12 +102,12 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1,
   printf("\n");
 }
 
-void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name, bool print_ptr) {
+void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool print_ptr) {
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index());
   TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor");
 
-  const char* name_ptr = name.has_value() ? name->data_ptr() : nullptr;
-  const int name_len = name.has_value() ? name->numel() : 0;
+  const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr() : nullptr;
+  const int name_len = name_buffer.has_value() ? name_buffer->numel() : 0;
 
   if (x.is_floating_point()) {
     if (x.dim() == 1) {

From d0a9ba40cf136cc217f3ea09c449b733e8c61b1f Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:38:39 +0800
Subject: [PATCH 10/57] more

---
 csrc/debug_print.cu | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 61e7e69..6335d94 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -15,13 +15,17 @@
       TYPE, NAME,                                                              \
       AT_DISPATCH_CASE_FLOATING_AND_REDUCED_FLOATING_TYPES(__VA_ARGS__))
 
+__device__ void PrintCommon(void* x, const char* name_ptr, const int name_len, const bool print_ptr) {
+  if (print_ptr) {
+    printf("addr: %lld\n", x);
+  }
+}
+
 template <typename float_t>
 __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
                                    const size_t stride_0, const size_t n,
                                    const char* name_ptr, const int name_len, const bool print_ptr) {
-  if (print_ptr) {
-    printf("addr: %lld\n", x);
-  }
+  PrintCommon(x, name_ptr, name_len, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f, ", float(x[i * stride_0]));
   }
@@ -31,9 +35,7 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
 template <typename int_t>
 __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0,
                                  const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) {
-  if (print_ptr) {
-    printf("addr: %lld\n", x);
-  }
+  PrintCommon(x, name_ptr, name_len, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld, ", int64_t(x[i * stride_0]));
   }
@@ -45,9 +47,7 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x,
                                    const size_t shape_0, const size_t stride_1,
                                    const size_t stride_0, const size_t n,
                                    const char* name_ptr, const int name_len, const bool print_ptr) {
-  if (print_ptr) {
-    printf("addr: %lld\n", x);
-  }
+  PrintCommon(x, name_ptr, name_len, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f, ",
            float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]));
@@ -59,9 +59,7 @@ template <typename int_t>
 __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0,
                                  const size_t stride_1, const size_t stride_0,
                                  const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) {
-  if (print_ptr) {
-    printf("addr: %lld\n", x);
-  }
+  PrintCommon(x, name_ptr, name_len, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld, ",
            int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]));
@@ -75,9 +73,7 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x,
                                    const size_t stride_2, const size_t stride_1,
                                    const size_t stride_0, const size_t n,
                                    const char* name_ptr, const int name_len, const bool print_ptr) {
-  if (print_ptr) {
-    printf("addr: %lld\n", x);
-  }
+  PrintCommon(x, name_ptr, name_len, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f, ", float(x[(i / shape_0 / shape_1) * stride_2 +
                              ((i / shape_0) % shape_1) * stride_1 +
@@ -91,9 +87,7 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1,
                                  const size_t shape_0, const size_t stride_2,
                                  const size_t stride_1, const size_t stride_0,
                                  const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) {
-  if (print_ptr) {
-    printf("addr: %lld\n", x);
-  }
+  PrintCommon(x, name_ptr, name_len, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld, ", int64_t(x[(i / shape_0 / shape_1) * stride_2 +
                                ((i / shape_0) % shape_1) * stride_1 +

From 47636bf5e0e10dc8b1c47ed989c8f815b48d8d30 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:39:04 +0800
Subject: [PATCH 11/57] more

---
 csrc/debug_print.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 6335d94..5afe4b7 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -15,7 +15,10 @@
       TYPE, NAME,                                                              \
       AT_DISPATCH_CASE_FLOATING_AND_REDUCED_FLOATING_TYPES(__VA_ARGS__))
 
-__device__ void PrintCommon(void* x, const char* name_ptr, const int name_len, const bool print_ptr) {
+__device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) {
+  if (name_ptr != nullptr) {
+    printf("name: %s\n", name_ptr);
+  }
   if (print_ptr) {
     printf("addr: %lld\n", x);
   }

From d5973fc3383f9b84793a77e37781f881ff110157 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:39:26 +0800
Subject: [PATCH 12/57] more

---
 csrc/debug_print.cu | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 5afe4b7..7086d80 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -27,8 +27,8 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr)
 template <typename float_t>
 __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
                                    const size_t stride_0, const size_t n,
-                                   const char* name_ptr, const int name_len, const bool print_ptr) {
-  PrintCommon(x, name_ptr, name_len, print_ptr);
+                                   const char* name_ptr, const bool print_ptr) {
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f, ", float(x[i * stride_0]));
   }
@@ -37,8 +37,8 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
 
 template <typename int_t>
 __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0,
-                                 const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) {
-  PrintCommon(x, name_ptr, name_len, print_ptr);
+                                 const size_t n, const char* name_ptr, const bool print_ptr) {
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld, ", int64_t(x[i * stride_0]));
   }
@@ -49,8 +49,8 @@ template <typename float_t>
 __global__ void PrintFloatTensor2D(float_t *__restrict__ x,
                                    const size_t shape_0, const size_t stride_1,
                                    const size_t stride_0, const size_t n,
-                                   const char* name_ptr, const int name_len, const bool print_ptr) {
-  PrintCommon(x, name_ptr, name_len, print_ptr);
+                                   const char* name_ptr, const bool print_ptr) {
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f, ",
            float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]));
@@ -61,8 +61,8 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x,
 template <typename int_t>
 __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0,
                                  const size_t stride_1, const size_t stride_0,
-                                 const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) {
-  PrintCommon(x, name_ptr, name_len, print_ptr);
+                                 const size_t n, const char* name_ptr, const bool print_ptr) {
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld, ",
            int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]));
@@ -75,8 +75,8 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x,
                                    const size_t shape_1, const size_t shape_0,
                                    const size_t stride_2, const size_t stride_1,
                                    const size_t stride_0, const size_t n,
-                                   const char* name_ptr, const int name_len, const bool print_ptr) {
-  PrintCommon(x, name_ptr, name_len, print_ptr);
+                                   const char* name_ptr, const bool print_ptr) {
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f, ", float(x[(i / shape_0 / shape_1) * stride_2 +
                              ((i / shape_0) % shape_1) * stride_1 +
@@ -89,8 +89,8 @@ template <typename int_t>
 __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1,
                                  const size_t shape_0, const size_t stride_2,
                                  const size_t stride_1, const size_t stride_0,
-                                 const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) {
-  PrintCommon(x, name_ptr, name_len, print_ptr);
+                                 const size_t n, const char* name_ptr, const bool print_ptr) {
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld, ", int64_t(x[(i / shape_0 / shape_1) * stride_2 +
                                ((i / shape_0) % shape_1) * stride_1 +
@@ -104,28 +104,27 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
   TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor");
 
   const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr() : nullptr;
-  const int name_len = name_buffer.has_value() ? name_buffer->numel() : 0;
 
   if (x.is_floating_point()) {
     if (x.dim() == 1) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintFloatTensor1D", ([&] {
             PrintFloatTensor1D<<<1, 1, 0, stream>>>(
-                x.data_ptr<scalar_t>(), x.stride(0), x.numel(), name_ptr, name_len, print_ptr);
+                x.data_ptr<scalar_t>(), x.stride(0), x.numel(), name_ptr, print_ptr);
           }));
     } else if (x.dim() == 2) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintFloatTensor2D", ([&] {
             PrintFloatTensor2D<<<1, 1, 0, stream>>>(
                 x.data_ptr<scalar_t>(), x.size(1), x.stride(0), x.stride(1),
-                x.numel(), name_ptr, name_len, print_ptr);
+                x.numel(), name_ptr, print_ptr);
           }));
     } else if (x.dim() == 3) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintFloatTensor3D", ([&] {
             PrintFloatTensor3D<<<1, 1, 0, stream>>>(
                 x.data_ptr<scalar_t>(), x.size(1), x.size(2), x.stride(0),
-                x.stride(1), x.stride(2), x.numel(), name_ptr, name_len, print_ptr);
+                x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr);
           }));
     } else {
       // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
@@ -141,21 +140,21 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] {
                                    PrintIntTensor1D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.stride(0),
-                                       x.numel(), name_ptr, name_len, print_ptr);
+                                       x.numel(), name_ptr, print_ptr);
                                  }));
     } else if (x.dim() == 2) {
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] {
                                    PrintIntTensor2D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.size(1),
                                        x.stride(0), x.stride(1), x.numel(),
-                                       name_ptr, name_len, print_ptr);
+                                       name_ptr, print_ptr);
                                  }));
     } else if (x.dim() == 3) {
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] {
                                    PrintIntTensor3D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.size(1),
                                        x.size(2), x.stride(0), x.stride(1),
-                                       x.stride(2), x.numel(), name_ptr, name_len, print_ptr);
+                                       x.stride(2), x.numel(), name_ptr, print_ptr);
                                  }));
     } else {
       // NOTE(Zihao): I'm just too lazy to do this, codegen for higher

From 76106c3497b47c008be8f100ecb41dffcfa54efe Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:40:26 +0800
Subject: [PATCH 13/57] more

---
 example.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example.py b/example.py
index f731d77..c62f4ba 100644
--- a/example.py
+++ b/example.py
@@ -27,9 +27,9 @@
     z = x @ y
     debug_print.print_tensor(z)
     z1 = z @ y
-    debug_print.print_tensor(z1[..., 0])
+    debug_print.print_tensor(z1[..., 0], name="This is name for part of z1")
     z2 = z1 @ y
-    debug_print.print_tensor(z2)
+    debug_print.print_tensor(z2, name="This is name for z2")
 
 x.copy_(torch.randn(2, 2))
 y.copy_(torch.ones(2, 2))

From b8095396edf170a0d7a85a9a997bee595f33dffc Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:42:28 +0800
Subject: [PATCH 14/57] more

---
 csrc/debug_print.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 7086d80..b3b3189 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -103,7 +103,7 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index());
   TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor");
 
-  const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr() : nullptr;
+  const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr::<char>() : nullptr;
 
   if (x.is_floating_point()) {
     if (x.dim() == 1) {

From bc3cd589f7da94c266368309d90a54011e880c3f Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:50:13 +0800
Subject: [PATCH 15/57] more

---
 csrc/debug_print.cu | 2 +-
 example.py          | 2 ++
 setup.py            | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index b3b3189..c5e265a 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -103,7 +103,7 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index());
   TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor");
 
-  const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr::<char>() : nullptr;
+  const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr<char>() : nullptr;
 
   if (x.is_floating_point()) {
     if (x.dim() == 1) {
diff --git a/example.py b/example.py
index c62f4ba..66d31ef 100644
--- a/example.py
+++ b/example.py
@@ -1,6 +1,8 @@
 import torch
 import debug_print
 
+print(f"{vars(debug_print)=} {dir(debug_print)=}")
+debug_print.initialize()
 
 x = torch.rand(3, 4, 5).to(0)
 debug_print.print_tensor(x)
diff --git a/setup.py b/setup.py
index dedb087..2ba3a10 100644
--- a/setup.py
+++ b/setup.py
@@ -4,6 +4,7 @@
 setup(
     name="debug_print",
     version="0.0.2",
+    packages=['debug_print'],
     ext_modules=[
         CUDAExtension(
             name="debug_print._kernels",

From 0ffd3e8ba94dc9390d663880bcf591df5ec2e12b Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:53:50 +0800
Subject: [PATCH 16/57] more

---
 csrc/debug_print.cu     | 2 +-
 debug_print/__init__.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index c5e265a..c25a068 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -103,7 +103,7 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index());
   TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor");
 
-  const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr<char>() : nullptr;
+  const char* name_ptr = name_buffer.has_value() ? reinterpret_cast<char*>(name_buffer->data_ptr<uint8_t>()) : nullptr;
 
   if (x.is_floating_point()) {
     if (x.dim() == 1) {
diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index 8832e2a..c301dc1 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -28,8 +28,8 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
         if len(name) > 0:
             name_bytes = name.encode("utf-8")
             name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1)
-            tmp = torch.empty(list(name_bytes) + [0], dtype=torch.uint8, device="cpu")
-            name_buffer.copy_(tmp.to(name_buffer.device).view(torch.char))
+            tmp = torch.empty(list(name_bytes) + [0], dtype=torch.uint8, device="cpu").to(name_buffer.device)
+            name_buffer.copy_(tmp)
         else:
             name_buffer = None
         _print_tensor_kernel(x, name_buffer, print_ptr)

From 4876b98189671fec67906d7c4bbd724b4c083cce Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:54:39 +0800
Subject: [PATCH 17/57] more

---
 example.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/example.py b/example.py
index 66d31ef..d34706c 100644
--- a/example.py
+++ b/example.py
@@ -1,7 +1,6 @@
 import torch
 import debug_print
 
-print(f"{vars(debug_print)=} {dir(debug_print)=}")
 debug_print.initialize()
 
 x = torch.rand(3, 4, 5).to(0)

From 6dd6a8b34e7ede16383bf0d8a13ccff0c585456f Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:55:01 +0800
Subject: [PATCH 18/57] more

---
 debug_print/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index c301dc1..353fba6 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -6,7 +6,7 @@
 
 class _Buffer:
     def __init__(self, device_index: int):
-        self._tensor = torch.zeros((10_000_000,), dtype=torch.char, device=f"cuda:{device_index}")
+        self._tensor = torch.zeros((10_000_000,), dtype=torch.uint8, device=f"cuda:{device_index}")
         self._used_len = 0
 
     def allocate(self, size: int):

From 8cc9bce09e522aea75aa5c0c3715fcdf45cc9db4 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:55:46 +0800
Subject: [PATCH 19/57] more

---
 debug_print/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index 353fba6..7e08354 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -28,7 +28,7 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
         if len(name) > 0:
             name_bytes = name.encode("utf-8")
             name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1)
-            tmp = torch.empty(list(name_bytes) + [0], dtype=torch.uint8, device="cpu").to(name_buffer.device)
+            tmp = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu").to(name_buffer.device)
             name_buffer.copy_(tmp)
         else:
             name_buffer = None

From a3ca231c173debed4b086a3d5b33a7c5c897f6e9 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:57:22 +0800
Subject: [PATCH 20/57] more

---
 debug_print/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index 7e08354..f9acdc9 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -28,7 +28,8 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
         if len(name) > 0:
             name_bytes = name.encode("utf-8")
             name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1)
-            tmp = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu").to(name_buffer.device)
+            tmp = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu")
+            tmp = tmp.to(name_buffer.device)
             name_buffer.copy_(tmp)
         else:
             name_buffer = None

From 6c18f170d1b4c99108638cfbaf93bfe1aa9c4ad3 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 16:59:56 +0800
Subject: [PATCH 21/57] more

---
 debug_print/__init__.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index f9acdc9..d6018e5 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -1,3 +1,4 @@
+from dataclasses import dataclass
 from typing import Dict, Optional
 
 import torch
@@ -16,6 +17,15 @@ def allocate(self, size: int):
         return output
 
 
+@dataclass
+class _CopyTask:
+    src: torch.Tensor
+    dst: torch.Tensor
+
+    def execute(self):
+        self.dst.copy_(self.src)
+
+
 class _DebugPrinter:
     def __init__(self):
         # Can be optimized
@@ -28,9 +38,9 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
         if len(name) > 0:
             name_bytes = name.encode("utf-8")
             name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1)
-            tmp = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu")
-            tmp = tmp.to(name_buffer.device)
-            name_buffer.copy_(tmp)
+            name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu")
+            name_cpu = name_cpu.to(name_buffer.device)
+            name_buffer.copy_(name_cpu)
         else:
             name_buffer = None
         _print_tensor_kernel(x, name_buffer, print_ptr)

From 14a850d54aac7bbad242f93b0410b697cfee7da0 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:00:14 +0800
Subject: [PATCH 22/57] more

---
 debug_print/__init__.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index d6018e5..796dfbd 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -37,13 +37,12 @@ def __init__(self):
     def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
         if len(name) > 0:
             name_bytes = name.encode("utf-8")
-            name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1)
+            name_buffer_gpu = self._buffers[x.device.index].allocate(len(name_bytes) + 1)
             name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu")
-            name_cpu = name_cpu.to(name_buffer.device)
-            name_buffer.copy_(name_cpu)
+            copy_task = _CopyTask(src=name_cpu, dst=name_buffer_gpu)
         else:
-            name_buffer = None
-        _print_tensor_kernel(x, name_buffer, print_ptr)
+            name_buffer_gpu = None
+        _print_tensor_kernel(x, name_buffer_gpu, print_ptr)
 
 
 _printer: Optional[_DebugPrinter] = None

From e0d2ed9756409bb09e98d1984bcbaff195411044 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:01:00 +0800
Subject: [PATCH 23/57] more

---
 debug_print/__init__.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index 796dfbd..e5c0757 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Dict, Optional, List
 
 import torch
 from ._kernels import print_tensor as _print_tensor_kernel
@@ -33,17 +33,21 @@ def __init__(self):
             device_index: _Buffer(device_index=device_index)
             for device_index in range(torch.cuda.device_count())
         }
+        self._pending_copy_tasks: List[_CopyTask] = []
 
     def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
-        if len(name) > 0:
-            name_bytes = name.encode("utf-8")
-            name_buffer_gpu = self._buffers[x.device.index].allocate(len(name_bytes) + 1)
-            name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu")
-            copy_task = _CopyTask(src=name_cpu, dst=name_buffer_gpu)
-        else:
-            name_buffer_gpu = None
+        name_buffer_gpu = self._compute_name_buffer_gpu(name=name)
         _print_tensor_kernel(x, name_buffer_gpu, print_ptr)
 
+    def _compute_name_buffer_gpu(self, name: str):
+        if len(name) == 0:
+            return None
+
+        name_bytes = name.encode("utf-8")
+        name_buffer_gpu = self._buffers[x.device.index].allocate(len(name_bytes) + 1)
+        name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu")
+        copy_task = _CopyTask(src=name_cpu, dst=name_buffer_gpu)
+
 
 _printer: Optional[_DebugPrinter] = None
 

From 5018cf4fc972687edfab6df2d1b053c7318a0580 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:01:24 +0800
Subject: [PATCH 24/57] more

---
 debug_print/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index e5c0757..2c34238 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -48,6 +48,11 @@ def _compute_name_buffer_gpu(self, name: str):
         name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu")
         copy_task = _CopyTask(src=name_cpu, dst=name_buffer_gpu)
 
+        if torch.cuda.is_current_stream_capturing():
+            self._pending_copy_tasks.append(copy_task)
+        else:
+            copy_task.execute()
+
 
 _printer: Optional[_DebugPrinter] = None
 

From 6eb37e2f9132b1cc1166f208d903a7cdab8beec9 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:02:18 +0800
Subject: [PATCH 25/57] more

---
 debug_print/__init__.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index 2c34238..23e8a4e 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -35,16 +35,21 @@ def __init__(self):
         }
         self._pending_copy_tasks: List[_CopyTask] = []
 
+    def post_initialize(self):
+        for copy_task in self._pending_copy_tasks:
+            copy_task.execute()
+        self._pending_copy_tasks.clear()
+
     def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
-        name_buffer_gpu = self._compute_name_buffer_gpu(name=name)
+        name_buffer_gpu = self._compute_name_buffer_gpu(name=name, device_index=x.device.index)
         _print_tensor_kernel(x, name_buffer_gpu, print_ptr)
 
-    def _compute_name_buffer_gpu(self, name: str):
+    def _compute_name_buffer_gpu(self, name: str, device_index: int):
         if len(name) == 0:
             return None
 
         name_bytes = name.encode("utf-8")
-        name_buffer_gpu = self._buffers[x.device.index].allocate(len(name_bytes) + 1)
+        name_buffer_gpu = self._buffers[device_index].allocate(len(name_bytes) + 1)
         name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu")
         copy_task = _CopyTask(src=name_cpu, dst=name_buffer_gpu)
 
@@ -53,6 +58,8 @@ def _compute_name_buffer_gpu(self, name: str):
         else:
             copy_task.execute()
 
+        return name_buffer_gpu
+
 
 _printer: Optional[_DebugPrinter] = None
 
@@ -63,5 +70,9 @@ def initialize():
     _printer = _DebugPrinter()
 
 
+def post_initialize():
+    _printer.post_initialize()
+
+
 def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = False):
     _printer(x=x, name=name, print_ptr=print_ptr)

From bdbf3791f76faffac8d8df4ef14d6f395aedc866 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:03:11 +0800
Subject: [PATCH 26/57] more

---
 example.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/example.py b/example.py
index d34706c..020da60 100644
--- a/example.py
+++ b/example.py
@@ -32,6 +32,8 @@
     z2 = z1 @ y
     debug_print.print_tensor(z2, name="This is name for z2")
 
+debug_print.post_initialize()
+
 x.copy_(torch.randn(2, 2))
 y.copy_(torch.ones(2, 2))
 print("start replay...")

From 96eaae434ea24d971e9cfa74572d7cb2ee717628 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:03:59 +0800
Subject: [PATCH 27/57] more

---
 example.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/example.py b/example.py
index 020da60..0a7d315 100644
--- a/example.py
+++ b/example.py
@@ -3,6 +3,7 @@
 
 debug_print.initialize()
 
+print("demo without cuda graph...")
 x = torch.rand(3, 4, 5).to(0)
 debug_print.print_tensor(x)
 debug_print.print_tensor(x[..., 0:3])
@@ -10,6 +11,7 @@
 debug_print.print_tensor(x[..., 0])
 debug_print.print_tensor(x[0:1, 1:3, 0:4])
 
+print("start warmup...")
 s = torch.cuda.Stream()
 s.wait_stream(torch.cuda.current_stream())
 x = torch.empty(2, 2).half().to(0)
@@ -21,6 +23,7 @@
         z2 = z1 @ y
 
 
+print("start graph capture...")
 g = torch.cuda.CUDAGraph()
 with torch.cuda.graph(g, stream=s):
     debug_print.print_tensor(x)

From c014aab19b1a4cc1baaa0a00bae34f42f9b4119e Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:06:59 +0800
Subject: [PATCH 28/57] more

---
 csrc/debug_print.cu | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index c25a068..60bc46f 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -52,8 +52,9 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x,
                                    const char* name_ptr, const bool print_ptr) {
   PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
-    printf("%.4f, ",
-           float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]));
+    printf("%.4f%c ",
+           float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]),
+           (i % shape_0 == 0) ? ";" : ",");
   }
   printf("\n");
 }
@@ -64,8 +65,9 @@ __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0,
                                  const size_t n, const char* name_ptr, const bool print_ptr) {
   PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
-    printf("%lld, ",
-           int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]));
+    printf("%lld%c ",
+           int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]),
+           (i % shape_0 == 0) ? ";" : ",");
   }
   printf("\n");
 }
@@ -78,9 +80,10 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x,
                                    const char* name_ptr, const bool print_ptr) {
   PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
-    printf("%.4f, ", float(x[(i / shape_0 / shape_1) * stride_2 +
+    printf("%.4f%c ", float(x[(i / shape_0 / shape_1) * stride_2 +
                              ((i / shape_0) % shape_1) * stride_1 +
-                             (i % shape_0) * stride_0]));
+                             (i % shape_0) * stride_0]),
+           ((i % (shape_0 * shape_1) == 0) || (i % shape_0 == 0)) ? ";" : ",");
   }
   printf("\n");
 }
@@ -92,9 +95,10 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1,
                                  const size_t n, const char* name_ptr, const bool print_ptr) {
   PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
-    printf("%lld, ", int64_t(x[(i / shape_0 / shape_1) * stride_2 +
+    printf("%lld%c ", int64_t(x[(i / shape_0 / shape_1) * stride_2 +
                                ((i / shape_0) % shape_1) * stride_1 +
-                               (i % shape_0) * stride_0]));
+                               (i % shape_0) * stride_0]),
+           ((i % (shape_0 * shape_1) == 0) || (i % shape_0 == 0)) ? ";" : ",");
   }
   printf("\n");
 }

From 06abf6cdbef6c021fc4a132a8b5cac4a4ad6acb3 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:08:12 +0800
Subject: [PATCH 29/57] more

---
 csrc/debug_print.cu     | 2 +-
 debug_print/__init__.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 60bc46f..d750472 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -103,7 +103,7 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1,
   printf("\n");
 }
 
-void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool print_ptr) {
+void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool print_ptr, bool print_shape) {
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index());
   TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor");
 
diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index 23e8a4e..b38ad3b 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -40,9 +40,9 @@ def post_initialize(self):
             copy_task.execute()
         self._pending_copy_tasks.clear()
 
-    def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False):
+    def __call__(self, x: torch.Tensor, name: str, print_ptr: bool, print_shape: bool):
         name_buffer_gpu = self._compute_name_buffer_gpu(name=name, device_index=x.device.index)
-        _print_tensor_kernel(x, name_buffer_gpu, print_ptr)
+        _print_tensor_kernel(x, name_buffer_gpu, print_ptr, print_shape)
 
     def _compute_name_buffer_gpu(self, name: str, device_index: int):
         if len(name) == 0:
@@ -74,5 +74,5 @@ def post_initialize():
     _printer.post_initialize()
 
 
-def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = False):
-    _printer(x=x, name=name, print_ptr=print_ptr)
+def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = False, print_shape: bool = False):
+    _printer(x=x, name=name, print_ptr=print_ptr, print_shape=print_shape)

From 577afe27d23a5d8927bb99320346a7920f0fc73e Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:10:40 +0800
Subject: [PATCH 30/57] more

---
 csrc/debug_print.cu | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index d750472..bd12481 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -15,7 +15,7 @@
       TYPE, NAME,                                                              \
       AT_DISPATCH_CASE_FLOATING_AND_REDUCED_FLOATING_TYPES(__VA_ARGS__))
 
-__device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) {
+__device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr, const bool print_shape) {
   if (name_ptr != nullptr) {
     printf("name: %s\n", name_ptr);
   }
@@ -27,8 +27,8 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr)
 template <typename float_t>
 __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
                                    const size_t stride_0, const size_t n,
-                                   const char* name_ptr, const bool print_ptr) {
-  PrintCommon(x, name_ptr, print_ptr);
+                                   const char* name_ptr, const bool print_ptr, const bool print_shape) {
+  PrintCommon(x, name_ptr, print_ptr, print_shape);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f, ", float(x[i * stride_0]));
   }
@@ -37,8 +37,8 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
 
 template <typename int_t>
 __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0,
-                                 const size_t n, const char* name_ptr, const bool print_ptr) {
-  PrintCommon(x, name_ptr, print_ptr);
+                                 const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) {
+  PrintCommon(x, name_ptr, print_ptr, print_shape);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld, ", int64_t(x[i * stride_0]));
   }
@@ -49,8 +49,8 @@ template <typename float_t>
 __global__ void PrintFloatTensor2D(float_t *__restrict__ x,
                                    const size_t shape_0, const size_t stride_1,
                                    const size_t stride_0, const size_t n,
-                                   const char* name_ptr, const bool print_ptr) {
-  PrintCommon(x, name_ptr, print_ptr);
+                                   const char* name_ptr, const bool print_ptr, const bool print_shape) {
+  PrintCommon(x, name_ptr, print_ptr, print_shape);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f%c ",
            float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]),
@@ -62,8 +62,8 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x,
 template <typename int_t>
 __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0,
                                  const size_t stride_1, const size_t stride_0,
-                                 const size_t n, const char* name_ptr, const bool print_ptr) {
-  PrintCommon(x, name_ptr, print_ptr);
+                                 const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) {
+  PrintCommon(x, name_ptr, print_ptr, print_shape);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld%c ",
            int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]),
@@ -77,8 +77,8 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x,
                                    const size_t shape_1, const size_t shape_0,
                                    const size_t stride_2, const size_t stride_1,
                                    const size_t stride_0, const size_t n,
-                                   const char* name_ptr, const bool print_ptr) {
-  PrintCommon(x, name_ptr, print_ptr);
+                                   const char* name_ptr, const bool print_ptr, const bool print_shape) {
+  PrintCommon(x, name_ptr, print_ptr, print_shape);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f%c ", float(x[(i / shape_0 / shape_1) * stride_2 +
                              ((i / shape_0) % shape_1) * stride_1 +
@@ -92,8 +92,9 @@ template <typename int_t>
 __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1,
                                  const size_t shape_0, const size_t stride_2,
                                  const size_t stride_1, const size_t stride_0,
-                                 const size_t n, const char* name_ptr, const bool print_ptr) {
-  PrintCommon(x, name_ptr, print_ptr);
+                                 const size_t n, const char* name_ptr,
+                                 const bool print_ptr, const bool print_shape) {
+  PrintCommon(x, name_ptr, print_ptr, print_shape);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld%c ", int64_t(x[(i / shape_0 / shape_1) * stride_2 +
                                ((i / shape_0) % shape_1) * stride_1 +
@@ -114,21 +115,21 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintFloatTensor1D", ([&] {
             PrintFloatTensor1D<<<1, 1, 0, stream>>>(
-                x.data_ptr<scalar_t>(), x.stride(0), x.numel(), name_ptr, print_ptr);
+                x.data_ptr<scalar_t>(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape);
           }));
     } else if (x.dim() == 2) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintFloatTensor2D", ([&] {
             PrintFloatTensor2D<<<1, 1, 0, stream>>>(
                 x.data_ptr<scalar_t>(), x.size(1), x.stride(0), x.stride(1),
-                x.numel(), name_ptr, print_ptr);
+                x.numel(), name_ptr, print_ptr, print_shape);
           }));
     } else if (x.dim() == 3) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintFloatTensor3D", ([&] {
             PrintFloatTensor3D<<<1, 1, 0, stream>>>(
                 x.data_ptr<scalar_t>(), x.size(1), x.size(2), x.stride(0),
-                x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr);
+                x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
           }));
     } else {
       // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
@@ -144,21 +145,21 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] {
                                    PrintIntTensor1D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.stride(0),
-                                       x.numel(), name_ptr, print_ptr);
+                                       x.numel(), name_ptr, print_ptr, print_shape);
                                  }));
     } else if (x.dim() == 2) {
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] {
                                    PrintIntTensor2D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.size(1),
                                        x.stride(0), x.stride(1), x.numel(),
-                                       name_ptr, print_ptr);
+                                       name_ptr, print_ptr, print_shape);
                                  }));
     } else if (x.dim() == 3) {
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] {
                                    PrintIntTensor3D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.size(1),
                                        x.size(2), x.stride(0), x.stride(1),
-                                       x.stride(2), x.numel(), name_ptr, print_ptr);
+                                       x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
                                  }));
     } else {
       // NOTE(Zihao): I'm just too lazy to do this, codegen for higher

From e66e5619d793b973ba28ce70f0f752fb5e9b8078 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:10:59 +0800
Subject: [PATCH 31/57] more

---
 csrc/debug_print.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index bd12481..2fea4c2 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -28,7 +28,7 @@ template <typename float_t>
 __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
                                    const size_t stride_0, const size_t n,
                                    const char* name_ptr, const bool print_ptr, const bool print_shape) {
-  PrintCommon(x, name_ptr, print_ptr, print_shape);
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f, ", float(x[i * stride_0]));
   }
@@ -38,7 +38,7 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
 template <typename int_t>
 __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0,
                                  const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) {
-  PrintCommon(x, name_ptr, print_ptr, print_shape);
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld, ", int64_t(x[i * stride_0]));
   }
@@ -50,7 +50,7 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x,
                                    const size_t shape_0, const size_t stride_1,
                                    const size_t stride_0, const size_t n,
                                    const char* name_ptr, const bool print_ptr, const bool print_shape) {
-  PrintCommon(x, name_ptr, print_ptr, print_shape);
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f%c ",
            float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]),
@@ -63,7 +63,7 @@ template <typename int_t>
 __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0,
                                  const size_t stride_1, const size_t stride_0,
                                  const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) {
-  PrintCommon(x, name_ptr, print_ptr, print_shape);
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld%c ",
            int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]),
@@ -78,7 +78,7 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x,
                                    const size_t stride_2, const size_t stride_1,
                                    const size_t stride_0, const size_t n,
                                    const char* name_ptr, const bool print_ptr, const bool print_shape) {
-  PrintCommon(x, name_ptr, print_ptr, print_shape);
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f%c ", float(x[(i / shape_0 / shape_1) * stride_2 +
                              ((i / shape_0) % shape_1) * stride_1 +
@@ -94,7 +94,7 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1,
                                  const size_t stride_1, const size_t stride_0,
                                  const size_t n, const char* name_ptr,
                                  const bool print_ptr, const bool print_shape) {
-  PrintCommon(x, name_ptr, print_ptr, print_shape);
+  PrintCommon(x, name_ptr, print_ptr);
   for (size_t i = 0; i < n; ++i) {
     printf("%lld%c ", int64_t(x[(i / shape_0 / shape_1) * stride_2 +
                                ((i / shape_0) % shape_1) * stride_1 +

From c7a901c9bfcc577276ff2a92c6b2d0d1d9d36fc5 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:12:18 +0800
Subject: [PATCH 32/57] more

---
 csrc/debug_print.cu | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 2fea4c2..0e96cbb 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -29,6 +29,9 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
                                    const size_t stride_0, const size_t n,
                                    const char* name_ptr, const bool print_ptr, const bool print_shape) {
   PrintCommon(x, name_ptr, print_ptr);
+  if (print_shape) {
+    printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0);
+  }
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f, ", float(x[i * stride_0]));
   }
@@ -39,6 +42,9 @@ template <typename int_t>
 __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0,
                                  const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) {
   PrintCommon(x, name_ptr, print_ptr);
+  if (print_shape) {
+    printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0);
+  }
   for (size_t i = 0; i < n; ++i) {
     printf("%lld, ", int64_t(x[i * stride_0]));
   }
@@ -51,6 +57,9 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x,
                                    const size_t stride_0, const size_t n,
                                    const char* name_ptr, const bool print_ptr, const bool print_shape) {
   PrintCommon(x, name_ptr, print_ptr);
+  if (print_shape) {
+    printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, , (int) stride_0);
+  }
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f%c ",
            float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]),
@@ -121,14 +130,14 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintFloatTensor2D", ([&] {
             PrintFloatTensor2D<<<1, 1, 0, stream>>>(
-                x.data_ptr<scalar_t>(), x.size(1), x.stride(0), x.stride(1),
+                x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.stride(0), x.stride(1),
                 x.numel(), name_ptr, print_ptr, print_shape);
           }));
     } else if (x.dim() == 3) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintFloatTensor3D", ([&] {
             PrintFloatTensor3D<<<1, 1, 0, stream>>>(
-                x.data_ptr<scalar_t>(), x.size(1), x.size(2), x.stride(0),
+                x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.size(2), x.stride(0),
                 x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
           }));
     } else {
@@ -150,14 +159,14 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
     } else if (x.dim() == 2) {
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] {
                                    PrintIntTensor2D<<<1, 1, 0, stream>>>(
-                                       x.data_ptr<scalar_t>(), x.size(1),
+                                       x.data_ptr<scalar_t>(), x.size(0), x.size(1),
                                        x.stride(0), x.stride(1), x.numel(),
                                        name_ptr, print_ptr, print_shape);
                                  }));
     } else if (x.dim() == 3) {
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] {
                                    PrintIntTensor3D<<<1, 1, 0, stream>>>(
-                                       x.data_ptr<scalar_t>(), x.size(1),
+                                       x.data_ptr<scalar_t>(), x.size(0), x.size(1),
                                        x.size(2), x.stride(0), x.stride(1),
                                        x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
                                  }));

From 5d3bde94ae003d2a6e28f2d194514041c50fecec Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:12:51 +0800
Subject: [PATCH 33/57] more

---
 csrc/debug_print.cu | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 0e96cbb..790d5a1 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -58,7 +58,7 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x,
                                    const char* name_ptr, const bool print_ptr, const bool print_shape) {
   PrintCommon(x, name_ptr, print_ptr);
   if (print_shape) {
-    printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, , (int) stride_0);
+    printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1);
   }
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f%c ",
@@ -73,6 +73,9 @@ __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0,
                                  const size_t stride_1, const size_t stride_0,
                                  const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) {
   PrintCommon(x, name_ptr, print_ptr);
+  if (print_shape) {
+    printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1);
+  }
   for (size_t i = 0; i < n; ++i) {
     printf("%lld%c ",
            int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]),
@@ -88,6 +91,10 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x,
                                    const size_t stride_0, const size_t n,
                                    const char* name_ptr, const bool print_ptr, const bool print_shape) {
   PrintCommon(x, name_ptr, print_ptr);
+  if (print_shape) {
+    printf("shape=(%d, %d, %d), stride=(%d, %d, %d)",
+        (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2);
+  }
   for (size_t i = 0; i < n; ++i) {
     printf("%.4f%c ", float(x[(i / shape_0 / shape_1) * stride_2 +
                              ((i / shape_0) % shape_1) * stride_1 +
@@ -104,6 +111,10 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1,
                                  const size_t n, const char* name_ptr,
                                  const bool print_ptr, const bool print_shape) {
   PrintCommon(x, name_ptr, print_ptr);
+  if (print_shape) {
+    printf("shape=(%d, %d, %d), stride=(%d, %d, %d)",
+        (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2);
+  }
   for (size_t i = 0; i < n; ++i) {
     printf("%lld%c ", int64_t(x[(i / shape_0 / shape_1) * stride_2 +
                                ((i / shape_0) % shape_1) * stride_1 +

From aa0aa6ff823c3e15e066615e23426ef7b65124e7 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:16:08 +0800
Subject: [PATCH 34/57] more

---
 csrc/debug_print.cu | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 790d5a1..ec79a39 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -24,8 +24,19 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr,
   }
 }
 
+template <typename scalar_t>
+__device__ void PrintElem(scalar_t value) {
+    if constexpr (std::is_floating_point<scalar_t>::value) {
+      printf("%.4f, ", float(x[i * stride_0]));
+    } else if constexpr (std::is_integral<scalar_t>::value) {
+      printf("%lld, ", static_cast<long long>(x[i * stride_0]));
+    } else {
+      printf("?, ");
+    }
+}
+
 template <typename float_t>
-__global__ void PrintFloatTensor1D(float_t *__restrict__ x,
+__global__ void PrintTensor1D(float_t *__restrict__ x,
                                    const size_t stride_0, const size_t n,
                                    const char* name_ptr, const bool print_ptr, const bool print_shape) {
   PrintCommon(x, name_ptr, print_ptr);
@@ -33,20 +44,7 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x,
     printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0);
   }
   for (size_t i = 0; i < n; ++i) {
-    printf("%.4f, ", float(x[i * stride_0]));
-  }
-  printf("\n");
-}
-
-template <typename int_t>
-__global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0,
-                                 const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) {
-  PrintCommon(x, name_ptr, print_ptr);
-  if (print_shape) {
-    printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0);
-  }
-  for (size_t i = 0; i < n; ++i) {
-    printf("%lld, ", int64_t(x[i * stride_0]));
+    PrintElem(x[i * stride_0]);
   }
   printf("\n");
 }

From 5d6773972100712bcf934ebfbf9a4a3988a1919b Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:16:33 +0800
Subject: [PATCH 35/57] more

---
 csrc/debug_print.cu | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index ec79a39..df78cf6 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -36,15 +36,18 @@ __device__ void PrintElem(scalar_t value) {
 }
 
 template <typename float_t>
-__global__ void PrintTensor1D(float_t *__restrict__ x,
-                                   const size_t stride_0, const size_t n,
-                                   const char* name_ptr, const bool print_ptr, const bool print_shape) {
+__global__ void PrintTensor1D(
+    float_t *__restrict__ x,
+    const size_t shape_0,
+    const size_t stride_0,
+    const char* name_ptr, const bool print_ptr, const bool print_shape
+) {
   PrintCommon(x, name_ptr, print_ptr);
   if (print_shape) {
     printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0);
   }
-  for (size_t i = 0; i < n; ++i) {
-    PrintElem(x[i * stride_0]);
+  for (size_t index_0 = 0; index_0 < shape_0; ++index_0) {
+    PrintElem(x[index_0 * stride_0]);
   }
   printf("\n");
 }

From d6458290394ea09612feada6fde8a061861fbf4d Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:17:59 +0800
Subject: [PATCH 36/57] more

---
 csrc/debug_print.cu | 84 +++++++++++++++------------------------------
 1 file changed, 27 insertions(+), 57 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index df78cf6..eac027b 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -44,7 +44,7 @@ __global__ void PrintTensor1D(
 ) {
   PrintCommon(x, name_ptr, print_ptr);
   if (print_shape) {
-    printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0);
+    printf("shape=(%d), stride=(%d)", (int) shape_0, (int) stride_0);
   }
   for (size_t index_0 = 0; index_0 < shape_0; ++index_0) {
     PrintElem(x[index_0 * stride_0]);
@@ -53,74 +53,44 @@ __global__ void PrintTensor1D(
 }
 
 template <typename float_t>
-__global__ void PrintFloatTensor2D(float_t *__restrict__ x,
-                                   const size_t shape_0, const size_t stride_1,
-                                   const size_t stride_0, const size_t n,
-                                   const char* name_ptr, const bool print_ptr, const bool print_shape) {
-  PrintCommon(x, name_ptr, print_ptr);
-  if (print_shape) {
-    printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1);
-  }
-  for (size_t i = 0; i < n; ++i) {
-    printf("%.4f%c ",
-           float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]),
-           (i % shape_0 == 0) ? ";" : ",");
-  }
-  printf("\n");
-}
-
-template <typename int_t>
-__global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0,
-                                 const size_t stride_1, const size_t stride_0,
-                                 const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) {
+__global__ void PrintTensor2D(
+    float_t *__restrict__ x,
+    const size_t shape_0, const size_t shape_1,
+    const size_t stride_0, const size_t stride_1,
+    const char* name_ptr, const bool print_ptr, const bool print_shape
+) {
   PrintCommon(x, name_ptr, print_ptr);
   if (print_shape) {
     printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1);
   }
-  for (size_t i = 0; i < n; ++i) {
-    printf("%lld%c ",
-           int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]),
-           (i % shape_0 == 0) ? ";" : ",");
+  for (size_t index_0 = 0; index_0 < shape_0; ++index_0) {
+    for (size_t index_1 = 0; index_1 < shape_1; ++index_1) {
+      PrintElem(x[index_0 * stride_0 + index_1 * stride_1]);
+    }
+    printf("; ");
   }
   printf("\n");
 }
 
 template <typename float_t>
-__global__ void PrintFloatTensor3D(float_t *__restrict__ x,
-                                   const size_t shape_1, const size_t shape_0,
-                                   const size_t stride_2, const size_t stride_1,
-                                   const size_t stride_0, const size_t n,
-                                   const char* name_ptr, const bool print_ptr, const bool print_shape) {
-  PrintCommon(x, name_ptr, print_ptr);
-  if (print_shape) {
-    printf("shape=(%d, %d, %d), stride=(%d, %d, %d)",
-        (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2);
-  }
-  for (size_t i = 0; i < n; ++i) {
-    printf("%.4f%c ", float(x[(i / shape_0 / shape_1) * stride_2 +
-                             ((i / shape_0) % shape_1) * stride_1 +
-                             (i % shape_0) * stride_0]),
-           ((i % (shape_0 * shape_1) == 0) || (i % shape_0 == 0)) ? ";" : ",");
-  }
-  printf("\n");
-}
-
-template <typename int_t>
-__global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1,
-                                 const size_t shape_0, const size_t stride_2,
-                                 const size_t stride_1, const size_t stride_0,
-                                 const size_t n, const char* name_ptr,
-                                 const bool print_ptr, const bool print_shape) {
+__global__ void PrintTensor3D(
+    float_t *__restrict__ x,
+    const size_t shape_0, const size_t shape_1, const size_t shape_2,
+    const size_t stride_0, const size_t stride_1, const size_t stride_2,
+    const char* name_ptr, const bool print_ptr, const bool print_shape
+) {
   PrintCommon(x, name_ptr, print_ptr);
   if (print_shape) {
-    printf("shape=(%d, %d, %d), stride=(%d, %d, %d)",
-        (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2);
+    printf("shape=(%d, %d, %d), stride=(%d, %d, %d)", (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2);
   }
-  for (size_t i = 0; i < n; ++i) {
-    printf("%lld%c ", int64_t(x[(i / shape_0 / shape_1) * stride_2 +
-                               ((i / shape_0) % shape_1) * stride_1 +
-                               (i % shape_0) * stride_0]),
-           ((i % (shape_0 * shape_1) == 0) || (i % shape_0 == 0)) ? ";" : ",");
+  for (size_t index_0 = 0; index_0 < shape_0; ++index_0) {
+    for (size_t index_1 = 0; index_1 < shape_1; ++index_1) {
+      for (size_t index_2 = 0; index_2 < shape_2; ++index_2) {
+        PrintElem(x[index_0 * stride_0 + index_1 * stride_1]);
+      }
+      printf("; ");
+    }
+    printf("; ");
   }
   printf("\n");
 }

From bacaf8c353025b9382f4b79562dfb81ac8547281 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:18:24 +0800
Subject: [PATCH 37/57] more

---
 csrc/debug_print.cu | 84 ++++++++++++++-------------------------------
 1 file changed, 25 insertions(+), 59 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index eac027b..e4d3a22 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -101,67 +101,33 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
 
   const char* name_ptr = name_buffer.has_value() ? reinterpret_cast<char*>(name_buffer->data_ptr<uint8_t>()) : nullptr;
 
-  if (x.is_floating_point()) {
-    if (x.dim() == 1) {
-      AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
-          x.scalar_type(), "PrintFloatTensor1D", ([&] {
-            PrintFloatTensor1D<<<1, 1, 0, stream>>>(
-                x.data_ptr<scalar_t>(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape);
-          }));
-    } else if (x.dim() == 2) {
-      AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
-          x.scalar_type(), "PrintFloatTensor2D", ([&] {
-            PrintFloatTensor2D<<<1, 1, 0, stream>>>(
-                x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.stride(0), x.stride(1),
-                x.numel(), name_ptr, print_ptr, print_shape);
-          }));
-    } else if (x.dim() == 3) {
-      AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
-          x.scalar_type(), "PrintFloatTensor3D", ([&] {
-            PrintFloatTensor3D<<<1, 1, 0, stream>>>(
-                x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.size(2), x.stride(0),
-                x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
-          }));
-    } else {
-      // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
-      // dimensions should be a better idea
-      TORCH_CHECK(false, "Input dimension not supported.");
-    }
-    cudaError_t status = cudaGetLastError();
-    TORCH_CHECK(status == cudaSuccess,
-                "PrintFloatTensor failed with error " +
-                    std::string(cudaGetErrorString(status)));
+  if (x.dim() == 1) {
+    AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
+        x.scalar_type(), "PrintFloatTensor1D", ([&] {
+          PrintFloatTensor1D<<<1, 1, 0, stream>>>(
+              x.data_ptr<scalar_t>(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape);
+        }));
+  } else if (x.dim() == 2) {
+    AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
+        x.scalar_type(), "PrintFloatTensor2D", ([&] {
+          PrintFloatTensor2D<<<1, 1, 0, stream>>>(
+              x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.stride(0), x.stride(1),
+              x.numel(), name_ptr, print_ptr, print_shape);
+        }));
+  } else if (x.dim() == 3) {
+    AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
+        x.scalar_type(), "PrintFloatTensor3D", ([&] {
+          PrintFloatTensor3D<<<1, 1, 0, stream>>>(
+              x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.size(2), x.stride(0),
+              x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
+        }));
   } else {
-    if (x.dim() == 1) {
-      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] {
-                                   PrintIntTensor1D<<<1, 1, 0, stream>>>(
-                                       x.data_ptr<scalar_t>(), x.stride(0),
-                                       x.numel(), name_ptr, print_ptr, print_shape);
-                                 }));
-    } else if (x.dim() == 2) {
-      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] {
-                                   PrintIntTensor2D<<<1, 1, 0, stream>>>(
-                                       x.data_ptr<scalar_t>(), x.size(0), x.size(1),
-                                       x.stride(0), x.stride(1), x.numel(),
-                                       name_ptr, print_ptr, print_shape);
-                                 }));
-    } else if (x.dim() == 3) {
-      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] {
-                                   PrintIntTensor3D<<<1, 1, 0, stream>>>(
-                                       x.data_ptr<scalar_t>(), x.size(0), x.size(1),
-                                       x.size(2), x.stride(0), x.stride(1),
-                                       x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
-                                 }));
-    } else {
-      // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
-      // dimensions should be a better idea
-      TORCH_CHECK(false, "Input dimension not supported.");
-    }
-    cudaError_t status = cudaGetLastError();
-    TORCH_CHECK(status == cudaSuccess,
-                "PrintIntTensor failed with error " +
-                    std::string(cudaGetErrorString(status)));
+    // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
+    // dimensions should be a better idea
+    TORCH_CHECK(false, "Input dimension not supported.");
   }
+  cudaError_t status = cudaGetLastError();
+  TORCH_CHECK(status == cudaSuccess, "PrintTensor failed with error " + std::string(cudaGetErrorString(status)));
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

From 9a4b5ffaae55116c731a5b2fe0bbae1af0ab4107 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:18:47 +0800
Subject: [PATCH 38/57] Revert "more"

This reverts commit bacaf8c353025b9382f4b79562dfb81ac8547281.
---
 csrc/debug_print.cu | 84 +++++++++++++++++++++++++++++++--------------
 1 file changed, 59 insertions(+), 25 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index e4d3a22..eac027b 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -101,33 +101,67 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
 
   const char* name_ptr = name_buffer.has_value() ? reinterpret_cast<char*>(name_buffer->data_ptr<uint8_t>()) : nullptr;
 
-  if (x.dim() == 1) {
-    AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
-        x.scalar_type(), "PrintFloatTensor1D", ([&] {
-          PrintFloatTensor1D<<<1, 1, 0, stream>>>(
-              x.data_ptr<scalar_t>(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape);
-        }));
-  } else if (x.dim() == 2) {
-    AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
-        x.scalar_type(), "PrintFloatTensor2D", ([&] {
-          PrintFloatTensor2D<<<1, 1, 0, stream>>>(
-              x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.stride(0), x.stride(1),
-              x.numel(), name_ptr, print_ptr, print_shape);
-        }));
-  } else if (x.dim() == 3) {
-    AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
-        x.scalar_type(), "PrintFloatTensor3D", ([&] {
-          PrintFloatTensor3D<<<1, 1, 0, stream>>>(
-              x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.size(2), x.stride(0),
-              x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
-        }));
+  if (x.is_floating_point()) {
+    if (x.dim() == 1) {
+      AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
+          x.scalar_type(), "PrintFloatTensor1D", ([&] {
+            PrintFloatTensor1D<<<1, 1, 0, stream>>>(
+                x.data_ptr<scalar_t>(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape);
+          }));
+    } else if (x.dim() == 2) {
+      AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
+          x.scalar_type(), "PrintFloatTensor2D", ([&] {
+            PrintFloatTensor2D<<<1, 1, 0, stream>>>(
+                x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.stride(0), x.stride(1),
+                x.numel(), name_ptr, print_ptr, print_shape);
+          }));
+    } else if (x.dim() == 3) {
+      AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
+          x.scalar_type(), "PrintFloatTensor3D", ([&] {
+            PrintFloatTensor3D<<<1, 1, 0, stream>>>(
+                x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.size(2), x.stride(0),
+                x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
+          }));
+    } else {
+      // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
+      // dimensions should be a better idea
+      TORCH_CHECK(false, "Input dimension not supported.");
+    }
+    cudaError_t status = cudaGetLastError();
+    TORCH_CHECK(status == cudaSuccess,
+                "PrintFloatTensor failed with error " +
+                    std::string(cudaGetErrorString(status)));
   } else {
-    // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
-    // dimensions should be a better idea
-    TORCH_CHECK(false, "Input dimension not supported.");
+    if (x.dim() == 1) {
+      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] {
+                                   PrintIntTensor1D<<<1, 1, 0, stream>>>(
+                                       x.data_ptr<scalar_t>(), x.stride(0),
+                                       x.numel(), name_ptr, print_ptr, print_shape);
+                                 }));
+    } else if (x.dim() == 2) {
+      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] {
+                                   PrintIntTensor2D<<<1, 1, 0, stream>>>(
+                                       x.data_ptr<scalar_t>(), x.size(0), x.size(1),
+                                       x.stride(0), x.stride(1), x.numel(),
+                                       name_ptr, print_ptr, print_shape);
+                                 }));
+    } else if (x.dim() == 3) {
+      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] {
+                                   PrintIntTensor3D<<<1, 1, 0, stream>>>(
+                                       x.data_ptr<scalar_t>(), x.size(0), x.size(1),
+                                       x.size(2), x.stride(0), x.stride(1),
+                                       x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
+                                 }));
+    } else {
+      // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
+      // dimensions should be a better idea
+      TORCH_CHECK(false, "Input dimension not supported.");
+    }
+    cudaError_t status = cudaGetLastError();
+    TORCH_CHECK(status == cudaSuccess,
+                "PrintIntTensor failed with error " +
+                    std::string(cudaGetErrorString(status)));
   }
-  cudaError_t status = cudaGetLastError();
-  TORCH_CHECK(status == cudaSuccess, "PrintTensor failed with error " + std::string(cudaGetErrorString(status)));
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

From 9f1e6e4f5b3ecdb92ad43cdc4a974fff23d3141c Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:19:09 +0800
Subject: [PATCH 39/57] more

---
 csrc/debug_print.cu | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index eac027b..86c775e 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -104,21 +104,21 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
   if (x.is_floating_point()) {
     if (x.dim() == 1) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
-          x.scalar_type(), "PrintFloatTensor1D", ([&] {
-            PrintFloatTensor1D<<<1, 1, 0, stream>>>(
+          x.scalar_type(), "PrintTensor1D", ([&] {
+            PrintTensor1D<<<1, 1, 0, stream>>>(
                 x.data_ptr<scalar_t>(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape);
           }));
     } else if (x.dim() == 2) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
-          x.scalar_type(), "PrintFloatTensor2D", ([&] {
-            PrintFloatTensor2D<<<1, 1, 0, stream>>>(
+          x.scalar_type(), "PrintTensor2D", ([&] {
+            PrintTensor2D<<<1, 1, 0, stream>>>(
                 x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.stride(0), x.stride(1),
                 x.numel(), name_ptr, print_ptr, print_shape);
           }));
     } else if (x.dim() == 3) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
-          x.scalar_type(), "PrintFloatTensor3D", ([&] {
-            PrintFloatTensor3D<<<1, 1, 0, stream>>>(
+          x.scalar_type(), "PrintTensor3D", ([&] {
+            PrintTensor3D<<<1, 1, 0, stream>>>(
                 x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.size(2), x.stride(0),
                 x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
           }));
@@ -129,25 +129,25 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
     }
     cudaError_t status = cudaGetLastError();
     TORCH_CHECK(status == cudaSuccess,
-                "PrintFloatTensor failed with error " +
+                "PrintTensor failed with error " +
                     std::string(cudaGetErrorString(status)));
   } else {
     if (x.dim() == 1) {
-      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] {
-                                   PrintIntTensor1D<<<1, 1, 0, stream>>>(
+      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor1D", ([&] {
+                                   PrintTensor1D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.stride(0),
                                        x.numel(), name_ptr, print_ptr, print_shape);
                                  }));
     } else if (x.dim() == 2) {
-      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] {
-                                   PrintIntTensor2D<<<1, 1, 0, stream>>>(
+      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor2D", ([&] {
+                                   PrintTensor2D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.size(0), x.size(1),
                                        x.stride(0), x.stride(1), x.numel(),
                                        name_ptr, print_ptr, print_shape);
                                  }));
     } else if (x.dim() == 3) {
-      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] {
-                                   PrintIntTensor3D<<<1, 1, 0, stream>>>(
+      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor3D", ([&] {
+                                   PrintTensor3D<<<1, 1, 0, stream>>>(
                                        x.data_ptr<scalar_t>(), x.size(0), x.size(1),
                                        x.size(2), x.stride(0), x.stride(1),
                                        x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
@@ -159,7 +159,7 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
     }
     cudaError_t status = cudaGetLastError();
     TORCH_CHECK(status == cudaSuccess,
-                "PrintIntTensor failed with error " +
+                "PrintTensor failed with error " +
                     std::string(cudaGetErrorString(status)));
   }
 }

From 821eb26d17b5be4ea55aa3ee713dc4a5febab20c Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:20:35 +0800
Subject: [PATCH 40/57] more

---
 csrc/debug_print.cu | 49 +++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 86c775e..3ceab93 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -106,21 +106,28 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintTensor1D", ([&] {
             PrintTensor1D<<<1, 1, 0, stream>>>(
-                x.data_ptr<scalar_t>(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape);
+                x.data_ptr<scalar_t>(),
+                x.size(0), x.stride(0),
+                name_ptr, print_ptr, print_shape
+            );
           }));
     } else if (x.dim() == 2) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintTensor2D", ([&] {
             PrintTensor2D<<<1, 1, 0, stream>>>(
-                x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.stride(0), x.stride(1),
-                x.numel(), name_ptr, print_ptr, print_shape);
+                x.data_ptr<scalar_t>(),
+                x.size(0), x.size(1), x.stride(0), x.stride(1),
+                name_ptr, print_ptr, print_shape
+            );
           }));
     } else if (x.dim() == 3) {
       AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
           x.scalar_type(), "PrintTensor3D", ([&] {
             PrintTensor3D<<<1, 1, 0, stream>>>(
-                x.data_ptr<scalar_t>(), x.size(0), x.size(1), x.size(2), x.stride(0),
-                x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
+                x.data_ptr<scalar_t>(),
+                x.size(0), x.size(1), x.size(2), x.stride(0), x.stride(1), x.stride(2),
+                name_ptr, print_ptr, print_shape
+            );
           }));
     } else {
       // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
@@ -134,24 +141,28 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
   } else {
     if (x.dim() == 1) {
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor1D", ([&] {
-                                   PrintTensor1D<<<1, 1, 0, stream>>>(
-                                       x.data_ptr<scalar_t>(), x.stride(0),
-                                       x.numel(), name_ptr, print_ptr, print_shape);
-                                 }));
+           PrintTensor1D<<<1, 1, 0, stream>>>(
+                x.data_ptr<scalar_t>(),
+                x.size(0), x.stride(0),
+                name_ptr, print_ptr, print_shape
+            );
+      }));
     } else if (x.dim() == 2) {
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor2D", ([&] {
-                                   PrintTensor2D<<<1, 1, 0, stream>>>(
-                                       x.data_ptr<scalar_t>(), x.size(0), x.size(1),
-                                       x.stride(0), x.stride(1), x.numel(),
-                                       name_ptr, print_ptr, print_shape);
-                                 }));
+       PrintTensor2D<<<1, 1, 0, stream>>>(
+                x.data_ptr<scalar_t>(),
+                x.size(0), x.size(1), x.stride(0), x.stride(1),
+                name_ptr, print_ptr, print_shape
+            );
+     }));
     } else if (x.dim() == 3) {
       AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor3D", ([&] {
-                                   PrintTensor3D<<<1, 1, 0, stream>>>(
-                                       x.data_ptr<scalar_t>(), x.size(0), x.size(1),
-                                       x.size(2), x.stride(0), x.stride(1),
-                                       x.stride(2), x.numel(), name_ptr, print_ptr, print_shape);
-                                 }));
+       PrintTensor3D<<<1, 1, 0, stream>>>(
+                x.data_ptr<scalar_t>(),
+                x.size(0), x.size(1), x.size(2), x.stride(0), x.stride(1), x.stride(2),
+                name_ptr, print_ptr, print_shape
+            );
+     }));
     } else {
       // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
       // dimensions should be a better idea

From 8d619e621deb5f474db59e07962ae7d32f7e9fe7 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:21:21 +0800
Subject: [PATCH 41/57] more

---
 csrc/debug_print.cu | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 3ceab93..0363f20 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -46,9 +46,11 @@ __global__ void PrintTensor1D(
   if (print_shape) {
     printf("shape=(%d), stride=(%d)", (int) shape_0, (int) stride_0);
   }
+  printf("[");
   for (size_t index_0 = 0; index_0 < shape_0; ++index_0) {
     PrintElem(x[index_0 * stride_0]);
   }
+  printf("]");
   printf("\n");
 }
 
@@ -63,12 +65,15 @@ __global__ void PrintTensor2D(
   if (print_shape) {
     printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1);
   }
+  printf("[");
   for (size_t index_0 = 0; index_0 < shape_0; ++index_0) {
+    printf("[");
     for (size_t index_1 = 0; index_1 < shape_1; ++index_1) {
       PrintElem(x[index_0 * stride_0 + index_1 * stride_1]);
     }
-    printf("; ");
+    printf("]");
   }
+  printf("]");
   printf("\n");
 }
 
@@ -83,15 +88,19 @@ __global__ void PrintTensor3D(
   if (print_shape) {
     printf("shape=(%d, %d, %d), stride=(%d, %d, %d)", (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2);
   }
+  printf("[");
   for (size_t index_0 = 0; index_0 < shape_0; ++index_0) {
+    printf("[");
     for (size_t index_1 = 0; index_1 < shape_1; ++index_1) {
+      printf("[");
       for (size_t index_2 = 0; index_2 < shape_2; ++index_2) {
         PrintElem(x[index_0 * stride_0 + index_1 * stride_1]);
       }
-      printf("; ");
+      printf("]");
     }
-    printf("; ");
+    printf("]");
   }
+  printf("]");
   printf("\n");
 }
 

From 7c8da333ce956be7b5e725b010c6b2fd9d6ea982 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:22:51 +0800
Subject: [PATCH 42/57] more

---
 example.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/example.py b/example.py
index 0a7d315..a74c067 100644
--- a/example.py
+++ b/example.py
@@ -11,6 +11,17 @@
 debug_print.print_tensor(x[..., 0])
 debug_print.print_tensor(x[0:1, 1:3, 0:4])
 
+print("demo for all types...")
+debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32), name="for int32")
+debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int64), name="for int64")
+debug_print.print_tensor(torch.tensor([1.5, 2.5, 3.5], dtype=torch.float), name="for float")
+
+print("demo for all dims...")
+debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32), name="for 1D")
+debug_print.print_tensor(torch.tensor([[1, 2, 3], [3, 4, 5]], dtype=torch.int32), name="for 2D")
+debug_print.print_tensor(torch.tensor([[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]], dtype=torch.int32),
+                         name="for 3D")
+
 print("start warmup...")
 s = torch.cuda.Stream()
 s.wait_stream(torch.cuda.current_stream())
@@ -22,7 +33,6 @@
         z1 = z @ y
         z2 = z1 @ y
 
-
 print("start graph capture...")
 g = torch.cuda.CUDAGraph()
 with torch.cuda.graph(g, stream=s):

From 4fcee3d3b7a7fa0c15f6d9a0aeec88d166cebd9b Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:23:25 +0800
Subject: [PATCH 43/57] more

---
 csrc/debug_print.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 0363f20..51ba3bd 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -27,9 +27,9 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr,
 template <typename scalar_t>
 __device__ void PrintElem(scalar_t value) {
     if constexpr (std::is_floating_point<scalar_t>::value) {
-      printf("%.4f, ", float(x[i * stride_0]));
+      printf("%.4f, ", float(value));
     } else if constexpr (std::is_integral<scalar_t>::value) {
-      printf("%lld, ", static_cast<long long>(x[i * stride_0]));
+      printf("%lld, ", static_cast<long long>(value));
     } else {
       printf("?, ");
     }

From cf18d097434b5bfb951d2e133e118bee3bac7f25 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:24:03 +0800
Subject: [PATCH 44/57] more

---
 csrc/debug_print.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 51ba3bd..5cdcaa4 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -71,7 +71,7 @@ __global__ void PrintTensor2D(
     for (size_t index_1 = 0; index_1 < shape_1; ++index_1) {
       PrintElem(x[index_0 * stride_0 + index_1 * stride_1]);
     }
-    printf("]");
+    printf("], ");
   }
   printf("]");
   printf("\n");
@@ -96,9 +96,9 @@ __global__ void PrintTensor3D(
       for (size_t index_2 = 0; index_2 < shape_2; ++index_2) {
         PrintElem(x[index_0 * stride_0 + index_1 * stride_1]);
       }
-      printf("]");
+      printf("], ");
     }
-    printf("]");
+    printf("], ");
   }
   printf("]");
   printf("\n");

From 8a521360e30f21e11c3f20ef1a13f7cb9bdd486b Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:24:15 +0800
Subject: [PATCH 45/57] more

---
 csrc/debug_print.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 5cdcaa4..220e67c 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -15,7 +15,7 @@
       TYPE, NAME,                                                              \
       AT_DISPATCH_CASE_FLOATING_AND_REDUCED_FLOATING_TYPES(__VA_ARGS__))
 
-__device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr, const bool print_shape) {
+__device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) {
   if (name_ptr != nullptr) {
     printf("name: %s\n", name_ptr);
   }

From 768d952c452c30222cfb54054cf701d9857872f0 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:26:38 +0800
Subject: [PATCH 46/57] more

---
 debug_print/__init__.py |  1 +
 example.py              | 15 ++++++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index b38ad3b..14f369a 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -41,6 +41,7 @@ def post_initialize(self):
         self._pending_copy_tasks.clear()
 
     def __call__(self, x: torch.Tensor, name: str, print_ptr: bool, print_shape: bool):
+        assert x.is_cuda, f"{x.device} must be on cuda"
         name_buffer_gpu = self._compute_name_buffer_gpu(name=name, device_index=x.device.index)
         _print_tensor_kernel(x, name_buffer_gpu, print_ptr, print_shape)
 
diff --git a/example.py b/example.py
index a74c067..0a311f5 100644
--- a/example.py
+++ b/example.py
@@ -12,15 +12,16 @@
 debug_print.print_tensor(x[0:1, 1:3, 0:4])
 
 print("demo for all types...")
-debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32), name="for int32")
-debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int64), name="for int64")
-debug_print.print_tensor(torch.tensor([1.5, 2.5, 3.5], dtype=torch.float), name="for float")
+debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for int32")
+debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int64, device="cuda:0"), name="for int64")
+debug_print.print_tensor(torch.tensor([1.5, 2.5, 3.5], dtype=torch.float, device="cuda:0"), name="for float")
 
 print("demo for all dims...")
-debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32), name="for 1D")
-debug_print.print_tensor(torch.tensor([[1, 2, 3], [3, 4, 5]], dtype=torch.int32), name="for 2D")
-debug_print.print_tensor(torch.tensor([[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]], dtype=torch.int32),
-                         name="for 3D")
+debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for 1D")
+debug_print.print_tensor(torch.tensor([[1, 2, 3], [3, 4, 5]], dtype=torch.int32, device="cuda:0"), name="for 2D")
+debug_print.print_tensor(
+    torch.tensor([[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]], dtype=torch.int32, device="cuda:0"),
+    name="for 3D")
 
 print("start warmup...")
 s = torch.cuda.Stream()

From 567f62d9bbd1a0843137681973c6bb16ee22b11f Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:27:42 +0800
Subject: [PATCH 47/57] more

---
 csrc/debug_print.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 220e67c..b337aa8 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -31,7 +31,7 @@ __device__ void PrintElem(scalar_t value) {
     } else if constexpr (std::is_integral<scalar_t>::value) {
       printf("%lld, ", static_cast<long long>(value));
     } else {
-      printf("?, ");
+      static_assert(false, "unsupported scalar_t type");
     }
 }
 

From 1610757e554e88cded33c82e1db4b41c5a25c964 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:28:49 +0800
Subject: [PATCH 48/57] more

---
 csrc/debug_print.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index b337aa8..5befa41 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -24,6 +24,9 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr)
   }
 }
 
+template <typename T>
+struct always_false : std::false_type {};
+
 template <typename scalar_t>
 __device__ void PrintElem(scalar_t value) {
     if constexpr (std::is_floating_point<scalar_t>::value) {
@@ -31,7 +34,7 @@ __device__ void PrintElem(scalar_t value) {
     } else if constexpr (std::is_integral<scalar_t>::value) {
       printf("%lld, ", static_cast<long long>(value));
     } else {
-      static_assert(false, "unsupported scalar_t type");
+        static_assert(always_false<scalar_t>::value, "PrintElem: unsupported scalar_t type");
     }
 }
 

From c79d34698711fb1d15232ffd79f140596afb5951 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:30:24 +0800
Subject: [PATCH 49/57] more

---
 csrc/debug_print.cu | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 5befa41..cc66caa 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -24,12 +24,21 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr)
   }
 }
 
+template <typename T>
+struct is_my_floating_point : std::is_floating_point<T> {};
+
+template <>
+struct is_my_floating_point<c10::Half> : std::true_type {};
+
+template <>
+struct is_my_floating_point<c10::BFloat16> : std::true_type {};
+
 template <typename T>
 struct always_false : std::false_type {};
 
 template <typename scalar_t>
 __device__ void PrintElem(scalar_t value) {
-    if constexpr (std::is_floating_point<scalar_t>::value) {
+    if constexpr (is_my_floating_point<scalar_t>::value) {
       printf("%.4f, ", float(value));
     } else if constexpr (std::is_integral<scalar_t>::value) {
       printf("%lld, ", static_cast<long long>(value));

From 2288462651555d2111016142cb7e4622e6484b82 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:32:34 +0800
Subject: [PATCH 50/57] more

---
 csrc/debug_print.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index cc66caa..d34972f 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -106,7 +106,7 @@ __global__ void PrintTensor3D(
     for (size_t index_1 = 0; index_1 < shape_1; ++index_1) {
       printf("[");
       for (size_t index_2 = 0; index_2 < shape_2; ++index_2) {
-        PrintElem(x[index_0 * stride_0 + index_1 * stride_1]);
+        PrintElem(x[index_0 * stride_0 + index_1 * stride_1 + index_2 * stride_2]);
       }
       printf("], ");
     }

From e7c6395f2ce13b9ac4359849171de0c8c962e6bc Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:33:53 +0800
Subject: [PATCH 51/57] more

---
 csrc/debug_print.cu | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index d34972f..27c1b0d 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -17,10 +17,10 @@
 
 __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) {
   if (name_ptr != nullptr) {
-    printf("name: %s\n", name_ptr);
+    printf("name=%s, ", name_ptr);
   }
   if (print_ptr) {
-    printf("addr: %lld\n", x);
+    printf("addr=%lld, ", x);
   }
 }
 
@@ -58,12 +58,11 @@ __global__ void PrintTensor1D(
   if (print_shape) {
     printf("shape=(%d), stride=(%d)", (int) shape_0, (int) stride_0);
   }
-  printf("[");
+  printf("\n[");
   for (size_t index_0 = 0; index_0 < shape_0; ++index_0) {
     PrintElem(x[index_0 * stride_0]);
   }
-  printf("]");
-  printf("\n");
+  printf("]\n");
 }
 
 template <typename float_t>
@@ -77,7 +76,7 @@ __global__ void PrintTensor2D(
   if (print_shape) {
     printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1);
   }
-  printf("[");
+  printf("\n[");
   for (size_t index_0 = 0; index_0 < shape_0; ++index_0) {
     printf("[");
     for (size_t index_1 = 0; index_1 < shape_1; ++index_1) {
@@ -85,8 +84,7 @@ __global__ void PrintTensor2D(
     }
     printf("], ");
   }
-  printf("]");
-  printf("\n");
+  printf("]\n");
 }
 
 template <typename float_t>
@@ -100,7 +98,7 @@ __global__ void PrintTensor3D(
   if (print_shape) {
     printf("shape=(%d, %d, %d), stride=(%d, %d, %d)", (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2);
   }
-  printf("[");
+  printf("\n[");
   for (size_t index_0 = 0; index_0 < shape_0; ++index_0) {
     printf("[");
     for (size_t index_1 = 0; index_1 < shape_1; ++index_1) {
@@ -112,8 +110,7 @@ __global__ void PrintTensor3D(
     }
     printf("], ");
   }
-  printf("]");
-  printf("\n");
+  printf("]\n");
 }
 
 void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool print_ptr, bool print_shape) {

From 7e6e9e62fba068ca867278af1e2b7aaca0f1c396 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 17:34:41 +0800
Subject: [PATCH 52/57] morew

---
 example.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/example.py b/example.py
index 0a311f5..b0a7d3c 100644
--- a/example.py
+++ b/example.py
@@ -12,16 +12,16 @@
 debug_print.print_tensor(x[0:1, 1:3, 0:4])
 
 print("demo for all types...")
-debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for int32")
-debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int64, device="cuda:0"), name="for int64")
-debug_print.print_tensor(torch.tensor([1.5, 2.5, 3.5], dtype=torch.float, device="cuda:0"), name="for float")
+debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for int32", print_shape=True, print_ptr=True)
+debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int64, device="cuda:0"), name="for int64", print_shape=True, print_ptr=True)
+debug_print.print_tensor(torch.tensor([1.5, 2.5, 3.5], dtype=torch.float, device="cuda:0"), name="for float", print_shape=True, print_ptr=True)
 
 print("demo for all dims...")
-debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for 1D")
-debug_print.print_tensor(torch.tensor([[1, 2, 3], [3, 4, 5]], dtype=torch.int32, device="cuda:0"), name="for 2D")
+debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for 1D", print_shape=True, print_ptr=True)
+debug_print.print_tensor(torch.tensor([[1, 2, 3], [3, 4, 5]], dtype=torch.int32, device="cuda:0"), name="for 2D", print_shape=True, print_ptr=True)
 debug_print.print_tensor(
     torch.tensor([[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]], dtype=torch.int32, device="cuda:0"),
-    name="for 3D")
+    name="for 3D", print_shape=True, print_ptr=True)
 
 print("start warmup...")
 s = torch.cuda.Stream()
@@ -42,7 +42,7 @@
     z = x @ y
     debug_print.print_tensor(z)
     z1 = z @ y
-    debug_print.print_tensor(z1[..., 0], name="This is name for part of z1")
+    debug_print.print_tensor(z1[..., 0], name="This is name for part of z1", print_shape=True, print_ptr=True)
     z2 = z1 @ y
     debug_print.print_tensor(z2, name="This is name for z2")
 

From f33a38c3a2baa2151e45872b090ad0addfcd9dce Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 18:29:08 +0800
Subject: [PATCH 53/57] more

---
 debug_print/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index 14f369a..fc46ca4 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -31,7 +31,8 @@ def __init__(self):
         # Can be optimized
         self._buffers: Dict[int, _Buffer] = {
             device_index: _Buffer(device_index=device_index)
-            for device_index in range(torch.cuda.device_count())
+            # for device_index in range(torch.cuda.device_count())
+            for device_index in [torch.cuda.current_device()]
         }
         self._pending_copy_tasks: List[_CopyTask] = []
 
@@ -75,5 +76,5 @@ def post_initialize():
     _printer.post_initialize()
 
 
-def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = False, print_shape: bool = False):
+def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = True, print_shape: bool = True):
     _printer(x=x, name=name, print_ptr=print_ptr, print_shape=print_shape)

From a255336ecc4292e1ccb2962c641a31463ef82734 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 18:34:48 +0800
Subject: [PATCH 54/57] more

---
 debug_print/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index fc46ca4..c4db2fa 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -68,7 +68,9 @@ def _compute_name_buffer_gpu(self, name: str, device_index: int):
 
 def initialize():
     global _printer
-    assert _printer is None
+    if _printer is not None:
+        print("debug_print.initialize skip since already initialized")
+        return
     _printer = _DebugPrinter()
 
 

From 29bf99284f1d6f78e8edbd70ef93b3535b7093e4 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 19:37:55 +0800
Subject: [PATCH 55/57] more

---
 debug_print/__init__.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index c4db2fa..b2865ac 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -27,13 +27,12 @@ def execute(self):
 
 
 class _DebugPrinter:
-    def __init__(self):
+    def __init__(self, device_id: Optional[int]):
+        if device_id is None:
+            device_id = torch.cuda.current_device()
+
         # Can be optimized
-        self._buffers: Dict[int, _Buffer] = {
-            device_index: _Buffer(device_index=device_index)
-            # for device_index in range(torch.cuda.device_count())
-            for device_index in [torch.cuda.current_device()]
-        }
+        self._buffers: Dict[int, _Buffer] = {device_id: _Buffer(device_index=device_id)}
         self._pending_copy_tasks: List[_CopyTask] = []
 
     def post_initialize(self):
@@ -66,12 +65,12 @@ def _compute_name_buffer_gpu(self, name: str, device_index: int):
 _printer: Optional[_DebugPrinter] = None
 
 
-def initialize():
+def initialize(device_id: int):
     global _printer
     if _printer is not None:
         print("debug_print.initialize skip since already initialized")
         return
-    _printer = _DebugPrinter()
+    _printer = _DebugPrinter(device_id=device_id)
 
 
 def post_initialize():

From 7aca9d8a8ba96976449bd946798e67120cd45fe2 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 19:43:40 +0800
Subject: [PATCH 56/57] more

---
 csrc/debug_print.cu | 47 +++++++++++++++++++++++++++++++++++++++++++++
 example.py          |  8 ++++++++
 2 files changed, 55 insertions(+)

diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu
index 27c1b0d..40ccbc5 100644
--- a/csrc/debug_print.cu
+++ b/csrc/debug_print.cu
@@ -113,6 +113,36 @@ __global__ void PrintTensor3D(
   printf("]\n");
 }
 
+template <typename float_t>
+__global__ void PrintTensor4D(
+    float_t *__restrict__ x,
+    const size_t shape_0, const size_t shape_1, const size_t shape_2, const size_t shape_3,
+    const size_t stride_0, const size_t stride_1, const size_t stride_2, const size_t stride_3,
+    const char* name_ptr, const bool print_ptr, const bool print_shape
+) {
+  PrintCommon(x, name_ptr, print_ptr);
+  if (print_shape) {
+    printf("shape=(%d, %d, %d, %d), stride=(%d, %d, %d, %d)", (int) shape_0, (int) shape_1, (int) shape_2, (int) shape_3, (int) stride_0, (int) stride_1, (int) stride_2, (int) stride_3);
+  }
+  printf("\n[");
+  for (size_t index_0 = 0; index_0 < shape_0; ++index_0) {
+    printf("[");
+    for (size_t index_1 = 0; index_1 < shape_1; ++index_1) {
+      printf("[");
+      for (size_t index_2 = 0; index_2 < shape_2; ++index_2) {
+        printf("[");
+        for (size_t index_3 = 0; index_3 < shape_3; ++index_3) {
+          PrintElem(x[index_0 * stride_0 + index_1 * stride_1 + index_2 * stride_2 + index_3 * stride_3]);
+        }
+        printf("], ");
+      }
+      printf("], ");
+    }
+    printf("], ");
+  }
+  printf("]\n");
+}
+
 void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool print_ptr, bool print_shape) {
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index());
   TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor");
@@ -147,6 +177,15 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
                 name_ptr, print_ptr, print_shape
             );
           }));
+    } else if (x.dim() == 4) {
+      AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES(
+          x.scalar_type(), "PrintTensor4D", ([&] {
+            PrintTensor4D<<<1, 1, 0, stream>>>(
+                x.data_ptr<scalar_t>(),
+                x.size(0), x.size(1), x.size(2), x.size(3), x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+                name_ptr, print_ptr, print_shape
+            );
+          }));
     } else {
       // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
       // dimensions should be a better idea
@@ -181,6 +220,14 @@ void PrintTensor(torch::Tensor x, std::optional<torch::Tensor> name_buffer, bool
                 name_ptr, print_ptr, print_shape
             );
      }));
+    } else if (x.dim() == 4) {
+      AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor4D", ([&] {
+       PrintTensor4D<<<1, 1, 0, stream>>>(
+                x.data_ptr<scalar_t>(),
+                x.size(0), x.size(1), x.size(2), x.size(3), x.stride(0), x.stride(1), x.stride(2), x.stride(3),
+                name_ptr, print_ptr, print_shape
+            );
+     }));
     } else {
       // NOTE(Zihao): I'm just too lazy to do this, codegen for higher
       // dimensions should be a better idea
diff --git a/example.py b/example.py
index b0a7d3c..b24ba69 100644
--- a/example.py
+++ b/example.py
@@ -22,6 +22,14 @@
 debug_print.print_tensor(
     torch.tensor([[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]], dtype=torch.int32, device="cuda:0"),
     name="for 3D", print_shape=True, print_ptr=True)
+debug_print.print_tensor(
+    torch.tensor(
+        [
+            [[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]],
+            [[[-1, -2, -3], [-3, -4, -5]], [[-10, -20, -30], [-30, -40, -50]]],
+        ],
+        dtype=torch.int32, device="cuda:0"),
+    name="for 4D", print_shape=True, print_ptr=True)
 
 print("start warmup...")
 s = torch.cuda.Stream()

From 797597a3b79153057ce6fa2dfccf10128f2b31ad Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Thu, 26 Jun 2025 19:45:14 +0800
Subject: [PATCH 57/57] more

---
 debug_print/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/debug_print/__init__.py b/debug_print/__init__.py
index b2865ac..d6b71ed 100644
--- a/debug_print/__init__.py
+++ b/debug_print/__init__.py
@@ -65,7 +65,7 @@ def _compute_name_buffer_gpu(self, name: str, device_index: int):
 _printer: Optional[_DebugPrinter] = None
 
 
-def initialize(device_id: int):
+def initialize(device_id: Optional[int] = None):
     global _printer
     if _printer is not None:
         print("debug_print.initialize skip since already initialized")