From 0e87bc58a771c3a24fcc4f28353ddf3e19be6be0 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:30:12 +0800 Subject: [PATCH 01/57] more --- debug_print/__init__.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index addae96..4cbc802 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -1,6 +1,31 @@ +from typing import Dict + import torch from ._kernels import print_tensor as print_tensor_kernel +class _Buffer: + def __init__(self, device_index: int): + self._tensor = torch.zeros((10_000_000,), dtype=torch.char, device=f"cuda:{device_index}") + self._used_len = 0 + + def allocate(self, size: int): + output = self._tensor[self._used_len: self._used_len + size] + self._used_len += size + assert self._used_len <= len(self._tensor) + return output + + +class _DebugPrinter: + def __init__(self): + # Can be optimized + self._buffers: Dict[int, _Buffer] = { + device_index: _Buffer(device_index=device_index) + for device_index in range(torch.cuda.device_count()) + } + + def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): + + def print_tensor(x: torch.Tensor, print_ptr: bool = False): print_tensor_kernel(x, print_ptr) From 741959ec067a696c4f7383dc150c04599474e1d5 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:31:21 +0800 Subject: [PATCH 02/57] more --- debug_print/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index 4cbc802..2280ed8 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -25,6 +25,10 @@ def __init__(self): } def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): + if len(name) > 0: + name_bytes = TODO + name_buffer = self._buffers[x.device].allocate(len(name_bytes) + 1) + name_buffer.copy_(TODO) def print_tensor(x: torch.Tensor, print_ptr: bool = False): From 805b35e260245ae9228e24b1545fc392e975d195 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:32:14 +0800 Subject: [PATCH 03/57] more --- debug_print/__init__.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index 2280ed8..671b259 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, Optional import torch from ._kernels import print_tensor as print_tensor_kernel @@ -31,5 +31,14 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): name_buffer.copy_(TODO) -def print_tensor(x: torch.Tensor, print_ptr: bool = False): - print_tensor_kernel(x, print_ptr) +_printer: Optional[_DebugPrinter] = None + + +def initialize(): + global _printer + assert _printer is None + _printer = _DebugPrinter() + + +def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = False): + _printer(x=x, name=name, print_ptr=print_ptr) From 7b0f41ce26747c5c2eb3846227290f475ff2df35 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:32:37 +0800 Subject: [PATCH 04/57] more --- debug_print/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index 671b259..c85a9c2 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -1,7 +1,7 @@ from typing import Dict, Optional import torch -from ._kernels import print_tensor as print_tensor_kernel +from ._kernels import print_tensor as _print_tensor_kernel class _Buffer: @@ -29,6 +29,9 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): name_bytes = TODO name_buffer = self._buffers[x.device].allocate(len(name_bytes) + 1) name_buffer.copy_(TODO) + else: + name_buffer = None + _print_tensor_kernel(x, name_buffer, print_ptr) _printer: Optional[_DebugPrinter] = None From e09b2969a606162dc1098ac775212dc47b75a636 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:34:21 +0800 Subject: [PATCH 05/57] more --- debug_print/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index c85a9c2..8832e2a 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -26,9 +26,10 @@ def __init__(self): def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): if len(name) > 0: - name_bytes = TODO - name_buffer = self._buffers[x.device].allocate(len(name_bytes) + 1) - name_buffer.copy_(TODO) + name_bytes = name.encode("utf-8") + name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1) + tmp = torch.empty(list(name_bytes) + [0], dtype=torch.uint8, device="cpu") + name_buffer.copy_(tmp.to(name_buffer.device).view(torch.char)) else: name_buffer = None _print_tensor_kernel(x, name_buffer, print_ptr) From a6c756d6748aea01010f64cd276fa260a5284c46 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:36:32 +0800 Subject: [PATCH 06/57] more --- csrc/debug_print.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 5527ba4..6a9c056 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -102,9 +102,12 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1, printf("\n"); } -void PrintTensor(torch::Tensor x, bool print_ptr) { +void PrintTensor(torch::Tensor x, std::optional name, bool print_ptr) { cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index()); TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor"); + + const char* name_ptr = name.has_value() ? name->data_ptr() : nullptr; + if (x.is_floating_point()) { if (x.dim() == 1) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( From d0e7b5488f3f1138bfc48013c46d5654ddf4f110 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:37:08 +0800 Subject: [PATCH 07/57] more --- csrc/debug_print.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 6a9c056..b600243 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -107,27 +107,28 @@ void PrintTensor(torch::Tensor x, std::optional name, bool print_ TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor"); const char* name_ptr = name.has_value() ? name->data_ptr() : nullptr; + const int name_len = name.has_value() ? name->numel() : 0; if (x.is_floating_point()) { if (x.dim() == 1) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintFloatTensor1D", ([&] { PrintFloatTensor1D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.stride(0), x.numel(), print_ptr); + x.data_ptr(), x.stride(0), x.numel(), name_ptr, name_len, print_ptr); })); } else if (x.dim() == 2) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintFloatTensor2D", ([&] { PrintFloatTensor2D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.stride(0), x.stride(1), - x.numel(), print_ptr); + x.numel(), name_ptr, name_len, print_ptr); })); } else if (x.dim() == 3) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintFloatTensor3D", ([&] { PrintFloatTensor3D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.size(2), x.stride(0), - x.stride(1), x.stride(2), x.numel(), print_ptr); + x.stride(1), x.stride(2), x.numel(), name_ptr, name_len, print_ptr); })); } else { // NOTE(Zihao): I'm just too lazy to do this, codegen for higher @@ -143,21 +144,21 @@ void PrintTensor(torch::Tensor x, std::optional name, bool print_ AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] { PrintIntTensor1D<<<1, 1, 0, stream>>>( x.data_ptr(), x.stride(0), - x.numel(), print_ptr); + x.numel(), name_ptr, name_len, print_ptr); })); } else if (x.dim() == 2) { AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] { PrintIntTensor2D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.stride(0), x.stride(1), x.numel(), - print_ptr); + name_ptr, name_len, print_ptr); })); } else if (x.dim() == 3) { AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] { PrintIntTensor3D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.size(2), x.stride(0), x.stride(1), - x.stride(2), x.numel(), print_ptr); + x.stride(2), x.numel(), name_ptr, name_len, print_ptr); })); } else { // NOTE(Zihao): I'm just too lazy to do this, codegen for higher From b66aa9a60f2cbeb400a369a35ac20c1bbbf23051 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:37:39 +0800 Subject: [PATCH 08/57] more --- csrc/debug_print.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index b600243..efd017a 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -18,7 +18,7 @@ template __global__ void PrintFloatTensor1D(float_t *__restrict__ x, const size_t stride_0, const size_t n, - const bool print_ptr) { + const char* name_ptr, const int name_len, const bool print_ptr) { if (print_ptr) { printf("addr: %lld\n", x); } @@ -30,7 +30,7 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x, template __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0, - const size_t n, const bool print_ptr) { + const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { if (print_ptr) { printf("addr: %lld\n", x); } @@ -44,7 +44,7 @@ template __global__ void PrintFloatTensor2D(float_t *__restrict__ x, const size_t shape_0, const size_t stride_1, const size_t stride_0, const size_t n, - const bool print_ptr) { + const char* name_ptr, const int name_len, const bool print_ptr) { if (print_ptr) { printf("addr: %lld\n", x); } @@ -58,7 +58,7 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x, template __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0, const size_t stride_1, const size_t stride_0, - const size_t n, const bool print_ptr) { + const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { if (print_ptr) { printf("addr: %lld\n", x); } @@ -74,7 +74,7 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x, const size_t shape_1, const size_t shape_0, const size_t stride_2, const size_t stride_1, const size_t stride_0, const size_t n, - const bool print_ptr) { + const char* name_ptr, const int name_len, const bool print_ptr) { if (print_ptr) { printf("addr: %lld\n", x); } @@ -90,7 +90,7 @@ template __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1, const size_t shape_0, const size_t stride_2, const size_t stride_1, const size_t stride_0, - const size_t n, const bool print_ptr) { + const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { if (print_ptr) { printf("addr: %lld\n", x); } From f43e6a7127edb31fc25d3faf72266c7e4012ac42 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:37:51 +0800 Subject: [PATCH 09/57] more --- csrc/debug_print.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index efd017a..61e7e69 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -102,12 +102,12 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1, printf("\n"); } -void PrintTensor(torch::Tensor x, std::optional name, bool print_ptr) { +void PrintTensor(torch::Tensor x, std::optional name_buffer, bool print_ptr) { cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index()); TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor"); - const char* name_ptr = name.has_value() ? name->data_ptr() : nullptr; - const int name_len = name.has_value() ? name->numel() : 0; + const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr() : nullptr; + const int name_len = name_buffer.has_value() ? name_buffer->numel() : 0; if (x.is_floating_point()) { if (x.dim() == 1) { From d0a9ba40cf136cc217f3ea09c449b733e8c61b1f Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:38:39 +0800 Subject: [PATCH 10/57] more --- csrc/debug_print.cu | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 61e7e69..6335d94 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -15,13 +15,17 @@ TYPE, NAME, \ AT_DISPATCH_CASE_FLOATING_AND_REDUCED_FLOATING_TYPES(__VA_ARGS__)) +__device__ void PrintCommon(void* x, const char* name_ptr, const int name_len, const bool print_ptr) { + if (print_ptr) { + printf("addr: %lld\n", x); + } +} + template __global__ void PrintFloatTensor1D(float_t *__restrict__ x, const size_t stride_0, const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { - if (print_ptr) { - printf("addr: %lld\n", x); - } + PrintCommon(x, name_ptr, name_len, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%.4f, ", float(x[i * stride_0])); } @@ -31,9 +35,7 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x, template __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0, const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { - if (print_ptr) { - printf("addr: %lld\n", x); - } + PrintCommon(x, name_ptr, name_len, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%lld, ", int64_t(x[i * stride_0])); } @@ -45,9 +47,7 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x, const size_t shape_0, const size_t stride_1, const size_t stride_0, const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { - if (print_ptr) { - printf("addr: %lld\n", x); - } + PrintCommon(x, name_ptr, name_len, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%.4f, ", float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0])); @@ -59,9 +59,7 @@ template __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0, const size_t stride_1, const size_t stride_0, const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { - if (print_ptr) { - printf("addr: %lld\n", x); - } + PrintCommon(x, name_ptr, name_len, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%lld, ", int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0])); @@ -75,9 +73,7 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x, const size_t stride_2, const size_t stride_1, const size_t stride_0, const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { - if (print_ptr) { - printf("addr: %lld\n", x); - } + PrintCommon(x, name_ptr, name_len, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%.4f, ", float(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + @@ -91,9 +87,7 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1, const size_t shape_0, const size_t stride_2, const size_t stride_1, const size_t stride_0, const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { - if (print_ptr) { - printf("addr: %lld\n", x); - } + PrintCommon(x, name_ptr, name_len, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%lld, ", int64_t(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + From 47636bf5e0e10dc8b1c47ed989c8f815b48d8d30 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:39:04 +0800 Subject: [PATCH 11/57] more --- csrc/debug_print.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 6335d94..5afe4b7 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -15,7 +15,10 @@ TYPE, NAME, \ AT_DISPATCH_CASE_FLOATING_AND_REDUCED_FLOATING_TYPES(__VA_ARGS__)) -__device__ void PrintCommon(void* x, const char* name_ptr, const int name_len, const bool print_ptr) { +__device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) { + if (name_ptr != nullptr) { + printf("name: %s\n", name_ptr); + } if (print_ptr) { printf("addr: %lld\n", x); } From d5973fc3383f9b84793a77e37781f881ff110157 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:39:26 +0800 Subject: [PATCH 12/57] more --- csrc/debug_print.cu | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 5afe4b7..7086d80 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -27,8 +27,8 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) template __global__ void PrintFloatTensor1D(float_t *__restrict__ x, const size_t stride_0, const size_t n, - const char* name_ptr, const int name_len, const bool print_ptr) { - PrintCommon(x, name_ptr, name_len, print_ptr); + const char* name_ptr, const bool print_ptr) { + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%.4f, ", float(x[i * stride_0])); } @@ -37,8 +37,8 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x, template __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0, - const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { - PrintCommon(x, name_ptr, name_len, print_ptr); + const size_t n, const char* name_ptr, const bool print_ptr) { + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%lld, ", int64_t(x[i * stride_0])); } @@ -49,8 +49,8 @@ template __global__ void PrintFloatTensor2D(float_t *__restrict__ x, const size_t shape_0, const size_t stride_1, const size_t stride_0, const size_t n, - const char* name_ptr, const int name_len, const bool print_ptr) { - PrintCommon(x, name_ptr, name_len, print_ptr); + const char* name_ptr, const bool print_ptr) { + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%.4f, ", float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0])); @@ -61,8 +61,8 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x, template __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0, const size_t stride_1, const size_t stride_0, - const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { - PrintCommon(x, name_ptr, name_len, print_ptr); + const size_t n, const char* name_ptr, const bool print_ptr) { + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%lld, ", int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0])); @@ -75,8 +75,8 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x, const size_t shape_1, const size_t shape_0, const size_t stride_2, const size_t stride_1, const size_t stride_0, const size_t n, - const char* name_ptr, const int name_len, const bool print_ptr) { - PrintCommon(x, name_ptr, name_len, print_ptr); + const char* name_ptr, const bool print_ptr) { + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%.4f, ", float(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + @@ -89,8 +89,8 @@ template __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1, const size_t shape_0, const size_t stride_2, const size_t stride_1, const size_t stride_0, - const size_t n, const char* name_ptr, const int name_len, const bool print_ptr) { - PrintCommon(x, name_ptr, name_len, print_ptr); + const size_t n, const char* name_ptr, const bool print_ptr) { + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%lld, ", int64_t(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + @@ -104,28 +104,27 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor"); const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr() : nullptr; - const int name_len = name_buffer.has_value() ? name_buffer->numel() : 0; if (x.is_floating_point()) { if (x.dim() == 1) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintFloatTensor1D", ([&] { PrintFloatTensor1D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.stride(0), x.numel(), name_ptr, name_len, print_ptr); + x.data_ptr(), x.stride(0), x.numel(), name_ptr, print_ptr); })); } else if (x.dim() == 2) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintFloatTensor2D", ([&] { PrintFloatTensor2D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.stride(0), x.stride(1), - x.numel(), name_ptr, name_len, print_ptr); + x.numel(), name_ptr, print_ptr); })); } else if (x.dim() == 3) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintFloatTensor3D", ([&] { PrintFloatTensor3D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.size(2), x.stride(0), - x.stride(1), x.stride(2), x.numel(), name_ptr, name_len, print_ptr); + x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr); })); } else { // NOTE(Zihao): I'm just too lazy to do this, codegen for higher @@ -141,21 +140,21 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] { PrintIntTensor1D<<<1, 1, 0, stream>>>( x.data_ptr(), x.stride(0), - x.numel(), name_ptr, name_len, print_ptr); + x.numel(), name_ptr, print_ptr); })); } else if (x.dim() == 2) { AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] { PrintIntTensor2D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.stride(0), x.stride(1), x.numel(), - name_ptr, name_len, print_ptr); + name_ptr, print_ptr); })); } else if (x.dim() == 3) { AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] { PrintIntTensor3D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.size(2), x.stride(0), x.stride(1), - x.stride(2), x.numel(), name_ptr, name_len, print_ptr); + x.stride(2), x.numel(), name_ptr, print_ptr); })); } else { // NOTE(Zihao): I'm just too lazy to do this, codegen for higher From 76106c3497b47c008be8f100ecb41dffcfa54efe Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:40:26 +0800 Subject: [PATCH 13/57] more --- example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example.py b/example.py index f731d77..c62f4ba 100644 --- a/example.py +++ b/example.py @@ -27,9 +27,9 @@ z = x @ y debug_print.print_tensor(z) z1 = z @ y - debug_print.print_tensor(z1[..., 0]) + debug_print.print_tensor(z1[..., 0], name="This is name for part of z1") z2 = z1 @ y - debug_print.print_tensor(z2) + debug_print.print_tensor(z2, name="This is name for z2") x.copy_(torch.randn(2, 2)) y.copy_(torch.ones(2, 2)) From b8095396edf170a0d7a85a9a997bee595f33dffc Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:42:28 +0800 Subject: [PATCH 14/57] more --- csrc/debug_print.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 7086d80..b3b3189 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -103,7 +103,7 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index()); TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor"); - const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr() : nullptr; + const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr::() : nullptr; if (x.is_floating_point()) { if (x.dim() == 1) { From bc3cd589f7da94c266368309d90a54011e880c3f Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:50:13 +0800 Subject: [PATCH 15/57] more --- csrc/debug_print.cu | 2 +- example.py | 2 ++ setup.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index b3b3189..c5e265a 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -103,7 +103,7 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index()); TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor"); - const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr::() : nullptr; + const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr() : nullptr; if (x.is_floating_point()) { if (x.dim() == 1) { diff --git a/example.py b/example.py index c62f4ba..66d31ef 100644 --- a/example.py +++ b/example.py @@ -1,6 +1,8 @@ import torch import debug_print +print(f"{vars(debug_print)=} {dir(debug_print)=}") +debug_print.initialize() x = torch.rand(3, 4, 5).to(0) debug_print.print_tensor(x) diff --git a/setup.py b/setup.py index dedb087..2ba3a10 100644 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ setup( name="debug_print", version="0.0.2", + packages=['debug_print'], ext_modules=[ CUDAExtension( name="debug_print._kernels", From 0ffd3e8ba94dc9390d663880bcf591df5ec2e12b Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:53:50 +0800 Subject: [PATCH 16/57] more --- csrc/debug_print.cu | 2 +- debug_print/__init__.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index c5e265a..c25a068 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -103,7 +103,7 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index()); TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor"); - const char* name_ptr = name_buffer.has_value() ? name_buffer->data_ptr() : nullptr; + const char* name_ptr = name_buffer.has_value() ? reinterpret_cast(name_buffer->data_ptr()) : nullptr; if (x.is_floating_point()) { if (x.dim() == 1) { diff --git a/debug_print/__init__.py b/debug_print/__init__.py index 8832e2a..c301dc1 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -28,8 +28,8 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): if len(name) > 0: name_bytes = name.encode("utf-8") name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1) - tmp = torch.empty(list(name_bytes) + [0], dtype=torch.uint8, device="cpu") - name_buffer.copy_(tmp.to(name_buffer.device).view(torch.char)) + tmp = torch.empty(list(name_bytes) + [0], dtype=torch.uint8, device="cpu").to(name_buffer.device) + name_buffer.copy_(tmp) else: name_buffer = None _print_tensor_kernel(x, name_buffer, print_ptr) From 4876b98189671fec67906d7c4bbd724b4c083cce Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:54:39 +0800 Subject: [PATCH 17/57] more --- example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/example.py b/example.py index 66d31ef..d34706c 100644 --- a/example.py +++ b/example.py @@ -1,7 +1,6 @@ import torch import debug_print -print(f"{vars(debug_print)=} {dir(debug_print)=}") debug_print.initialize() x = torch.rand(3, 4, 5).to(0) From 6dd6a8b34e7ede16383bf0d8a13ccff0c585456f Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:55:01 +0800 Subject: [PATCH 18/57] more --- debug_print/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index c301dc1..353fba6 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -6,7 +6,7 @@ class _Buffer: def __init__(self, device_index: int): - self._tensor = torch.zeros((10_000_000,), dtype=torch.char, device=f"cuda:{device_index}") + self._tensor = torch.zeros((10_000_000,), dtype=torch.uint8, device=f"cuda:{device_index}") self._used_len = 0 def allocate(self, size: int): From 8cc9bce09e522aea75aa5c0c3715fcdf45cc9db4 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:55:46 +0800 Subject: [PATCH 19/57] more --- debug_print/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index 353fba6..7e08354 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -28,7 +28,7 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): if len(name) > 0: name_bytes = name.encode("utf-8") name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1) - tmp = torch.empty(list(name_bytes) + [0], dtype=torch.uint8, device="cpu").to(name_buffer.device) + tmp = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu").to(name_buffer.device) name_buffer.copy_(tmp) else: name_buffer = None From a3ca231c173debed4b086a3d5b33a7c5c897f6e9 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:57:22 +0800 Subject: [PATCH 20/57] more --- debug_print/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index 7e08354..f9acdc9 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -28,7 +28,8 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): if len(name) > 0: name_bytes = name.encode("utf-8") name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1) - tmp = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu").to(name_buffer.device) + tmp = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu") + tmp = tmp.to(name_buffer.device) name_buffer.copy_(tmp) else: name_buffer = None From 6c18f170d1b4c99108638cfbaf93bfe1aa9c4ad3 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 16:59:56 +0800 Subject: [PATCH 21/57] more --- debug_print/__init__.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index f9acdc9..d6018e5 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from typing import Dict, Optional import torch @@ -16,6 +17,15 @@ def allocate(self, size: int): return output +@dataclass +class _CopyTask: + src: torch.Tensor + dst: torch.Tensor + + def execute(self): + self.dst.copy_(self.src) + + class _DebugPrinter: def __init__(self): # Can be optimized @@ -28,9 +38,9 @@ def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): if len(name) > 0: name_bytes = name.encode("utf-8") name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1) - tmp = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu") - tmp = tmp.to(name_buffer.device) - name_buffer.copy_(tmp) + name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu") + name_cpu = name_cpu.to(name_buffer.device) + name_buffer.copy_(name_cpu) else: name_buffer = None _print_tensor_kernel(x, name_buffer, print_ptr) From 14a850d54aac7bbad242f93b0410b697cfee7da0 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:00:14 +0800 Subject: [PATCH 22/57] more --- debug_print/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index d6018e5..796dfbd 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -37,13 +37,12 @@ def __init__(self): def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): if len(name) > 0: name_bytes = name.encode("utf-8") - name_buffer = self._buffers[x.device.index].allocate(len(name_bytes) + 1) + name_buffer_gpu = self._buffers[x.device.index].allocate(len(name_bytes) + 1) name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu") - name_cpu = name_cpu.to(name_buffer.device) - name_buffer.copy_(name_cpu) + copy_task = _CopyTask(src=name_cpu, dst=name_buffer_gpu) else: - name_buffer = None - _print_tensor_kernel(x, name_buffer, print_ptr) + name_buffer_gpu = None + _print_tensor_kernel(x, name_buffer_gpu, print_ptr) _printer: Optional[_DebugPrinter] = None From e0d2ed9756409bb09e98d1984bcbaff195411044 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:01:00 +0800 Subject: [PATCH 23/57] more --- debug_print/__init__.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index 796dfbd..e5c0757 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Dict, Optional +from typing import Dict, Optional, List import torch from ._kernels import print_tensor as _print_tensor_kernel @@ -33,17 +33,21 @@ def __init__(self): device_index: _Buffer(device_index=device_index) for device_index in range(torch.cuda.device_count()) } + self._pending_copy_tasks: List[_CopyTask] = [] def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): - if len(name) > 0: - name_bytes = name.encode("utf-8") - name_buffer_gpu = self._buffers[x.device.index].allocate(len(name_bytes) + 1) - name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu") - copy_task = _CopyTask(src=name_cpu, dst=name_buffer_gpu) - else: - name_buffer_gpu = None + name_buffer_gpu = self._compute_name_buffer_gpu(name=name) _print_tensor_kernel(x, name_buffer_gpu, print_ptr) + def _compute_name_buffer_gpu(self, name: str): + if len(name) == 0: + return None + + name_bytes = name.encode("utf-8") + name_buffer_gpu = self._buffers[x.device.index].allocate(len(name_bytes) + 1) + name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu") + copy_task = _CopyTask(src=name_cpu, dst=name_buffer_gpu) + _printer: Optional[_DebugPrinter] = None From 5018cf4fc972687edfab6df2d1b053c7318a0580 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:01:24 +0800 Subject: [PATCH 24/57] more --- debug_print/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index e5c0757..2c34238 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -48,6 +48,11 @@ def _compute_name_buffer_gpu(self, name: str): name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu") copy_task = _CopyTask(src=name_cpu, dst=name_buffer_gpu) + if torch.cuda.is_current_stream_capturing(): + self._pending_copy_tasks.append(copy_task) + else: + copy_task.execute() + _printer: Optional[_DebugPrinter] = None From 6eb37e2f9132b1cc1166f208d903a7cdab8beec9 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:02:18 +0800 Subject: [PATCH 25/57] more --- debug_print/__init__.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index 2c34238..23e8a4e 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -35,16 +35,21 @@ def __init__(self): } self._pending_copy_tasks: List[_CopyTask] = [] + def post_initialize(self): + for copy_task in self._pending_copy_tasks: + copy_task.execute() + self._pending_copy_tasks.clear() + def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): - name_buffer_gpu = self._compute_name_buffer_gpu(name=name) + name_buffer_gpu = self._compute_name_buffer_gpu(name=name, device_index=x.device.index) _print_tensor_kernel(x, name_buffer_gpu, print_ptr) - def _compute_name_buffer_gpu(self, name: str): + def _compute_name_buffer_gpu(self, name: str, device_index: int): if len(name) == 0: return None name_bytes = name.encode("utf-8") - name_buffer_gpu = self._buffers[x.device.index].allocate(len(name_bytes) + 1) + name_buffer_gpu = self._buffers[device_index].allocate(len(name_bytes) + 1) name_cpu = torch.tensor(list(name_bytes) + [0], dtype=torch.uint8, device="cpu") copy_task = _CopyTask(src=name_cpu, dst=name_buffer_gpu) @@ -53,6 +58,8 @@ def _compute_name_buffer_gpu(self, name: str): else: copy_task.execute() + return name_buffer_gpu + _printer: Optional[_DebugPrinter] = None @@ -63,5 +70,9 @@ def initialize(): _printer = _DebugPrinter() +def post_initialize(): + _printer.post_initialize() + + def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = False): _printer(x=x, name=name, print_ptr=print_ptr) From bdbf3791f76faffac8d8df4ef14d6f395aedc866 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:03:11 +0800 Subject: [PATCH 26/57] more --- example.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/example.py b/example.py index d34706c..020da60 100644 --- a/example.py +++ b/example.py @@ -32,6 +32,8 @@ z2 = z1 @ y debug_print.print_tensor(z2, name="This is name for z2") +debug_print.post_initialize() + x.copy_(torch.randn(2, 2)) y.copy_(torch.ones(2, 2)) print("start replay...") From 96eaae434ea24d971e9cfa74572d7cb2ee717628 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:03:59 +0800 Subject: [PATCH 27/57] more --- example.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/example.py b/example.py index 020da60..0a7d315 100644 --- a/example.py +++ b/example.py @@ -3,6 +3,7 @@ debug_print.initialize() +print("demo without cuda graph...") x = torch.rand(3, 4, 5).to(0) debug_print.print_tensor(x) debug_print.print_tensor(x[..., 0:3]) @@ -10,6 +11,7 @@ debug_print.print_tensor(x[..., 0]) debug_print.print_tensor(x[0:1, 1:3, 0:4]) +print("start warmup...") s = torch.cuda.Stream() s.wait_stream(torch.cuda.current_stream()) x = torch.empty(2, 2).half().to(0) @@ -21,6 +23,7 @@ z2 = z1 @ y +print("start graph capture...") g = torch.cuda.CUDAGraph() with torch.cuda.graph(g, stream=s): debug_print.print_tensor(x) From c014aab19b1a4cc1baaa0a00bae34f42f9b4119e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:06:59 +0800 Subject: [PATCH 28/57] more --- csrc/debug_print.cu | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index c25a068..60bc46f 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -52,8 +52,9 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x, const char* name_ptr, const bool print_ptr) { PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { - printf("%.4f, ", - float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0])); + printf("%.4f%c ", + float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]), + (i % shape_0 == 0) ? ";" : ","); } printf("\n"); } @@ -64,8 +65,9 @@ __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0, const size_t n, const char* name_ptr, const bool print_ptr) { PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { - printf("%lld, ", - int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0])); + printf("%lld%c ", + int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]), + (i % shape_0 == 0) ? ";" : ","); } printf("\n"); } @@ -78,9 +80,10 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x, const char* name_ptr, const bool print_ptr) { PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { - printf("%.4f, ", float(x[(i / shape_0 / shape_1) * stride_2 + + printf("%.4f%c ", float(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + - (i % shape_0) * stride_0])); + (i % shape_0) * stride_0]), + ((i % (shape_0 * shape_1) == 0) || (i % shape_0 == 0)) ? ";" : ","); } printf("\n"); } @@ -92,9 +95,10 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1, const size_t n, const char* name_ptr, const bool print_ptr) { PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { - printf("%lld, ", int64_t(x[(i / shape_0 / shape_1) * stride_2 + + printf("%lld%c ", int64_t(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + - (i % shape_0) * stride_0])); + (i % shape_0) * stride_0]), + ((i % (shape_0 * shape_1) == 0) || (i % shape_0 == 0)) ? ";" : ","); } printf("\n"); } From 06abf6cdbef6c021fc4a132a8b5cac4a4ad6acb3 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:08:12 +0800 Subject: [PATCH 29/57] more --- csrc/debug_print.cu | 2 +- debug_print/__init__.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 60bc46f..d750472 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -103,7 +103,7 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1, printf("\n"); } -void PrintTensor(torch::Tensor x, std::optional name_buffer, bool print_ptr) { +void PrintTensor(torch::Tensor x, std::optional name_buffer, bool print_ptr, bool print_shape) { cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index()); TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor"); diff --git a/debug_print/__init__.py b/debug_print/__init__.py index 23e8a4e..b38ad3b 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -40,9 +40,9 @@ def post_initialize(self): copy_task.execute() self._pending_copy_tasks.clear() - def __call__(self, x: torch.Tensor, name: str = "", print_ptr: bool = False): + def __call__(self, x: torch.Tensor, name: str, print_ptr: bool, print_shape: bool): name_buffer_gpu = self._compute_name_buffer_gpu(name=name, device_index=x.device.index) - _print_tensor_kernel(x, name_buffer_gpu, print_ptr) + _print_tensor_kernel(x, name_buffer_gpu, print_ptr, print_shape) def _compute_name_buffer_gpu(self, name: str, device_index: int): if len(name) == 0: @@ -74,5 +74,5 @@ def post_initialize(): _printer.post_initialize() -def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = False): - _printer(x=x, name=name, print_ptr=print_ptr) +def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = False, print_shape: bool = False): + _printer(x=x, name=name, print_ptr=print_ptr, print_shape=print_shape) From 577afe27d23a5d8927bb99320346a7920f0fc73e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:10:40 +0800 Subject: [PATCH 30/57] more --- csrc/debug_print.cu | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index d750472..bd12481 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -15,7 +15,7 @@ TYPE, NAME, \ AT_DISPATCH_CASE_FLOATING_AND_REDUCED_FLOATING_TYPES(__VA_ARGS__)) -__device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) { +__device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr, const bool print_shape) { if (name_ptr != nullptr) { printf("name: %s\n", name_ptr); } @@ -27,8 +27,8 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) template __global__ void PrintFloatTensor1D(float_t *__restrict__ x, const size_t stride_0, const size_t n, - const char* name_ptr, const bool print_ptr) { - PrintCommon(x, name_ptr, print_ptr); + const char* name_ptr, const bool print_ptr, const bool print_shape) { + PrintCommon(x, name_ptr, print_ptr, print_shape); for (size_t i = 0; i < n; ++i) { printf("%.4f, ", float(x[i * stride_0])); } @@ -37,8 +37,8 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x, template __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0, - const size_t n, const char* name_ptr, const bool print_ptr) { - PrintCommon(x, name_ptr, print_ptr); + const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { + PrintCommon(x, name_ptr, print_ptr, print_shape); for (size_t i = 0; i < n; ++i) { printf("%lld, ", int64_t(x[i * stride_0])); } @@ -49,8 +49,8 @@ template __global__ void PrintFloatTensor2D(float_t *__restrict__ x, const size_t shape_0, const size_t stride_1, const size_t stride_0, const size_t n, - const char* name_ptr, const bool print_ptr) { - PrintCommon(x, name_ptr, print_ptr); + const char* name_ptr, const bool print_ptr, const bool print_shape) { + PrintCommon(x, name_ptr, print_ptr, print_shape); for (size_t i = 0; i < n; ++i) { printf("%.4f%c ", float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]), @@ -62,8 +62,8 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x, template __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0, const size_t stride_1, const size_t stride_0, - const size_t n, const char* name_ptr, const bool print_ptr) { - PrintCommon(x, name_ptr, print_ptr); + const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { + PrintCommon(x, name_ptr, print_ptr, print_shape); for (size_t i = 0; i < n; ++i) { printf("%lld%c ", int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]), @@ -77,8 +77,8 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x, const size_t shape_1, const size_t shape_0, const size_t stride_2, const size_t stride_1, const size_t stride_0, const size_t n, - const char* name_ptr, const bool print_ptr) { - PrintCommon(x, name_ptr, print_ptr); + const char* name_ptr, const bool print_ptr, const bool print_shape) { + PrintCommon(x, name_ptr, print_ptr, print_shape); for (size_t i = 0; i < n; ++i) { printf("%.4f%c ", float(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + @@ -92,8 +92,9 @@ template __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1, const size_t shape_0, const size_t stride_2, const size_t stride_1, const size_t stride_0, - const size_t n, const char* name_ptr, const bool print_ptr) { - PrintCommon(x, name_ptr, print_ptr); + const size_t n, const char* name_ptr, + const bool print_ptr, const bool print_shape) { + PrintCommon(x, name_ptr, print_ptr, print_shape); for (size_t i = 0; i < n; ++i) { printf("%lld%c ", int64_t(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + @@ -114,21 +115,21 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintFloatTensor1D", ([&] { PrintFloatTensor1D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.stride(0), x.numel(), name_ptr, print_ptr); + x.data_ptr(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape); })); } else if (x.dim() == 2) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintFloatTensor2D", ([&] { PrintFloatTensor2D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.stride(0), x.stride(1), - x.numel(), name_ptr, print_ptr); + x.numel(), name_ptr, print_ptr, print_shape); })); } else if (x.dim() == 3) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintFloatTensor3D", ([&] { PrintFloatTensor3D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.size(2), x.stride(0), - x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr); + x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); })); } else { // NOTE(Zihao): I'm just too lazy to do this, codegen for higher @@ -144,21 +145,21 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] { PrintIntTensor1D<<<1, 1, 0, stream>>>( x.data_ptr(), x.stride(0), - x.numel(), name_ptr, print_ptr); + x.numel(), name_ptr, print_ptr, print_shape); })); } else if (x.dim() == 2) { AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] { PrintIntTensor2D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.stride(0), x.stride(1), x.numel(), - name_ptr, print_ptr); + name_ptr, print_ptr, print_shape); })); } else if (x.dim() == 3) { AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] { PrintIntTensor3D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(1), x.size(2), x.stride(0), x.stride(1), - x.stride(2), x.numel(), name_ptr, print_ptr); + x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); })); } else { // NOTE(Zihao): I'm just too lazy to do this, codegen for higher From e66e5619d793b973ba28ce70f0f752fb5e9b8078 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:10:59 +0800 Subject: [PATCH 31/57] more --- csrc/debug_print.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index bd12481..2fea4c2 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -28,7 +28,7 @@ template __global__ void PrintFloatTensor1D(float_t *__restrict__ x, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { - PrintCommon(x, name_ptr, print_ptr, print_shape); + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%.4f, ", float(x[i * stride_0])); } @@ -38,7 +38,7 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x, template __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { - PrintCommon(x, name_ptr, print_ptr, print_shape); + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%lld, ", int64_t(x[i * stride_0])); } @@ -50,7 +50,7 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x, const size_t shape_0, const size_t stride_1, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { - PrintCommon(x, name_ptr, print_ptr, print_shape); + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%.4f%c ", float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]), @@ -63,7 +63,7 @@ template __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0, const size_t stride_1, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { - PrintCommon(x, name_ptr, print_ptr, print_shape); + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%lld%c ", int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]), @@ -78,7 +78,7 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x, const size_t stride_2, const size_t stride_1, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { - PrintCommon(x, name_ptr, print_ptr, print_shape); + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%.4f%c ", float(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + @@ -94,7 +94,7 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1, const size_t stride_1, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { - PrintCommon(x, name_ptr, print_ptr, print_shape); + PrintCommon(x, name_ptr, print_ptr); for (size_t i = 0; i < n; ++i) { printf("%lld%c ", int64_t(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + From c7a901c9bfcc577276ff2a92c6b2d0d1d9d36fc5 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:12:18 +0800 Subject: [PATCH 32/57] more --- csrc/debug_print.cu | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 2fea4c2..0e96cbb 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -29,6 +29,9 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { PrintCommon(x, name_ptr, print_ptr); + if (print_shape) { + printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0); + } for (size_t i = 0; i < n; ++i) { printf("%.4f, ", float(x[i * stride_0])); } @@ -39,6 +42,9 @@ template __global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { PrintCommon(x, name_ptr, print_ptr); + if (print_shape) { + printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0); + } for (size_t i = 0; i < n; ++i) { printf("%lld, ", int64_t(x[i * stride_0])); } @@ -51,6 +57,9 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { PrintCommon(x, name_ptr, print_ptr); + if (print_shape) { + printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, , (int) stride_0); + } for (size_t i = 0; i < n; ++i) { printf("%.4f%c ", float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]), @@ -121,14 +130,14 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintFloatTensor2D", ([&] { PrintFloatTensor2D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(1), x.stride(0), x.stride(1), + x.data_ptr(), x.size(0), x.size(1), x.stride(0), x.stride(1), x.numel(), name_ptr, print_ptr, print_shape); })); } else if (x.dim() == 3) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintFloatTensor3D", ([&] { PrintFloatTensor3D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(1), x.size(2), x.stride(0), + x.data_ptr(), x.size(0), x.size(1), x.size(2), x.stride(0), x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); })); } else { @@ -150,14 +159,14 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool } else if (x.dim() == 2) { AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] { PrintIntTensor2D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(1), + x.data_ptr(), x.size(0), x.size(1), x.stride(0), x.stride(1), x.numel(), name_ptr, print_ptr, print_shape); })); } else if (x.dim() == 3) { AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] { PrintIntTensor3D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(1), + x.data_ptr(), x.size(0), x.size(1), x.size(2), x.stride(0), x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); })); From 5d3bde94ae003d2a6e28f2d194514041c50fecec Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:12:51 +0800 Subject: [PATCH 33/57] more --- csrc/debug_print.cu | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 0e96cbb..790d5a1 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -58,7 +58,7 @@ __global__ void PrintFloatTensor2D(float_t *__restrict__ x, const char* name_ptr, const bool print_ptr, const bool print_shape) { PrintCommon(x, name_ptr, print_ptr); if (print_shape) { - printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, , (int) stride_0); + printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1); } for (size_t i = 0; i < n; ++i) { printf("%.4f%c ", @@ -73,6 +73,9 @@ __global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0, const size_t stride_1, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { PrintCommon(x, name_ptr, print_ptr); + if (print_shape) { + printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1); + } for (size_t i = 0; i < n; ++i) { printf("%lld%c ", int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]), @@ -88,6 +91,10 @@ __global__ void PrintFloatTensor3D(float_t *__restrict__ x, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { PrintCommon(x, name_ptr, print_ptr); + if (print_shape) { + printf("shape=(%d, %d, %d), stride=(%d, %d, %d)", + (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2); + } for (size_t i = 0; i < n; ++i) { printf("%.4f%c ", float(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + @@ -104,6 +111,10 @@ __global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { PrintCommon(x, name_ptr, print_ptr); + if (print_shape) { + printf("shape=(%d, %d, %d), stride=(%d, %d, %d)", + (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2); + } for (size_t i = 0; i < n; ++i) { printf("%lld%c ", int64_t(x[(i / shape_0 / shape_1) * stride_2 + ((i / shape_0) % shape_1) * stride_1 + From aa0aa6ff823c3e15e066615e23426ef7b65124e7 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:16:08 +0800 Subject: [PATCH 34/57] more --- csrc/debug_print.cu | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 790d5a1..ec79a39 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -24,8 +24,19 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr, } } +template +__device__ void PrintElem(scalar_t value) { + if constexpr (std::is_floating_point::value) { + printf("%.4f, ", float(x[i * stride_0])); + } else if constexpr (std::is_integral::value) { + printf("%lld, ", static_cast(x[i * stride_0])); + } else { + printf("?, "); + } +} + template -__global__ void PrintFloatTensor1D(float_t *__restrict__ x, +__global__ void PrintTensor1D(float_t *__restrict__ x, const size_t stride_0, const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { PrintCommon(x, name_ptr, print_ptr); @@ -33,20 +44,7 @@ __global__ void PrintFloatTensor1D(float_t *__restrict__ x, printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0); } for (size_t i = 0; i < n; ++i) { - printf("%.4f, ", float(x[i * stride_0])); - } - printf("\n"); -} - -template -__global__ void PrintIntTensor1D(int_t *__restrict__ x, const size_t stride_0, - const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { - PrintCommon(x, name_ptr, print_ptr); - if (print_shape) { - printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0); - } - for (size_t i = 0; i < n; ++i) { - printf("%lld, ", int64_t(x[i * stride_0])); + PrintElem(x[i * stride_0]); } printf("\n"); } From 5d6773972100712bcf934ebfbf9a4a3988a1919b Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:16:33 +0800 Subject: [PATCH 35/57] more --- csrc/debug_print.cu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index ec79a39..df78cf6 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -36,15 +36,18 @@ __device__ void PrintElem(scalar_t value) { } template -__global__ void PrintTensor1D(float_t *__restrict__ x, - const size_t stride_0, const size_t n, - const char* name_ptr, const bool print_ptr, const bool print_shape) { +__global__ void PrintTensor1D( + float_t *__restrict__ x, + const size_t shape_0, + const size_t stride_0, + const char* name_ptr, const bool print_ptr, const bool print_shape +) { PrintCommon(x, name_ptr, print_ptr); if (print_shape) { printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0); } - for (size_t i = 0; i < n; ++i) { - PrintElem(x[i * stride_0]); + for (size_t index_0 = 0; index_0 < shape_0; ++index_0) { + PrintElem(x[index_0 * stride_0]); } printf("\n"); } From d6458290394ea09612feada6fde8a061861fbf4d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:17:59 +0800 Subject: [PATCH 36/57] more --- csrc/debug_print.cu | 84 +++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 57 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index df78cf6..eac027b 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -44,7 +44,7 @@ __global__ void PrintTensor1D( ) { PrintCommon(x, name_ptr, print_ptr); if (print_shape) { - printf("shape=(%d), stride=(%d)", (int) n, (int) stride_0); + printf("shape=(%d), stride=(%d)", (int) shape_0, (int) stride_0); } for (size_t index_0 = 0; index_0 < shape_0; ++index_0) { PrintElem(x[index_0 * stride_0]); @@ -53,74 +53,44 @@ __global__ void PrintTensor1D( } template -__global__ void PrintFloatTensor2D(float_t *__restrict__ x, - const size_t shape_0, const size_t stride_1, - const size_t stride_0, const size_t n, - const char* name_ptr, const bool print_ptr, const bool print_shape) { - PrintCommon(x, name_ptr, print_ptr); - if (print_shape) { - printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1); - } - for (size_t i = 0; i < n; ++i) { - printf("%.4f%c ", - float(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]), - (i % shape_0 == 0) ? ";" : ","); - } - printf("\n"); -} - -template -__global__ void PrintIntTensor2D(int_t *__restrict__ x, const size_t shape_0, - const size_t stride_1, const size_t stride_0, - const size_t n, const char* name_ptr, const bool print_ptr, const bool print_shape) { +__global__ void PrintTensor2D( + float_t *__restrict__ x, + const size_t shape_0, const size_t shape_1, + const size_t stride_0, const size_t stride_1, + const char* name_ptr, const bool print_ptr, const bool print_shape +) { PrintCommon(x, name_ptr, print_ptr); if (print_shape) { printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1); } - for (size_t i = 0; i < n; ++i) { - printf("%lld%c ", - int64_t(x[(i / shape_0) * stride_1 + (i % shape_0) * stride_0]), - (i % shape_0 == 0) ? ";" : ","); + for (size_t index_0 = 0; index_0 < shape_0; ++index_0) { + for (size_t index_1 = 0; index_1 < shape_1; ++index_1) { + PrintElem(x[index_0 * stride_0 + index_1 * stride_1]); + } + printf("; "); } printf("\n"); } template -__global__ void PrintFloatTensor3D(float_t *__restrict__ x, - const size_t shape_1, const size_t shape_0, - const size_t stride_2, const size_t stride_1, - const size_t stride_0, const size_t n, - const char* name_ptr, const bool print_ptr, const bool print_shape) { - PrintCommon(x, name_ptr, print_ptr); - if (print_shape) { - printf("shape=(%d, %d, %d), stride=(%d, %d, %d)", - (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2); - } - for (size_t i = 0; i < n; ++i) { - printf("%.4f%c ", float(x[(i / shape_0 / shape_1) * stride_2 + - ((i / shape_0) % shape_1) * stride_1 + - (i % shape_0) * stride_0]), - ((i % (shape_0 * shape_1) == 0) || (i % shape_0 == 0)) ? ";" : ","); - } - printf("\n"); -} - -template -__global__ void PrintIntTensor3D(int_t *__restrict__ x, const size_t shape_1, - const size_t shape_0, const size_t stride_2, - const size_t stride_1, const size_t stride_0, - const size_t n, const char* name_ptr, - const bool print_ptr, const bool print_shape) { +__global__ void PrintTensor3D( + float_t *__restrict__ x, + const size_t shape_0, const size_t shape_1, const size_t shape_2, + const size_t stride_0, const size_t stride_1, const size_t stride_2, + const char* name_ptr, const bool print_ptr, const bool print_shape +) { PrintCommon(x, name_ptr, print_ptr); if (print_shape) { - printf("shape=(%d, %d, %d), stride=(%d, %d, %d)", - (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2); + printf("shape=(%d, %d, %d), stride=(%d, %d, %d)", (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2); } - for (size_t i = 0; i < n; ++i) { - printf("%lld%c ", int64_t(x[(i / shape_0 / shape_1) * stride_2 + - ((i / shape_0) % shape_1) * stride_1 + - (i % shape_0) * stride_0]), - ((i % (shape_0 * shape_1) == 0) || (i % shape_0 == 0)) ? ";" : ","); + for (size_t index_0 = 0; index_0 < shape_0; ++index_0) { + for (size_t index_1 = 0; index_1 < shape_1; ++index_1) { + for (size_t index_2 = 0; index_2 < shape_2; ++index_2) { + PrintElem(x[index_0 * stride_0 + index_1 * stride_1]); + } + printf("; "); + } + printf("; "); } printf("\n"); } From bacaf8c353025b9382f4b79562dfb81ac8547281 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:18:24 +0800 Subject: [PATCH 37/57] more --- csrc/debug_print.cu | 84 ++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 59 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index eac027b..e4d3a22 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -101,67 +101,33 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool const char* name_ptr = name_buffer.has_value() ? reinterpret_cast(name_buffer->data_ptr()) : nullptr; - if (x.is_floating_point()) { - if (x.dim() == 1) { - AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( - x.scalar_type(), "PrintFloatTensor1D", ([&] { - PrintFloatTensor1D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape); - })); - } else if (x.dim() == 2) { - AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( - x.scalar_type(), "PrintFloatTensor2D", ([&] { - PrintFloatTensor2D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(0), x.size(1), x.stride(0), x.stride(1), - x.numel(), name_ptr, print_ptr, print_shape); - })); - } else if (x.dim() == 3) { - AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( - x.scalar_type(), "PrintFloatTensor3D", ([&] { - PrintFloatTensor3D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(0), x.size(1), x.size(2), x.stride(0), - x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); - })); - } else { - // NOTE(Zihao): I'm just too lazy to do this, codegen for higher - // dimensions should be a better idea - TORCH_CHECK(false, "Input dimension not supported."); - } - cudaError_t status = cudaGetLastError(); - TORCH_CHECK(status == cudaSuccess, - "PrintFloatTensor failed with error " + - std::string(cudaGetErrorString(status))); + if (x.dim() == 1) { + AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( + x.scalar_type(), "PrintFloatTensor1D", ([&] { + PrintFloatTensor1D<<<1, 1, 0, stream>>>( + x.data_ptr(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape); + })); + } else if (x.dim() == 2) { + AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( + x.scalar_type(), "PrintFloatTensor2D", ([&] { + PrintFloatTensor2D<<<1, 1, 0, stream>>>( + x.data_ptr(), x.size(0), x.size(1), x.stride(0), x.stride(1), + x.numel(), name_ptr, print_ptr, print_shape); + })); + } else if (x.dim() == 3) { + AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( + x.scalar_type(), "PrintFloatTensor3D", ([&] { + PrintFloatTensor3D<<<1, 1, 0, stream>>>( + x.data_ptr(), x.size(0), x.size(1), x.size(2), x.stride(0), + x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); + })); } else { - if (x.dim() == 1) { - AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] { - PrintIntTensor1D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.stride(0), - x.numel(), name_ptr, print_ptr, print_shape); - })); - } else if (x.dim() == 2) { - AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] { - PrintIntTensor2D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(0), x.size(1), - x.stride(0), x.stride(1), x.numel(), - name_ptr, print_ptr, print_shape); - })); - } else if (x.dim() == 3) { - AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] { - PrintIntTensor3D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(0), x.size(1), - x.size(2), x.stride(0), x.stride(1), - x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); - })); - } else { - // NOTE(Zihao): I'm just too lazy to do this, codegen for higher - // dimensions should be a better idea - TORCH_CHECK(false, "Input dimension not supported."); - } - cudaError_t status = cudaGetLastError(); - TORCH_CHECK(status == cudaSuccess, - "PrintIntTensor failed with error " + - std::string(cudaGetErrorString(status))); + // NOTE(Zihao): I'm just too lazy to do this, codegen for higher + // dimensions should be a better idea + TORCH_CHECK(false, "Input dimension not supported."); } + cudaError_t status = cudaGetLastError(); + TORCH_CHECK(status == cudaSuccess, "PrintTensor failed with error " + std::string(cudaGetErrorString(status))); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { From 9a4b5ffaae55116c731a5b2fe0bbae1af0ab4107 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:18:47 +0800 Subject: [PATCH 38/57] Revert "more" This reverts commit bacaf8c353025b9382f4b79562dfb81ac8547281. --- csrc/debug_print.cu | 84 +++++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index e4d3a22..eac027b 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -101,33 +101,67 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool const char* name_ptr = name_buffer.has_value() ? reinterpret_cast(name_buffer->data_ptr()) : nullptr; - if (x.dim() == 1) { - AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( - x.scalar_type(), "PrintFloatTensor1D", ([&] { - PrintFloatTensor1D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape); - })); - } else if (x.dim() == 2) { - AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( - x.scalar_type(), "PrintFloatTensor2D", ([&] { - PrintFloatTensor2D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(0), x.size(1), x.stride(0), x.stride(1), - x.numel(), name_ptr, print_ptr, print_shape); - })); - } else if (x.dim() == 3) { - AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( - x.scalar_type(), "PrintFloatTensor3D", ([&] { - PrintFloatTensor3D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(0), x.size(1), x.size(2), x.stride(0), - x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); - })); + if (x.is_floating_point()) { + if (x.dim() == 1) { + AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( + x.scalar_type(), "PrintFloatTensor1D", ([&] { + PrintFloatTensor1D<<<1, 1, 0, stream>>>( + x.data_ptr(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape); + })); + } else if (x.dim() == 2) { + AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( + x.scalar_type(), "PrintFloatTensor2D", ([&] { + PrintFloatTensor2D<<<1, 1, 0, stream>>>( + x.data_ptr(), x.size(0), x.size(1), x.stride(0), x.stride(1), + x.numel(), name_ptr, print_ptr, print_shape); + })); + } else if (x.dim() == 3) { + AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( + x.scalar_type(), "PrintFloatTensor3D", ([&] { + PrintFloatTensor3D<<<1, 1, 0, stream>>>( + x.data_ptr(), x.size(0), x.size(1), x.size(2), x.stride(0), + x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); + })); + } else { + // NOTE(Zihao): I'm just too lazy to do this, codegen for higher + // dimensions should be a better idea + TORCH_CHECK(false, "Input dimension not supported."); + } + cudaError_t status = cudaGetLastError(); + TORCH_CHECK(status == cudaSuccess, + "PrintFloatTensor failed with error " + + std::string(cudaGetErrorString(status))); } else { - // NOTE(Zihao): I'm just too lazy to do this, codegen for higher - // dimensions should be a better idea - TORCH_CHECK(false, "Input dimension not supported."); + if (x.dim() == 1) { + AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] { + PrintIntTensor1D<<<1, 1, 0, stream>>>( + x.data_ptr(), x.stride(0), + x.numel(), name_ptr, print_ptr, print_shape); + })); + } else if (x.dim() == 2) { + AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] { + PrintIntTensor2D<<<1, 1, 0, stream>>>( + x.data_ptr(), x.size(0), x.size(1), + x.stride(0), x.stride(1), x.numel(), + name_ptr, print_ptr, print_shape); + })); + } else if (x.dim() == 3) { + AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] { + PrintIntTensor3D<<<1, 1, 0, stream>>>( + x.data_ptr(), x.size(0), x.size(1), + x.size(2), x.stride(0), x.stride(1), + x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); + })); + } else { + // NOTE(Zihao): I'm just too lazy to do this, codegen for higher + // dimensions should be a better idea + TORCH_CHECK(false, "Input dimension not supported."); + } + cudaError_t status = cudaGetLastError(); + TORCH_CHECK(status == cudaSuccess, + "PrintIntTensor failed with error " + + std::string(cudaGetErrorString(status))); } - cudaError_t status = cudaGetLastError(); - TORCH_CHECK(status == cudaSuccess, "PrintTensor failed with error " + std::string(cudaGetErrorString(status))); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { From 9f1e6e4f5b3ecdb92ad43cdc4a974fff23d3141c Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:19:09 +0800 Subject: [PATCH 39/57] more --- csrc/debug_print.cu | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index eac027b..86c775e 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -104,21 +104,21 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool if (x.is_floating_point()) { if (x.dim() == 1) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( - x.scalar_type(), "PrintFloatTensor1D", ([&] { - PrintFloatTensor1D<<<1, 1, 0, stream>>>( + x.scalar_type(), "PrintTensor1D", ([&] { + PrintTensor1D<<<1, 1, 0, stream>>>( x.data_ptr(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape); })); } else if (x.dim() == 2) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( - x.scalar_type(), "PrintFloatTensor2D", ([&] { - PrintFloatTensor2D<<<1, 1, 0, stream>>>( + x.scalar_type(), "PrintTensor2D", ([&] { + PrintTensor2D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(0), x.size(1), x.stride(0), x.stride(1), x.numel(), name_ptr, print_ptr, print_shape); })); } else if (x.dim() == 3) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( - x.scalar_type(), "PrintFloatTensor3D", ([&] { - PrintFloatTensor3D<<<1, 1, 0, stream>>>( + x.scalar_type(), "PrintTensor3D", ([&] { + PrintTensor3D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(0), x.size(1), x.size(2), x.stride(0), x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); })); @@ -129,25 +129,25 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool } cudaError_t status = cudaGetLastError(); TORCH_CHECK(status == cudaSuccess, - "PrintFloatTensor failed with error " + + "PrintTensor failed with error " + std::string(cudaGetErrorString(status))); } else { if (x.dim() == 1) { - AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor1D", ([&] { - PrintIntTensor1D<<<1, 1, 0, stream>>>( + AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor1D", ([&] { + PrintTensor1D<<<1, 1, 0, stream>>>( x.data_ptr(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape); })); } else if (x.dim() == 2) { - AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor2D", ([&] { - PrintIntTensor2D<<<1, 1, 0, stream>>>( + AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor2D", ([&] { + PrintTensor2D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(0), x.size(1), x.stride(0), x.stride(1), x.numel(), name_ptr, print_ptr, print_shape); })); } else if (x.dim() == 3) { - AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintIntTensor3D", ([&] { - PrintIntTensor3D<<<1, 1, 0, stream>>>( + AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor3D", ([&] { + PrintTensor3D<<<1, 1, 0, stream>>>( x.data_ptr(), x.size(0), x.size(1), x.size(2), x.stride(0), x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); @@ -159,7 +159,7 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool } cudaError_t status = cudaGetLastError(); TORCH_CHECK(status == cudaSuccess, - "PrintIntTensor failed with error " + + "PrintTensor failed with error " + std::string(cudaGetErrorString(status))); } } From 821eb26d17b5be4ea55aa3ee713dc4a5febab20c Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:20:35 +0800 Subject: [PATCH 40/57] more --- csrc/debug_print.cu | 49 +++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 86c775e..3ceab93 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -106,21 +106,28 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintTensor1D", ([&] { PrintTensor1D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.stride(0), x.numel(), name_ptr, print_ptr, print_shape); + x.data_ptr(), + x.size(0), x.stride(0), + name_ptr, print_ptr, print_shape + ); })); } else if (x.dim() == 2) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintTensor2D", ([&] { PrintTensor2D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(0), x.size(1), x.stride(0), x.stride(1), - x.numel(), name_ptr, print_ptr, print_shape); + x.data_ptr(), + x.size(0), x.size(1), x.stride(0), x.stride(1), + name_ptr, print_ptr, print_shape + ); })); } else if (x.dim() == 3) { AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( x.scalar_type(), "PrintTensor3D", ([&] { PrintTensor3D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(0), x.size(1), x.size(2), x.stride(0), - x.stride(1), x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); + x.data_ptr(), + x.size(0), x.size(1), x.size(2), x.stride(0), x.stride(1), x.stride(2), + name_ptr, print_ptr, print_shape + ); })); } else { // NOTE(Zihao): I'm just too lazy to do this, codegen for higher @@ -134,24 +141,28 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool } else { if (x.dim() == 1) { AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor1D", ([&] { - PrintTensor1D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.stride(0), - x.numel(), name_ptr, print_ptr, print_shape); - })); + PrintTensor1D<<<1, 1, 0, stream>>>( + x.data_ptr(), + x.size(0), x.stride(0), + name_ptr, print_ptr, print_shape + ); + })); } else if (x.dim() == 2) { AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor2D", ([&] { - PrintTensor2D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(0), x.size(1), - x.stride(0), x.stride(1), x.numel(), - name_ptr, print_ptr, print_shape); - })); + PrintTensor2D<<<1, 1, 0, stream>>>( + x.data_ptr(), + x.size(0), x.size(1), x.stride(0), x.stride(1), + name_ptr, print_ptr, print_shape + ); + })); } else if (x.dim() == 3) { AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor3D", ([&] { - PrintTensor3D<<<1, 1, 0, stream>>>( - x.data_ptr(), x.size(0), x.size(1), - x.size(2), x.stride(0), x.stride(1), - x.stride(2), x.numel(), name_ptr, print_ptr, print_shape); - })); + PrintTensor3D<<<1, 1, 0, stream>>>( + x.data_ptr(), + x.size(0), x.size(1), x.size(2), x.stride(0), x.stride(1), x.stride(2), + name_ptr, print_ptr, print_shape + ); + })); } else { // NOTE(Zihao): I'm just too lazy to do this, codegen for higher // dimensions should be a better idea From 8d619e621deb5f474db59e07962ae7d32f7e9fe7 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:21:21 +0800 Subject: [PATCH 41/57] more --- csrc/debug_print.cu | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 3ceab93..0363f20 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -46,9 +46,11 @@ __global__ void PrintTensor1D( if (print_shape) { printf("shape=(%d), stride=(%d)", (int) shape_0, (int) stride_0); } + printf("["); for (size_t index_0 = 0; index_0 < shape_0; ++index_0) { PrintElem(x[index_0 * stride_0]); } + printf("]"); printf("\n"); } @@ -63,12 +65,15 @@ __global__ void PrintTensor2D( if (print_shape) { printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1); } + printf("["); for (size_t index_0 = 0; index_0 < shape_0; ++index_0) { + printf("["); for (size_t index_1 = 0; index_1 < shape_1; ++index_1) { PrintElem(x[index_0 * stride_0 + index_1 * stride_1]); } - printf("; "); + printf("]"); } + printf("]"); printf("\n"); } @@ -83,15 +88,19 @@ __global__ void PrintTensor3D( if (print_shape) { printf("shape=(%d, %d, %d), stride=(%d, %d, %d)", (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2); } + printf("["); for (size_t index_0 = 0; index_0 < shape_0; ++index_0) { + printf("["); for (size_t index_1 = 0; index_1 < shape_1; ++index_1) { + printf("["); for (size_t index_2 = 0; index_2 < shape_2; ++index_2) { PrintElem(x[index_0 * stride_0 + index_1 * stride_1]); } - printf("; "); + printf("]"); } - printf("; "); + printf("]"); } + printf("]"); printf("\n"); } From 7c8da333ce956be7b5e725b010c6b2fd9d6ea982 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:22:51 +0800 Subject: [PATCH 42/57] more --- example.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/example.py b/example.py index 0a7d315..a74c067 100644 --- a/example.py +++ b/example.py @@ -11,6 +11,17 @@ debug_print.print_tensor(x[..., 0]) debug_print.print_tensor(x[0:1, 1:3, 0:4]) +print("demo for all types...") +debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32), name="for int32") +debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int64), name="for int64") +debug_print.print_tensor(torch.tensor([1.5, 2.5, 3.5], dtype=torch.float), name="for float") + +print("demo for all dims...") +debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32), name="for 1D") +debug_print.print_tensor(torch.tensor([[1, 2, 3], [3, 4, 5]], dtype=torch.int32), name="for 2D") +debug_print.print_tensor(torch.tensor([[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]], dtype=torch.int32), + name="for 3D") + print("start warmup...") s = torch.cuda.Stream() s.wait_stream(torch.cuda.current_stream()) @@ -22,7 +33,6 @@ z1 = z @ y z2 = z1 @ y - print("start graph capture...") g = torch.cuda.CUDAGraph() with torch.cuda.graph(g, stream=s): From 4fcee3d3b7a7fa0c15f6d9a0aeec88d166cebd9b Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:23:25 +0800 Subject: [PATCH 43/57] more --- csrc/debug_print.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 0363f20..51ba3bd 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -27,9 +27,9 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr, template __device__ void PrintElem(scalar_t value) { if constexpr (std::is_floating_point::value) { - printf("%.4f, ", float(x[i * stride_0])); + printf("%.4f, ", float(value)); } else if constexpr (std::is_integral::value) { - printf("%lld, ", static_cast(x[i * stride_0])); + printf("%lld, ", static_cast(value)); } else { printf("?, "); } From cf18d097434b5bfb951d2e133e118bee3bac7f25 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:24:03 +0800 Subject: [PATCH 44/57] more --- csrc/debug_print.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 51ba3bd..5cdcaa4 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -71,7 +71,7 @@ __global__ void PrintTensor2D( for (size_t index_1 = 0; index_1 < shape_1; ++index_1) { PrintElem(x[index_0 * stride_0 + index_1 * stride_1]); } - printf("]"); + printf("], "); } printf("]"); printf("\n"); @@ -96,9 +96,9 @@ __global__ void PrintTensor3D( for (size_t index_2 = 0; index_2 < shape_2; ++index_2) { PrintElem(x[index_0 * stride_0 + index_1 * stride_1]); } - printf("]"); + printf("], "); } - printf("]"); + printf("], "); } printf("]"); printf("\n"); From 8a521360e30f21e11c3f20ef1a13f7cb9bdd486b Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:24:15 +0800 Subject: [PATCH 45/57] more --- csrc/debug_print.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 5cdcaa4..220e67c 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -15,7 +15,7 @@ TYPE, NAME, \ AT_DISPATCH_CASE_FLOATING_AND_REDUCED_FLOATING_TYPES(__VA_ARGS__)) -__device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr, const bool print_shape) { +__device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) { if (name_ptr != nullptr) { printf("name: %s\n", name_ptr); } From 768d952c452c30222cfb54054cf701d9857872f0 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:26:38 +0800 Subject: [PATCH 46/57] more --- debug_print/__init__.py | 1 + example.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index b38ad3b..14f369a 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -41,6 +41,7 @@ def post_initialize(self): self._pending_copy_tasks.clear() def __call__(self, x: torch.Tensor, name: str, print_ptr: bool, print_shape: bool): + assert x.is_cuda, f"{x.device} must be on cuda" name_buffer_gpu = self._compute_name_buffer_gpu(name=name, device_index=x.device.index) _print_tensor_kernel(x, name_buffer_gpu, print_ptr, print_shape) diff --git a/example.py b/example.py index a74c067..0a311f5 100644 --- a/example.py +++ b/example.py @@ -12,15 +12,16 @@ debug_print.print_tensor(x[0:1, 1:3, 0:4]) print("demo for all types...") -debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32), name="for int32") -debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int64), name="for int64") -debug_print.print_tensor(torch.tensor([1.5, 2.5, 3.5], dtype=torch.float), name="for float") +debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for int32") +debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int64, device="cuda:0"), name="for int64") +debug_print.print_tensor(torch.tensor([1.5, 2.5, 3.5], dtype=torch.float, device="cuda:0"), name="for float") print("demo for all dims...") -debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32), name="for 1D") -debug_print.print_tensor(torch.tensor([[1, 2, 3], [3, 4, 5]], dtype=torch.int32), name="for 2D") -debug_print.print_tensor(torch.tensor([[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]], dtype=torch.int32), - name="for 3D") +debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for 1D") +debug_print.print_tensor(torch.tensor([[1, 2, 3], [3, 4, 5]], dtype=torch.int32, device="cuda:0"), name="for 2D") +debug_print.print_tensor( + torch.tensor([[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]], dtype=torch.int32, device="cuda:0"), + name="for 3D") print("start warmup...") s = torch.cuda.Stream() From 567f62d9bbd1a0843137681973c6bb16ee22b11f Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:27:42 +0800 Subject: [PATCH 47/57] more --- csrc/debug_print.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 220e67c..b337aa8 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -31,7 +31,7 @@ __device__ void PrintElem(scalar_t value) { } else if constexpr (std::is_integral::value) { printf("%lld, ", static_cast(value)); } else { - printf("?, "); + static_assert(false, "unsupported scalar_t type"); } } From 1610757e554e88cded33c82e1db4b41c5a25c964 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:28:49 +0800 Subject: [PATCH 48/57] more --- csrc/debug_print.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index b337aa8..5befa41 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -24,6 +24,9 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) } } +template +struct always_false : std::false_type {}; + template __device__ void PrintElem(scalar_t value) { if constexpr (std::is_floating_point::value) { @@ -31,7 +34,7 @@ __device__ void PrintElem(scalar_t value) { } else if constexpr (std::is_integral::value) { printf("%lld, ", static_cast(value)); } else { - static_assert(false, "unsupported scalar_t type"); + static_assert(always_false::value, "PrintElem: unsupported scalar_t type"); } } From c79d34698711fb1d15232ffd79f140596afb5951 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:30:24 +0800 Subject: [PATCH 49/57] more --- csrc/debug_print.cu | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 5befa41..cc66caa 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -24,12 +24,21 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) } } +template +struct is_my_floating_point : std::is_floating_point {}; + +template <> +struct is_my_floating_point : std::true_type {}; + +template <> +struct is_my_floating_point : std::true_type {}; + template struct always_false : std::false_type {}; template __device__ void PrintElem(scalar_t value) { - if constexpr (std::is_floating_point::value) { + if constexpr (is_my_floating_point::value) { printf("%.4f, ", float(value)); } else if constexpr (std::is_integral::value) { printf("%lld, ", static_cast(value)); From 2288462651555d2111016142cb7e4622e6484b82 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:32:34 +0800 Subject: [PATCH 50/57] more --- csrc/debug_print.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index cc66caa..d34972f 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -106,7 +106,7 @@ __global__ void PrintTensor3D( for (size_t index_1 = 0; index_1 < shape_1; ++index_1) { printf("["); for (size_t index_2 = 0; index_2 < shape_2; ++index_2) { - PrintElem(x[index_0 * stride_0 + index_1 * stride_1]); + PrintElem(x[index_0 * stride_0 + index_1 * stride_1 + index_2 * stride_2]); } printf("], "); } From e7c6395f2ce13b9ac4359849171de0c8c962e6bc Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:33:53 +0800 Subject: [PATCH 51/57] more --- csrc/debug_print.cu | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index d34972f..27c1b0d 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -17,10 +17,10 @@ __device__ void PrintCommon(void* x, const char* name_ptr, const bool print_ptr) { if (name_ptr != nullptr) { - printf("name: %s\n", name_ptr); + printf("name=%s, ", name_ptr); } if (print_ptr) { - printf("addr: %lld\n", x); + printf("addr=%lld, ", x); } } @@ -58,12 +58,11 @@ __global__ void PrintTensor1D( if (print_shape) { printf("shape=(%d), stride=(%d)", (int) shape_0, (int) stride_0); } - printf("["); + printf("\n["); for (size_t index_0 = 0; index_0 < shape_0; ++index_0) { PrintElem(x[index_0 * stride_0]); } - printf("]"); - printf("\n"); + printf("]\n"); } template @@ -77,7 +76,7 @@ __global__ void PrintTensor2D( if (print_shape) { printf("shape=(%d, %d), stride=(%d, %d)", (int) shape_0, (int) shape_1, (int) stride_0, (int) stride_1); } - printf("["); + printf("\n["); for (size_t index_0 = 0; index_0 < shape_0; ++index_0) { printf("["); for (size_t index_1 = 0; index_1 < shape_1; ++index_1) { @@ -85,8 +84,7 @@ __global__ void PrintTensor2D( } printf("], "); } - printf("]"); - printf("\n"); + printf("]\n"); } template @@ -100,7 +98,7 @@ __global__ void PrintTensor3D( if (print_shape) { printf("shape=(%d, %d, %d), stride=(%d, %d, %d)", (int) shape_0, (int) shape_1, (int) shape_2, (int) stride_0, (int) stride_1, (int) stride_2); } - printf("["); + printf("\n["); for (size_t index_0 = 0; index_0 < shape_0; ++index_0) { printf("["); for (size_t index_1 = 0; index_1 < shape_1; ++index_1) { @@ -112,8 +110,7 @@ __global__ void PrintTensor3D( } printf("], "); } - printf("]"); - printf("\n"); + printf("]\n"); } void PrintTensor(torch::Tensor x, std::optional name_buffer, bool print_ptr, bool print_shape) { From 7e6e9e62fba068ca867278af1e2b7aaca0f1c396 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 17:34:41 +0800 Subject: [PATCH 52/57] morew --- example.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/example.py b/example.py index 0a311f5..b0a7d3c 100644 --- a/example.py +++ b/example.py @@ -12,16 +12,16 @@ debug_print.print_tensor(x[0:1, 1:3, 0:4]) print("demo for all types...") -debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for int32") -debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int64, device="cuda:0"), name="for int64") -debug_print.print_tensor(torch.tensor([1.5, 2.5, 3.5], dtype=torch.float, device="cuda:0"), name="for float") +debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for int32", print_shape=True, print_ptr=True) +debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int64, device="cuda:0"), name="for int64", print_shape=True, print_ptr=True) +debug_print.print_tensor(torch.tensor([1.5, 2.5, 3.5], dtype=torch.float, device="cuda:0"), name="for float", print_shape=True, print_ptr=True) print("demo for all dims...") -debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for 1D") -debug_print.print_tensor(torch.tensor([[1, 2, 3], [3, 4, 5]], dtype=torch.int32, device="cuda:0"), name="for 2D") +debug_print.print_tensor(torch.tensor([3, 4, 5], dtype=torch.int32, device="cuda:0"), name="for 1D", print_shape=True, print_ptr=True) +debug_print.print_tensor(torch.tensor([[1, 2, 3], [3, 4, 5]], dtype=torch.int32, device="cuda:0"), name="for 2D", print_shape=True, print_ptr=True) debug_print.print_tensor( torch.tensor([[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]], dtype=torch.int32, device="cuda:0"), - name="for 3D") + name="for 3D", print_shape=True, print_ptr=True) print("start warmup...") s = torch.cuda.Stream() @@ -42,7 +42,7 @@ z = x @ y debug_print.print_tensor(z) z1 = z @ y - debug_print.print_tensor(z1[..., 0], name="This is name for part of z1") + debug_print.print_tensor(z1[..., 0], name="This is name for part of z1", print_shape=True, print_ptr=True) z2 = z1 @ y debug_print.print_tensor(z2, name="This is name for z2") From f33a38c3a2baa2151e45872b090ad0addfcd9dce Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 18:29:08 +0800 Subject: [PATCH 53/57] more --- debug_print/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index 14f369a..fc46ca4 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -31,7 +31,8 @@ def __init__(self): # Can be optimized self._buffers: Dict[int, _Buffer] = { device_index: _Buffer(device_index=device_index) - for device_index in range(torch.cuda.device_count()) + # for device_index in range(torch.cuda.device_count()) + for device_index in [torch.cuda.current_device()] } self._pending_copy_tasks: List[_CopyTask] = [] @@ -75,5 +76,5 @@ def post_initialize(): _printer.post_initialize() -def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = False, print_shape: bool = False): +def print_tensor(x: torch.Tensor, name: str = "", print_ptr: bool = True, print_shape: bool = True): _printer(x=x, name=name, print_ptr=print_ptr, print_shape=print_shape) From a255336ecc4292e1ccb2962c641a31463ef82734 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 18:34:48 +0800 Subject: [PATCH 54/57] more --- debug_print/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index fc46ca4..c4db2fa 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -68,7 +68,9 @@ def _compute_name_buffer_gpu(self, name: str, device_index: int): def initialize(): global _printer - assert _printer is None + if _printer is not None: + print("debug_print.initialize skip since already initialized") + return _printer = _DebugPrinter() From 29bf99284f1d6f78e8edbd70ef93b3535b7093e4 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 19:37:55 +0800 Subject: [PATCH 55/57] more --- debug_print/__init__.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index c4db2fa..b2865ac 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -27,13 +27,12 @@ def execute(self): class _DebugPrinter: - def __init__(self): + def __init__(self, device_id: Optional[int]): + if device_id is None: + device_id = torch.cuda.current_device() + # Can be optimized - self._buffers: Dict[int, _Buffer] = { - device_index: _Buffer(device_index=device_index) - # for device_index in range(torch.cuda.device_count()) - for device_index in [torch.cuda.current_device()] - } + self._buffers: Dict[int, _Buffer] = {device_id: _Buffer(device_index=device_id)} self._pending_copy_tasks: List[_CopyTask] = [] def post_initialize(self): @@ -66,12 +65,12 @@ def _compute_name_buffer_gpu(self, name: str, device_index: int): _printer: Optional[_DebugPrinter] = None -def initialize(): +def initialize(device_id: int): global _printer if _printer is not None: print("debug_print.initialize skip since already initialized") return - _printer = _DebugPrinter() + _printer = _DebugPrinter(device_id=device_id) def post_initialize(): From 7aca9d8a8ba96976449bd946798e67120cd45fe2 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 19:43:40 +0800 Subject: [PATCH 56/57] more --- csrc/debug_print.cu | 47 +++++++++++++++++++++++++++++++++++++++++++++ example.py | 8 ++++++++ 2 files changed, 55 insertions(+) diff --git a/csrc/debug_print.cu b/csrc/debug_print.cu index 27c1b0d..40ccbc5 100644 --- a/csrc/debug_print.cu +++ b/csrc/debug_print.cu @@ -113,6 +113,36 @@ __global__ void PrintTensor3D( printf("]\n"); } +template +__global__ void PrintTensor4D( + float_t *__restrict__ x, + const size_t shape_0, const size_t shape_1, const size_t shape_2, const size_t shape_3, + const size_t stride_0, const size_t stride_1, const size_t stride_2, const size_t stride_3, + const char* name_ptr, const bool print_ptr, const bool print_shape +) { + PrintCommon(x, name_ptr, print_ptr); + if (print_shape) { + printf("shape=(%d, %d, %d, %d), stride=(%d, %d, %d, %d)", (int) shape_0, (int) shape_1, (int) shape_2, (int) shape_3, (int) stride_0, (int) stride_1, (int) stride_2, (int) stride_3); + } + printf("\n["); + for (size_t index_0 = 0; index_0 < shape_0; ++index_0) { + printf("["); + for (size_t index_1 = 0; index_1 < shape_1; ++index_1) { + printf("["); + for (size_t index_2 = 0; index_2 < shape_2; ++index_2) { + printf("["); + for (size_t index_3 = 0; index_3 < shape_3; ++index_3) { + PrintElem(x[index_0 * stride_0 + index_1 * stride_1 + index_2 * stride_2 + index_3 * stride_3]); + } + printf("], "); + } + printf("], "); + } + printf("], "); + } + printf("]\n"); +} + void PrintTensor(torch::Tensor x, std::optional name_buffer, bool print_ptr, bool print_shape) { cudaStream_t stream = c10::cuda::getCurrentCUDAStream(x.device().index()); TORCH_CHECK(x.is_cuda(), "The input tensor should be a CUDA tensor"); @@ -147,6 +177,15 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool name_ptr, print_ptr, print_shape ); })); + } else if (x.dim() == 4) { + AT_DISPATCH_FLOATING_AND_REDUCED_FLOATING_TYPES( + x.scalar_type(), "PrintTensor4D", ([&] { + PrintTensor4D<<<1, 1, 0, stream>>>( + x.data_ptr(), + x.size(0), x.size(1), x.size(2), x.size(3), x.stride(0), x.stride(1), x.stride(2), x.stride(3), + name_ptr, print_ptr, print_shape + ); + })); } else { // NOTE(Zihao): I'm just too lazy to do this, codegen for higher // dimensions should be a better idea @@ -181,6 +220,14 @@ void PrintTensor(torch::Tensor x, std::optional name_buffer, bool name_ptr, print_ptr, print_shape ); })); + } else if (x.dim() == 4) { + AT_DISPATCH_INTEGRAL_TYPES(x.scalar_type(), "PrintTensor4D", ([&] { + PrintTensor4D<<<1, 1, 0, stream>>>( + x.data_ptr(), + x.size(0), x.size(1), x.size(2), x.size(3), x.stride(0), x.stride(1), x.stride(2), x.stride(3), + name_ptr, print_ptr, print_shape + ); + })); } else { // NOTE(Zihao): I'm just too lazy to do this, codegen for higher // dimensions should be a better idea diff --git a/example.py b/example.py index b0a7d3c..b24ba69 100644 --- a/example.py +++ b/example.py @@ -22,6 +22,14 @@ debug_print.print_tensor( torch.tensor([[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]], dtype=torch.int32, device="cuda:0"), name="for 3D", print_shape=True, print_ptr=True) +debug_print.print_tensor( + torch.tensor( + [ + [[[1, 2, 3], [3, 4, 5]], [[10, 20, 30], [30, 40, 50]]], + [[[-1, -2, -3], [-3, -4, -5]], [[-10, -20, -30], [-30, -40, -50]]], + ], + dtype=torch.int32, device="cuda:0"), + name="for 4D", print_shape=True, print_ptr=True) print("start warmup...") s = torch.cuda.Stream() From 797597a3b79153057ce6fa2dfccf10128f2b31ad Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Thu, 26 Jun 2025 19:45:14 +0800 Subject: [PATCH 57/57] more --- debug_print/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug_print/__init__.py b/debug_print/__init__.py index b2865ac..d6b71ed 100644 --- a/debug_print/__init__.py +++ b/debug_print/__init__.py @@ -65,7 +65,7 @@ def _compute_name_buffer_gpu(self, name: str, device_index: int): _printer: Optional[_DebugPrinter] = None -def initialize(device_id: int): +def initialize(device_id: Optional[int] = None): global _printer if _printer is not None: print("debug_print.initialize skip since already initialized")