-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[NVPTX][NFC] Rearrange the TMA-S2G intrinsics #144903
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This patch moves the TMA S2G intrinsics into their own set of loops. This is in preparation for adding im2colw/w128 modes to the G2S intrinsics (and the S2G ones do not support those modes). Signed-off-by: Durgadoss R <[email protected]>
@llvm/pr-subscribers-llvm-ir Author: Durgadoss R (durga4github) ChangesThis patch moves the TMA S2G intrinsics into their own set of loops. Full diff: https://github.com/llvm/llvm-project/pull/144903.diff 1 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 410a0dea2bf57..0375f29ad8906 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2013,9 +2013,36 @@ class DefaultAttrsIntrinsicFlags<list<LLVMType> ret_types,
!foreach(i, !range(flags),
ImmArg<ArgIndex<!add(i, !size(param_types))>>))>;
-// Intrinsics for Tensor Copy using TMA
-// G2S -> From Global to Shared memory variants
-// S2G -> From Shared to Global memory variants
+// TMA Tensor Copy Intrinsics: S2G -> From Shared to Global memory variants
+foreach dim = 1...5 in {
+ defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim);
+ foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
+ def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d :
+ DefaultAttrsIntrinsicFlags<[],
+ !listconcat([llvm_shared_ptr_ty, // src_smem_ptr
+ llvm_ptr_ty], // tensormap_ptr
+ tensor_dim_args, // actual tensor dims
+ [llvm_i64_ty]), // cache_hint
+ [llvm_i1_ty], // Flag for cache_hint
+ [IntrConvergent,
+ ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+ NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+
+ // Intrinsics for TMA Copy with reduction
+ foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in
+ def int_nvvm_cp_async_bulk_tensor_reduce_ # red_op # _ # mode # _ # dim # d :
+ DefaultAttrsIntrinsicFlags<[],
+ !listconcat([llvm_shared_ptr_ty, // src_smem_ptr
+ llvm_ptr_ty], // tensormap_ptr
+ tensor_dim_args, // actual tensor dims
+ [llvm_i64_ty]), // cache_hint
+ [llvm_i1_ty], // Flag for cache_hint
+ [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+ NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+ }
+}
+
+// TMA Tensor Copy Intrinsics: G2S -> From Global to Shared memory variants
foreach dim = 1...5 in {
defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim);
@@ -2045,17 +2072,6 @@ foreach dim = 1...5 in {
def int_nvvm_cp_async_bulk_tensor_g2s_ # mode # _ # dim # d :
DefaultAttrsIntrinsicFlags<[], g2s_params, g2s_flags, g2s_props>;
- def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d :
- DefaultAttrsIntrinsicFlags<[],
- !listconcat([llvm_shared_ptr_ty, // src_smem_ptr
- llvm_ptr_ty], // tensormap_ptr
- tensor_dim_args, // actual tensor dims
- [llvm_i64_ty]), // cache_hint
- [llvm_i1_ty], // Flag for cache_hint
- [IntrConvergent,
- ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
- NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
-
def int_nvvm_cp_async_bulk_tensor_prefetch_ # mode # _ # dim # d :
DefaultAttrsIntrinsicFlags<[],
!listconcat([llvm_ptr_ty], // tensormap_ptr
@@ -2065,18 +2081,6 @@ foreach dim = 1...5 in {
[llvm_i1_ty], // Flag for cache_hint
[IntrConvergent,
ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
-
- // Intrinsics for TMA Copy with reduction
- foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in
- def int_nvvm_cp_async_bulk_tensor_reduce_ # red_op # _ # mode # _ # dim # d :
- DefaultAttrsIntrinsicFlags<[],
- !listconcat([llvm_shared_ptr_ty, // src_smem_ptr
- llvm_ptr_ty], // tensormap_ptr
- tensor_dim_args, // actual tensor dims
- [llvm_i64_ty]), // cache_hint
- [llvm_i1_ty], // Flag for cache_hint
- [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
- NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
}
}
|
@llvm/pr-subscribers-backend-nvptx Author: Durgadoss R (durga4github) ChangesThis patch moves the TMA S2G intrinsics into their own set of loops. Full diff: https://github.com/llvm/llvm-project/pull/144903.diff 1 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 410a0dea2bf57..0375f29ad8906 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2013,9 +2013,36 @@ class DefaultAttrsIntrinsicFlags<list<LLVMType> ret_types,
!foreach(i, !range(flags),
ImmArg<ArgIndex<!add(i, !size(param_types))>>))>;
-// Intrinsics for Tensor Copy using TMA
-// G2S -> From Global to Shared memory variants
-// S2G -> From Shared to Global memory variants
+// TMA Tensor Copy Intrinsics: S2G -> From Shared to Global memory variants
+foreach dim = 1...5 in {
+ defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim);
+ foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
+ def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d :
+ DefaultAttrsIntrinsicFlags<[],
+ !listconcat([llvm_shared_ptr_ty, // src_smem_ptr
+ llvm_ptr_ty], // tensormap_ptr
+ tensor_dim_args, // actual tensor dims
+ [llvm_i64_ty]), // cache_hint
+ [llvm_i1_ty], // Flag for cache_hint
+ [IntrConvergent,
+ ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+ NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+
+ // Intrinsics for TMA Copy with reduction
+ foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in
+ def int_nvvm_cp_async_bulk_tensor_reduce_ # red_op # _ # mode # _ # dim # d :
+ DefaultAttrsIntrinsicFlags<[],
+ !listconcat([llvm_shared_ptr_ty, // src_smem_ptr
+ llvm_ptr_ty], // tensormap_ptr
+ tensor_dim_args, // actual tensor dims
+ [llvm_i64_ty]), // cache_hint
+ [llvm_i1_ty], // Flag for cache_hint
+ [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+ NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+ }
+}
+
+// TMA Tensor Copy Intrinsics: G2S -> From Global to Shared memory variants
foreach dim = 1...5 in {
defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim);
@@ -2045,17 +2072,6 @@ foreach dim = 1...5 in {
def int_nvvm_cp_async_bulk_tensor_g2s_ # mode # _ # dim # d :
DefaultAttrsIntrinsicFlags<[], g2s_params, g2s_flags, g2s_props>;
- def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d :
- DefaultAttrsIntrinsicFlags<[],
- !listconcat([llvm_shared_ptr_ty, // src_smem_ptr
- llvm_ptr_ty], // tensormap_ptr
- tensor_dim_args, // actual tensor dims
- [llvm_i64_ty]), // cache_hint
- [llvm_i1_ty], // Flag for cache_hint
- [IntrConvergent,
- ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
- NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
-
def int_nvvm_cp_async_bulk_tensor_prefetch_ # mode # _ # dim # d :
DefaultAttrsIntrinsicFlags<[],
!listconcat([llvm_ptr_ty], // tensormap_ptr
@@ -2065,18 +2081,6 @@ foreach dim = 1...5 in {
[llvm_i1_ty], // Flag for cache_hint
[IntrConvergent,
ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
-
- // Intrinsics for TMA Copy with reduction
- foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in
- def int_nvvm_cp_async_bulk_tensor_reduce_ # red_op # _ # mode # _ # dim # d :
- DefaultAttrsIntrinsicFlags<[],
- !listconcat([llvm_shared_ptr_ty, // src_smem_ptr
- llvm_ptr_ty], // tensormap_ptr
- tensor_dim_args, // actual tensor dims
- [llvm_i64_ty]), // cache_hint
- [llvm_i1_ty], // Flag for cache_hint
- [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
- NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
}
}
|
This patch moves the TMA S2G intrinsics into their own set of loops.
This is in preparation for adding im2colw/w128 modes support to
the G2S intrinsics (but the S2G ones do not support those modes).