From c0b57f98288bda97999f2a7c919f3929ee3dfe1d Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 10:52:55 -0700 Subject: [PATCH 01/19] [SYCL][NFC] Prepare algorithms for non-uniformity To avoid duplicating logic and introducing even more overloads of the group algorithms, it is desirable to move some of the implementation details into the detail::spirv namespace. This commit makes a few changes to enable that to happen: - spirv:: functions with a Group template now take a group object, to enable run-time information (e.g. group membership) to pass through. - ControlBarrier and the OpGroup* instruction used to implement reduce/scan now forward to spirv::, similar to other group functions and algorithms. - The calc helper used to map functors to SPIR-V instructions is updated to use the new spirv:: functions, instead of calling __spirv intrinsics. Signed-off-by: John Pennycook --- sycl/include/sycl/detail/spirv.hpp | 80 +++++++++++++++++---- sycl/include/sycl/detail/type_traits.hpp | 23 ++++++ sycl/include/sycl/ext/oneapi/functional.hpp | 25 ++----- sycl/include/sycl/ext/oneapi/sub_group.hpp | 15 ++-- sycl/include/sycl/group_algorithm.hpp | 37 +++++----- sycl/include/sycl/group_barrier.hpp | 25 ++----- 6 files changed, 124 insertions(+), 81 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 2da8ee1a7db90..7b46609931176 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -94,11 +95,11 @@ void GenericCall(const Functor &ApplyToBytes) { } } -template bool GroupAll(bool pred) { +template bool GroupAll(Group g, bool pred) { return __spirv_GroupAll(group_scope::value, pred); } -template bool GroupAny(bool pred) { +template bool GroupAny(Group g, bool pred) { return __spirv_GroupAny(group_scope::value, pred); } @@ -157,7 +158,7 @@ template <> struct GroupId<::sycl::ext::oneapi::sub_group> { using type = uint32_t; }; template -EnableIfNativeBroadcast GroupBroadcast(T x, IdT local_id) { +EnableIfNativeBroadcast GroupBroadcast(Group, T x, IdT local_id) { using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(local_id); using OCLT = detail::ConvertToOpenCLType_t; @@ -168,14 +169,14 @@ EnableIfNativeBroadcast GroupBroadcast(T x, IdT local_id) { return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); } template -EnableIfBitcastBroadcast GroupBroadcast(T x, IdT local_id) { +EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, IdT local_id) { using BroadcastT = ConvertToNativeBroadcastType_t; auto BroadcastX = bit_cast(x); - BroadcastT Result = GroupBroadcast(BroadcastX, local_id); + BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id); return bit_cast(Result); } template -EnableIfGenericBroadcast GroupBroadcast(T x, IdT local_id) { +EnableIfGenericBroadcast GroupBroadcast(Group g, T x, IdT local_id) { // Initialize with x to support type T without default constructor T Result = x; char *XBytes = reinterpret_cast(&x); @@ -183,7 +184,7 @@ EnableIfGenericBroadcast GroupBroadcast(T x, IdT local_id) { auto BroadcastBytes = [=](size_t Offset, size_t Size) { uint64_t BroadcastX, BroadcastResult; std::memcpy(&BroadcastX, XBytes + Offset, Size); - BroadcastResult = GroupBroadcast(BroadcastX, local_id); + BroadcastResult = GroupBroadcast(g, BroadcastX, local_id); std::memcpy(ResultBytes + Offset, &BroadcastResult, Size); }; GenericCall(BroadcastBytes); @@ -192,9 +193,10 @@ EnableIfGenericBroadcast GroupBroadcast(T x, IdT local_id) { // Broadcast with vector local index template -EnableIfNativeBroadcast GroupBroadcast(T x, id local_id) { +EnableIfNativeBroadcast GroupBroadcast(Group g, T x, + id local_id) { if (Dimensions == 1) { - return GroupBroadcast(x, local_id[0]); + return GroupBroadcast(g, x, local_id[0]); } using IdT = vec; using OCLT = detail::ConvertToOpenCLType_t; @@ -209,16 +211,18 @@ EnableIfNativeBroadcast GroupBroadcast(T x, id local_id) { return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); } template -EnableIfBitcastBroadcast GroupBroadcast(T x, id local_id) { +EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, + id local_id) { using BroadcastT = ConvertToNativeBroadcastType_t; auto BroadcastX = bit_cast(x); - BroadcastT Result = GroupBroadcast(BroadcastX, local_id); + BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id); return bit_cast(Result); } template -EnableIfGenericBroadcast GroupBroadcast(T x, id local_id) { +EnableIfGenericBroadcast GroupBroadcast(Group g, T x, + id local_id) { if (Dimensions == 1) { - return GroupBroadcast(x, local_id[0]); + return GroupBroadcast(g, x, local_id[0]); } // Initialize with x to support type T without default constructor T Result = x; @@ -801,6 +805,56 @@ EnableIfGenericShuffle SubgroupShuffleUp(T x, uint32_t delta) { return Result; } +template +typename std::enable_if_t< + ext::oneapi::experimental::is_fixed_topology_group_v> +ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { + __spirv_ControlBarrier(group_scope::Scope, getScope(FenceScope), + getMemorySemanticsMask(Order) | + __spv::MemorySemanticsMask::SubgroupMemory | + __spv::MemorySemanticsMask::WorkgroupMemory | + __spv::MemorySemanticsMask::CrossWorkgroupMemory); +} + +#define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \ + template \ + typename std::enable_if_t< \ + ext::oneapi::experimental::is_fixed_topology_group_v, T> \ + Group##Instruction(Group G, __spv::GroupOperation Op, T x) { \ + using ConvertedT = detail::ConvertToOpenCLType_t; \ + \ + using OCLT = \ + conditional_t() || \ + std::is_same(), \ + cl_int, \ + conditional_t() || \ + std::is_same(), \ + cl_uint, ConvertedT>>; \ + OCLT Arg = x; \ + OCLT Ret = __spirv_Group##Instruction(group_scope::value, \ + static_cast(Op), Arg); \ + return Ret; \ + } + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMin) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMin) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMax) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMax) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMax) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(IAdd) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FAdd) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(IMulKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMulKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(CMulINTEL) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseOrKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseXorKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseAndKHR) + } // namespace spirv } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index 474ced1a5fe91..33039cf7b389e 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -27,6 +27,29 @@ struct sub_group; namespace experimental { template class group_with_scratchpad; +template struct is_fixed_topology_group : std::false_type {}; + +template +inline constexpr bool is_fixed_topology_group_v = + is_fixed_topology_group::value; + +#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP +template <> struct is_fixed_topology_group : std::true_type {}; +#endif + +template +struct is_fixed_topology_group> : std::true_type {}; + +template <> +struct is_fixed_topology_group : std::true_type { +}; + +template struct is_user_constructed_group : std::false_type {}; + +template +inline constexpr bool is_user_constructed_group_v = + is_user_constructed_group::value; + namespace detail { template struct is_group_helper : std::false_type {}; diff --git a/sycl/include/sycl/ext/oneapi/functional.hpp b/sycl/include/sycl/ext/oneapi/functional.hpp index b8be0c6573620..66454b75577ab 100644 --- a/sycl/include/sycl/ext/oneapi/functional.hpp +++ b/sycl/include/sycl/ext/oneapi/functional.hpp @@ -61,21 +61,9 @@ struct GroupOpTag< }; #define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation) \ - template \ - static T calc(GroupTag, T x, BinaryOperation) { \ - using ConvertedT = detail::ConvertToOpenCLType_t; \ - \ - using OCLT = \ - conditional_t() || \ - std::is_same(), \ - cl_int, \ - conditional_t() || \ - std::is_same(), \ - cl_uint, ConvertedT>>; \ - OCLT Arg = x; \ - OCLT Ret = \ - __spirv_Group##SPIRVOperation(S, static_cast(O), Arg); \ - return Ret; \ + template <__spv::GroupOperation O, typename Group, typename T> \ + static T calc(Group g, GroupTag, T x, BinaryOperation) { \ + return sycl::detail::spirv::Group##SPIRVOperation(g, O, x); \ } // calc for sycl function objects @@ -105,10 +93,11 @@ __SYCL_CALC_OVERLOAD(GroupOpIUnsigned, BitwiseAndKHR, sycl::bit_and) #undef __SYCL_CALC_OVERLOAD -template class BinaryOperation> -static T calc(typename GroupOpTag::type, T x, BinaryOperation) { - return calc(typename GroupOpTag::type(), x, BinaryOperation()); +static T calc(Group g, typename GroupOpTag::type, T x, + BinaryOperation) { + return calc(g, typename GroupOpTag::type(), x, BinaryOperation()); } } // namespace detail diff --git a/sycl/include/sycl/ext/oneapi/sub_group.hpp b/sycl/include/sycl/ext/oneapi/sub_group.hpp index 643a9245cb799..52de964cbbc1e 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group.hpp @@ -645,9 +645,8 @@ struct sub_group { "sycl::ext::oneapi::reduce instead.") EnableIfIsScalarArithmetic reduce(T x, BinaryOperation op) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc( - typename sycl::detail::GroupOpTag::type(), x, op); + return sycl::detail::calc<__spv::GroupOperation::Reduce>( + typename sycl::detail::GroupOpTag::type(), *this, x, op); #else (void)x; (void)op; @@ -676,9 +675,8 @@ struct sub_group { "sycl::ext::oneapi::exclusive_scan instead.") EnableIfIsScalarArithmetic exclusive_scan(T x, BinaryOperation op) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc( - typename sycl::detail::GroupOpTag::type(), x, op); + return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( + typename sycl::detail::GroupOpTag::type(), *this, x, op); #else (void)x; (void)op; @@ -715,9 +713,8 @@ struct sub_group { "sycl::ext::oneapi::inclusive_scan instead.") EnableIfIsScalarArithmetic inclusive_scan(T x, BinaryOperation op) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc( - typename sycl::detail::GroupOpTag::type(), x, op); + return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( + typename sycl::detail::GroupOpTag::type(), *this, x, op); #else (void)x; (void)op; diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index 1fa39d5ba3b5c..e55cd6d0d11e3 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -197,7 +197,7 @@ detail::enable_if_t<(is_group_v> && detail::is_multiplies::value)) && detail::is_native_op::value), T> -reduce_over_group(Group, T x, BinaryOperation binary_op) { +reduce_over_group(Group g, T x, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision static_assert( std::is_same::value || @@ -205,9 +205,8 @@ reduce_over_group(Group, T x, BinaryOperation binary_op) { std::is_same::value), "Result type of binary_op must match reduction accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc<__spv::GroupOperation::Reduce>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host.", PI_ERROR_INVALID_DEVICE); @@ -376,9 +375,9 @@ joint_reduce(Group g, Ptr first, Ptr last, T init, BinaryOperation binary_op) { // ---- any_of_group template detail::enable_if_t>, bool> -any_of_group(Group, bool pred) { +any_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAny(pred); + return sycl::detail::spirv::GroupAny(g, pred); #else (void)pred; throw runtime_error("Group algorithms are not supported on host.", @@ -415,9 +414,9 @@ joint_any_of(Group g, Ptr first, Ptr last, Predicate pred) { // ---- all_of_group template detail::enable_if_t>, bool> -all_of_group(Group, bool pred) { +all_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAll(pred); + return sycl::detail::spirv::GroupAll(g, pred); #else (void)pred; throw runtime_error("Group algorithms are not supported on host.", @@ -454,9 +453,9 @@ joint_all_of(Group g, Ptr first, Ptr last, Predicate pred) { // ---- none_of_group template detail::enable_if_t>, bool> -none_of_group(Group, bool pred) { +none_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAll(!pred); + return sycl::detail::spirv::GroupAll(g, !pred); #else (void)pred; throw runtime_error("Group algorithms are not supported on host.", @@ -571,9 +570,9 @@ detail::enable_if_t<(is_group_v> && (std::is_trivially_copyable::value || detail::is_vec::value)), T> -group_broadcast(Group, T x, typename Group::id_type local_id) { +group_broadcast(Group g, T x, typename Group::id_type local_id) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupBroadcast(x, local_id); + return sycl::detail::spirv::GroupBroadcast(g, x, local_id); #else (void)x; (void)local_id; @@ -628,16 +627,15 @@ detail::enable_if_t<(is_group_v> && detail::is_multiplies::value)) && detail::is_native_op::value), T> -exclusive_scan_over_group(Group, T x, BinaryOperation binary_op) { +exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision static_assert(std::is_same::value || (std::is_same::value && std::is_same::value), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host.", PI_ERROR_INVALID_DEVICE); @@ -862,16 +860,15 @@ detail::enable_if_t<(is_group_v> && detail::is_multiplies::value)) && detail::is_native_op::value), T> -inclusive_scan_over_group(Group, T x, BinaryOperation binary_op) { +inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision static_assert(std::is_same::value || (std::is_same::value && std::is_same::value), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/group_barrier.hpp b/sycl/include/sycl/group_barrier.hpp index 218f1909348c6..af1cb49e1e68e 100644 --- a/sycl/include/sycl/group_barrier.hpp +++ b/sycl/include/sycl/group_barrier.hpp @@ -20,31 +20,14 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace detail { -template struct group_barrier_scope {}; -template <> struct group_barrier_scope { - constexpr static auto Scope = __spv::Scope::Subgroup; -}; -template struct group_barrier_scope> { - constexpr static auto Scope = __spv::Scope::Workgroup; -}; -} // namespace detail - template typename std::enable_if>::type -group_barrier(Group, memory_scope FenceScope = Group::fence_scope) { - (void)FenceScope; -#ifdef __SYCL_DEVICE_ONLY__ +group_barrier(Group G, memory_scope FenceScope = Group::fence_scope) { // Per SYCL spec, group_barrier must perform both control barrier and memory // fence operations. All work-items execute a release fence prior to - // barrier and acquire fence afterwards. The rest of semantics flags specify - // which type of memory this behavior is applied to. - __spirv_ControlBarrier(detail::group_barrier_scope::Scope, - sycl::detail::spirv::getScope(FenceScope), - __spv::MemorySemanticsMask::SequentiallyConsistent | - __spv::MemorySemanticsMask::SubgroupMemory | - __spv::MemorySemanticsMask::WorkgroupMemory | - __spv::MemorySemanticsMask::CrossWorkgroupMemory); + // barrier and acquire fence afterwards. +#ifdef __SYCL_DEVICE_ONLY__ + detail::spirv::ControlBarrier(G, FenceScope, memory_order::seq_cst); #else throw sycl::runtime_error("Barriers are not supported on host device", PI_ERROR_INVALID_DEVICE); From 483ef0bd4604697aa438f1485c77964404b94ffb Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 11:09:02 -0700 Subject: [PATCH 02/19] [SYCL][NFC] Adjust some detail namespaces Nested detail namespaces cause problems for name lookup. Signed-off-by: John Pennycook --- .../ext/oneapi/experimental/ballot_group.hpp | 2 +- .../experimental/non_uniform_groups.hpp | 39 ++++++------------- .../experimental/opportunistic_group.hpp | 2 +- .../ext/oneapi/experimental/tangle_group.hpp | 2 +- .../sycl/ext/oneapi/sub_group_mask.hpp | 19 ++++++--- 5 files changed, 28 insertions(+), 36 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp index 04adcca79fcff..9e5ca0fc5e962 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp @@ -41,7 +41,7 @@ template class ballot_group { id_type get_local_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return detail::CallerPositionInMask(Mask); + return sycl::detail::CallerPositionInMask(Mask); #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 8899ae74ae985..74a5b0c1e2d94 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -13,49 +13,34 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace ext::oneapi::experimental { -template struct is_fixed_topology_group : std::false_type {}; - -template -inline constexpr bool is_fixed_topology_group_v = - is_fixed_topology_group::value; - -#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP -template <> struct is_fixed_topology_group : std::true_type {}; -#endif - -template -struct is_fixed_topology_group> : std::true_type {}; - -template <> struct is_fixed_topology_group : std::true_type {}; - -template struct is_user_constructed_group : std::false_type {}; - -template -inline constexpr bool is_user_constructed_group_v = - is_user_constructed_group::value; - -#ifdef __SYCL_DEVICE_ONLY__ -// TODO: This may need to be generalized beyond uint32_t for big masks namespace detail { -uint32_t CallerPositionInMask(sub_group_mask Mask) { - // FIXME: It would be nice to be able to jump straight to an __ocl_vec_t + +inline sycl::vec ExtractMask(ext::oneapi::sub_group_mask Mask) { sycl::marray TmpMArray; Mask.extract_bits(TmpMArray); sycl::vec MemberMask; for (int i = 0; i < 4; ++i) { MemberMask[i] = TmpMArray[i]; } + return MemberMask; +} + +#ifdef __SYCL_DEVICE_ONLY__ +// TODO: This may need to be generalized beyond uint32_t for big masks +inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) { + sycl::vec MemberMask = ExtractMask(Mask); auto OCLMask = sycl::detail::ConvertToOpenCLType_t>(MemberMask); return __spirv_GroupNonUniformBallotBitCount( __spv::Scope::Subgroup, (int)__spv::GroupOperation::ExclusiveScan, OCLMask); } -} // namespace detail #endif +} // namespace detail + } // namespace ext::oneapi::experimental + } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp index 1e20719aa472f..74fd03608a1cd 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp @@ -40,7 +40,7 @@ class opportunistic_group { id_type get_local_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return detail::CallerPositionInMask(Mask); + return sycl::detail::CallerPositionInMask(Mask); #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp index d832b644ef6d9..518cb1f118736 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp @@ -41,7 +41,7 @@ template class tangle_group { id_type get_local_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return detail::CallerPositionInMask(Mask); + return sycl::detail::CallerPositionInMask(Mask); #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index b07b993f06acf..a96d8a3e433a5 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -18,6 +18,13 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { class Builder; + +namespace spirv { + +template struct group_scope; + +} // namespace spirv + } // namespace detail namespace ext::oneapi { @@ -30,7 +37,7 @@ namespace ext::oneapi { #endif struct sub_group_mask { - friend class detail::Builder; + friend class sycl::detail::Builder; using BitsType = BITS_TYPE; static constexpr size_t max_bits = @@ -223,7 +230,7 @@ struct sub_group_mask { : Bits(rhs.Bits), bits_num(rhs.bits_num) {} template - friend detail::enable_if_t< + friend sycl::detail::enable_if_t< std::is_same, sub_group>::value, sub_group_mask> group_ballot(Group g, bool predicate); @@ -266,17 +273,17 @@ struct sub_group_mask { }; template -detail::enable_if_t, sub_group>::value, - sub_group_mask> +sycl::detail::enable_if_t, sub_group>::value, + sub_group_mask> group_ballot(Group g, bool predicate) { (void)g; #ifdef __SYCL_DEVICE_ONLY__ auto res = __spirv_GroupNonUniformBallot( - detail::spirv::group_scope::value, predicate); + sycl::detail::spirv::group_scope::value, predicate); BITS_TYPE val = res[0]; if constexpr (sizeof(BITS_TYPE) == 8) val |= ((BITS_TYPE)res[1]) << 32; - return detail::Builder::createSubGroupMask( + return sycl::detail::Builder::createSubGroupMask( val, g.get_max_local_range()[0]); #else (void)predicate; From 2e1567802a2191e2976177d63b67fbcfec77ce65 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 11:11:32 -0700 Subject: [PATCH 03/19] [SYCL] Add ballot_group support to algorithms Enables the following functions to be used with ballot_group arguments: - group_barrier - group_broadcast - any_of_group - all_of_group - none_of_group - reduce_over_group - exclusive_scan_over_group - inclusive_scan_over_group Signed-off-by: John Pennycook --- sycl/include/CL/__spirv/spirv_ops.hpp | 62 ++++++++++ sycl/include/sycl/detail/spirv.hpp | 110 +++++++++++++++++- .../ext/oneapi/experimental/ballot_group.hpp | 15 ++- .../experimental/non_uniform_groups.hpp | 24 ++++ 4 files changed, 206 insertions(+), 5 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 9cd947c21e8d7..2feeff8c52660 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -953,6 +953,68 @@ __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT int __spirv_GroupNonUniformBallotFindLSB(__spv::Scope::Flag, __ocl_vec_t) noexcept; +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT + __spirv_GroupNonUniformBroadcast(__spv::Scope::Flag, ValueT, IdT); + +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool +__spirv_GroupNonUniformAll(__spv::Scope::Flag, bool); + +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool +__spirv_GroupNonUniformAny(__spv::Scope::Flag, bool); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformSMin(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformUMin(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMin(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformSMax(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformUMax(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMax(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformIAdd(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFAdd(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformIMul(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMul(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseOr(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseXor(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT); + extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void __clc_BarrierInitialize(int64_t *state, int32_t expected_count) noexcept; diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 7b46609931176..253f9edb86b48 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -24,6 +24,9 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace ext { namespace oneapi { struct sub_group; +namespace experimental { +template struct ballot_group; +} // namespace experimental } // namespace oneapi } // namespace ext @@ -57,6 +60,11 @@ template <> struct group_scope<::sycl::ext::oneapi::sub_group> { static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup; }; +template +struct group_scope> { + static constexpr __spv::Scope::Flag value = group_scope::value; +}; + // Generic shuffles and broadcasts may require multiple calls to // intrinsics, and should use the fewest broadcasts possible // - Loop over chunks until remaining bytes < chunk size @@ -98,10 +106,32 @@ void GenericCall(const Functor &ApplyToBytes) { template bool GroupAll(Group g, bool pred) { return __spirv_GroupAll(group_scope::value, pred); } +template +bool GroupAll(ext::oneapi::experimental::ballot_group g, + bool pred) { + // Each ballot_group implicitly represents two groups + // We have to force each half down different control flow + if (g.get_group_id() == 1) { + return __spirv_GroupNonUniformAll(group_scope::value, pred); + } else { + return __spirv_GroupNonUniformAll(group_scope::value, pred); + } +} template bool GroupAny(Group g, bool pred) { return __spirv_GroupAny(group_scope::value, pred); } +template +bool GroupAny(ext::oneapi::experimental::ballot_group g, + bool pred) { + // Each ballot_group implicitly represents two groups + // We have to force each half down different control flow + if (g.get_group_id() == 1) { + return __spirv_GroupNonUniformAny(group_scope::value, pred); + } else { + return __spirv_GroupNonUniformAny(group_scope::value, pred); + } +} // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic // FIXME: Do not special-case for half once all backends support all data types. @@ -168,6 +198,33 @@ EnableIfNativeBroadcast GroupBroadcast(Group, T x, IdT local_id) { OCLIdT OCLId = detail::convertDataToType(GroupLocalId); return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); } +template +EnableIfNativeBroadcast +GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, + T x, IdT local_id) { + // Remap local_id to its original numbering in ParentGroup + auto LocalId = detail::IdToMaskPosition(g, local_id); + + // TODO: Refactor to avoid duplication after design settles + using GroupIdT = typename GroupId::type; + GroupIdT GroupLocalId = static_cast(LocalId); + using OCLT = detail::ConvertToOpenCLType_t; + using WidenedT = WidenOpenCLTypeTo32_t; + using OCLIdT = detail::ConvertToOpenCLType_t; + WidenedT OCLX = detail::convertDataToType(x); + OCLIdT OCLId = detail::convertDataToType(GroupLocalId); + + // Each ballot_group implicitly represents two groups + // We have to force each half down different control flow + if (g.get_group_id() == 1) { + return __spirv_GroupNonUniformBroadcast(group_scope::value, + OCLX, OCLId); + } else { + return __spirv_GroupNonUniformBroadcast(group_scope::value, + OCLX, OCLId); + } +} + template EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, IdT local_id) { using BroadcastT = ConvertToNativeBroadcastType_t; @@ -210,6 +267,13 @@ EnableIfNativeBroadcast GroupBroadcast(Group g, T x, OCLIdT OCLId = detail::convertDataToType(VecId); return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); } +template +EnableIfNativeBroadcast +GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, + T x, id<1> local_id) { + // Limited to 1D indices for now because ParentGroup must be sub-group + return GroupBroadcast(g, x, local_id[0]); +} template EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, id local_id) { @@ -231,7 +295,7 @@ EnableIfGenericBroadcast GroupBroadcast(Group g, T x, auto BroadcastBytes = [=](size_t Offset, size_t Size) { uint64_t BroadcastX, BroadcastResult; std::memcpy(&BroadcastX, XBytes + Offset, Size); - BroadcastResult = GroupBroadcast(BroadcastX, local_id); + BroadcastResult = GroupBroadcast(g, BroadcastX, local_id); std::memcpy(ResultBytes + Offset, &BroadcastResult, Size); }; GenericCall(BroadcastBytes); @@ -816,6 +880,26 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { __spv::MemorySemanticsMask::CrossWorkgroupMemory); } +template +typename std::enable_if_t< + ext::oneapi::experimental::is_user_constructed_group_v> +ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { +#if defined(__SPIR__) + // SPIR-V does not define an instruction to synchronize partial groups. + // However, most (possibly all?) of the current SPIR-V targets execute + // work-items in lockstep, so we can probably get away with a MemoryBarrier. + // TODO: Replace this if SPIR-V defines a NonUniformControlBarrier + __spirv_MemoryBarrier(getScope(FenceScope), + getMemorySemanticsMask(Order) | + __spv::MemorySemanticsMask::SubgroupMemory | + __spv::MemorySemanticsMask::WorkgroupMemory | + __spv::MemorySemanticsMask::CrossWorkgroupMemory); +#elif defined(__NVPTX__) + // TODO: Call syncwarp with appropriate mask extracted from the group +#endif +} + +// TODO: Refactor to avoid duplication after design settles #define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \ template \ typename std::enable_if_t< \ @@ -834,6 +918,30 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { OCLT Ret = __spirv_Group##Instruction(group_scope::value, \ static_cast(Op), Arg); \ return Ret; \ + } \ + \ + template \ + T Group##Instruction(ext::oneapi::experimental::ballot_group g, \ + __spv::GroupOperation Op, T x) { \ + using ConvertedT = detail::ConvertToOpenCLType_t; \ + \ + using OCLT = \ + conditional_t() || \ + std::is_same(), \ + cl_int, \ + conditional_t() || \ + std::is_same(), \ + cl_uint, ConvertedT>>; \ + OCLT Arg = x; \ + /* Each ballot_group implicitly represents two groups */ \ + /* We have to force each half down different control flow */ \ + auto Scope = group_scope::value; \ + auto OpInt = static_cast(Op); \ + if (g.get_group_id() == 1) { \ + return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ + } else { \ + return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ + } \ } __SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin) diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp index 9e5ca0fc5e962..fcdce42652075 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp @@ -112,15 +112,17 @@ template class ballot_group { #endif } -private: - sub_group_mask Mask; - bool Predicate; - protected: + const sub_group_mask Mask; + const bool Predicate; + ballot_group(sub_group_mask m, bool p) : Mask(m), Predicate(p) {} friend ballot_group get_ballot_group(ParentGroup g, bool predicate); + + friend uint32_t sycl::detail::IdToMaskPosition>( + ballot_group Group, uint32_t Id); }; template @@ -149,5 +151,10 @@ template struct is_user_constructed_group> : std::true_type {}; } // namespace ext::oneapi::experimental + +template +struct is_group> + : std::true_type {}; + } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 74a5b0c1e2d94..0efe7465a6e10 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -38,8 +38,32 @@ inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) { } #endif +template +inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) { + // TODO: This will need to be optimized + sycl::vec MemberMask = ExtractMask(Group.Mask); + uint32_t Count = 0; + for (int i = 0; i < 4; ++i) { + for (int b = 0; b < 32; ++b) { + if (MemberMask[i] & (1 << b)) { + if (Count == Id) { + return i * 32 + b; + } + Count++; + } + } + } + __builtin_unreachable(); + return Count; +} + } // namespace detail +namespace ext::oneapi::experimental { + +// Forward declarations of non-uniform group types for algorithm definitions +template class ballot_group; + } // namespace ext::oneapi::experimental } // __SYCL_INLINE_VER_NAMESPACE(_V1) From 64e3f9f3c9e4dfdffd3ab4cb926a4d97f1d72fd3 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 12:59:35 -0700 Subject: [PATCH 04/19] SYCL_EXTERNAL => __DPCPP_SYCL_EXTERNAL --- sycl/include/CL/__spirv/spirv_ops.hpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 2feeff8c52660..3f1f78eef58f0 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -964,55 +964,55 @@ __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool __spirv_GroupNonUniformAny(__spv::Scope::Flag, bool); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformSMin(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformUMin(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformFMin(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformSMax(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformUMax(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformFMax(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformIAdd(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformFAdd(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformIMul(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformFMul(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformBitwiseOr(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformBitwiseXor(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT); extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void From 2ec71982570e36aa7cd45a1ba0b164918b300798 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 13:04:22 -0700 Subject: [PATCH 05/19] Add extra include for vec<> --- sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 0efe7465a6e10..ce65ea1f6f71f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -9,6 +9,7 @@ #pragma once #include #include +#include #include namespace sycl { From a25bdd2bfa55781ad76d300a8a3b692c638efb7c Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 13:05:20 -0700 Subject: [PATCH 06/19] Do not mix struct/class in definitions --- sycl/include/sycl/detail/spirv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 253f9edb86b48..2611e02f3be86 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -25,7 +25,7 @@ namespace ext { namespace oneapi { struct sub_group; namespace experimental { -template struct ballot_group; +template class ballot_group; } // namespace experimental } // namespace oneapi } // namespace ext From 8dc124abffe30c9ec39674039d2faa94d2ad7ae1 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 14:30:00 -0700 Subject: [PATCH 07/19] Fix a few missed renames --- sycl/include/sycl/detail/spirv.hpp | 2 +- sycl/include/sycl/ext/oneapi/group_algorithm.hpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 2611e02f3be86..7565703c9eba6 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -873,7 +873,7 @@ template typename std::enable_if_t< ext::oneapi::experimental::is_fixed_topology_group_v> ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { - __spirv_ControlBarrier(group_scope::Scope, getScope(FenceScope), + __spirv_ControlBarrier(group_scope::value, getScope(FenceScope), getMemorySemanticsMask(Order) | __spv::MemorySemanticsMask::SubgroupMemory | __spv::MemorySemanticsMask::WorkgroupMemory | diff --git a/sycl/include/sycl/ext/oneapi/group_algorithm.hpp b/sycl/include/sycl/ext/oneapi/group_algorithm.hpp index 610dc6a14bdd4..6c29b65482067 100644 --- a/sycl/include/sycl/ext/oneapi/group_algorithm.hpp +++ b/sycl/include/sycl/ext/oneapi/group_algorithm.hpp @@ -146,9 +146,10 @@ __SYCL2020_DEPRECATED( detail::enable_if_t<(detail::is_generic_group::value && std::is_trivially_copyable::value && !detail::is_vector_arithmetic::value), - T> broadcast(Group, T x, typename Group::id_type local_id) { + T> broadcast(Group g, T x, + typename Group::id_type local_id) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupBroadcast(x, local_id); + return sycl::detail::spirv::GroupBroadcast(g, x, local_id); #else (void)x; (void)local_id; From af983f48f02b9f5fa457a1dd19be5dc9900fb4ac Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 14:56:02 -0700 Subject: [PATCH 08/19] Ensure Scope and GroupOperation are constexpr Fixes compilation at -O0. --- sycl/include/sycl/detail/spirv.hpp | 16 ++++++++-------- sycl/include/sycl/ext/oneapi/functional.hpp | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 7565703c9eba6..078961fc1a1b5 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -901,10 +901,10 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { // TODO: Refactor to avoid duplication after design settles #define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \ - template \ - typename std::enable_if_t< \ + template <__spv::GroupOperation Op, typename Group, typename T> \ + inline typename std::enable_if_t< \ ext::oneapi::experimental::is_fixed_topology_group_v, T> \ - Group##Instruction(Group G, __spv::GroupOperation Op, T x) { \ + Group##Instruction(Group G, T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ using OCLT = \ @@ -920,9 +920,9 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { return Ret; \ } \ \ - template \ - T Group##Instruction(ext::oneapi::experimental::ballot_group g, \ - __spv::GroupOperation Op, T x) { \ + template <__spv::GroupOperation Op, typename ParentGroup, typename T> \ + inline T Group##Instruction( \ + ext::oneapi::experimental::ballot_group g, T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ using OCLT = \ @@ -935,8 +935,8 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { OCLT Arg = x; \ /* Each ballot_group implicitly represents two groups */ \ /* We have to force each half down different control flow */ \ - auto Scope = group_scope::value; \ - auto OpInt = static_cast(Op); \ + constexpr auto Scope = group_scope::value; \ + constexpr auto OpInt = static_cast(Op); \ if (g.get_group_id() == 1) { \ return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ } else { \ diff --git a/sycl/include/sycl/ext/oneapi/functional.hpp b/sycl/include/sycl/ext/oneapi/functional.hpp index 66454b75577ab..7d8d269fde0fb 100644 --- a/sycl/include/sycl/ext/oneapi/functional.hpp +++ b/sycl/include/sycl/ext/oneapi/functional.hpp @@ -63,7 +63,7 @@ struct GroupOpTag< #define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation) \ template <__spv::GroupOperation O, typename Group, typename T> \ static T calc(Group g, GroupTag, T x, BinaryOperation) { \ - return sycl::detail::spirv::Group##SPIRVOperation(g, O, x); \ + return sycl::detail::spirv::Group##SPIRVOperation(g, x); \ } // calc for sycl function objects From 01ecf06172f949380d16ca2adfa08d6a559d23cb Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 28 Mar 2023 14:29:19 -0700 Subject: [PATCH 09/19] Fix nested detail:: namespace for group_ballot --- sycl/include/sycl/ext/oneapi/sub_group_mask.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index 822b863df03bb..88bcd543e8eaa 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -40,7 +40,7 @@ namespace ext::oneapi { // need to forward declare sub_group_mask first struct sub_group_mask; template -detail::enable_if_t, sub_group>::value, +sycl::detail::enable_if_t, sub_group>::value, sub_group_mask> group_ballot(Group g, bool predicate = true); From b2a4a115ca31de05fbf6a54a7767c4a0b50a1ce1 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 29 Mar 2023 08:52:15 -0700 Subject: [PATCH 10/19] Add basic tests for non-uniform groups Tests the ability to create an instance of each new group type, and the correctness of the core member functions. Signed-off-by: John Pennycook --- .../NonUniformGroups/ballot_group.cpp | 58 ++++++++++++++++ .../NonUniformGroups/cluster_group.cpp | 62 +++++++++++++++++ .../NonUniformGroups/is_fixed_topology.cpp | 12 ++++ .../NonUniformGroups/is_user_constructed.cpp | 14 ++++ .../NonUniformGroups/opportunistic_group.cpp | 68 ++++++++++++++++++ .../NonUniformGroups/tangle_group.cpp | 69 +++++++++++++++++++ 6 files changed, 283 insertions(+) create mode 100644 sycl/test-e2e/NonUniformGroups/ballot_group.cpp create mode 100644 sycl/test-e2e/NonUniformGroups/cluster_group.cpp create mode 100644 sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp create mode 100644 sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp create mode 100644 sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp create mode 100644 sycl/test-e2e/NonUniformGroups/tangle_group.cpp diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp new file mode 100644 index 0000000000000..955744b390c4a --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp @@ -0,0 +1,58 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items + bool Predicate = item.get_global_id() % 2 == 0; + auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + + // Check function return values match Predicate + bool Match = true; + auto GroupID = (Predicate) ? 1 : 0; + Match &= (BallotGroup.get_group_id() == GroupID); + Match &= (BallotGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (BallotGroup.get_group_range() == 2); + Match &= (BallotGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = BallotGroup.leader(); + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } + return 0; +} diff --git a/sycl/test-e2e/NonUniformGroups/cluster_group.cpp b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp new file mode 100644 index 0000000000000..e1d7634191df3 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp @@ -0,0 +1,62 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +template class TestKernel; + +template void test() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + auto ClusterGroup = syclex::get_cluster_group(SG); + + bool Match = true; + Match &= (ClusterGroup.get_group_id() == (WI / ClusterSize)); + Match &= (ClusterGroup.get_local_id() == (WI % ClusterSize)); + Match &= (ClusterGroup.get_group_range() == (32 / ClusterSize)); + Match &= (ClusterGroup.get_local_range() == ClusterSize); + MatchAcc[WI] = Match; + LeaderAcc[WI] = ClusterGroup.leader(); + }; + CGH.parallel_for>(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == ((WI % ClusterSize) == 0)); + } +} + +int main() { + test<1>(); + test<2>(); + test<4>(); + test<8>(); + test<16>(); + test<32>(); + return 0; +} diff --git a/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp new file mode 100644 index 0000000000000..b3b6cd5ba4adf --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp @@ -0,0 +1,12 @@ +// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out + +#include +namespace syclex = sycl::ext::oneapi::experimental; + +#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP +static_assert(syclex::is_fixed_topology_group_v); +#endif +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v); diff --git a/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp new file mode 100644 index 0000000000000..a3f0085d8eaa0 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp @@ -0,0 +1,14 @@ +// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out + +#include +namespace syclex = sycl::ext::oneapi::experimental; + +static_assert( + syclex::is_user_constructed_group_v>); +static_assert(syclex::is_user_constructed_group_v< + syclex::cluster_group<1, sycl::sub_group>>); +static_assert(syclex::is_user_constructed_group_v< + syclex::cluster_group<2, sycl::sub_group>>); +static_assert( + syclex::is_user_constructed_group_v>); +static_assert(syclex::is_user_constructed_group_v); diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp new file mode 100644 index 0000000000000..925340cee1c6d --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp @@ -0,0 +1,68 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Due to the unpredictable runtime behavior of opportunistic groups, + // some values may change from run to run. Check they're in expected + // ranges and consistent with other groups. + if (item.get_global_id() % 2 == 0) { + auto OpportunisticGroup = + syclex::this_kernel::get_opportunistic_group(); + + bool Match = true; + Match &= (OpportunisticGroup.get_group_id() == 0); + Match &= (OpportunisticGroup.get_local_id() < + OpportunisticGroup.get_local_range()); + Match &= (OpportunisticGroup.get_group_range() == 1); + Match &= (OpportunisticGroup.get_local_linear_range() <= + SG.get_local_linear_range()); + MatchAcc[WI] = Match; + LeaderAcc[WI] = OpportunisticGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + uint32_t NumLeaders = 0; + for (int WI = 0; WI < 32; ++WI) { + if (WI % 2 == 0) { + assert(MatchAcc[WI] == true); + if (LeaderAcc[WI]) { + NumLeaders++; + } + } + } + assert(NumLeaders > 0); + return 0; +} diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp new file mode 100644 index 0000000000000..172a73ebdca87 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp @@ -0,0 +1,69 @@ +// RUN: %clangxx -fsycl -fno-sycl-early-optimizations -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items via control flow + // Branches deliberately duplicated to test impact of optimizations + // This only reliably works with optimizations disabled right now + if (item.get_global_id() % 2 == 0) { + auto TangleGroup = syclex::get_tangle_group(SG); + + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } else { + auto TangleGroup = syclex::get_tangle_group(SG); + + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } + return 0; +} From 68ab5bc57e14a284fab8ac4b980ff0145bb9b8ef Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 29 Mar 2023 08:54:51 -0700 Subject: [PATCH 11/19] Add tests for ballot_group algorithms This commit adds tests for using ballot_group and the following algorithms: - group_barrier - group_broadcast - any_of_group - all_of_group - none_of_group - reduce_over_group - exclusive_scan_over_group - inclusive_scan_over_group Signed-off-by: John Pennycook --- .../ballot_group_algorithms.cpp | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp new file mode 100644 index 0000000000000..a5ca68bd79556 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp @@ -0,0 +1,131 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer TmpBuf{sycl::range{32}}; + sycl::buffer BarrierBuf{sycl::range{32}}; + sycl::buffer BroadcastBuf{sycl::range{32}}; + sycl::buffer AnyBuf{sycl::range{32}}; + sycl::buffer AllBuf{sycl::range{32}}; + sycl::buffer NoneBuf{sycl::range{32}}; + sycl::buffer ReduceBuf{sycl::range{32}}; + sycl::buffer ExScanBuf{sycl::range{32}}; + sycl::buffer IncScanBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor TmpAcc{TmpBuf, CGH, sycl::write_only}; + sycl::accessor BarrierAcc{BarrierBuf, CGH, sycl::write_only}; + sycl::accessor BroadcastAcc{BroadcastBuf, CGH, sycl::write_only}; + sycl::accessor AnyAcc{AnyBuf, CGH, sycl::write_only}; + sycl::accessor AllAcc{AllBuf, CGH, sycl::write_only}; + sycl::accessor NoneAcc{NoneBuf, CGH, sycl::write_only}; + sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only}; + sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only}; + sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items + bool Predicate = WI % 2 == 0; + auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + + // Check all other members' writes are visible after a barrier + TmpAcc[WI] = 1; + sycl::group_barrier(BallotGroup); + size_t Visible = 0; + for (size_t Other = 0; Other < 32; ++Other) { + if (WI % 2 == Other % 2) { + Visible += TmpAcc[Other]; + } + } + BarrierAcc[WI] = Visible; + + // Simple check of group algorithms + uint32_t OriginalLID = SG.get_local_linear_id(); + uint32_t LID = BallotGroup.get_local_linear_id(); + + uint32_t BroadcastResult = + sycl::group_broadcast(BallotGroup, OriginalLID, 0); + if (Predicate) { + BroadcastAcc[WI] = (BroadcastResult == 0); + } else { + BroadcastAcc[WI] = (BroadcastResult == 1); + } + + bool AnyResult = sycl::any_of_group(BallotGroup, Predicate); + if (Predicate) { + AnyAcc[WI] = (AnyResult == true); + } else { + AnyAcc[WI] = (AnyResult == false); + } + + bool AllResult = sycl::all_of_group(BallotGroup, Predicate); + if (Predicate) { + AllAcc[WI] = (AllResult == true); + } else { + AllAcc[WI] = (AllResult == false); + } + + bool NoneResult = sycl::none_of_group(BallotGroup, Predicate); + if (Predicate) { + NoneAcc[WI] = (NoneResult == false); + } else { + NoneAcc[WI] = (NoneResult == true); + } + + uint32_t ReduceResult = + sycl::reduce_over_group(BallotGroup, 1, sycl::plus<>()); + ReduceAcc[WI] = + (ReduceResult == BallotGroup.get_local_linear_range()); + + uint32_t ExScanResult = + sycl::exclusive_scan_over_group(BallotGroup, 1, sycl::plus<>()); + ExScanAcc[WI] = (ExScanResult == LID); + + uint32_t IncScanResult = + sycl::inclusive_scan_over_group(BallotGroup, 1, sycl::plus<>()); + IncScanAcc[WI] = (IncScanResult == LID + 1); + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor BarrierAcc{BarrierBuf, sycl::read_only}; + sycl::host_accessor BroadcastAcc{BroadcastBuf, sycl::read_only}; + sycl::host_accessor AnyAcc{AnyBuf, sycl::read_only}; + sycl::host_accessor AllAcc{AllBuf, sycl::read_only}; + sycl::host_accessor NoneAcc{NoneBuf, sycl::read_only}; + sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only}; + sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only}; + sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(BarrierAcc[WI] == true); + assert(BroadcastAcc[WI] == true); + assert(AnyAcc[WI] == true); + assert(AllAcc[WI] == true); + assert(NoneAcc[WI] == true); + assert(ReduceAcc[WI] == true); + assert(ExScanAcc[WI] == true); + assert(IncScanAcc[WI] == true); + } + return 0; +} From 56e05cec5eafd06f9a53bb2e90dd222d2c8c7140 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 29 Mar 2023 09:31:33 -0700 Subject: [PATCH 12/19] Clarify intent of ballot_group control flow branch --- sycl/include/sycl/detail/spirv.hpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 078961fc1a1b5..bf02d7550cc03 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -109,8 +109,9 @@ template bool GroupAll(Group g, bool pred) { template bool GroupAll(ext::oneapi::experimental::ballot_group g, bool pred) { - // Each ballot_group implicitly represents two groups - // We have to force each half down different control flow + // ballot_group partitions its parent into two groups (0 and 1) + // We have to force each group down different control flow + // Work-items in the "false" group (0) may still be active if (g.get_group_id() == 1) { return __spirv_GroupNonUniformAll(group_scope::value, pred); } else { @@ -124,8 +125,9 @@ template bool GroupAny(Group g, bool pred) { template bool GroupAny(ext::oneapi::experimental::ballot_group g, bool pred) { - // Each ballot_group implicitly represents two groups - // We have to force each half down different control flow + // ballot_group partitions its parent into two groups (0 and 1) + // We have to force each group down different control flow + // Work-items in the "false" group (0) may still be active if (g.get_group_id() == 1) { return __spirv_GroupNonUniformAny(group_scope::value, pred); } else { @@ -214,8 +216,9 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, WidenedT OCLX = detail::convertDataToType(x); OCLIdT OCLId = detail::convertDataToType(GroupLocalId); - // Each ballot_group implicitly represents two groups - // We have to force each half down different control flow + // ballot_group partitions its parent into two groups (0 and 1) + // We have to force each group down different control flow + // Work-items in the "false" group (0) may still be active if (g.get_group_id() == 1) { return __spirv_GroupNonUniformBroadcast(group_scope::value, OCLX, OCLId); @@ -933,8 +936,9 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { std::is_same(), \ cl_uint, ConvertedT>>; \ OCLT Arg = x; \ - /* Each ballot_group implicitly represents two groups */ \ - /* We have to force each half down different control flow */ \ + /* ballot_group partitions its parent into two groups (0 and 1) */ \ + /* We have to force each group down different control flow */ \ + /* Work-items in the "false" group (0) may still be active */ \ constexpr auto Scope = group_scope::value; \ constexpr auto OpInt = static_cast(Op); \ if (g.get_group_id() == 1) { \ From bd4c3a2e59d885ec7f159a47bce8a0ef14619187 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Thu, 30 Mar 2023 11:20:28 -0700 Subject: [PATCH 13/19] Update comment formatting Co-authored-by: aelovikov-intel --- sycl/include/sycl/detail/spirv.hpp | 6 +++--- sycl/test-e2e/NonUniformGroups/ballot_group.cpp | 4 ++-- sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp | 6 +++--- sycl/test-e2e/NonUniformGroups/tangle_group.cpp | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index bf02d7550cc03..45a2ba2f4ff1b 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -204,10 +204,10 @@ template EnableIfNativeBroadcast GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, T x, IdT local_id) { - // Remap local_id to its original numbering in ParentGroup + // Remap local_id to its original numbering in ParentGroup. auto LocalId = detail::IdToMaskPosition(g, local_id); - // TODO: Refactor to avoid duplication after design settles + // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(LocalId); using OCLT = detail::ConvertToOpenCLType_t; @@ -274,7 +274,7 @@ template EnableIfNativeBroadcast GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, T x, id<1> local_id) { - // Limited to 1D indices for now because ParentGroup must be sub-group + // Limited to 1D indices for now because ParentGroup must be sub-group. return GroupBroadcast(g, x, local_id[0]); } template diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp index 955744b390c4a..05b1a82628063 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp @@ -31,11 +31,11 @@ int main() { auto WI = item.get_global_id(); auto SG = item.get_sub_group(); - // Split into odd and even work-items + // Split into odd and even work-items. bool Predicate = item.get_global_id() % 2 == 0; auto BallotGroup = syclex::get_ballot_group(SG, Predicate); - // Check function return values match Predicate + // Check function return values match Predicate. bool Match = true; auto GroupID = (Predicate) ? 1 : 0; Match &= (BallotGroup.get_group_id() == GroupID); diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp index a5ca68bd79556..9f250062b9108 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp @@ -45,11 +45,11 @@ int main() { auto WI = item.get_global_id(); auto SG = item.get_sub_group(); - // Split into odd and even work-items + // Split into odd and even work-items. bool Predicate = WI % 2 == 0; auto BallotGroup = syclex::get_ballot_group(SG, Predicate); - // Check all other members' writes are visible after a barrier + // Check all other members' writes are visible after a barrier. TmpAcc[WI] = 1; sycl::group_barrier(BallotGroup); size_t Visible = 0; @@ -60,7 +60,7 @@ int main() { } BarrierAcc[WI] = Visible; - // Simple check of group algorithms + // Simple check of group algorithms. uint32_t OriginalLID = SG.get_local_linear_id(); uint32_t LID = BallotGroup.get_local_linear_id(); diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp index 172a73ebdca87..9e57a48633e0a 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp @@ -31,9 +31,9 @@ int main() { auto WI = item.get_global_id(); auto SG = item.get_sub_group(); - // Split into odd and even work-items via control flow - // Branches deliberately duplicated to test impact of optimizations - // This only reliably works with optimizations disabled right now + // Split into odd and even work-items via control flow. + // Branches deliberately duplicated to test impact of optimizations. + // This only reliably works with optimizations disabled right now. if (item.get_global_id() % 2 == 0) { auto TangleGroup = syclex::get_tangle_group(SG); From ee1f3a300a1825dae3241de0176ee67fc50e3169 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Thu, 30 Mar 2023 11:23:04 -0700 Subject: [PATCH 14/19] Remove names of unused parameters --- sycl/include/sycl/detail/spirv.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 45a2ba2f4ff1b..9534cc5a4f9e9 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -103,7 +103,7 @@ void GenericCall(const Functor &ApplyToBytes) { } } -template bool GroupAll(Group g, bool pred) { +template bool GroupAll(Group, bool pred) { return __spirv_GroupAll(group_scope::value, pred); } template @@ -119,7 +119,7 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, } } -template bool GroupAny(Group g, bool pred) { +template bool GroupAny(Group, bool pred) { return __spirv_GroupAny(group_scope::value, pred); } template From c1579fdb6d9ed790ada481e9517f040538a8b484 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Thu, 30 Mar 2023 11:47:12 -0700 Subject: [PATCH 15/19] Improve readability of ballot_group tests --- sycl/test-e2e/NonUniformGroups/ballot_group.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp index 05b1a82628063..98fd7174208e5 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp @@ -32,14 +32,17 @@ int main() { auto SG = item.get_sub_group(); // Split into odd and even work-items. - bool Predicate = item.get_global_id() % 2 == 0; + bool Predicate = WI % 2 == 0; auto BallotGroup = syclex::get_ballot_group(SG, Predicate); // Check function return values match Predicate. + // NB: Test currently uses exactly one sub-group, but we use SG + // below in case this changes in future. bool Match = true; auto GroupID = (Predicate) ? 1 : 0; + auto LocalID = SG.get_local_id() / 2; Match &= (BallotGroup.get_group_id() == GroupID); - Match &= (BallotGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (BallotGroup.get_local_id() == LocalID); Match &= (BallotGroup.get_group_range() == 2); Match &= (BallotGroup.get_local_range() == 16); MatchAcc[WI] = Match; From 9df2196e1fb9db179500525c27b0c854c9b89932 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Thu, 30 Mar 2023 11:49:26 -0700 Subject: [PATCH 16/19] Check number of Visible updates in barrier test --- sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp index 9f250062b9108..3f313f640f5b8 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp @@ -58,7 +58,7 @@ int main() { Visible += TmpAcc[Other]; } } - BarrierAcc[WI] = Visible; + BarrierAcc[WI] = (Visible == BallotGroup.get_local_linear_range()); // Simple check of group algorithms. uint32_t OriginalLID = SG.get_local_linear_id(); From 7359d6b9797a2f3ea5d3caaa294cb6f78620a083 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Thu, 30 Mar 2023 11:51:11 -0700 Subject: [PATCH 17/19] Update any_of test to use per-work-item predicate --- sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp index 3f313f640f5b8..ba3af3c839fad 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp @@ -72,7 +72,7 @@ int main() { BroadcastAcc[WI] = (BroadcastResult == 1); } - bool AnyResult = sycl::any_of_group(BallotGroup, Predicate); + bool AnyResult = sycl::any_of_group(BallotGroup, (LID == 0)); if (Predicate) { AnyAcc[WI] = (AnyResult == true); } else { From 57a63245a2b0a7812cf302ea1d3725259e6225e9 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Thu, 30 Mar 2023 13:33:10 -0700 Subject: [PATCH 18/19] Attempt to satisfy clang-format --- .../include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp | 2 +- sycl/include/sycl/ext/oneapi/sub_group_mask.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index ce65ea1f6f71f..c7101fd198c83 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -9,8 +9,8 @@ #pragma once #include #include -#include #include +#include namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index 88bcd543e8eaa..44cc1ab2cfc8d 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -41,7 +41,7 @@ namespace ext::oneapi { struct sub_group_mask; template sycl::detail::enable_if_t, sub_group>::value, - sub_group_mask> + sub_group_mask> group_ballot(Group g, bool predicate = true); struct sub_group_mask { From 354179320cdde01a3817790595ad9cd9d882c16f Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Fri, 21 Apr 2023 07:57:56 -0700 Subject: [PATCH 19/19] Fix any_of_group test --- sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp index ba3af3c839fad..1667445ada44b 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp @@ -73,11 +73,7 @@ int main() { } bool AnyResult = sycl::any_of_group(BallotGroup, (LID == 0)); - if (Predicate) { - AnyAcc[WI] = (AnyResult == true); - } else { - AnyAcc[WI] = (AnyResult == false); - } + AnyAcc[WI] = (AnyResult == true); bool AllResult = sycl::all_of_group(BallotGroup, Predicate); if (Predicate) {