Skip to content

[MLIR][OpenMP] Improve Generic-SPMD kernel detection #137307

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 37 additions & 51 deletions mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1955,7 +1955,7 @@ LogicalResult TargetOp::verifyRegions() {
}

static Operation *
findCapturedOmpOp(Operation *rootOp, bool checkSingleMandatoryExec,
findCapturedOmpOp(Operation *rootOp,
llvm::function_ref<bool(Operation *)> siblingAllowedFn) {
assert(rootOp && "expected valid operation");

Expand Down Expand Up @@ -1983,19 +1983,17 @@ findCapturedOmpOp(Operation *rootOp, bool checkSingleMandatoryExec,
// (i.e. its block's successors can reach it) or if it's not guaranteed to
// be executed before all exits of the region (i.e. it doesn't dominate all
// blocks with no successors reachable from the entry block).
if (checkSingleMandatoryExec) {
Region *parentRegion = op->getParentRegion();
Block *parentBlock = op->getBlock();

for (Block *successor : parentBlock->getSuccessors())
if (successor->isReachable(parentBlock))
return WalkResult::interrupt();

for (Block &block : *parentRegion)
if (domInfo.isReachableFromEntry(&block) && block.hasNoSuccessors() &&
!domInfo.dominates(parentBlock, &block))
return WalkResult::interrupt();
}
Region *parentRegion = op->getParentRegion();
Block *parentBlock = op->getBlock();

for (Block *successor : parentBlock->getSuccessors())
if (successor->isReachable(parentBlock))
return WalkResult::interrupt();

for (Block &block : *parentRegion)
if (domInfo.isReachableFromEntry(&block) && block.hasNoSuccessors() &&
!domInfo.dominates(parentBlock, &block))
return WalkResult::interrupt();

// Don't capture this op if it has a not-allowed sibling, and stop recursing
// into nested operations.
Expand All @@ -2018,27 +2016,25 @@ Operation *TargetOp::getInnermostCapturedOmpOp() {

// Only allow OpenMP terminators and non-OpenMP ops that have known memory
// effects, but don't include a memory write effect.
return findCapturedOmpOp(
*this, /*checkSingleMandatoryExec=*/true, [&](Operation *sibling) {
if (!sibling)
return false;

if (ompDialect == sibling->getDialect())
return sibling->hasTrait<OpTrait::IsTerminator>();

if (auto memOp = dyn_cast<MemoryEffectOpInterface>(sibling)) {
SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>, 4>
effects;
memOp.getEffects(effects);
return !llvm::any_of(
effects, [&](MemoryEffects::EffectInstance &effect) {
return isa<MemoryEffects::Write>(effect.getEffect()) &&
isa<SideEffects::AutomaticAllocationScopeResource>(
effect.getResource());
});
}
return true;
return findCapturedOmpOp(*this, [&](Operation *sibling) {
if (!sibling)
return false;

if (ompDialect == sibling->getDialect())
return sibling->hasTrait<OpTrait::IsTerminator>();

if (auto memOp = dyn_cast<MemoryEffectOpInterface>(sibling)) {
SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>, 4>
effects;
memOp.getEffects(effects);
return !llvm::any_of(effects, [&](MemoryEffects::EffectInstance &effect) {
return isa<MemoryEffects::Write>(effect.getEffect()) &&
isa<SideEffects::AutomaticAllocationScopeResource>(
effect.getResource());
});
}
return true;
});
}

TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {
Expand Down Expand Up @@ -2099,33 +2095,23 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {
if (isa<LoopOp>(innermostWrapper))
return TargetRegionFlags::spmd | TargetRegionFlags::trip_count;

// Find single immediately nested captured omp.parallel and add spmd flag
// (generic-spmd case).
// Add spmd flag if there's a nested omp.parallel (generic-spmd case).
//
// TODO: This shouldn't have to be done here, as it is too easy to break.
// The openmp-opt pass should be updated to be able to promote kernels like
// this from "Generic" to "Generic-SPMD". However, the use of the
// `kmpc_distribute_static_loop` family of functions produced by the
// OMPIRBuilder for these kernels prevents that from working.
Dialect *ompDialect = targetOp->getDialect();
Operation *nestedCapture = findCapturedOmpOp(
capturedOp, /*checkSingleMandatoryExec=*/false,
[&](Operation *sibling) {
return sibling && (ompDialect != sibling->getDialect() ||
sibling->hasTrait<OpTrait::IsTerminator>());
});
bool hasParallel = capturedOp
->walk<WalkOrder::PreOrder>([](ParallelOp) {
return WalkResult::interrupt();
})
.wasInterrupted();

TargetRegionFlags result =
TargetRegionFlags::generic | TargetRegionFlags::trip_count;

if (!nestedCapture)
return result;

while (nestedCapture->getParentOp() != capturedOp)
nestedCapture = nestedCapture->getParentOp();

return isa<ParallelOp>(nestedCapture) ? result | TargetRegionFlags::spmd
: result;
return hasParallel ? result | TargetRegionFlags::spmd : result;
}
// Detect target-parallel-wsloop[-simd].
else if (isa<WsloopOp>(innermostWrapper)) {
Expand Down
143 changes: 97 additions & 46 deletions mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
// RUN: split-file %s %t
// RUN: mlir-translate -mlir-to-llvmir %t/host.mlir | FileCheck %s --check-prefix=HOST
// RUN: mlir-translate -mlir-to-llvmir %t/device.mlir | FileCheck %s --check-prefix=DEVICE

//--- host.mlir
// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s

module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-amd-amdhsa"]} {
llvm.func @main(%arg0 : !llvm.ptr) {
llvm.func @host(%arg0 : !llvm.ptr) {
%x = llvm.load %arg0 : !llvm.ptr -> i32
%0 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr
omp.target host_eval(%x -> %lb, %x -> %ub, %x -> %step : i32, i32, i32) map_entries(%0 -> %ptr : !llvm.ptr) {
Expand All @@ -32,36 +28,36 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
}
}

// HOST-LABEL: define void @main
// HOST: %omp_loop.tripcount = {{.*}}
// HOST-NEXT: br label %[[ENTRY:.*]]
// HOST: [[ENTRY]]:
// HOST: %[[TRIPCOUNT:.*]] = zext i32 %omp_loop.tripcount to i64
// HOST: %[[TRIPCOUNT_KARG:.*]] = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %[[KARGS:.*]], i32 0, i32 8
// HOST-NEXT: store i64 %[[TRIPCOUNT]], ptr %[[TRIPCOUNT_KARG]]
// HOST: %[[RESULT:.*]] = call i32 @__tgt_target_kernel({{.*}}, ptr %[[KARGS]])
// HOST-NEXT: %[[CMP:.*]] = icmp ne i32 %[[RESULT]], 0
// HOST-NEXT: br i1 %[[CMP]], label %[[OFFLOAD_FAILED:.*]], label %{{.*}}
// HOST: [[OFFLOAD_FAILED]]:
// HOST: call void @[[TARGET_OUTLINE:.*]]({{.*}})
// CHECK-LABEL: define void @host
// CHECK: %omp_loop.tripcount = {{.*}}
// CHECK-NEXT: br label %[[ENTRY:.*]]
// CHECK: [[ENTRY]]:
// CHECK: %[[TRIPCOUNT:.*]] = zext i32 %omp_loop.tripcount to i64
// CHECK: %[[TRIPCOUNT_KARG:.*]] = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %[[KARGS:.*]], i32 0, i32 8
// CHECK-NEXT: store i64 %[[TRIPCOUNT]], ptr %[[TRIPCOUNT_KARG]]
// CHECK: %[[RESULT:.*]] = call i32 @__tgt_target_kernel({{.*}}, ptr %[[KARGS]])
// CHECK-NEXT: %[[CMP:.*]] = icmp ne i32 %[[RESULT]], 0
// CHECK-NEXT: br i1 %[[CMP]], label %[[OFFLOAD_FAILED:.*]], label %{{.*}}
// CHECK: [[OFFLOAD_FAILED]]:
// CHECK: call void @[[TARGET_OUTLINE:.*]]({{.*}})

// HOST: define internal void @[[TARGET_OUTLINE]]
// HOST: call void{{.*}}@__kmpc_fork_teams({{.*}}, ptr @[[TEAMS_OUTLINE:.*]], {{.*}})
// CHECK: define internal void @[[TARGET_OUTLINE]]
// CHECK: call void{{.*}}@__kmpc_fork_teams({{.*}}, ptr @[[TEAMS_OUTLINE:.*]], {{.*}})

// HOST: define internal void @[[TEAMS_OUTLINE]]
// HOST: call void @[[DISTRIBUTE_OUTLINE:.*]]({{.*}})
// CHECK: define internal void @[[TEAMS_OUTLINE]]
// CHECK: call void @[[DISTRIBUTE_OUTLINE:.*]]({{.*}})

// HOST: define internal void @[[DISTRIBUTE_OUTLINE]]
// HOST: call void @__kmpc_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 92, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}})
// HOST: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], {{.*}})
// CHECK: define internal void @[[DISTRIBUTE_OUTLINE]]
// CHECK: call void @__kmpc_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 92, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}})
// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], {{.*}})

// HOST: define internal void @[[PARALLEL_OUTLINE]]
// HOST: call void @__kmpc_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 34, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}})
// CHECK: define internal void @[[PARALLEL_OUTLINE]]
// CHECK: call void @__kmpc_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 34, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}})

//--- device.mlir
// -----

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true, omp.is_gpu = true} {
llvm.func @main(%arg0 : !llvm.ptr) {
llvm.func @device(%arg0 : !llvm.ptr) {
%0 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr
omp.target map_entries(%0 -> %ptr : !llvm.ptr) {
%x = llvm.load %ptr : !llvm.ptr -> i32
Expand All @@ -87,25 +83,80 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
}
}

// DEVICE: @[[KERNEL_NAME:.*]]_exec_mode = weak protected constant i8 [[EXEC_MODE:3]]
// DEVICE: @llvm.compiler.used = appending global [1 x ptr] [ptr @[[KERNEL_NAME]]_exec_mode], section "llvm.metadata"
// DEVICE: @[[KERNEL_NAME]]_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy {
// DEVICE-SAME: %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 [[EXEC_MODE]], {{.*}}},
// DEVICE-SAME: ptr @{{.*}}, ptr @{{.*}} }
// CHECK: @[[KERNEL_NAME:.*]]_exec_mode = weak protected constant i8 [[EXEC_MODE:3]]
// CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr @[[KERNEL_NAME]]_exec_mode], section "llvm.metadata"
// CHECK: @[[KERNEL_NAME]]_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy {
// CHECK-SAME: %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 [[EXEC_MODE]], {{.*}}},
// CHECK-SAME: ptr @{{.*}}, ptr @{{.*}} }

// CHECK: define weak_odr protected amdgpu_kernel void @[[KERNEL_NAME]]({{.*}})
// CHECK: %{{.*}} = call i32 @__kmpc_target_init(ptr @[[KERNEL_NAME]]_kernel_environment, {{.*}})
// CHECK: call void @[[TARGET_OUTLINE:.*]]({{.*}})
// CHECK: call void @__kmpc_target_deinit()

// CHECK: define internal void @[[TARGET_OUTLINE]]({{.*}})
// CHECK: call void @[[TEAMS_OUTLINE:.*]]({{.*}})

// CHECK: define internal void @[[TEAMS_OUTLINE]]({{.*}})
// CHECK: call void @__kmpc_distribute_static_loop{{.*}}({{.*}}, ptr @[[DISTRIBUTE_OUTLINE:[^,]*]], {{.*}})

// CHECK: define internal void @[[DISTRIBUTE_OUTLINE]]({{.*}})
// CHECK: call void @__kmpc_parallel_51(ptr {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], ptr {{.*}}, ptr {{.*}}, i64 {{.*}})

// CHECK: define internal void @[[PARALLEL_OUTLINE]]({{.*}})
// CHECK: call void @__kmpc_for_static_loop{{.*}}({{.*}})

// -----

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true, omp.is_gpu = true} {
llvm.func @device2(%arg0 : !llvm.ptr) {
%0 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr
omp.target map_entries(%0 -> %ptr : !llvm.ptr) {
%x = llvm.load %ptr : !llvm.ptr -> i32
omp.teams {
omp.distribute {
omp.loop_nest (%iv1) : i32 = (%x) to (%x) step (%x) {
omp.parallel {
omp.terminator
}
llvm.br ^bb2
^bb1:
omp.parallel {
omp.terminator
}
omp.yield
^bb2:
llvm.br ^bb1
}
}
omp.terminator
}
omp.terminator
}
llvm.return
}
}

// CHECK: @[[KERNEL_NAME:.*]]_exec_mode = weak protected constant i8 [[EXEC_MODE:3]]
// CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr @[[KERNEL_NAME]]_exec_mode], section "llvm.metadata"
// CHECK: @[[KERNEL_NAME]]_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy {
// CHECK-SAME: %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 [[EXEC_MODE]], {{.*}}},
// CHECK-SAME: ptr @{{.*}}, ptr @{{.*}} }

// DEVICE: define weak_odr protected amdgpu_kernel void @[[KERNEL_NAME]]({{.*}})
// DEVICE: %{{.*}} = call i32 @__kmpc_target_init(ptr @[[KERNEL_NAME]]_kernel_environment, {{.*}})
// DEVICE: call void @[[TARGET_OUTLINE:.*]]({{.*}})
// DEVICE: call void @__kmpc_target_deinit()
// CHECK: define weak_odr protected amdgpu_kernel void @[[KERNEL_NAME]]({{.*}})
// CHECK: %{{.*}} = call i32 @__kmpc_target_init(ptr @[[KERNEL_NAME]]_kernel_environment, {{.*}})
// CHECK: call void @[[TARGET_OUTLINE:.*]]({{.*}})
// CHECK: call void @__kmpc_target_deinit()

// DEVICE: define internal void @[[TARGET_OUTLINE]]({{.*}})
// DEVICE: call void @[[TEAMS_OUTLINE:.*]]({{.*}})
// CHECK: define internal void @[[TARGET_OUTLINE]]({{.*}})
// CHECK: call void @[[TEAMS_OUTLINE:.*]]({{.*}})

// DEVICE: define internal void @[[TEAMS_OUTLINE]]({{.*}})
// DEVICE: call void @__kmpc_distribute_static_loop{{.*}}({{.*}}, ptr @[[DISTRIBUTE_OUTLINE:[^,]*]], {{.*}})
// CHECK: define internal void @[[TEAMS_OUTLINE]]({{.*}})
// CHECK: call void @__kmpc_distribute_static_loop{{.*}}({{.*}}, ptr @[[DISTRIBUTE_OUTLINE:[^,]*]], {{.*}})

// DEVICE: define internal void @[[DISTRIBUTE_OUTLINE]]({{.*}})
// DEVICE: call void @__kmpc_parallel_51(ptr {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], ptr {{.*}}, ptr {{.*}}, i64 {{.*}})
// CHECK: define internal void @[[DISTRIBUTE_OUTLINE]]({{.*}})
// CHECK: call void @__kmpc_parallel_51(ptr {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, ptr @[[PARALLEL_OUTLINE0:.*]], ptr {{.*}}, ptr {{.*}}, i64 {{.*}})
// CHECK: call void @__kmpc_parallel_51(ptr {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, ptr @[[PARALLEL_OUTLINE1:.*]], ptr {{.*}}, ptr {{.*}}, i64 {{.*}})

// DEVICE: define internal void @[[PARALLEL_OUTLINE]]({{.*}})
// DEVICE: call void @__kmpc_for_static_loop{{.*}}({{.*}})
// CHECK: define internal void @[[PARALLEL_OUTLINE1]]({{.*}})
// CHECK: define internal void @[[PARALLEL_OUTLINE0]]({{.*}})