Skip to content

Commit eaa836c

Browse files
committed
[MLIR][OpenMP] Skip host omp ops when compiling for the target device
This patch separates the lowering dispatch for host and target devices. For the target device, if the current operation is not a top-level operation (e.g. omp.target) or is inside a target device code region it will be ignored, since it belongs to the host code.
1 parent 07a5667 commit eaa836c

8 files changed

+312
-118
lines changed

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Lines changed: 176 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -3042,6 +3042,172 @@ convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute,
30423042
return success();
30433043
}
30443044

3045+
static bool isTargetDeviceOp(Operation *op) {
3046+
// Assumes no reverse offloading
3047+
if (op->getParentOfType<omp::TargetOp>())
3048+
return true;
3049+
3050+
if (auto parentFn = op->getParentOfType<LLVM::LLVMFuncOp>())
3051+
if (auto declareTargetIface =
3052+
llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
3053+
parentFn.getOperation()))
3054+
if (declareTargetIface.isDeclareTarget() &&
3055+
declareTargetIface.getDeclareTargetDeviceType() !=
3056+
mlir::omp::DeclareTargetDeviceType::host)
3057+
return true;
3058+
3059+
return false;
3060+
}
3061+
3062+
/// Given an OpenMP MLIR operation, create the corresponding LLVM IR
3063+
/// (including OpenMP runtime calls).
3064+
static LogicalResult
3065+
convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
3066+
LLVM::ModuleTranslation &moduleTranslation) {
3067+
3068+
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
3069+
3070+
return llvm::TypeSwitch<Operation *, LogicalResult>(op)
3071+
.Case([&](omp::BarrierOp) {
3072+
ompBuilder->createBarrier(builder.saveIP(), llvm::omp::OMPD_barrier);
3073+
return success();
3074+
})
3075+
.Case([&](omp::TaskwaitOp) {
3076+
ompBuilder->createTaskwait(builder.saveIP());
3077+
return success();
3078+
})
3079+
.Case([&](omp::TaskyieldOp) {
3080+
ompBuilder->createTaskyield(builder.saveIP());
3081+
return success();
3082+
})
3083+
.Case([&](omp::FlushOp) {
3084+
// No support in Openmp runtime function (__kmpc_flush) to accept
3085+
// the argument list.
3086+
// OpenMP standard states the following:
3087+
// "An implementation may implement a flush with a list by ignoring
3088+
// the list, and treating it the same as a flush without a list."
3089+
//
3090+
// The argument list is discarded so that, flush with a list is treated
3091+
// same as a flush without a list.
3092+
ompBuilder->createFlush(builder.saveIP());
3093+
return success();
3094+
})
3095+
.Case([&](omp::ParallelOp op) {
3096+
return convertOmpParallel(op, builder, moduleTranslation);
3097+
})
3098+
.Case([&](omp::ReductionOp reductionOp) {
3099+
return convertOmpReductionOp(reductionOp, builder, moduleTranslation);
3100+
})
3101+
.Case([&](omp::MasterOp) {
3102+
return convertOmpMaster(*op, builder, moduleTranslation);
3103+
})
3104+
.Case([&](omp::CriticalOp) {
3105+
return convertOmpCritical(*op, builder, moduleTranslation);
3106+
})
3107+
.Case([&](omp::OrderedRegionOp) {
3108+
return convertOmpOrderedRegion(*op, builder, moduleTranslation);
3109+
})
3110+
.Case([&](omp::OrderedOp) {
3111+
return convertOmpOrdered(*op, builder, moduleTranslation);
3112+
})
3113+
.Case([&](omp::WsloopOp) {
3114+
return convertOmpWsloop(*op, builder, moduleTranslation);
3115+
})
3116+
.Case([&](omp::SimdLoopOp) {
3117+
return convertOmpSimdLoop(*op, builder, moduleTranslation);
3118+
})
3119+
.Case([&](omp::AtomicReadOp) {
3120+
return convertOmpAtomicRead(*op, builder, moduleTranslation);
3121+
})
3122+
.Case([&](omp::AtomicWriteOp) {
3123+
return convertOmpAtomicWrite(*op, builder, moduleTranslation);
3124+
})
3125+
.Case([&](omp::AtomicUpdateOp op) {
3126+
return convertOmpAtomicUpdate(op, builder, moduleTranslation);
3127+
})
3128+
.Case([&](omp::AtomicCaptureOp op) {
3129+
return convertOmpAtomicCapture(op, builder, moduleTranslation);
3130+
})
3131+
.Case([&](omp::SectionsOp) {
3132+
return convertOmpSections(*op, builder, moduleTranslation);
3133+
})
3134+
.Case([&](omp::SingleOp op) {
3135+
return convertOmpSingle(op, builder, moduleTranslation);
3136+
})
3137+
.Case([&](omp::TeamsOp op) {
3138+
return convertOmpTeams(op, builder, moduleTranslation);
3139+
})
3140+
.Case([&](omp::TaskOp op) {
3141+
return convertOmpTaskOp(op, builder, moduleTranslation);
3142+
})
3143+
.Case([&](omp::TaskgroupOp op) {
3144+
return convertOmpTaskgroupOp(op, builder, moduleTranslation);
3145+
})
3146+
.Case<omp::YieldOp, omp::TerminatorOp, omp::DeclareReductionOp,
3147+
omp::CriticalDeclareOp>([](auto op) {
3148+
// `yield` and `terminator` can be just omitted. The block structure
3149+
// was created in the region that handles their parent operation.
3150+
// `declare_reduction` will be used by reductions and is not
3151+
// converted directly, skip it.
3152+
// `critical.declare` is only used to declare names of critical
3153+
// sections which will be used by `critical` ops and hence can be
3154+
// ignored for lowering. The OpenMP IRBuilder will create unique
3155+
// name for critical section names.
3156+
return success();
3157+
})
3158+
.Case([&](omp::ThreadprivateOp) {
3159+
return convertOmpThreadprivate(*op, builder, moduleTranslation);
3160+
})
3161+
.Case<omp::TargetDataOp, omp::TargetEnterDataOp, omp::TargetExitDataOp,
3162+
omp::TargetUpdateOp>([&](auto op) {
3163+
return convertOmpTargetData(op, builder, moduleTranslation);
3164+
})
3165+
.Case([&](omp::TargetOp) {
3166+
return convertOmpTarget(*op, builder, moduleTranslation);
3167+
})
3168+
.Case<omp::MapInfoOp, omp::MapBoundsOp, omp::PrivateClauseOp>(
3169+
[&](auto op) {
3170+
// No-op, should be handled by relevant owning operations e.g.
3171+
// TargetOp, TargetEnterDataOp, TargetExitDataOp, TargetDataOp etc.
3172+
// and then discarded
3173+
return success();
3174+
})
3175+
.Default([&](Operation *inst) {
3176+
return inst->emitError("unsupported OpenMP operation: ")
3177+
<< inst->getName();
3178+
});
3179+
}
3180+
3181+
static LogicalResult
3182+
convertTargetDeviceOp(Operation *op, llvm::IRBuilderBase &builder,
3183+
LLVM::ModuleTranslation &moduleTranslation) {
3184+
return convertHostOrTargetOperation(op, builder, moduleTranslation);
3185+
}
3186+
3187+
static LogicalResult
3188+
convertTargetOpsInNest(Operation *op, llvm::IRBuilderBase &builder,
3189+
LLVM::ModuleTranslation &moduleTranslation) {
3190+
if (isa<omp::TargetOp>(op))
3191+
return convertOmpTarget(*op, builder, moduleTranslation);
3192+
if (isa<omp::TargetDataOp>(op))
3193+
return convertOmpTargetData(op, builder, moduleTranslation);
3194+
bool interrupted =
3195+
op->walk<WalkOrder::PreOrder>([&](Operation *oper) {
3196+
if (isa<omp::TargetOp>(oper)) {
3197+
if (failed(convertOmpTarget(*oper, builder, moduleTranslation)))
3198+
return WalkResult::interrupt();
3199+
return WalkResult::skip();
3200+
}
3201+
if (isa<omp::TargetDataOp>(oper)) {
3202+
if (failed(convertOmpTargetData(oper, builder, moduleTranslation)))
3203+
return WalkResult::interrupt();
3204+
return WalkResult::skip();
3205+
}
3206+
return WalkResult::advance();
3207+
}).wasInterrupted();
3208+
return failure(interrupted);
3209+
}
3210+
30453211
namespace {
30463212

30473213
/// Implementation of the dialect interface that converts operations belonging
@@ -3057,8 +3223,8 @@ class OpenMPDialectLLVMIRTranslationInterface
30573223
convertOperation(Operation *op, llvm::IRBuilderBase &builder,
30583224
LLVM::ModuleTranslation &moduleTranslation) const final;
30593225

3060-
/// Given an OpenMP MLIR attribute, create the corresponding LLVM-IR, runtime
3061-
/// calls, or operation amendments
3226+
/// Given an OpenMP MLIR attribute, create the corresponding LLVM-IR,
3227+
/// runtime calls, or operation amendments
30623228
LogicalResult
30633229
amendOperation(Operation *op, ArrayRef<llvm::Instruction *> instructions,
30643230
NamedAttribute attribute,
@@ -3163,116 +3329,15 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
31633329
LLVM::ModuleTranslation &moduleTranslation) const {
31643330

31653331
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
3332+
if (ompBuilder->Config.isTargetDevice()) {
3333+
if (isTargetDeviceOp(op)) {
3334+
return convertTargetDeviceOp(op, builder, moduleTranslation);
3335+
} else {
3336+
return convertTargetOpsInNest(op, builder, moduleTranslation);
3337+
}
3338+
}
31663339

3167-
return llvm::TypeSwitch<Operation *, LogicalResult>(op)
3168-
.Case([&](omp::BarrierOp) {
3169-
ompBuilder->createBarrier(builder.saveIP(), llvm::omp::OMPD_barrier);
3170-
return success();
3171-
})
3172-
.Case([&](omp::TaskwaitOp) {
3173-
ompBuilder->createTaskwait(builder.saveIP());
3174-
return success();
3175-
})
3176-
.Case([&](omp::TaskyieldOp) {
3177-
ompBuilder->createTaskyield(builder.saveIP());
3178-
return success();
3179-
})
3180-
.Case([&](omp::FlushOp) {
3181-
// No support in Openmp runtime function (__kmpc_flush) to accept
3182-
// the argument list.
3183-
// OpenMP standard states the following:
3184-
// "An implementation may implement a flush with a list by ignoring
3185-
// the list, and treating it the same as a flush without a list."
3186-
//
3187-
// The argument list is discarded so that, flush with a list is treated
3188-
// same as a flush without a list.
3189-
ompBuilder->createFlush(builder.saveIP());
3190-
return success();
3191-
})
3192-
.Case([&](omp::ParallelOp op) {
3193-
return convertOmpParallel(op, builder, moduleTranslation);
3194-
})
3195-
.Case([&](omp::ReductionOp reductionOp) {
3196-
return convertOmpReductionOp(reductionOp, builder, moduleTranslation);
3197-
})
3198-
.Case([&](omp::MasterOp) {
3199-
return convertOmpMaster(*op, builder, moduleTranslation);
3200-
})
3201-
.Case([&](omp::CriticalOp) {
3202-
return convertOmpCritical(*op, builder, moduleTranslation);
3203-
})
3204-
.Case([&](omp::OrderedRegionOp) {
3205-
return convertOmpOrderedRegion(*op, builder, moduleTranslation);
3206-
})
3207-
.Case([&](omp::OrderedOp) {
3208-
return convertOmpOrdered(*op, builder, moduleTranslation);
3209-
})
3210-
.Case([&](omp::WsloopOp) {
3211-
return convertOmpWsloop(*op, builder, moduleTranslation);
3212-
})
3213-
.Case([&](omp::SimdLoopOp) {
3214-
return convertOmpSimdLoop(*op, builder, moduleTranslation);
3215-
})
3216-
.Case([&](omp::AtomicReadOp) {
3217-
return convertOmpAtomicRead(*op, builder, moduleTranslation);
3218-
})
3219-
.Case([&](omp::AtomicWriteOp) {
3220-
return convertOmpAtomicWrite(*op, builder, moduleTranslation);
3221-
})
3222-
.Case([&](omp::AtomicUpdateOp op) {
3223-
return convertOmpAtomicUpdate(op, builder, moduleTranslation);
3224-
})
3225-
.Case([&](omp::AtomicCaptureOp op) {
3226-
return convertOmpAtomicCapture(op, builder, moduleTranslation);
3227-
})
3228-
.Case([&](omp::SectionsOp) {
3229-
return convertOmpSections(*op, builder, moduleTranslation);
3230-
})
3231-
.Case([&](omp::SingleOp op) {
3232-
return convertOmpSingle(op, builder, moduleTranslation);
3233-
})
3234-
.Case([&](omp::TeamsOp op) {
3235-
return convertOmpTeams(op, builder, moduleTranslation);
3236-
})
3237-
.Case([&](omp::TaskOp op) {
3238-
return convertOmpTaskOp(op, builder, moduleTranslation);
3239-
})
3240-
.Case([&](omp::TaskgroupOp op) {
3241-
return convertOmpTaskgroupOp(op, builder, moduleTranslation);
3242-
})
3243-
.Case<omp::YieldOp, omp::TerminatorOp, omp::DeclareReductionOp,
3244-
omp::CriticalDeclareOp>([](auto op) {
3245-
// `yield` and `terminator` can be just omitted. The block structure
3246-
// was created in the region that handles their parent operation.
3247-
// `declare_reduction` will be used by reductions and is not
3248-
// converted directly, skip it.
3249-
// `critical.declare` is only used to declare names of critical
3250-
// sections which will be used by `critical` ops and hence can be
3251-
// ignored for lowering. The OpenMP IRBuilder will create unique
3252-
// name for critical section names.
3253-
return success();
3254-
})
3255-
.Case([&](omp::ThreadprivateOp) {
3256-
return convertOmpThreadprivate(*op, builder, moduleTranslation);
3257-
})
3258-
.Case<omp::TargetDataOp, omp::TargetEnterDataOp, omp::TargetExitDataOp,
3259-
omp::TargetUpdateOp>([&](auto op) {
3260-
return convertOmpTargetData(op, builder, moduleTranslation);
3261-
})
3262-
.Case([&](omp::TargetOp) {
3263-
return convertOmpTarget(*op, builder, moduleTranslation);
3264-
})
3265-
.Case<omp::MapInfoOp, omp::MapBoundsOp, omp::PrivateClauseOp>(
3266-
[&](auto op) {
3267-
// No-op, should be handled by relevant owning operations e.g.
3268-
// TargetOp, TargetEnterDataOp, TargetExitDataOp, TargetDataOp etc.
3269-
// and then discarded
3270-
return success();
3271-
})
3272-
.Default([&](Operation *inst) {
3273-
return inst->emitError("unsupported OpenMP operation: ")
3274-
<< inst->getName();
3275-
});
3340+
return convertHostOrTargetOperation(op, builder, moduleTranslation);
32763341
}
32773342

32783343
void mlir::registerOpenMPDialectTranslation(DialectRegistry &registry) {

mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
// for nested omp do loop inside omp target region
55

66
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
7-
llvm.func @target_parallel_wsloop(%arg0: !llvm.ptr) attributes {
7+
llvm.func @target_parallel_wsloop(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>,
88
target_cpu = "gfx90a",
9-
target_features = #llvm.target_features<["+gfx9-insts", "+wavefrontsize64"]>
10-
} {
9+
target_features = #llvm.target_features<["+gfx9-insts", "+wavefrontsize64"]>}
10+
{
1111
omp.parallel {
1212
%loop_ub = llvm.mlir.constant(9 : i32) : i32
1313
%loop_lb = llvm.mlir.constant(0 : i32) : i32
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
2+
3+
module attributes {omp.is_target_device = true, omp.is_gpu = true} {
4+
llvm.func @omp_target_region_() {
5+
%0 = llvm.mlir.constant(20 : i32) : i32
6+
%1 = llvm.mlir.constant(10 : i32) : i32
7+
%2 = llvm.mlir.constant(1 : i64) : i64
8+
%3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr
9+
%4 = llvm.mlir.constant(1 : i64) : i64
10+
%5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr
11+
%6 = llvm.mlir.constant(1 : i64) : i64
12+
%7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr
13+
llvm.store %1, %3 : i32, !llvm.ptr
14+
llvm.store %0, %5 : i32, !llvm.ptr
15+
omp.task {
16+
%map1 = omp.map.info var_ptr(%3 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
17+
%map2 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
18+
%map3 = omp.map.info var_ptr(%7 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
19+
omp.target map_entries(%map1 -> %arg0, %map2 -> %arg1, %map3 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
20+
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
21+
%8 = llvm.load %arg0 : !llvm.ptr -> i32
22+
%9 = llvm.load %arg1 : !llvm.ptr -> i32
23+
%10 = llvm.add %8, %9 : i32
24+
llvm.store %10, %arg2 : i32, !llvm.ptr
25+
omp.terminator
26+
}
27+
omp.terminator
28+
}
29+
llvm.return
30+
}
31+
32+
llvm.func @omp_target_no_map() {
33+
omp.target {
34+
omp.terminator
35+
}
36+
llvm.return
37+
}
38+
}
39+
40+
// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}_{{.*}}_omp_target_region__l19
41+
// CHECK: ret void

mlir/test/Target/LLVMIR/omptarget-teams-llvm.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
module attributes {omp.is_target_device = true} {
77
llvm.func @foo(i32)
8-
llvm.func @omp_target_teams_shared_simple(%arg0 : i32) {
8+
llvm.func @omp_target_teams_shared_simple(%arg0 : i32) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
99
omp.teams {
1010
llvm.call @foo(%arg0) : (i32) -> ()
1111
omp.terminator

mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
// for nested omp do loop with collapse clause inside omp target region
55

66
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
7-
llvm.func @target_collapsed_wsloop(%arg0: !llvm.ptr) {
7+
llvm.func @target_collapsed_wsloop(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
88
%loop_ub = llvm.mlir.constant(99 : i32) : i32
99
%loop_lb = llvm.mlir.constant(0 : i32) : i32
1010
%loop_step = llvm.mlir.constant(1 : index) : i32

0 commit comments

Comments
 (0)