Skip to content
21 changes: 7 additions & 14 deletions flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,13 @@ DataSharingProcessor::DataSharingProcessor(
}

void DataSharingProcessor::processStep1(
mlir::omp::PrivateClauseOps *clauseOps,
llvm::SmallVectorImpl<const semantics::Symbol *> *privateSyms) {
mlir::omp::PrivateClauseOps *clauseOps) {
collectSymbolsForPrivatization();
collectDefaultSymbols();
collectImplicitSymbols();
collectPreDeterminedSymbols();

privatize(clauseOps, privateSyms);
privatize(clauseOps);

insertBarrier();
}
Expand Down Expand Up @@ -415,16 +414,14 @@ void DataSharingProcessor::collectPreDeterminedSymbols() {
preDeterminedSymbols);
}

void DataSharingProcessor::privatize(
mlir::omp::PrivateClauseOps *clauseOps,
llvm::SmallVectorImpl<const semantics::Symbol *> *privateSyms) {
void DataSharingProcessor::privatize(mlir::omp::PrivateClauseOps *clauseOps) {
for (const semantics::Symbol *sym : allPrivatizedSymbols) {
if (const auto *commonDet =
sym->detailsIf<semantics::CommonBlockDetails>()) {
for (const auto &mem : commonDet->objects())
doPrivatize(&*mem, clauseOps, privateSyms);
doPrivatize(&*mem, clauseOps);
} else
doPrivatize(sym, clauseOps, privateSyms);
doPrivatize(sym, clauseOps);
}
}

Expand All @@ -441,9 +438,8 @@ void DataSharingProcessor::copyLastPrivatize(mlir::Operation *op) {
}
}

void DataSharingProcessor::doPrivatize(
const semantics::Symbol *sym, mlir::omp::PrivateClauseOps *clauseOps,
llvm::SmallVectorImpl<const semantics::Symbol *> *privateSyms) {
void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym,
mlir::omp::PrivateClauseOps *clauseOps) {
if (!useDelayedPrivatization) {
cloneSymbol(sym);
copyFirstPrivateSymbol(sym);
Expand Down Expand Up @@ -548,9 +544,6 @@ void DataSharingProcessor::doPrivatize(
clauseOps->privateVars.push_back(hsb.getAddr());
}

if (privateSyms)
privateSyms->push_back(sym);

symToPrivatizer[sym] = privatizerOp;
}

Expand Down
18 changes: 9 additions & 9 deletions flang/lib/Lower/OpenMP/DataSharingProcessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,18 +103,15 @@ class DataSharingProcessor {
void collectDefaultSymbols();
void collectImplicitSymbols();
void collectPreDeterminedSymbols();
void privatize(mlir::omp::PrivateClauseOps *clauseOps,
llvm::SmallVectorImpl<const semantics::Symbol *> *privateSyms);
void privatize(mlir::omp::PrivateClauseOps *clauseOps);
void defaultPrivatize(
mlir::omp::PrivateClauseOps *clauseOps,
llvm::SmallVectorImpl<const semantics::Symbol *> *privateSyms);
void implicitPrivatize(
mlir::omp::PrivateClauseOps *clauseOps,
llvm::SmallVectorImpl<const semantics::Symbol *> *privateSyms);
void
doPrivatize(const semantics::Symbol *sym,
mlir::omp::PrivateClauseOps *clauseOps,
llvm::SmallVectorImpl<const semantics::Symbol *> *privateSyms);
void doPrivatize(const semantics::Symbol *sym,
mlir::omp::PrivateClauseOps *clauseOps);
void copyLastPrivatize(mlir::Operation *op);
void insertLastPrivateCompare(mlir::Operation *op);
void cloneSymbol(const semantics::Symbol *sym);
Expand Down Expand Up @@ -145,15 +142,18 @@ class DataSharingProcessor {
// Step2 performs the copying for lastprivates and requires knowledge of the
// MLIR operation to insert the last private update. Step2 adds
// dealocation code as well.
void processStep1(
mlir::omp::PrivateClauseOps *clauseOps = nullptr,
llvm::SmallVectorImpl<const semantics::Symbol *> *privateSyms = nullptr);
void processStep1(mlir::omp::PrivateClauseOps *clauseOps = nullptr);
void processStep2(mlir::Operation *op, bool isLoop);

void setLoopIV(mlir::Value iv) {
assert(!loopIV && "Loop iteration variable already set");
loopIV = iv;
}

const llvm::SetVector<const semantics::Symbol *> &
getAllSymbolsToPrivatize() const {
return allPrivatizedSymbols;
}
};

} // namespace omp
Expand Down
72 changes: 60 additions & 12 deletions flang/lib/Lower/OpenMP/OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -758,15 +758,33 @@ genBodyOfTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
llvm::ArrayRef<const semantics::Symbol *> mapSyms,
llvm::ArrayRef<mlir::Location> mapSymLocs,
llvm::ArrayRef<mlir::Type> mapSymTypes,
DataSharingProcessor &dsp,
const mlir::Location &currentLocation,
const ConstructQueue &queue, ConstructQueue::iterator item) {
assert(mapSymTypes.size() == mapSymLocs.size());

fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
mlir::Region &region = targetOp.getRegion();

auto *regionBlock =
firOpBuilder.createBlock(&region, {}, mapSymTypes, mapSymLocs);
mlir::OperandRange privateVars = targetOp.getPrivateVars();

llvm::SmallVector<mlir::Type> allRegionArgTypes;
allRegionArgTypes.reserve(mapSymTypes.size() +
targetOp.getPrivateVars().size());
llvm::transform(mapSymTypes, std::back_inserter(allRegionArgTypes),
[](mlir::Type t) { return t; });
llvm::transform(privateVars, std::back_inserter(allRegionArgTypes),
[](mlir::Value v) { return v.getType(); });

llvm::SmallVector<mlir::Location> allRegionArgLocs;
allRegionArgTypes.reserve(mapSymTypes.size() +
targetOp.getPrivateVars().size());
llvm::transform(mapSymLocs, std::back_inserter(allRegionArgLocs),
[](mlir::Location l) { return l; });
llvm::transform(privateVars, std::back_inserter(allRegionArgLocs),
[](mlir::Value v) { return v.getLoc(); });

auto *regionBlock = firOpBuilder.createBlock(&region, {}, allRegionArgTypes,
allRegionArgLocs);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment for this block of code. Is it possible to share any of it with the parallel operation? Or is it not possible due to the isolated from above nature of target?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Abstracted the common pieces into one template and commented it. Let me know if you don't like how I handled this or if it is too abstract and/or noisy.


// Clones the `bounds` placing them inside the target region and returns them.
auto cloneBound = [&](mlir::Value bound) {
Expand Down Expand Up @@ -830,6 +848,20 @@ genBodyOfTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
});
}

for (auto [argIndex, argSymbol] :
llvm::enumerate(dsp.getAllSymbolsToPrivatize())) {
argIndex = mapSyms.size() + argIndex;

const mlir::BlockArgument &arg = region.getArgument(argIndex);
converter.bindSymbol(*argSymbol,
hlfir::translateToExtendedValue(
currentLocation, firOpBuilder, hlfir::Entity{arg},
/*contiguousHint=*/
evaluate::IsSimplyContiguous(
*argSymbol, converter.getFoldingContext()))
.first);
}

// Check if cloning the bounds introduced any dependency on the outer region.
// If so, then either clone them as well if they are MemoryEffectFree, or else
// copy them to a new temporary and add them to the map and block_argument
Expand Down Expand Up @@ -907,6 +939,8 @@ genBodyOfTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
} else {
genNestedEvaluations(converter, eval);
}

dsp.processStep2(targetOp, /*isLoop=*/false);
}

template <typename OpTy, typename... Args>
Expand Down Expand Up @@ -1048,15 +1082,18 @@ static void genTargetClauses(
devicePtrSyms);
cp.processMap(loc, stmtCtx, clauseOps, &mapSyms, &mapLocs, &mapTypes);
cp.processThreadLimit(stmtCtx, clauseOps);
// TODO Support delayed privatization.

if (processHostOnlyClauses)
cp.processNowait(clauseOps);

cp.processTODO<clause::Allocate, clause::Defaultmap, clause::Firstprivate,
clause::InReduction, clause::Private, clause::Reduction,
clause::InReduction, clause::Reduction,
clause::UsesAllocators>(loc,
llvm::omp::Directive::OMPD_target);

// `target private(..)` is only supported in delayed privatization mode.
if (!enableDelayedPrivatization)
cp.processTODO<clause::Private>(loc, llvm::omp::Directive::OMPD_target);
}

static void genTargetDataClauses(
Expand Down Expand Up @@ -1289,7 +1326,6 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
lower::StatementContext stmtCtx;
mlir::omp::ParallelClauseOps clauseOps;
llvm::SmallVector<const semantics::Symbol *> privateSyms;
llvm::SmallVector<mlir::Type> reductionTypes;
llvm::SmallVector<const semantics::Symbol *> reductionSyms;
genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
Expand Down Expand Up @@ -1319,7 +1355,7 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
/*useDelayedPrivatization=*/true, &symTable);

if (privatize)
dsp.processStep1(&clauseOps, &privateSyms);
dsp.processStep1(&clauseOps);

auto genRegionEntryCB = [&](mlir::Operation *op) {
auto parallelOp = llvm::cast<mlir::omp::ParallelOp>(op);
Expand All @@ -1344,9 +1380,10 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
privateVarLocs);

llvm::SmallVector<const semantics::Symbol *> allSymbols = reductionSyms;
allSymbols.append(privateSyms);
allSymbols.append(dsp.getAllSymbolsToPrivatize().begin(),
dsp.getAllSymbolsToPrivatize().end());

for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) {
fir::ExtendedValue hostExV = converter.getSymbolExtendedValue(*arg);
converter.bindSymbol(*arg, hlfir::translateToExtendedValue(
loc, firOpBuilder, hlfir::Entity{prv},
/*contiguousHint=*/
Expand Down Expand Up @@ -1541,11 +1578,22 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
deviceAddrLocs, deviceAddrTypes, devicePtrSyms,
devicePtrLocs, devicePtrTypes);

llvm::SmallVector<const semantics::Symbol *> privateSyms;
DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/
lower::omp::isLastItemInQueue(item, queue),
/*useDelayedPrivatization=*/true, &symTable);
dsp.processStep1(&clauseOps);

// 5.8.1 Implicit Data-Mapping Attribute Rules
// The following code follows the implicit data-mapping rules to map all the
// symbols used inside the region that have not been explicitly mapped using
// the map clause.
// symbols used inside the region that do not have explicit data-environment
// attribute clauses (neither data-sharing; e.g. `private`, nor `map`
// clauses).
auto captureImplicitMap = [&](const semantics::Symbol &sym) {
if (dsp.getAllSymbolsToPrivatize().contains(&sym))
return;

if (llvm::find(mapSyms, &sym) == mapSyms.end()) {
mlir::Value baseOp = converter.getSymbolAddress(sym);
if (!baseOp)
Expand Down Expand Up @@ -1632,7 +1680,7 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,

auto targetOp = firOpBuilder.create<mlir::omp::TargetOp>(loc, clauseOps);
genBodyOfTargetOp(converter, symTable, semaCtx, eval, targetOp, mapSyms,
mapLocs, mapTypes, loc, queue, item);
mapLocs, mapTypes, dsp, loc, queue, item);
return targetOp;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
! Tests delayed privatization for `targets ... private(..)` for allocatables.

! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
! RUN: -o - %s 2>&1 | FileCheck %s
! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 \
! RUN: | FileCheck %s

subroutine target_allocatable
implicit none
integer, allocatable :: alloc_var

!$omp target private(alloc_var)
alloc_var = 10
!$omp end target
end subroutine target_allocatable

! CHECK-LABEL: omp.private {type = private}
! CHECK-SAME: @[[VAR_PRIVATIZER_SYM:.*]] :
! CHECK-SAME: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
! CHECK: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
! CHECK: %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "alloc_var", {{.*}}}

! CHECK-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : !fir.ref<!fir.box<!fir.heap<i32>>>
! CHECK-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
! CHECK-NEXT: %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64
! CHECK-NEXT: %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64

! CHECK-NEXT: fir.if %[[ALLOC_COND]] {
! CHECK: %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 {fir.must_be_heap = true, {{.*}}}
! CHECK-NEXT: %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
! CHECK-NEXT: fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
! CHECK-NEXT: } else {
! CHECK-NEXT: %[[ZERO_BITS:.*]] = fir.zero_bits !fir.heap<i32>
! CHECK-NEXT: %[[ZERO_BOX:.*]] = fir.embox %[[ZERO_BITS]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
! CHECK-NEXT: fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
! CHECK-NEXT: }

! CHECK-NEXT: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
! CHECK-NEXT: omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])

! CHECK-NEXT: } dealloc {
! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):

! CHECK-NEXT: %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]]
! CHECK-NEXT: %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]]
! CHECK-NEXT: %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]]
! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64
! CHECK-NEXT: %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64

! CHECK-NEXT: fir.if %[[PRIV_NULL_COND]] {
! CHECK: %[[PRIV_VAL_2:.*]] = fir.load %[[PRIV_ARG]]
! CHECK-NEXT: %[[PRIV_ADDR_2:.*]] = fir.box_addr %[[PRIV_VAL_2]]
! CHECK-NEXT: fir.freemem %[[PRIV_ADDR_2]]
! CHECK-NEXT: %[[ZEROS:.*]] = fir.zero_bits
! CHECK-NEXT: %[[ZEROS_BOX:.*]] = fir.embox %[[ZEROS]]
! CHECK-NEXT: fir.store %[[ZEROS_BOX]] to %[[PRIV_ARG]]
! CHECK-NEXT: }

! CHECK-NEXT: omp.yield
! CHECK-NEXT: }


! CHECK-LABEL: func.func @_QPtarget_allocatable() {

! CHECK: %[[VAR_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
! CHECK-SAME: {bindc_name = "alloc_var", {{.*}}}
! CHECK: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]

! CHECK: omp.target private(
! CHECK-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} : [[TYPE]]) {
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
! Tests delayed privatization for `targets ... private(..)` for simple variables.

! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
! RUN: -o - %s 2>&1 | FileCheck %s
! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 \
! RUN: | FileCheck %s

subroutine target_simple
implicit none
integer :: simple_var

!$omp target private(simple_var)
simple_var = 10
!$omp end target
end subroutine target_simple

! CHECK-LABEL: omp.private {type = private}
! CHECK-SAME: @[[VAR_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
! CHECK: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
! CHECK: %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "simple_var", {{.*}}}
! CHECK: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
! CHECK: omp.yield(%[[PRIV_DECL]]#0 : !fir.ref<i32>)
! CHECK: }

! CHECK-LABEL: func.func @_QPtarget_simple() {
! CHECK: %[[VAR_ALLOC:.*]] = fir.alloca i32 {bindc_name = "simple_var", {{.*}}}
! CHECK: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]

! CHECK: omp.target private(
! CHECK-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} : !fir.ref<i32>) {
! CHECK: ^bb0(%[[REG_ARG:.*]]: !fir.ref<i32>):
! CHECK: %[[REG_DECL:.*]]:2 = hlfir.declare %[[REG_ARG]]
! CHECK: %[[C10:.*]] = arith.constant 10
! CHECK: hlfir.assign %[[C10]] to %[[REG_DECL]]#0
! CHECK: omp.terminator
! CHECK: }

! CHECK: return
! CHECK: }