Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ computeTransferSegments(int64_t totalElements, int64_t elementBits,
///
/// The gather_to_lds instruction requires:
/// - Source indices: per-lane divergent (each lane reads from different loc)
/// - Destination indices: subgroup-uniform (all lanes write to same LDS base)
/// - Destination indices: per-lane divergent (each lane writes to distinct
/// LDS location)
///
/// Index computation rules for each dimension:
///
Expand All @@ -159,7 +160,7 @@ computeTransferSegments(int64_t totalElements, int64_t elementBits,
///
/// Where:
/// - srcDimOffset: position with lane offset (divergent per lane)
/// - dstDimOffset: position without lane offset (uniform across subgroup)
/// - dstDimOffset: position with lane offset (divergent per lane)
/// - indices[dim]: index memref mapping dest positions to source positions
static std::pair<SmallVector<Value>, SmallVector<Value>>
generateGatherIndices(OpBuilder &rewriter, Location loc,
Expand Down Expand Up @@ -407,9 +408,10 @@ struct LowerCoalescedGatherDMAPattern final
SmallVector<Value> srcDimOffsets(outerDimOffsets);
llvm::append_range(srcDimOffsets, srcDelinearize.getResults());

// Destination indices: no lane offset (subgroup-uniform).
// Destination indices: include lane offset (divergent per lane) so
// each lane writes to its own distinct LDS location.
auto dstDelinearize = affine::AffineDelinearizeIndexOp::create(
rewriter, loc, linearOffsetVal, basis, /*hasOuterBound=*/true);
rewriter, loc, srcLinearOffset, basis, /*hasOuterBound=*/true);

SmallVector<Value> dstDimOffsets(outerDimOffsets);
llvm::append_range(dstDimOffsets, dstDelinearize.getResults());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -830,42 +830,32 @@ struct GPUConvertToCoalescedDMAPass final
FunctionOpInterface funcOp = getOperation();
MLIRContext *context = &getContext();

// Pre-check: decide whether all linalg.copy ops should be DMA-converted.
// Only activate when at least one copy already has use_global_load_dma
// (indicating DMA intent from upstream config, e.g. --iree-llvmgpu-use-
// direct-load). Collect all promoted copies (use_global_load_dma or
// derived_thread_config). If ALL are DMA-convertible, upgrade them all to
// use_global_load_dma. If ANY fails, downgrade them all to
// derived_thread_config.
// Pre-check: verify that all copies marked with use_global_load_dma are
// actually DMA-convertible. If any DMA-marked copy fails the check,
// downgrade ALL DMA-marked copies to derived_thread_config.
// Copies already marked with derived_thread_config are left unchanged —
// they should not be upgraded to use_global_load_dma because they may
// have shapes (e.g. scale operands) that are too small for DMA after
// per-warp tiling, leading to incorrect thread distribution.
// Note: GatherOps are excluded — they come from input IR (not from
// GPUPromoteMatmulOperands) and are handled independently by
// ConvertGatherToCoalescedDMA.
SmallVector<linalg::CopyOp> promotedCopies;
bool hasDMAIntent = false;
SmallVector<linalg::CopyOp> dmaCopies;
funcOp->walk([&](linalg::CopyOp copyOp) {
if (getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(copyOp)) {
hasDMAIntent = true;
promotedCopies.push_back(copyOp);
} else if (getLoweringConfig<IREE::GPU::DerivedThreadConfigAttr>(
copyOp)) {
promotedCopies.push_back(copyOp);
dmaCopies.push_back(copyOp);
}
});

if (hasDMAIntent) {
bool allConvertible = llvm::all_of(promotedCopies, isCopyDMAConvertible);
LLVM_DEBUG({
if (!allConvertible) {
llvm::dbgs() << "DMA pre-check: not all copies convertible, "
<< "downgrading " << promotedCopies.size()
if (!dmaCopies.empty()) {
bool allConvertible = llvm::all_of(dmaCopies, isCopyDMAConvertible);
if (!allConvertible) {
LLVM_DEBUG({
llvm::dbgs() << "DMA pre-check: not all DMA copies convertible, "
<< "downgrading " << dmaCopies.size()
<< " copies to derived_thread_config\n";
}
});
for (linalg::CopyOp copyOp : promotedCopies) {
if (allConvertible) {
setLoweringConfig(copyOp,
IREE::GPU::UseGlobalLoadDMAAttr::get(context));
} else {
});
for (linalg::CopyOp copyOp : dmaCopies) {
setLoweringConfig(copyOp,
IREE::GPU::DerivedThreadConfigAttr::get(context));
}
Expand Down
Loading
Loading