iree-org · lialan · Mar 12, 2026 · Mar 13, 2026
@@ -148,7 +148,8 @@ computeTransferSegments(int64_t totalElements, int64_t elementBits,
 ///
 /// The gather_to_lds instruction requires:
 ///   - Source indices: per-lane divergent (each lane reads from different loc)
-///   - Destination indices: subgroup-uniform (all lanes write to same LDS base)
+///   - Destination indices: per-lane divergent (each lane writes to distinct
+///     LDS location)
 ///
 /// Index computation rules for each dimension:
 ///
@@ -159,7 +160,7 @@ computeTransferSegments(int64_t totalElements, int64_t elementBits,
 ///
 /// Where:
 ///   - srcDimOffset: position with lane offset (divergent per lane)
-///   - dstDimOffset: position without lane offset (uniform across subgroup)
+///   - dstDimOffset: position with lane offset (divergent per lane)
 ///   - indices[dim]: index memref mapping dest positions to source positions
 static std::pair<SmallVector<Value>, SmallVector<Value>>
 generateGatherIndices(OpBuilder &rewriter, Location loc,
@@ -407,9 +408,10 @@ struct LowerCoalescedGatherDMAPattern final
           SmallVector<Value> srcDimOffsets(outerDimOffsets);
           llvm::append_range(srcDimOffsets, srcDelinearize.getResults());
 
-          // Destination indices: no lane offset (subgroup-uniform).
+          // Destination indices: include lane offset (divergent per lane) so
+          // each lane writes to its own distinct LDS location.
           auto dstDelinearize = affine::AffineDelinearizeIndexOp::create(
-              rewriter, loc, linearOffsetVal, basis, /*hasOuterBound=*/true);
+              rewriter, loc, srcLinearOffset, basis, /*hasOuterBound=*/true);
 
           SmallVector<Value> dstDimOffsets(outerDimOffsets);
           llvm::append_range(dstDimOffsets, dstDelinearize.getResults());

@@ -830,42 +830,32 @@ struct GPUConvertToCoalescedDMAPass final
     FunctionOpInterface funcOp = getOperation();
     MLIRContext *context = &getContext();
 
-    // Pre-check: decide whether all linalg.copy ops should be DMA-converted.
-    // Only activate when at least one copy already has use_global_load_dma
-    // (indicating DMA intent from upstream config, e.g. --iree-llvmgpu-use-
-    // direct-load). Collect all promoted copies (use_global_load_dma or
-    // derived_thread_config). If ALL are DMA-convertible, upgrade them all to
-    // use_global_load_dma. If ANY fails, downgrade them all to
-    // derived_thread_config.
+    // Pre-check: verify that all copies marked with use_global_load_dma are
+    // actually DMA-convertible. If any DMA-marked copy fails the check,
+    // downgrade ALL DMA-marked copies to derived_thread_config.
+    // Copies already marked with derived_thread_config are left unchanged —
+    // they should not be upgraded to use_global_load_dma because they may
+    // have shapes (e.g. scale operands) that are too small for DMA after
+    // per-warp tiling, leading to incorrect thread distribution.
     // Note: GatherOps are excluded — they come from input IR (not from
     // GPUPromoteMatmulOperands) and are handled independently by
     // ConvertGatherToCoalescedDMA.
-    SmallVector<linalg::CopyOp> promotedCopies;
-    bool hasDMAIntent = false;
+    SmallVector<linalg::CopyOp> dmaCopies;
     funcOp->walk([&](linalg::CopyOp copyOp) {
       if (getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(copyOp)) {
-        hasDMAIntent = true;
-        promotedCopies.push_back(copyOp);
-      } else if (getLoweringConfig<IREE::GPU::DerivedThreadConfigAttr>(
-                     copyOp)) {
-        promotedCopies.push_back(copyOp);
+        dmaCopies.push_back(copyOp);
       }
     });
 
-    if (hasDMAIntent) {
-      bool allConvertible = llvm::all_of(promotedCopies, isCopyDMAConvertible);
-      LLVM_DEBUG({
-        if (!allConvertible) {
-          llvm::dbgs() << "DMA pre-check: not all copies convertible, "
-                       << "downgrading " << promotedCopies.size()
+    if (!dmaCopies.empty()) {
+      bool allConvertible = llvm::all_of(dmaCopies, isCopyDMAConvertible);
+      if (!allConvertible) {
+        LLVM_DEBUG({
+          llvm::dbgs() << "DMA pre-check: not all DMA copies convertible, "
+                       << "downgrading " << dmaCopies.size()
                        << " copies to derived_thread_config\n";
-        }
-      });
-      for (linalg::CopyOp copyOp : promotedCopies) {
-        if (allConvertible) {
-          setLoweringConfig(copyOp,
-                            IREE::GPU::UseGlobalLoadDMAAttr::get(context));
-        } else {
+        });
+        for (linalg::CopyOp copyOp : dmaCopies) {
           setLoweringConfig(copyOp,
                             IREE::GPU::DerivedThreadConfigAttr::get(context));
         }