Fix AMD GPU alloca address space errors (#433)

phambinhfin · web-flow · commit b3f5970805e9 · 2025-11-19T16:10:27.000+02:00
* Fix AMD GPU alloca address space errors

AMD GPUs require stack allocations (alloca instructions) to be in
address space 5 (private/local memory), not address space 0 (generic
memory)

* Optimize AMDGPU allocas by keeping AS5 pointers throughout

* Fix AMDGPU allocas to use address space 5 in MLIR lowering
diff --git a/xla/codegen/emitters/transforms/lower_to_llvm.cc b/xla/codegen/emitters/transforms/lower_to_llvm.cc
@@ -140,6 +140,29 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
                                    std::move(mathPatterns)))) {
       signalPassFailure();
     }
+    
+    // For AMDGPU, fix allocas to use address space 5 (private)
+    // AMDGPU requires allocas in AS5, but MLIR lowering creates them in AS0
+    if (device_spec_.IsAmdGpu()) {
+      getOperation()->walk([](mlir::LLVM::AllocaOp alloca) {
+        auto ptr_type = mlir::cast<mlir::LLVM::LLVMPointerType>(alloca.getResult().getType());
+        // Check if address space is 0 (default/generic)
+        if (ptr_type.getAddressSpace() == 0) {
+          mlir::OpBuilder builder(alloca);
+          // Create new alloca in address space 5
+          auto new_ptr_type = mlir::LLVM::LLVMPointerType::get(builder.getContext(), 5);
+          auto new_alloca = builder.create<mlir::LLVM::AllocaOp>(
+              alloca.getLoc(),
+              new_ptr_type,
+              alloca.getElemType(),
+              alloca.getArraySize(),
+              alloca.getAlignment().value_or(0));
+          alloca.replaceAllUsesWith(new_alloca.getResult());
+          alloca.erase();
+        }
+      });
+      VLOG(3) << "Fixed AMDGPU allocas to use address space 5";
+    }
   }
 
  private:
diff --git a/xla/service/llvm_ir/llvm_loop.cc b/xla/service/llvm_ir/llvm_loop.cc
@@ -105,8 +105,9 @@ void ForLoop::Emit(llvm::IRBuilderBase* b) {
   llvm::Function* func = preheader_bb_->getParent();
   b->SetInsertPoint(&func->getEntryBlock(),
                     func->getEntryBlock().getFirstInsertionPt());
-  llvm::Value* indvar_address = b->CreateAlloca(
-      start_index_->getType(), nullptr, GetQualifiedName("invar_address"));
+  // Use EmitAllocaAtFunctionEntryWithCount which handles AMD GPU address space correctly
+  llvm::Value* indvar_address = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+      start_index_->getType(), nullptr, GetQualifiedName("invar_address"), b, 0);
 
   // Preheader basic block.
   // Initialize induction variable starting index. Create branch to the header.
diff --git a/xla/service/llvm_ir/tuple_ops.cc b/xla/service/llvm_ir/tuple_ops.cc
@@ -82,9 +82,9 @@ std::vector<llvm::Value*> EmitTupleAllocasAtFunctionEntry(
     CHECK(ShapeUtil::IsScalar(element_shape));
     llvm::Type* type = llvm_ir::PrimitiveTypeToIrType(
         element_shape.element_type(), b->getContext());
-    llvm::AllocaInst* alloca = b->CreateAlloca(
-        type,
-        /*ArraySize=*/nullptr, AsStringRef(absl::StrCat("tuple_element_", i)));
+    // Use EmitAllocaAtFunctionEntry which handles AMD GPU address space correctly
+    llvm::AllocaInst* alloca = llvm_ir::EmitAllocaAtFunctionEntry(
+        type, absl::StrCat("tuple_element_", i), b);
     generated_allocas.push_back(alloca);
   }