Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/Development.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ There are several transformations (LLVM passes) done on the LLVM IR of the devic
* HipDefrost.cpp - removes freeze from instructions (workaround for the llvm-spirv translator).
* HipDynMem.cpp - replaces dynamically sized shared-memory variables (`extern __shared__ type variable[];`) with a kernel argument. This is because in OpenCL, dynamically-sized local memory can only be passed as kernel argument.
* HipEmitLoweredNames.cpp - required processing for hiprtcGetLoweredName()
* HipGlobalVariable.cpp - creates special kernels that handle access and modification of global scope variables.
* HipGlobalVariable.cpp - creates special kernels that handle access and modification of global scope variables. This pass lowers host-accessible global device variables (`__device__` and `__constant__`) by generating shadow kernels. These shadow kernels allow the runtime to query variable properties (size, alignment), bind device addresses, and initialize values. The pass handles both regular and templated variables, including those with COMDAT linkage (e.g., templated `__constant__` variables). Variables marked as `externally_initialized` are lowered regardless of COMDAT status.
* HipKernelArgSpiller.cpp - Reduces the size of large kernel parameter lists by spilling them into a device buffer
* HipLowerSwitch.cpp - Lowers switch instructions with a "non-standard" integer bitwidth (e.g. i4) to bitwidth supported by SPIRV-LLVM-Translator
* HipLowerZeroLengthArrays.cpp - Lowers occurrences of zero length array types (unsupported by SPIRV-LLVM-Translator)
Expand Down
22 changes: 18 additions & 4 deletions llvm_passes/HipGlobalVariables.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ static Instruction *createKernelStub(Module &M, StringRef Name,
F->setCallingConv(CallingConv::SPIR_KERNEL);
// HIP-CLang marks kernels hidden. Do the same here for consistency.
F->setVisibility(GlobalValue::HiddenVisibility);
// Mark the function as externally used to prevent it from being removed by GlobalDCEPass
F->setLinkage(GlobalValue::ExternalLinkage);
BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F);
IRBuilder<> B(BB);
return B.CreateRetVoid();
Expand Down Expand Up @@ -327,23 +329,35 @@ static bool shouldLower(const GlobalVariable &GVar) {
return false; // Already lowered.

// All host accessible global device variables are marked to be externally
// initialized and does not have COMDAT (so far).
if (!GVar.isExternallyInitialized() || GVar.hasComdat())
// initialized. For templated variables, we allow COMDAT linkage.
if (!GVar.isExternallyInitialized()) {
LLVM_DEBUG(dbgs() << "Skipping variable " << GVar.getName()
<< " - not externally initialized\n");
return false;
}

// String literals get an unnamed_addr attribute, we know by it to
// skip them.
if (GVar.hasAtLeastLocalUnnamedAddr())
if (GVar.hasAtLeastLocalUnnamedAddr()) {
LLVM_DEBUG(dbgs() << "Skipping variable " << GVar.getName()
<< " - has unnamed_addr\n");
return false;
}

// Only objects in cross-workgroup address space are considered. LLVM IR
// straight out from the HIP-Clang does not have objects in constant address
// space so we don't look for them.
if (GVar.getAddressSpace() != SpirvCrossWorkGroupAS) return false;
if (GVar.getAddressSpace() != SpirvCrossWorkGroupAS) {
LLVM_DEBUG(dbgs() << "Skipping variable " << GVar.getName()
<< " - wrong address space: " << GVar.getAddressSpace() << "\n");
return false;
}

// Catch globals with unexpected attributes.
assert(!GVar.isThreadLocal());

LLVM_DEBUG(dbgs() << "Lowering variable " << GVar.getName()
<< " - hasInitializer=" << GVar.hasInitializer() << "\n");
return true;
}

Expand Down
168 changes: 133 additions & 35 deletions src/CHIPBackend.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
/*
* Copyright (c) 2021-23 chipStar developers
*
Expand Down Expand Up @@ -58,8 +58,12 @@
const chipstar::DeviceVar *Var,
void *InfoBuffer) {
assert(M && Var && InfoBuffer);
auto *K = M->getKernelByName(std::string(ChipVarInfoPrefix) +
std::string(Var->getName()));

logTrace("queueVariableInfoShadowKernel: Var->getName() = '{}'", Var->getName());
std::string VarNameStr(Var->getName());
logTrace("queueVariableInfoShadowKernel: VarNameStr = '{}'", VarNameStr);

auto *K = M->getKernelByName(std::string(ChipVarInfoPrefix) + VarNameStr);
assert(K && "chipstar::Module is missing a shadow kernel?");
void *Args[] = {&InfoBuffer};
queueKernel(Q, K, Args);
Expand Down Expand Up @@ -135,11 +139,12 @@
}

chipstar::AllocationTracker::~AllocationTracker() {
LOCK(AllocationTrackerMtx); // Protect against concurrent access during cleanup

LOCK(
AllocationTrackerMtx); // Protect against concurrent access during cleanup

// Clear the maps first to prevent double-deletion
PtrToAllocInfo_.clear();

for (auto *Member : AllocInfos_) {
if (Member) {
delete Member;
Expand All @@ -153,9 +158,9 @@
if (!Ptr) {
return nullptr;
}

LOCK(AllocationTrackerMtx); // chipstar::AllocTracker::PtrToAllocInfo_

// In case that Ptr is the base of the allocation, check hash map directly
auto Found = PtrToAllocInfo_.count(const_cast<void *>(Ptr));
if (Found)
Expand Down Expand Up @@ -407,20 +412,26 @@
assert(Size && "Unexpected zero sized device variable.");
assert(Alignment && "Unexpected alignment requirement.");

logTrace("Variable '{}': Size={}, Alignment={}, HasInitializer={}",
Var->getName(), Size, Alignment, HasInitializer);
Var->setDevAddr(
Ctx->allocate(Size, Alignment, hipMemoryType::hipMemoryTypeDevice));
Var->markHasInitializer(HasInitializer);
// Sanity check for object sizes reported by the shadow kernels vs
// __hipRegisterVar. For hipRTC variables, getSize() may return 0 since
// they're not registered via __hipRegisterVar, so skip the check.
assert((Var->getSize() == 0 || Var->getSize() == Size) && "Object size discrepancy!");
// __hipRegisterVar. For device-only variables, we don't have __hipRegisterVar
// so the size is 0 - update it from the shadow kernel.
if (Var->getSize() == 0) {
// This is a device-only variable - update the size
const_cast<SPVVariable*>(Var->getSrcVar())->Size = Size;
}
assert(Var->getSize() == Size && "Object size discrepancy!");
queueVariableBindShadowKernel(Queue, this, Var);
}
Queue->finish();
DeviceVariablesAllocated_ = true;

Ctx->free(VarInfoBufD);

return hipSuccess;
}

Expand Down Expand Up @@ -466,8 +477,11 @@

bool QueuedKernels = false;
for (auto *Var : ChipVars_) {
logTrace("Checking variable '{}' for initialization: hasInitializer={}",
Var->getName(), Var->hasInitializer());
if (!Var->hasInitializer())
continue;
logTrace("Initializing variable '{}'", Var->getName());
queueVariableInitShadowKernel(Queue, this, Var);
QueuedKernels = true;
}
Expand Down Expand Up @@ -692,9 +706,9 @@
chipstar::Context *chipstar::Device::getContext() { return Ctx_; }
int chipstar::Device::getDeviceId() { return Idx_; }

chipstar::DeviceVar *chipstar::Device::getStatGlobalVar(const void *HostPtr) {
if (DeviceVarLookup_.count(HostPtr)) {
auto *Var = DeviceVarLookup_[HostPtr];
chipstar::DeviceVar *chipstar::Device::getStatGlobalVar(const void *Ptr) {
if (DeviceVarLookup_.count(Ptr)) {
auto *Var = DeviceVarLookup_[Ptr];
assert(Var->getDevAddr() && "Missing device pointer.");
return Var;
}
Expand Down Expand Up @@ -1146,6 +1160,8 @@
std::string NameTmp(Info.Name.begin(), Info.Name.end());
std::string VarInfoKernelName = std::string(ChipVarInfoPrefix) + NameTmp;

logTrace("Processing variable: {} with host pointer: {}", NameTmp, (const void*)Info.Ptr.Value);

if (!Mod->hasKernel(VarInfoKernelName)) {
// The kernel compilation pipe is allowed to remove device-side unused
// global variables from the device modules. This is utilized in the
Expand All @@ -1158,6 +1174,7 @@
Info.Name);
continue;
}
logTrace("Found shadow kernel for variable: {}", NameTmp);
auto *Var = new chipstar::DeviceVar(&Info);
Mod->addDeviceVariable(Var);

Expand All @@ -1166,12 +1183,86 @@
HostPtrToCompiledMod_[Info.Ptr] = Mod;
}

// Discover device-only variables (e.g., template instantiations) that weren't
// registered via __hipRegisterVar. These have shadow kernels but no host symbol.
// Static variables protected by DeviceVarMtx to ensure thread safety.
static int DummyHostPtr = 0;
// Use unique_ptr for automatic cleanup when the program exits
static std::vector<std::unique_ptr<SPVVariable>> SyntheticVars;

{
LOCK(DeviceVarMtx); // Protect static SyntheticVars from concurrent access
for (auto *Kernel : Mod->getKernels()) {
const std::string &KernelName = Kernel->getName();
size_t PrefixLen = strlen(ChipVarInfoPrefix);

// Check if this is a variable info shadow kernel
if (KernelName.length() <= PrefixLen ||
KernelName.substr(0, PrefixLen) != ChipVarInfoPrefix)
continue;

// Extract variable name
std::string VarName = KernelName.substr(PrefixLen);

// Check if we already processed this variable
bool AlreadyRegistered = false;
for (const auto &Info : SrcMod->Variables) {
std::string NameTmp(Info.Name.begin(), Info.Name.end());
if (NameTmp == VarName) {
AlreadyRegistered = true;
break;
}
}

if (AlreadyRegistered)
continue;

// Check if already in SyntheticVars (another thread may have added it)
bool AlreadySynthesized = false;
for (const auto &SV : SyntheticVars) {
if (SV->Name == VarName) {
AlreadySynthesized = true;
break;
}
}
if (AlreadySynthesized)
continue;

// This is a device-only variable - create a DeviceVar for it
logTrace("Found device-only variable: {} (no host symbol)", VarName);

// Create a synthetic SPVVariable for this device-only variable
// Use aggregate initialization since SPVVariable has no default constructor
auto *RawVar = new SPVVariable{
{const_cast<SPVModule *>(SrcMod), HostPtr(&DummyHostPtr), VarName}, 0};
std::unique_ptr<SPVVariable> SyntheticVar(RawVar);

auto *Var = new chipstar::DeviceVar(SyntheticVar.get());
Mod->addDeviceVariable(Var);

// Store the synthetic variable so it persists (unique_ptr handles cleanup)
SyntheticVars.push_back(std::move(SyntheticVar));

// Note: We don't add to DeviceVarLookup_ since there's no host pointer
}
}

#ifndef NDEBUG
{
LOCK(DeviceVarMtx); // chipstar::Device::HostPtrToCompiledMod_
assert((!Mod || (HostPtrToCompiledMod_.count(Ptr) &&
HostPtrToCompiledMod_[Ptr] == Mod)) &&
"Forgot to map the host pointers");
// For unregistered templated variables, we might not have a mapping
// In that case, log but continue with normal module creation/mapping
if (Mod && !HostPtrToCompiledMod_.count(Ptr)) {
logTrace("Host pointer {} not found in mapping - likely unregistered "
"templated variable",
static_cast<const void *>(Ptr));
}
// Only assert if we have a module AND it's not properly mapped
// Allow unregistered templated variables to proceed without mapping
if (Mod && HostPtrToCompiledMod_.count(Ptr)) {
assert(HostPtrToCompiledMod_[Ptr] == Mod &&
"Forgot to map the host pointers");
}
}
#endif

Expand Down Expand Up @@ -1283,17 +1374,17 @@

void chipstar::Context::reset() {
logDebug("Resetting Context: deleting allocations");

auto Dev = getDevice();

// Properly free all allocations and clean up AllocationTracker
for (auto &Ptr : AllocatedPtrs_) {
// Get allocation info before freeing
chipstar::AllocationInfo *AllocInfo = Dev->AllocTracker->getAllocInfo(Ptr);

// Free the memory
freeImpl(Ptr);

// Remove from AllocationTracker to prevent double-allocation errors
if (AllocInfo) {
Dev->AllocTracker->eraseRecord(AllocInfo);
Expand All @@ -1311,7 +1402,7 @@
if (!Ptr) {
return hipErrorInvalidValue;
}

chipstar::Device *ChipDev = ::Backend->getActiveDevice();
chipstar::AllocationInfo *AllocInfo =
ChipDev->AllocTracker->getAllocInfo(Ptr);
Expand Down Expand Up @@ -1368,29 +1459,33 @@
// Check if any threads called HIP APIs
usleep(100000);
int activeThreads = GlobalActiveThreads.load(std::memory_order_relaxed);

if (activeThreads <= 1) {
// Only main thread or no threads called HIP APIs
logDebug("waitForThreadExit: No additional threads detected (count: {})", activeThreads);
logDebug("waitForThreadExit: No additional threads detected (count: {})",
activeThreads);
return;
}

// Wait for threads to exit by polling the counter
logDebug("waitForThreadExit: Waiting for {} threads to exit", activeThreads - 1);

logDebug("waitForThreadExit: Waiting for {} threads to exit",
activeThreads - 1);

for (int i = 0; i < 50; i++) { // Max 5 seconds
pthread_yield();
usleep(100000); // 100ms per iteration

activeThreads = GlobalActiveThreads.load(std::memory_order_relaxed);
if (activeThreads <= 1) {
logDebug("waitForThreadExit: All threads exited (count: {})", activeThreads);
logDebug("waitForThreadExit: All threads exited (count: {})",
activeThreads);
return;
}
}

logWarn("waitForThreadExit: Timeout waiting for threads to exit (remaining: {})",
activeThreads - 1);

logWarn(
"waitForThreadExit: Timeout waiting for threads to exit (remaining: {})",
activeThreads - 1);

// Cleanup all queues
{
Expand Down Expand Up @@ -1438,7 +1533,8 @@

chipstar::Context *chipstar::Backend::getActiveContext() {
if (!::Backend) {
CHIPERR_LOG_AND_THROW("Backend not initialized", hipErrorInitializationError);
CHIPERR_LOG_AND_THROW("Backend not initialized",
hipErrorInitializationError);
}
LOCK(::Backend->ActiveCtxMtx); // reading Backend::ChipCtxStack
// assert(ChipCtxStack.size() > 0 && "Context stack is empty");
Expand All @@ -1451,7 +1547,8 @@

chipstar::Device *chipstar::Backend::getActiveDevice() {
if (!::Backend) {
CHIPERR_LOG_AND_THROW("Backend not initialized", hipErrorInitializationError);
CHIPERR_LOG_AND_THROW("Backend not initialized",
hipErrorInitializationError);
}
chipstar::Context *Ctx = getActiveContext();
return Ctx->getDevice();
Expand Down Expand Up @@ -1607,7 +1704,8 @@

chipstar::Queue *chipstar::Backend::findQueue(chipstar::Queue *ChipQueue) {
if (!::Backend) {
CHIPERR_LOG_AND_THROW("Backend not initialized", hipErrorInitializationError);
CHIPERR_LOG_AND_THROW("Backend not initialized",
hipErrorInitializationError);
}
auto Dev = ::Backend->getActiveDevice();
LOCK(Dev->QueueAddRemoveMtx);
Expand Down
1 change: 1 addition & 0 deletions src/CHIPBackend.hh
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,7 @@ public:

void *getDevAddr() const { return DevAddr_; }
void setDevAddr(void *Addr) { DevAddr_ = Addr; }
const SPVVariable *getSrcVar() const { return SrcVar_; }
std::string_view getName() const { return SrcVar_->Name; }
size_t getSize() const { return SrcVar_->Size; }
size_t getAlignment() const { return Alignment_; }
Expand Down
Loading