Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/TargetOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ class TargetOptions {
/// \brief If enabled, use precise square root
bool NVVMCudaPrecSqrt = false;

/// \brief If enabled, use precise division
bool NVVMCudaPrecDiv = false;

/// \brief If enabled, allow AMDGPU unsafe floating point atomics.
bool AllowAMDGPUUnsafeFPAtomics = false;

Expand Down
5 changes: 5 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -1446,6 +1446,11 @@ defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
PosFlag<SetTrue, [], [ClangOption, CC1Option], "Specify">,
NegFlag<SetFalse, [], [ClangOption, CC1Option], "Don't specify">,
BothFlags<[], [ClangOption], " that sqrt is correctly rounded (for CUDA devices)">>;
defm cuda_prec_div : BoolFOption<"cuda-prec-div",
TargetOpts<"NVVMCudaPrecDiv">, DefaultFalse,
PosFlag<SetTrue, [], [ClangOption, CC1Option], "Specify">,
NegFlag<SetFalse, [], [ClangOption, CC1Option], "Don't specify">,
BothFlags<[], [ClangOption], " that div is correctly rounded (for CUDA devices)">>;
}

def emit_static_lib : Flag<["--"], "emit-static-lib">,
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/CodeGen/CodeGenModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1346,6 +1346,8 @@ void CodeGenModule::Release() {
(CodeGenOpts.FPDenormalMode.Output != llvm::DenormalMode::IEEE));
getModule().addModuleFlag(llvm::Module::Max, "nvvm-reflect-prec-sqrt",
getTarget().getTargetOpts().NVVMCudaPrecSqrt);
getModule().addModuleFlag(llvm::Module::Max, "nvvm-reflect-prec-div",
getTarget().getTargetOpts().NVVMCudaPrecDiv);
}

if (LangOpts.SYCLIsDevice) {
Expand Down
8 changes: 6 additions & 2 deletions clang/lib/Driver/ToolChains/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1055,8 +1055,12 @@ llvm::SmallVector<std::string, 12> ROCMToolChain::getCommonDeviceLibNames(
options::OPT_fno_fast_math, false);
bool CorrectSqrt = false;
if (DeviceOffloadingKind == Action::OFK_SYCL) {
// When using SYCL, sqrt is only correctly rounded if the flag is specified
CorrectSqrt = DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt);
// When using SYCL, sqrt and div is only correctly rounded if the flag is
// specified
CorrectSqrt =
DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt) ||
DriverArgs.hasArg(options::OPT_foffload_fp32_prec_div) ||
DriverArgs.hasArg(options::OPT_foffload_fp32_prec_sqrt);
} else
CorrectSqrt = DriverArgs.hasFlag(
options::OPT_fhip_fp32_correctly_rounded_divide_sqrt,
Expand Down
5 changes: 4 additions & 1 deletion clang/lib/Driver/ToolChains/Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -965,8 +965,11 @@ void CudaToolChain::addClangTargetOptions(
if (DeviceOffloadingKind == Action::OFK_SYCL) {
SYCLInstallation.addSYCLIncludeArgs(DriverArgs, CC1Args);

if (DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt))
if (DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt) ||
DriverArgs.hasArg(options::OPT_foffload_fp32_prec_sqrt))
CC1Args.push_back("-fcuda-prec-sqrt");
if (DriverArgs.hasArg(options::OPT_foffload_fp32_prec_div))
CC1Args.push_back("-fcuda-prec-div");

bool FastRelaxedMath = DriverArgs.hasFlag(
options::OPT_ffast_math, options::OPT_fno_fast_math, false);
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenCUDA/flush-denormals.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ extern "C" __device__ void foo() {}
// FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// NOFTZ-NOT: "denormal-fp-math-f32"

// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
// PTXFTZ:[[MODFLAG]] = !{i32 7, !"nvvm-reflect-ftz", i32 1}

// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
// PTXNOFTZ:[[MODFLAG]] = !{i32 7, !"nvvm-reflect-ftz", i32 0}
11 changes: 11 additions & 0 deletions clang/test/CodeGenCUDA/nvvm-reflect-prec-div.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm -fcuda-prec-div %s -o -| FileCheck --check-prefix=CHECK-ON %s
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s

#include "Inputs/cuda.h"

// Check that the -fcuda-prec-div flag correctly sets the nvvm-reflect module flags.

extern "C" __device__ void foo() {}

// CHECK-ON: !{i32 7, !"nvvm-reflect-prec-div", i32 1}
// CHECK-OFF: !{i32 7, !"nvvm-reflect-prec-div", i32 0}
4 changes: 2 additions & 2 deletions clang/test/CodeGenSYCL/flush-denormals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
void foo() {}

// FTZ32: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// PTXFTZ32:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
// PTXFTZ32:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
// PTXFTZ32:[[MODFLAG]] = !{i32 7, !"nvvm-reflect-ftz", i32 1}

// FTZ: attributes #0 = {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign"
// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
// PTXFTZ:[[MODFLAG]] = !{i32 7, !"nvvm-reflect-ftz", i32 1}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,22 @@
// RUN: %s \
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s

// RUN: %clang -### \
// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa -fno-sycl-libspirv \
// RUN: -Xsycl-target-backend --offload-arch=gfx900 \
// RUN: -foffload-fp32-prec-sqrt \
// RUN: --rocm-path=%S/Inputs/rocm \
// RUN: %s \
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s

// RUN: %clang -### \
// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa -fno-sycl-libspirv \
// RUN: -Xsycl-target-backend --offload-arch=gfx900 \
// RUN: -foffload-fp32-prec-div \
// RUN: --rocm-path=%S/Inputs/rocm \
// RUN: %s \
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s

// CHECK-CORRECT: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc"

// RUN: %clang -### \
Expand All @@ -28,6 +44,22 @@
// RUN: %s \
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CONFLICT %s

// RUN: %clang -### \
// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa -fno-sycl-libspirv \
// RUN: -Xsycl-target-backend --offload-arch=gfx900 \
// RUN: -foffload-fp32-prec-sqrt -fno-hip-fp32-correctly-rounded-divide-sqrt \
// RUN: --rocm-path=%S/Inputs/rocm \
// RUN: %s \
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CONFLICT %s

// RUN: %clang -### \
// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa -fno-sycl-libspirv \
// RUN: -Xsycl-target-backend --offload-arch=gfx900 \
// RUN: -foffload-fp32-prec-div -fno-hip-fp32-correctly-rounded-divide-sqrt \
// RUN: --rocm-path=%S/Inputs/rocm \
// RUN: %s \
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CONFLICT %s

// CHECK-CONFLICT: warning: argument unused during compilation: '-fno-hip-fp32-correctly-rounded-divide-sqrt'
// CHECK-CONFLICT: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc"

Expand Down
18 changes: 18 additions & 0 deletions clang/test/Driver/sycl-nvptx-div.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// REQUIRES: nvptx-registered-target

// RUN: %clang -### -nocudalib \
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda \
// RUN: -foffload-fp32-prec-div \
// RUN: %s \
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s

// CHECK-CORRECT: "-fcuda-prec-div"

// RUN: %clang -### -nocudalib \
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda \
// RUN: %s \
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-APPROX %s

// CHECK-APPROX-NOT: "-fcuda-prec-div"

void func(){};
6 changes: 6 additions & 0 deletions clang/test/Driver/sycl-nvptx-sqrt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
// RUN: %s \
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s

// RUN: %clang -### -nocudalib \
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda \
// RUN: -foffload-fp32-prec-sqrt \
// RUN: %s \
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s

// CHECK-CORRECT: "-fcuda-prec-sqrt"

// RUN: %clang -### -nocudalib \
Expand Down
11 changes: 7 additions & 4 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1041,16 +1041,19 @@ Flag Description
=========================== ======================================================
``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero
``__CUDA_PREC_SQRT=[0,1]`` Use precise square root
``__CUDA_PREC_DIV=[0,1]`` Use precise division
=========================== ======================================================

The value of these flags are determined by the "nvvm-reflect-ftz" and
"nvvm-reflect-prec-sqrt" module flags respectively.
The following sets the ftz flag to 1, and the precise sqrt flag to 1.
The value of these flags are determined by the "nvvm-reflect-ftz",
"nvvm-reflect-prec-sqrt" and "nvvm-reflect-prec-div" module flags respectively.
The following sets the ftz flag to 1, and the precise sqrt and div flag to 1.

.. code-block:: llvm

!llvm.module.flags = !{!0}
!llvm.module.flags = !{!0, !1, !2}
!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!1 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1}
Copy link
Contributor Author

@MrSidims MrSidims Feb 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line seem to be lost during pulldown (at least git log/blame doesn't show any patches explicitly removing it), so I'm restoring it here.

!2 = !{i32 4, !"nvvm-reflect-prec-div", i32 1}

(``i32 4`` indicates that the value set here overrides the value in another
module we link with. See the `LangRef <LangRef.html#module-flags-metadata>`
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/NVPTX/NVVMReflect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,12 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt")))
ReflectVal = Flag->getSExtValue();
} else if (ReflectArg == "__CUDA_PREC_DIV") {
// Try to pull __CUDA_PREC_DIV from the nvvm-reflect-prec-div module
// flag.
if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
F.getParent()->getModuleFlag("nvvm-reflect-prec-div")))
ReflectVal = Flag->getSExtValue();
}

// If the immediate user is a simple comparison we want to simplify it.
Expand Down
10 changes: 9 additions & 1 deletion llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
declare i32 @__nvvm_reflect(ptr)
@str = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
@str.1 = private unnamed_addr addrspace(1) constant [17 x i8] c"__CUDA_PREC_SQRT\00"
@str.2 = private unnamed_addr addrspace(1) constant [16 x i8] c"__CUDA_PREC_DIV\00"

define i32 @foo() {
%call = call i32 @__nvvm_reflect(ptr addrspacecast (ptr addrspace(1) @str to ptr))
Expand All @@ -17,6 +18,13 @@ define i32 @foo_sqrt() {
ret i32 %call
}

!llvm.module.flags = !{!0, !1}
define i32 @foo_div() {
%call = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(1)* @str.2, i32 0, i32 0) to i8*))
; CHECK: ret i32 42
ret i32 %call
}

!llvm.module.flags = !{!0, !1, !2}
!0 = !{i32 4, !"nvvm-reflect-ftz", i32 42}
!1 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 42}
!2 = !{i32 4, !"nvvm-reflect-prec-div", i32 42}
Loading