Skip to content

Commit 3302a79

Browse files
author
Sidorov, Dmitry
committed
[SYCL][NVPTX][HIP] Propagate -foffload-fp32-prec-div/sqrt
It follows the approach from intel#5141 and intel#5309 adding intermediate fcuda-prec-div flag. Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
1 parent 0dfb947 commit 3302a79

14 files changed

Lines changed: 113 additions & 12 deletions

File tree

clang/include/clang/Basic/TargetOptions.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ class TargetOptions {
7878
/// \brief If enabled, use precise square root
7979
bool NVVMCudaPrecSqrt = false;
8080

81+
/// \brief If enabled, use precise division
82+
bool NVVMCudaPrecDiv = false;
83+
8184
/// \brief If enabled, allow AMDGPU unsafe floating point atomics.
8285
bool AllowAMDGPUUnsafeFPAtomics = false;
8386

clang/include/clang/Driver/Options.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,6 +1446,11 @@ defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
14461446
PosFlag<SetTrue, [], [ClangOption, CC1Option], "Specify">,
14471447
NegFlag<SetFalse, [], [ClangOption, CC1Option], "Don't specify">,
14481448
BothFlags<[], [ClangOption], " that sqrt is correctly rounded (for CUDA devices)">>;
1449+
defm cuda_prec_div : BoolFOption<"cuda-prec-div",
1450+
TargetOpts<"NVVMCudaPrecDiv">, DefaultFalse,
1451+
PosFlag<SetTrue, [], [ClangOption, CC1Option], "Specify">,
1452+
NegFlag<SetFalse, [], [ClangOption, CC1Option], "Don't specify">,
1453+
BothFlags<[], [ClangOption], " that div is correctly rounded (for CUDA devices)">>;
14491454
}
14501455

14511456
def emit_static_lib : Flag<["--"], "emit-static-lib">,

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1346,6 +1346,8 @@ void CodeGenModule::Release() {
13461346
(CodeGenOpts.FPDenormalMode.Output != llvm::DenormalMode::IEEE));
13471347
getModule().addModuleFlag(llvm::Module::Max, "nvvm-reflect-prec-sqrt",
13481348
getTarget().getTargetOpts().NVVMCudaPrecSqrt);
1349+
getModule().addModuleFlag(llvm::Module::Max, "nvvm-reflect-prec-div",
1350+
getTarget().getTargetOpts().NVVMCudaPrecDiv);
13491351
}
13501352

13511353
if (LangOpts.SYCLIsDevice) {

clang/lib/Driver/ToolChains/AMDGPU.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,8 +1055,12 @@ llvm::SmallVector<std::string, 12> ROCMToolChain::getCommonDeviceLibNames(
10551055
options::OPT_fno_fast_math, false);
10561056
bool CorrectSqrt = false;
10571057
if (DeviceOffloadingKind == Action::OFK_SYCL) {
1058-
// When using SYCL, sqrt is only correctly rounded if the flag is specified
1059-
CorrectSqrt = DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt);
1058+
// When using SYCL, sqrt and div is only correctly rounded if the flag is
1059+
// specified
1060+
CorrectSqrt =
1061+
DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt) ||
1062+
DriverArgs.hasArg(options::OPT_foffload_fp32_prec_div) ||
1063+
DriverArgs.hasArg(options::OPT_foffload_fp32_prec_sqrt);
10601064
} else
10611065
CorrectSqrt = DriverArgs.hasFlag(
10621066
options::OPT_fhip_fp32_correctly_rounded_divide_sqrt,

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -965,8 +965,11 @@ void CudaToolChain::addClangTargetOptions(
965965
if (DeviceOffloadingKind == Action::OFK_SYCL) {
966966
SYCLInstallation.addSYCLIncludeArgs(DriverArgs, CC1Args);
967967

968-
if (DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt))
968+
if (DriverArgs.hasArg(options::OPT_fsycl_fp32_prec_sqrt) ||
969+
DriverArgs.hasArg(options::OPT_foffload_fp32_prec_sqrt))
969970
CC1Args.push_back("-fcuda-prec-sqrt");
971+
if (DriverArgs.hasArg(options::OPT_foffload_fp32_prec_div))
972+
CC1Args.push_back("-fcuda-prec-div");
970973

971974
bool FastRelaxedMath = DriverArgs.hasFlag(
972975
options::OPT_ffast_math, options::OPT_fno_fast_math, false);

clang/test/CodeGenCUDA/flush-denormals.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ extern "C" __device__ void foo() {}
4444
// FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
4545
// NOFTZ-NOT: "denormal-fp-math-f32"
4646

47-
// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
47+
// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
4848
// PTXFTZ:[[MODFLAG]] = !{i32 7, !"nvvm-reflect-ftz", i32 1}
4949

50-
// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
50+
// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
5151
// PTXNOFTZ:[[MODFLAG]] = !{i32 7, !"nvvm-reflect-ftz", i32 0}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm -fcuda-prec-div %s -o -| FileCheck --check-prefix=CHECK-ON %s
2+
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s
3+
4+
#include "Inputs/cuda.h"
5+
6+
// Check that the -fcuda-prec-div flag correctly sets the nvvm-reflect module flags.
7+
8+
extern "C" __device__ void foo() {}
9+
10+
// CHECK-ON: !{i32 7, !"nvvm-reflect-prec-div", i32 1}
11+
// CHECK-OFF: !{i32 7, !"nvvm-reflect-prec-div", i32 0}

clang/test/CodeGenSYCL/flush-denormals.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010
void foo() {}
1111

1212
// FTZ32: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
13-
// PTXFTZ32:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
13+
// PTXFTZ32:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
1414
// PTXFTZ32:[[MODFLAG]] = !{i32 7, !"nvvm-reflect-ftz", i32 1}
1515

1616
// FTZ: attributes #0 = {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign"
17-
// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
17+
// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
1818
// PTXFTZ:[[MODFLAG]] = !{i32 7, !"nvvm-reflect-ftz", i32 1}

clang/test/Driver/sycl-amdgcn-sqrt.cpp renamed to clang/test/Driver/sycl-amdgcn-divide-sqrt.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,22 @@
99
// RUN: %s \
1010
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s
1111

12+
// RUN: %clang -### \
13+
// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa -fno-sycl-libspirv \
14+
// RUN: -Xsycl-target-backend --offload-arch=gfx900 \
15+
// RUN: -foffload-fp32-prec-sqrt \
16+
// RUN: --rocm-path=%S/Inputs/rocm \
17+
// RUN: %s \
18+
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s
19+
20+
// RUN: %clang -### \
21+
// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa -fno-sycl-libspirv \
22+
// RUN: -Xsycl-target-backend --offload-arch=gfx900 \
23+
// RUN: -foffload-fp32-prec-div \
24+
// RUN: --rocm-path=%S/Inputs/rocm \
25+
// RUN: %s \
26+
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s
27+
1228
// CHECK-CORRECT: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc"
1329

1430
// RUN: %clang -### \
@@ -28,6 +44,22 @@
2844
// RUN: %s \
2945
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CONFLICT %s
3046

47+
// RUN: %clang -### \
48+
// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa -fno-sycl-libspirv \
49+
// RUN: -Xsycl-target-backend --offload-arch=gfx900 \
50+
// RUN: -foffload-fp32-prec-sqrt -fno-hip-fp32-correctly-rounded-divide-sqrt \
51+
// RUN: --rocm-path=%S/Inputs/rocm \
52+
// RUN: %s \
53+
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CONFLICT %s
54+
55+
// RUN: %clang -### \
56+
// RUN: -fsycl -fsycl-targets=amdgcn-amd-amdhsa -fno-sycl-libspirv \
57+
// RUN: -Xsycl-target-backend --offload-arch=gfx900 \
58+
// RUN: -foffload-fp32-prec-div -fno-hip-fp32-correctly-rounded-divide-sqrt \
59+
// RUN: --rocm-path=%S/Inputs/rocm \
60+
// RUN: %s \
61+
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CONFLICT %s
62+
3163
// CHECK-CONFLICT: warning: argument unused during compilation: '-fno-hip-fp32-correctly-rounded-divide-sqrt'
3264
// CHECK-CONFLICT: "-mlink-builtin-bitcode" "{{.*}}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc"
3365

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// REQUIRES: nvptx-registered-target
2+
3+
// RUN: %clang -### -nocudalib \
4+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda \
5+
// RUN: -foffload-fp32-prec-div \
6+
// RUN: %s \
7+
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-CORRECT %s
8+
9+
// CHECK-CORRECT: "-fcuda-prec-div"
10+
11+
// RUN: %clang -### -nocudalib \
12+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda \
13+
// RUN: %s \
14+
// RUN: 2>&1 | FileCheck --check-prefix=CHECK-APPROX %s
15+
16+
// CHECK-APPROX-NOT: "-fcuda-prec-div"
17+
18+
void func(){};

0 commit comments

Comments
 (0)