Skip to content

Commit 84c4715

Browse files
committed
Merge from 'sycl' to 'sycl-web' (#1)
CONFLICT (content): Merge conflict in llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp CONFLICT (content): Merge conflict in llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
2 parents 6851795 + 8fa64a9 commit 84c4715

47 files changed

Lines changed: 1755 additions & 544 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

clang/include/clang/Basic/AttrDocs.td

Lines changed: 129 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2503,26 +2503,77 @@ device kernel, the attribute is not ignored and it is propagated to the kernel.
25032503
[[intel::num_simd_work_items(N)]] void operator()() const {}
25042504
};
25052505

2506-
If the`` intel::reqd_work_group_size`` or ``cl::reqd_work_group_size``
2507-
attribute is specified on a declaration along with a
2508-
intel::num_simd_work_items attribute, the work group size attribute
2509-
argument (the first argument) must be evenly divisible by the argument specified
2510-
in the ``intel::num_simd_work_items`` attribute.
2506+
If the ``reqd_work_group_size`` attribute is specified on a declaration along
2507+
with ``num_simd_work_items``, the required work group size specified
2508+
by ``num_simd_work_items`` attribute must evenly divide the index that
2509+
increments fastest in the ``reqd_work_group_size`` attribute.
2510+
2511+
The arguments to ``reqd_work_group_size`` are ordered based on which index
2512+
increments the fastest. In OpenCL, the first argument is the index that
2513+
increments the fastest, and in SYCL, the last argument is the index that
2514+
increments the fastest.
2515+
2516+
In OpenCL, all three arguments are required.
2517+
2518+
In SYCL, the attribute accepts either one, two, or three arguments; in each
2519+
form, the last (or only) argument is the index that increments fastest.
2520+
The number of arguments passed to the attribute must match the dimensionality
2521+
of the kernel the attribute is applied to.
25112522

25122523
.. code-block:: c++
25132524

2525+
// Note, '64' is evenly divisible by '4'; in SYCL, the last
2526+
// argument to the attribute is the one which increments fastest.
25142527
struct func {
25152528
[[intel::num_simd_work_items(4)]]
2516-
[[intel::reqd_work_group_size(64, 64, 64)]]
2529+
[[intel::reqd_work_group_size(7, 4, 64)]]
25172530
void operator()() const {}
25182531
};
25192532

2533+
// Note, '8' is evenly divisible by '8'; in SYCL, the last
2534+
// argument to the attribute is the one which increments fastest.
25202535
struct bar {
2521-
[[intel::reqd_work_group_size(64, 64, 64)]]
2536+
[[intel::reqd_work_group_size(1, 1, 8)]]
2537+
[[intel::num_simd_work_items(8)]]
2538+
void operator()() const {}
2539+
};
2540+
2541+
// Note, '10' is evenly divisible by '5'; in SYCL, the last
2542+
// argument to the attribute is the one which increments fastest.
2543+
[[cl::reqd_work_group_size(7, 5, 10)]]
2544+
[[intel::num_simd_work_items(5)]] void fun2() {}
2545+
2546+
// Note, '8' is evenly divisible by '4'; in SYCL, the last
2547+
// argument to the attribute is the one which increments fastest.
2548+
[[intel::num_simd_work_items(4)]]
2549+
[[cl::reqd_work_group_size(5, 4, 8)]] void fun3() {}
2550+
2551+
// Note, '8' is evenly divisible by '8'; in SYCL, the last
2552+
// argument to the attribute is the one which increments fastest.
2553+
struct func1 {
2554+
[[intel::num_simd_work_items(8)]]
2555+
[[cl::reqd_work_group_size(1, 1, 8)]]
2556+
void operator()() const {}
2557+
};
2558+
2559+
// Note, '8' is evenly divisible by '4'; in SYCL, the last
2560+
// argument to the attribute is the one which increments fastest.
2561+
struct bar1 {
2562+
[[cl::reqd_work_group_size(7, 4, 8)]]
25222563
[[intel::num_simd_work_items(4)]]
25232564
void operator()() const {}
25242565
};
25252566

2567+
// Note, '4' is evenly divisible by '2'; in SYCL, the last
2568+
// argument to the attribute is the one which increments fastest.
2569+
[[intel::num_simd_work_items(2)]]
2570+
__attribute__((reqd_work_group_size(3, 2, 4))) void test();
2571+
2572+
// Note, '8' is evenly divisible by '2'; in SYCL, the last
2573+
// argument to the attribute is the one which increments fastest.
2574+
__attribute__((reqd_work_group_size(3, 2, 8)))
2575+
[intel::num_simd_work_items(2)]] void test();
2576+
25262577
}];
25272578
}
25282579

@@ -2636,6 +2687,77 @@ In OpenCL C, this attribute is available in GNU spelling
26362687

26372688
__kernel __attribute__((reqd_work_group_size(8, 16, 32))) void test() {}
26382689

2690+
The arguments to ``reqd_work_group_size`` are ordered based on which index
2691+
increments the fastest. In OpenCL, the first argument is the index that
2692+
increments the fastest, and in SYCL, the last argument is the index that
2693+
increments the fastest.
2694+
2695+
In OpenCL, all three arguments are required.
2696+
2697+
In SYCL, the attribute accepts either one, two, or three arguments; in each
2698+
form, the last (or only) argument is the index that increments fastest. The
2699+
number of arguments passed to the attribute must match the dimensionality of
2700+
the kernel the attribute is applied to.
2701+
2702+
If the ``reqd_work_group_size attribute`` is specified on a declaration along
2703+
with ``num_simd_work_items``, the required work group size specified by
2704+
``num_simd_work_items`` must evenly divide the index that increments fastest
2705+
in the ``reqd_work_group_size`` attribute.
2706+
2707+
.. code-block:: c++
2708+
2709+
// Note, '64' is evenly divisible by '4'; in SYCL, the last
2710+
// argument to the attribute is the one which increments fastest.
2711+
struct func {
2712+
[[intel::num_simd_work_items(4)]]
2713+
[[intel::reqd_work_group_size(7, 4, 64)]]
2714+
void operator()() const {}
2715+
};
2716+
2717+
// Note, '8' is evenly divisible by '8'; in SYCL, the last
2718+
// argument to the attribute is the one which increments fastest.
2719+
struct bar {
2720+
[[intel::reqd_work_group_size(1, 1, 8)]]
2721+
[[intel::num_simd_work_items(8)]]
2722+
void operator()() const {}
2723+
};
2724+
2725+
// Note, '10' is evenly divisible by '5'; in SYCL, the last
2726+
// argument to the attribute is the one which increments fastest.
2727+
[[cl::reqd_work_group_size(7, 5, 10)]]
2728+
[[intel::num_simd_work_items(5)]] void fun2() {}
2729+
2730+
// Note, '8' is evenly divisible by '4'; in SYCL, the last
2731+
// argument to the attribute is the one which increments fastest.
2732+
[[intel::num_simd_work_items(4)]]
2733+
[[cl::reqd_work_group_size(5, 4, 8)]] void fun3() {}
2734+
2735+
// Note, '8' is evenly divisible by '8'; in SYCL, the last
2736+
// argument to the attribute is the one which increments fastest.
2737+
struct func1 {
2738+
[[intel::num_simd_work_items(8)]]
2739+
[[cl::reqd_work_group_size(1, 1, 8)]]
2740+
void operator()() const {}
2741+
};
2742+
2743+
// Note, '8' is evenly divisible by '4'; in SYCL, the last
2744+
// argument to the attribute is the one which increments fastest.
2745+
struct bar1 {
2746+
[[cl::reqd_work_group_size(7, 4, 8)]]
2747+
[[intel::num_simd_work_items(4)]]
2748+
void operator()() const {}
2749+
};
2750+
2751+
// Note, '4' is evenly divisible by '2'; in SYCL, the last
2752+
// argument to the attribute is the one which increments fastest.
2753+
[[intel::num_simd_work_items(2)]]
2754+
__attribute__((reqd_work_group_size(3, 2, 4))) void test();
2755+
2756+
// Note, '8' is evenly divisible by '2'; in SYCL, the last
2757+
// argument to the attribute is the one which increments fastest.
2758+
__attribute__((reqd_work_group_size(3, 2, 8)))
2759+
[intel::num_simd_work_items(2)]] void test();
2760+
26392761
}];
26402762
}
26412763

clang/include/clang/Basic/SyclOptReportHandler.h

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,31 @@ class FunctionDecl;
2525
class SyclOptReportHandler {
2626
private:
2727
struct OptReportInfo {
28-
std::string KernelArgName;
28+
std::string KernelArgDescName; // Kernel argument name itself, or the name
29+
// of the parent class if the kernel argument
30+
// is a decomposed member.
2931
std::string KernelArgType;
3032
SourceLocation KernelArgLoc;
33+
unsigned KernelArgSize;
34+
std::string KernelArgDesc;
35+
std::string KernelArgDecomposedField;
3136

32-
OptReportInfo(std::string ArgName, std::string ArgType,
33-
SourceLocation ArgLoc)
34-
: KernelArgName(std::move(ArgName)), KernelArgType(std::move(ArgType)),
35-
KernelArgLoc(ArgLoc) {}
37+
OptReportInfo(std::string ArgDescName, std::string ArgType,
38+
SourceLocation ArgLoc, unsigned ArgSize, std::string ArgDesc,
39+
std::string ArgDecomposedField)
40+
: KernelArgDescName(std::move(ArgDescName)),
41+
KernelArgType(std::move(ArgType)), KernelArgLoc(ArgLoc),
42+
KernelArgSize(ArgSize), KernelArgDesc(std::move(ArgDesc)),
43+
KernelArgDecomposedField(std::move(ArgDecomposedField)) {}
3644
};
3745
llvm::DenseMap<const FunctionDecl *, SmallVector<OptReportInfo>> Map;
3846

3947
public:
40-
void AddKernelArgs(const FunctionDecl *FD, std::string ArgName,
41-
std::string ArgType, SourceLocation ArgLoc) {
42-
Map[FD].emplace_back(ArgName, ArgType, ArgLoc);
48+
void AddKernelArgs(const FunctionDecl *FD, StringRef ArgDescName,
49+
StringRef ArgType, SourceLocation ArgLoc, unsigned ArgSize,
50+
StringRef ArgDesc, StringRef ArgDecomposedField) {
51+
Map[FD].emplace_back(ArgDescName.data(), ArgType.data(), ArgLoc, ArgSize,
52+
ArgDesc.data(), ArgDecomposedField.data());
4353
}
4454
SmallVector<OptReportInfo> &GetInfo(const FunctionDecl *FD) {
4555
auto It = Map.find(FD);

clang/include/clang/Sema/Sema.h

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1813,6 +1813,29 @@ class Sema final {
18131813
LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/All)
18141814
};
18151815

1816+
private:
1817+
// A collection of a pair of undefined functions and their callers known
1818+
// to be reachable from a routine on the device (kernel or device function).
1819+
typedef std::pair<const FunctionDecl *, const FunctionDecl *> CallPair;
1820+
llvm::SmallVector<CallPair> UndefinedReachableFromSyclDevice;
1821+
1822+
public:
1823+
// Helper routine to add a pair of Callee-Caller pair of FunctionDecl *
1824+
// to UndefinedReachableFromSyclDevice.
1825+
void addFDToReachableFromSyclDevice(const FunctionDecl *Callee,
1826+
const FunctionDecl *Caller) {
1827+
UndefinedReachableFromSyclDevice.push_back(std::make_pair(Callee, Caller));
1828+
}
1829+
// Helper routine to check if a pair of Callee-Caller FunctionDecl *
1830+
// is in UndefinedReachableFromSyclDevice.
1831+
bool isFDReachableFromSyclDevice(const FunctionDecl *Callee,
1832+
const FunctionDecl *Caller) {
1833+
return llvm::any_of(UndefinedReachableFromSyclDevice,
1834+
[Callee, Caller](const CallPair &P) {
1835+
return P.first == Callee && P.second == Caller;
1836+
});
1837+
}
1838+
18161839
/// A generic diagnostic builder for errors which may or may not be deferred.
18171840
///
18181841
/// In CUDA, there exist constructs (e.g. variable-length arrays, try/catch)
@@ -13333,7 +13356,8 @@ class Sema final {
1333313356
/// properly declared for device compilation.
1333413357
void finalizeSYCLDelayedAnalysis(const FunctionDecl *Caller,
1333513358
const FunctionDecl *Callee,
13336-
SourceLocation Loc);
13359+
SourceLocation Loc,
13360+
DeviceDiagnosticReason Reason);
1333713361

1333813362
/// Tells whether given variable is a SYCL explicit SIMD extension's "private
1333913363
/// global" variable - global variable in the private address space.

clang/lib/CodeGen/CodeGenFunction.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,14 +1515,18 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
15151515
for (auto ORI : llvm::enumerate(OptReportHandler.GetInfo(FD))) {
15161516
llvm::DiagnosticLocation DL =
15171517
SourceLocToDebugLoc(ORI.value().KernelArgLoc);
1518-
std::string KAN = ORI.value().KernelArgName;
1518+
StringRef NameInDesc = ORI.value().KernelArgDescName;
1519+
StringRef ArgType = ORI.value().KernelArgType;
1520+
StringRef ArgDesc = ORI.value().KernelArgDesc;
1521+
unsigned ArgSize = ORI.value().KernelArgSize;
1522+
StringRef ArgDecomposedField = ORI.value().KernelArgDecomposedField;
1523+
15191524
llvm::OptimizationRemark Remark("sycl", "Region", DL,
15201525
&Fn->getEntryBlock());
1521-
Remark << "Argument " << llvm::ore::NV("Argument", ORI.index())
1522-
<< " for function kernel: "
1523-
<< llvm::ore::NV(KAN.empty() ? "&" : "") << " " << Fn->getName()
1524-
<< "." << llvm::ore::NV(KAN.empty() ? " " : KAN) << "("
1525-
<< ORI.value().KernelArgType << ")";
1526+
Remark << "Arg " << llvm::ore::NV("Argument", ORI.index()) << ":"
1527+
<< ArgDesc << NameInDesc << " (" << ArgDecomposedField
1528+
<< "Type:" << ArgType << ", "
1529+
<< "Size: " << llvm::ore::NV("Argument", ArgSize) << ")";
15261530
ORE.emit(Remark);
15271531
}
15281532
}

clang/lib/Driver/ToolChains/SYCL.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ void SYCL::constructLLVMForeachCommand(Compilation &C, const JobAction &JA,
6161
std::unique_ptr<Command> InputCommand,
6262
const InputInfoList &InputFiles,
6363
const InputInfo &Output, const Tool *T,
64+
StringRef Increment,
6465
StringRef Ext = "out") {
6566
// Construct llvm-foreach command.
6667
// The llvm-foreach command looks like this:
@@ -80,6 +81,9 @@ void SYCL::constructLLVMForeachCommand(Compilation &C, const JobAction &JA,
8081
C.getArgs().MakeArgString("--out-file-list=" + OutputFileName));
8182
ForeachArgs.push_back(
8283
C.getArgs().MakeArgString("--out-replace=" + OutputFileName));
84+
if (!Increment.empty())
85+
ForeachArgs.push_back(
86+
C.getArgs().MakeArgString("--out-increment=" + Increment));
8387
ForeachArgs.push_back(C.getArgs().MakeArgString("--"));
8488
ForeachArgs.push_back(
8589
C.getArgs().MakeArgString(InputCommand->getExecutable()));
@@ -345,7 +349,7 @@ void SYCL::fpga::BackendCompiler::constructOpenCLAOTCommand(
345349
Exec, CmdArgs, None);
346350
if (!ForeachInputs.empty())
347351
constructLLVMForeachCommand(C, JA, std::move(Cmd), ForeachInputs, Output,
348-
this, ForeachExt);
352+
this, "", ForeachExt);
349353
else
350354
C.addCommand(std::move(Cmd));
351355
}
@@ -498,7 +502,7 @@ void SYCL::fpga::BackendCompiler::ConstructJob(
498502
Exec, CmdArgs, None);
499503
if (!ForeachInputs.empty())
500504
constructLLVMForeachCommand(C, JA, std::move(Cmd), ForeachInputs, Output,
501-
this, ForeachExt);
505+
this, ReportOptArg, ForeachExt);
502506
else
503507
C.addCommand(std::move(Cmd));
504508
}
@@ -537,7 +541,7 @@ void SYCL::gen::BackendCompiler::ConstructJob(Compilation &C,
537541
Exec, CmdArgs, None);
538542
if (!ForeachInputs.empty())
539543
constructLLVMForeachCommand(C, JA, std::move(Cmd), ForeachInputs, Output,
540-
this);
544+
this, "");
541545
else
542546
C.addCommand(std::move(Cmd));
543547
}
@@ -570,7 +574,7 @@ void SYCL::x86_64::BackendCompiler::ConstructJob(
570574
Exec, CmdArgs, None);
571575
if (!ForeachInputs.empty())
572576
constructLLVMForeachCommand(C, JA, std::move(Cmd), ForeachInputs, Output,
573-
this);
577+
this, "");
574578
else
575579
C.addCommand(std::move(Cmd));
576580
}

clang/lib/Driver/ToolChains/SYCL.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ void constructLLVMForeachCommand(Compilation &C, const JobAction &JA,
2424
std::unique_ptr<Command> InputCommand,
2525
const InputInfoList &InputFiles,
2626
const InputInfo &Output, const Tool *T,
27-
StringRef Ext);
27+
StringRef Increment, StringRef Ext);
2828

2929
// Runs llvm-spirv to convert spirv to bc, llvm-link, which links multiple LLVM
3030
// bitcode. Converts generated bc back to spirv using llvm-spirv, wraps with

clang/lib/Sema/Sema.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1662,7 +1662,7 @@ class DeferredDiagnosticsEmitter
16621662
S.finalizeOpenMPDelayedAnalysis(Caller, FD, Loc);
16631663
// Finalize analysis of SYCL-specific constructs.
16641664
if (Caller && S.LangOpts.SYCLIsDevice)
1665-
S.finalizeSYCLDelayedAnalysis(Caller, FD, Loc);
1665+
S.finalizeSYCLDelayedAnalysis(Caller, FD, Loc, RootReason);
16661666
if (Caller)
16671667
S.DeviceKnownEmittedFns[FD] = {Caller, Loc};
16681668
// Always emit deferred diagnostics for the direct users. This does not

clang/lib/Sema/SemaDecl.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18493,11 +18493,19 @@ Decl *Sema::getObjCDeclContext() const {
1849318493
}
1849418494

1849518495
Sema::DeviceDiagnosticReason Sema::getEmissionReason(const FunctionDecl *FD) {
18496+
// FIXME: This should really be a bitwise-or of the language modes.
1849618497
if (FD->hasAttr<SYCLSimdAttr>())
1849718498
return Sema::DeviceDiagnosticReason::Esimd;
18498-
else if (FD->hasAttr<SYCLDeviceAttr>() || FD->hasAttr<SYCLKernelAttr>())
18499+
if (FD->hasAttr<SYCLDeviceAttr>() || FD->hasAttr<SYCLKernelAttr>())
1849918500
return Sema::DeviceDiagnosticReason::Sycl;
18500-
// FIXME: Figure out the logic for OMP and CUDA.
18501+
// FIXME: Refine the logic for CUDA and OpenMP.
18502+
if (getLangOpts().CUDA)
18503+
return getLangOpts().CUDAIsDevice ? Sema::DeviceDiagnosticReason::CudaDevice
18504+
: Sema::DeviceDiagnosticReason::CudaHost;
18505+
if (getLangOpts().OpenMP)
18506+
return getLangOpts().OpenMPIsDevice
18507+
? Sema::DeviceDiagnosticReason::OmpDevice
18508+
: Sema::DeviceDiagnosticReason::OmpHost;
1850118509
return Sema::DeviceDiagnosticReason::All;
1850218510
}
1850318511

0 commit comments

Comments
 (0)