forked from intel/llvm
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathqueue_impl.hpp
More file actions
446 lines (385 loc) · 16.6 KB
/
queue_impl.hpp
File metadata and controls
446 lines (385 loc) · 16.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
//==------------------ queue_impl.hpp - SYCL queue -------------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#pragma once
#include <CL/sycl/context.hpp>
#include <CL/sycl/device.hpp>
#include <CL/sycl/event.hpp>
#include <CL/sycl/exception.hpp>
#include <CL/sycl/exception_list.hpp>
#include <CL/sycl/handler.hpp>
#include <CL/sycl/properties/context_properties.hpp>
#include <CL/sycl/properties/queue_properties.hpp>
#include <CL/sycl/property_list.hpp>
#include <CL/sycl/stl.hpp>
#include <detail/context_impl.hpp>
#include <detail/device_impl.hpp>
#include <detail/event_impl.hpp>
#include <detail/plugin.hpp>
#include <detail/scheduler/scheduler.hpp>
#include <detail/thread_pool.hpp>
#include <utility>
__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
namespace detail {
using ContextImplPtr = std::shared_ptr<detail::context_impl>;
using DeviceImplPtr = shared_ptr_class<detail::device_impl>;
/// Sets max number of queues supported by FPGA RT.
static constexpr size_t MaxNumQueues = 256;
//// Possible CUDA context types supported by PI CUDA backend
/// TODO: Implement this as a property once there is an extension document
enum class CUDAContextT : char { primary, custom };
/// Default context type created for CUDA backend
constexpr CUDAContextT DefaultContextType = CUDAContextT::custom;
enum QueueOrder { Ordered, OOO };
class queue_impl {
public:
/// Constructs a SYCL queue from a device using an async_handler and
/// property_list provided.
///
/// \param Device is a SYCL device that is used to dispatch tasks submitted
/// to the queue.
/// \param AsyncHandler is a SYCL asynchronous exception handler.
/// \param PropList is a list of properties to use for queue construction.
queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler,
const property_list &PropList)
: queue_impl(Device,
detail::getSyclObjImpl(
context(createSyclObjFromImpl<device>(Device), {},
(DefaultContextType == CUDAContextT::primary)
? property_list{property::context::cuda::
use_primary_context()}
: property_list{})),
AsyncHandler, PropList){};
/// Constructs a SYCL queue with an async_handler and property_list provided
/// form a device and a context.
///
/// \param Device is a SYCL device that is used to dispatch tasks submitted
/// to the queue.
/// \param Context is a SYCL context to associate with the queue being
/// constructed.
/// \param AsyncHandler is a SYCL asynchronous exception handler.
/// \param PropList is a list of properties to use for queue construction.
queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context,
const async_handler &AsyncHandler, const property_list &PropList)
: MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler),
MPropList(PropList), MHostQueue(MDevice->is_host()) {
if (!Context->hasDevice(Device))
throw cl::sycl::invalid_parameter_error(
"Queue cannot be constructed with the given context and device "
"as the context does not contain the given device.",
PI_INVALID_DEVICE);
if (!MHostQueue) {
const QueueOrder QOrder =
MPropList.has_property<property::queue::in_order>()
? QueueOrder::Ordered
: QueueOrder::OOO;
MQueues.push_back(createQueue(QOrder));
}
}
/// Constructs a SYCL queue from plugin interoperability handle.
///
/// \param PiQueue is a raw PI queue handle.
/// \param Context is a SYCL context to associate with the queue being
/// constructed.
/// \param AsyncHandler is a SYCL asynchronous exception handler.
queue_impl(RT::PiQueue PiQueue, const ContextImplPtr &Context,
const async_handler &AsyncHandler)
: MContext(Context), MAsyncHandler(AsyncHandler), MHostQueue(false) {
MQueues.push_back(pi::cast<RT::PiQueue>(PiQueue));
RT::PiDevice Device{};
const detail::plugin &Plugin = getPlugin();
// TODO catch an exception and put it to list of asynchronous exceptions
Plugin.call<PiApiKind::piQueueGetInfo>(MQueues[0], PI_QUEUE_INFO_DEVICE,
sizeof(Device), &Device, nullptr);
MDevice =
DeviceImplPtr(new device_impl(Device, Context->getPlatformImpl()));
// TODO catch an exception and put it to list of asynchronous exceptions
Plugin.call<PiApiKind::piQueueRetain>(MQueues[0]);
}
~queue_impl() {
throw_asynchronous();
if (!MHostQueue) {
getPlugin().call<PiApiKind::piQueueRelease>(MQueues[0]);
}
}
/// \return an OpenCL interoperability queue handle.
cl_command_queue get() {
if (!MHostQueue) {
getPlugin().call<PiApiKind::piQueueRetain>(MQueues[0]);
return pi::cast<cl_command_queue>(MQueues[0]);
}
throw invalid_object_error(
"This instance of queue doesn't support OpenCL interoperability",
PI_INVALID_QUEUE);
}
/// \return an associated SYCL context.
context get_context() const {
return createSyclObjFromImpl<context>(MContext);
}
const plugin &getPlugin() const { return MContext->getPlugin(); }
ContextImplPtr getContextImplPtr() const { return MContext; }
/// \return an associated SYCL device.
device get_device() const { return createSyclObjFromImpl<device>(MDevice); }
/// \return true if this queue is a SYCL host queue.
bool is_host() const { return MHostQueue; }
/// Queries SYCL queue for information.
///
/// The return type depends on information being queried.
template <info::queue Param>
typename info::param_traits<info::queue, Param>::return_type get_info() const;
/// Submits a command group function object to the queue, in order to be
/// scheduled for execution on the device.
///
/// On a kernel error, this command group function object is then scheduled
/// for execution on a secondary queue.
///
/// \param CGF is a function object containing command group.
/// \param Self is a shared_ptr to this queue.
/// \param SecondQueue is a shared_ptr to the secondary queue.
/// \param Loc is the code location of the submit call (default argument)
/// \return a SYCL event object, which corresponds to the queue the command
/// group is being enqueued on.
event submit(const function_class<void(handler &)> &CGF,
const shared_ptr_class<queue_impl> &Self,
const shared_ptr_class<queue_impl> &SecondQueue,
const detail::code_location &Loc) {
try {
return submit_impl(CGF, Self, Loc);
} catch (...) {
{
std::lock_guard<mutex_class> Lock(MMutex);
MExceptions.PushBack(std::current_exception());
}
return SecondQueue->submit(CGF, SecondQueue, Loc);
}
}
/// Submits a command group function object to the queue, in order to be
/// scheduled for execution on the device.
///
/// \param CGF is a function object containing command group.
/// \param Self is a shared_ptr to this queue.
/// \param Loc is the code location of the submit call (default argument)
/// \return a SYCL event object for the submitted command group.
event submit(const function_class<void(handler &)> &CGF,
const shared_ptr_class<queue_impl> &Self,
const detail::code_location &Loc) {
return submit_impl(CGF, Self, Loc);
}
/// Performs a blocking wait for the completion of all enqueued tasks in the
/// queue.
///
/// Synchronous errors will be reported through SYCL exceptions.
/// @param Loc is the code location of the submit call (default argument)
void wait(const detail::code_location &Loc = {});
/// \return list of asynchronous exceptions occurred during execution.
exception_list getExceptionList() const { return MExceptions; }
/// @param Loc is the code location of the submit call (default argument)
void wait_and_throw(const detail::code_location &Loc = {}) {
wait(Loc);
throw_asynchronous();
}
/// Performs a blocking wait for the completion of all enqueued tasks in the
/// queue.
///
/// Synchronous errors will be reported through SYCL exceptions.
/// Asynchronous errors will be passed to the async_handler passed to the
/// queue on construction. If no async_handler was provided then
/// asynchronous exceptions will be lost.
void throw_asynchronous() {
if (!MAsyncHandler)
return;
exception_list Exceptions;
{
std::lock_guard<mutex_class> Lock(MMutex);
Exceptions = std::move(MExceptions);
}
// Unlock the mutex before calling user-provided handler to avoid
// potential deadlock if the same queue is somehow referenced in the
// handler.
if (Exceptions.size())
MAsyncHandler(std::move(Exceptions));
}
/// Creates PI queue.
///
/// \param Order specifies whether the queue being constructed as in-order
/// or out-of-order.
RT::PiQueue createQueue(QueueOrder Order) {
RT::PiQueueProperties CreationFlags = 0;
if (Order == QueueOrder::OOO) {
CreationFlags = PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
}
if (MPropList.has_property<property::queue::enable_profiling>()) {
CreationFlags |= PI_QUEUE_PROFILING_ENABLE;
}
RT::PiQueue Queue{};
RT::PiContext Context = MContext->getHandleRef();
RT::PiDevice Device = MDevice->getHandleRef();
const detail::plugin &Plugin = getPlugin();
assert(Plugin.getBackend() == MDevice->getPlugin().getBackend());
RT::PiResult Error = Plugin.call_nocheck<PiApiKind::piQueueCreate>(
Context, Device, CreationFlags, &Queue);
// If creating out-of-order queue failed and this property is not
// supported (for example, on FPGA), it will return
// CL_INVALID_QUEUE_PROPERTIES and will try to create in-order queue.
if (MSupportOOO && Error == PI_INVALID_QUEUE_PROPERTIES) {
MSupportOOO = false;
Queue = createQueue(QueueOrder::Ordered);
} else {
Plugin.checkPiResult(Error);
}
return Queue;
}
/// \return a raw PI handle for a free queue. The returned handle is not
/// retained. It is caller responsibility to make sure queue is still alive.
RT::PiQueue &getExclusiveQueueHandleRef() {
RT::PiQueue *PIQ = nullptr;
bool ReuseQueue = false;
{
std::lock_guard<mutex_class> Lock(MMutex);
// To achieve parallelism for FPGA with in order execution model with
// possibility of two kernels to share data with each other we shall
// create a queue for every kernel enqueued.
if (MQueues.size() < MaxNumQueues) {
MQueues.push_back({});
PIQ = &MQueues.back();
} else {
// If the limit of OpenCL queues is going to be exceeded - take the
// earliest used queue, wait until it finished and then reuse it.
PIQ = &MQueues[MNextQueueIdx];
MNextQueueIdx = (MNextQueueIdx + 1) % MaxNumQueues;
ReuseQueue = true;
}
}
if (!ReuseQueue)
*PIQ = createQueue(QueueOrder::Ordered);
else
getPlugin().call<PiApiKind::piQueueFinish>(*PIQ);
return *PIQ;
}
/// \return a raw PI queue handle. The returned handle is not retained. It
/// is caller responsibility to make sure queue is still alive.
RT::PiQueue &getHandleRef() {
if (MSupportOOO)
return MQueues[0];
return getExclusiveQueueHandleRef();
}
/// \return true if the queue was constructed with property specified by
/// PropertyT.
template <typename propertyT> bool has_property() const {
return MPropList.has_property<propertyT>();
}
/// \return a copy of the property of type PropertyT that the queue was
/// constructed with. If the queue was not constructed with the PropertyT
/// property, an invalid_object_error SYCL exception.
template <typename propertyT> propertyT get_property() const {
return MPropList.get_property<propertyT>();
}
/// Fills the memory pointed by a USM pointer with the value specified.
///
/// \param Impl is a shared_ptr to this queue.
/// \param Ptr is a USM pointer to the memory to fill.
/// \param Value is a value to be set. Value is cast as an unsigned char.
/// \param Count is a number of bytes to fill.
/// \return an event representing fill operation.
event memset(const shared_ptr_class<queue_impl> &Self, void *Ptr, int Value,
size_t Count);
/// Copies data from one memory region to another, both pointed by
/// USM pointers.
///
/// \param Impl is a shared_ptr to this queue.
/// \param Dest is a USM pointer to the destination memory.
/// \param Src is a USM pointer to the source memory.
/// \param Count is a number of bytes to copy.
event memcpy(const shared_ptr_class<queue_impl> &Self, void *Dest,
const void *Src, size_t Count);
/// Provides additional information to the underlying runtime about how
/// different allocations are used.
///
/// \param Impl is a shared_ptr to this queue.
/// \param Ptr is a USM pointer to the allocation.
/// \param Length is a number of bytes in the allocation.
/// \param Advice is a device-defined advice for the specified allocation.
event mem_advise(const shared_ptr_class<queue_impl> &Self, const void *Ptr,
size_t Length, pi_mem_advice Advice);
/// Puts exception to the list of asynchronous ecxeptions.
///
/// \param ExceptionPtr is a pointer to exception to be put.
void reportAsyncException(const std::exception_ptr &ExceptionPtr) {
std::lock_guard<mutex_class> Lock(MMutex);
MExceptions.PushBack(ExceptionPtr);
}
ThreadPool &getThreadPool() {
if (!MHostTaskThreadPool)
initHostTaskAndEventCallbackThreadPool();
return *MHostTaskThreadPool;
}
/// Gets the native handle of the SYCL queue.
///
/// \return a native handle.
pi_native_handle getNative() const;
private:
/// Performs command group submission to the queue.
///
/// \param CGF is a function object containing command group.
/// \param Self is a pointer to this queue.
/// \param Loc is the code location of the submit call (default argument)
/// \return a SYCL event representing submitted command group.
event submit_impl(const function_class<void(handler &)> &CGF,
const shared_ptr_class<queue_impl> &Self,
const detail::code_location &Loc) {
handler Handler(Self, MHostQueue);
Handler.saveCodeLoc(Loc);
CGF(Handler);
event Event = Handler.finalize();
addEvent(Event);
return Event;
}
// When instrumentation is enabled emits trace event for wait begin and
// returns the telemetry event generated for the wait
void *instrumentationProlog(const detail::code_location &CodeLoc,
string_class &Name, int32_t StreamID,
uint64_t &iid);
// Uses events generated by the Prolog and emits wait done event
void instrumentationEpilog(void *TelementryEvent, string_class &Name,
int32_t StreamID, uint64_t IId);
void initHostTaskAndEventCallbackThreadPool();
/// Stores a USM operation event that should be associated with the queue
///
/// \param Event is the event to be stored
void addUSMEvent(const event &Event);
/// Stores an event that should be associated with the queue
///
/// \param Event is the event to be stored
void addEvent(const event &Event);
/// Protects all the fields that can be changed by class' methods.
mutex_class MMutex;
DeviceImplPtr MDevice;
const ContextImplPtr MContext;
/// These events are tracked, but not owned, by the queue.
vector_class<std::weak_ptr<event_impl>> MEventsWeak;
/// Events without data dependencies (such as USM) need an owner,
/// additionally, USM operations are not added to the scheduler command graph,
/// queue is the only owner on the runtime side.
vector_class<event> MEventsShared;
exception_list MExceptions;
const async_handler MAsyncHandler;
const property_list MPropList;
/// List of queues created for FPGA device from a single SYCL queue.
vector_class<RT::PiQueue> MQueues;
/// Iterator through MQueues.
size_t MNextQueueIdx = 0;
const bool MHostQueue = false;
// Assume OOO support by default.
bool MSupportOOO = true;
// Thread pool for host task and event callbacks execution.
// The thread pool is instantiated upon the very first call to getThreadPool()
std::unique_ptr<ThreadPool> MHostTaskThreadPool;
};
} // namespace detail
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)