diff --git a/source/conf.py b/source/conf.py index 5ed241ac82..4c3af6a7b4 100644 --- a/source/conf.py +++ b/source/conf.py @@ -21,6 +21,8 @@ repo_root = '..' exec(open(join(repo_root, 'source', 'conf', 'common_conf.py')).read()) +extensions += ['notfound.extension'] # noqa: F821 + # -- Project information ----------------------------------------------------- project = u'oneAPI Specification' diff --git a/source/conf/common_conf.py b/source/conf/common_conf.py index 9e94a5230b..d1a0043fe4 100644 --- a/source/conf/common_conf.py +++ b/source/conf/common_conf.py @@ -23,14 +23,12 @@ ) extensions = [ - 'notfound.extension', 'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.mathjax', 'sphinx.ext.ifconfig', - # 'sphinx.ext.imgconverter', 'sphinx.ext.viewcode', 'sphinx.ext.githubpages', 'sphinx.ext.graphviz', @@ -41,7 +39,6 @@ 'sphinx_substitution_extensions', 'sphinxcontrib.plantuml', 'breathe', - # 'vplapi', 'dalapi', # oneDAL API generator ] diff --git a/source/elements/dpcpp/source/conf.py b/source/elements/dpcpp/source/conf.py index 279a384a44..6fd922d8f1 100644 --- a/source/elements/dpcpp/source/conf.py +++ b/source/elements/dpcpp/source/conf.py @@ -18,10 +18,11 @@ # import os import sys -sys.path.insert(0, os.path.abspath(os.path.join('..','..','..','conf'))) -# element_conf needs to import this conf -sys.path.insert(0, os.path.abspath('.')) +from os.path import join + project = 'dpcpp' -from element_conf import * +repo_root = join('..', '..', '..', '..') +exec(open(join(repo_root, 'source', 'conf', 'common_conf.py')).read()) +exec(open(join(repo_root, 'source', 'conf', 'element_conf.py')).read()) diff --git a/source/elements/dpcpp/source/index.rst b/source/elements/dpcpp/source/index.rst index d265ed7140..1e5294715f 100644 --- a/source/elements/dpcpp/source/index.rst +++ b/source/elements/dpcpp/source/index.rst @@ -1,4 +1,4 @@ -.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation +.. SPDX-FileCopyrightText: 2019-2021 Intel Corporation .. .. SPDX-License-Identifier: CC-BY-4.0 @@ -28,8 +28,7 @@ on devices. The language is comprised of the following components: orchestrate the offloaded functions. - DPC++ Language extensions. A compliant DPC++ implementation must - support the specified language features. These include - unified shared memory (USM), ordered queues, and reductions. Some + support the specified language features. Some extensions are required only when the DPC++ implementation supports a specific class of device, as summarized in the `Extensions Table`_. An implementation supports a class of device if @@ -49,119 +48,64 @@ some additional extensions being required, and some DPC++ extensions no longer required if covered by newer C++ or SYCL versions directly. .. table:: DPC++ Extensions Table: Support requirements for DPC++ - implementations above SYCL 1.2.1 + implementations above SYCL 2020 :name: Extensions Table - =========================== ======================== ==================== ==================== ==================== ============= - Feature Where defined CPU GPU FPGA Test [#test]_ - =========================== ======================== ==================== ==================== ==================== ============= - Accessor simplifications `SYCL 2020 provisional`_ Required Required Required NA [#na]_ - bit_cast `SYCL 2020 provisional`_ Required Required Required NA [#na]_ - Deduction guides `SYCL 2020 provisional`_ Required Required Required NA [#na]_ - Device specific queries `SYCL 2020 provisional`_ Not required [#tmp]_ Not required [#tmp]_ Not required [#tmp]_ NA [#na]_ - Extended atomics `SYCL 2020 provisional`_ Required [#ea]_ Required [#ea]_ Not required [#tmp]_ NA [#na]_ - Kernel func type attributes `SYCL 2020 provisional`_ Required Required Required NA [#na]_ - In-order queues `SYCL 2020 provisional`_ Required Required Required NA [#na]_ - Math array `SYCL 2020 provisional`_ Not required [#tmp]_ Not required [#tmp]_ Not required [#tmp]_ NA [#na]_ - Optional lambda name `SYCL 2020 provisional`_ Required Required Required NA [#na]_ - Queue shortcuts `SYCL 2020 provisional`_ Required Required Required NA [#na]_ - Required work-group size `SYCL 2020 provisional`_ Required Required Required NA [#na]_ - Standard layout relaxed `SYCL 2020 provisional`_ Required Required Required NA [#na]_ - Unified Shared Memory `SYCL 2020 provisional`_ Required [#USM]_ Required [#USM]_ Required [#USM]_ `usm `__ - `Accessor properties`_ DPC++ extension Required [#aprop]_ Required [#aprop]_ Required [#aprop]_ NA [#na]_ - `CXX standard library`_ DPC++ extension Required Required Not required [#tmp]_ NA [#na]_ - `Data flow pipes`_ DPC++ extension Not required Not required Required `fpga_tests `__ - `Enqueued barriers`_ DPC++ extension Required Required Required NA [#na]_ - `Group algorithms`_ DPC++ extension Required Required Not required [#tmp]_ NA [#na]_ - `Group mask`_ DPC++ extension Not required [#tmp]_ Not required [#tmp]_ Not required [#tmp]_ NA [#na]_ - `Parallel for shortcuts`_ DPC++ extension Required Required Required NA [#na]_ - `Pinned memory property`_ DPC++ extension Required Required Required NA [#na]_ - `Reductions`_ DPC++ extension Required [#redc]_ Required [#redc]_ Not required [#tmp]_ NA [#na]_ - `Restrict all arguments`_ DPC++ extension Required Required Required NA [#na]_ - `Static local mem query`_ DPC++ extension Not required [#tmp]_ Not required [#tmp]_ Not required [#tmp]_ NA [#na]_ - `Sub-groups`_ DPC++ extension Required Required Not required `sub_group `__ - `Sub-group algorithms`_ DPC++ extension Required [#sga]_ Required [#sga]_ Not required `sub_group `__ - =========================== ======================== ==================== ==================== ==================== ============= + =========================== ==================== ==================== ==================== ============= + Feature CPU GPU FPGA Test [#test]_ + =========================== ==================== ==================== ==================== ============= + `Accessor properties`_ Required Required Required NA [#na]_ + `CXX standard library`_ Required Required Not required [#tmp]_ NA [#na]_ + `Data flow pipes`_ Not required Not required Required `fpga_tests `__ + `Enqueued barriers`_ Required Required Required NA [#na]_ + `Extended atomics`_ Required Required Required NA [#na]_ + `Filter selector`_ Required Required Required NA [#na]_ + `FPGA LSU controls`_ Not required Not required Required NA [#na]_ + `FPGA memory channel`_ Not required Not required Required NA [#na]_ + `FPGA register`_ Not required Not required Required NA [#na]_ + `FPGA selector`_ Required Required Required NA [#na]_ + `GPU device info`_ Required Required Required NA [#na]_ + `Level zero backend`_ Required [#lzero]_ Required [#lzero]_ Required [#lzero]_ NA [#na]_ + `Local memory allocations`_ Required Required Required NA [#na]_ + `Pinned memory property`_ Required Required Required NA [#na]_ + `Platform context`_ Required Required Required NA [#na]_ + `Restrict all arguments`_ Required Required Required NA [#na]_ + `Sub-group mask`_ Required Required Required NA [#na]_ + =========================== ==================== ==================== ==================== ============= .. ========================== ================ ================ ==================== ============= -.. _`Accessor properties`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/accessor_properties -.. _`CXX standard library`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/C-CXX-StandardLibrary -.. _`Data flow pipes`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/DataFlowPipes -.. _`Enqueued barriers`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/EnqueueBarrier -.. _`Group algorithms`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/GroupAlgorithms -.. _`Group mask`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/GroupMask -.. _`Parallel for shortcuts`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/ParallelForSimplification -.. _`Pinned memory property`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/UsePinnedMemoryProperty -.. _`Reductions`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/Reduction -.. _`Restrict all arguments`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/KernelRestrictAll -.. _`Static local mem query`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/StaticLocalMemoryQuery -.. _`Sub-groups`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/SubGroup -.. _`Sub-group algorithms`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/SubGroupAlgorithms +.. _`Accessor properties`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/accessor_properties/SYCL_ONEAPI_accessor_properties.asciidoc +.. _`CXX standard library`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/C-CXX-StandardLibrary/C-CXX-StandardLibrary.rst +.. _`Data flow pipes`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/DataFlowPipes/data_flow_pipes.asciidoc +.. _`Enqueued barriers`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/EnqueueBarrier/enqueue_barrier.asciidoc +.. _`Extended atomics`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc +.. _`Filter selector`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/FilterSelector/FilterSelector.adoc +.. _`FPGA LSU controls`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/IntelFPGA/FPGALsu.md +.. _`FPGA memory channel`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/MemChannel/MemChannel.asciidoc +.. _`FPGA register`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/IntelFPGA/FPGAReg.md +.. _`FPGA selector`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/IntelFPGA/FPGASelector.md +.. _`GPU device info`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/IntelGPU/IntelGPUDeviceInfo.md +.. _`Level zero backend`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/LevelZeroBackend/LevelZeroBackend.md +.. _`Local memory allocations`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/LocalMemory/LocalMemory.asciidoc +.. _`Pinned memory property`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/UsePinnedMemoryProperty/UsePinnedMemoryPropery.adoc +.. _`Platform context`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/PlatformContext/PlatformContext.adoc +.. _`Restrict all arguments`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/KernelRestrictAll/SYCL_INTEL_kernel_restrict_all.asciidoc +.. _`Sub-group mask`: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/SubGroupMask/SubGroupMask.asciidoc + .. [#test] Test directory within `extension tests`_ -.. [#USM] Minimum of explicit USM support .. [#na] Not yet available. .. [#tmp] Likely to be required in the future +.. [#lzero] Required if the device backend is Level Zero. -.. [#redc] DPC++ requirement is for one dimensional reductions, single reduction variable support -.. [#sga] DPC++ requirement is for sub-group algorithms that have equivalent group algorithms -.. [#ea] DPC++ requirement does not include support for atomics in the generic address space -.. [#aprop] DPC++ requirement is for the general property mechanism, and not specific properties within it Detailed API and Language Descriptions -------------------------------------- -The `SYCL 1.2.1 Specification`_ describes the SYCL APIs and language. DPC++ extensions on top of SYCL -are described in the `SYCL Extensions`_ repository. Some features defined in the -`SYCL 2020 Provisional Specification`_ but not in the `SYCL 1.2.1 Specification`_ are required in -DPC++, as summarized in `Extensions Table`_, and most replace DPC++ extensions that were required in previous -versions of this specification. - - -A brief summary of the required features from `SYCL 2020 Provisional Specification`_ (above -`SYCL 1.2.1 Specification`_) follows: - -- Accessor simplifications - simplification of the accessor interface, reduction of verbosity in common - code, and removal of need to specify template arguments in common cases. Section 4.7.6 of the - `SYCL 2020 Provisional Specification`_. -- bit_cast - inclusion of C++20 (p0476r2) ``std::bit_cast`` as ``sycl::bit_cast``. Section 3.8.2 of the - `SYCL 2020 Provisional Specification`_. -- Deduction guides - simplifies common code patterns and reduces code length and verbosity by enabling - Class Template Argument Deduction (CTAD) from modern C++. Distributed throughout the - `SYCL 2020 Provisional Specification`_. -- Device specific queries - kernel property queries associated with a specific device. Section 4.12 of the - `SYCL 2020 Provisional Specification`_. -- Extended atomics - alignment with C++20 ``std::atomic_ref``, including some tweaks for memory models in SYCL. - Support for floating-point types and shorthand operators. Section 4.17.3 of the `SYCL 2020 Provisional Specification`_. - Additional atomic-related queries are defined in Table 4.19, and some changes to fences and barriers are reflected - in Section 4.17.1 (both in the `SYCL 2020 Provisional Specification`_). -- Kernel function type attributes - definition of kernel attributes as function type attributes that allows - them to be applied to lambdas. Definition of some core attributes. Section 5.7 of the - `SYCL 2020 Provisional Specification`_. -- In-order queues - defines simple in-order semantics for queues, to simplify common coding patterns. - Section 4.6.5 of the `SYCL 2020 Provisional Specification`_. -- Math array - contiguous fixed-size portable container. Section 4.16.3 of the - `SYCL 2020 Provisional Specification`_. -- Optional lambda name - removes requirement to manually name lambdas that define kernels. - Simplifies coding and enables composability with libraries. Lambdas can still be manually named, if - desired, such as when debugging or interfacing with a ``sycl::program`` object. - Section 4.14.2 of the `SYCL 2020 Provisional Specification`_. -- Queue shortcuts - defines kernel invocation functions directly on the queue classes, to simplify code patterns - where dependencies and/or accessors do not need to be created within the additional command group scope. Reduces - code verbosity in some common patterns. Section 4.6.5 of the `SYCL 2020 Provisional Specification`_. -- Required work-group size - defines an attribute that can be applied to kernels (including lambda definitions of kernels) - which signals that the kernel will only be invoked with a specific work-group size. This is an optimization attribute - that enables optimizations based on additional user-driven information. Section 5.7 of the - `SYCL 2020 Provisional Specification`_. -- Standard layout relaxed - removes the requirement that data shared by a host and device(s) must be C++ standard layout - types. Requires device compilers to validate layout compatibility. Section 4.14.4 of the `SYCL 2020 Provisional Specification`_. -- Unified Shared Memory (USM) - defines pointer based memory accesses and management interfaces. Provides - the ability to create allocations that are visible and have consistent pointer values across both - host and device(s). Different USM capability levels are defined, corresponding to different levels - of device and implementation support. Section 4.8 of the `SYCL 2020 Provisional Specification`_. - +The `SYCL 2020 Specification`_ describes the SYCL APIs and language. DPC++ extensions on top of SYCL +are described in the `SYCL Extensions`_ repository. A brief summary of the extensions is as follows: @@ -171,29 +115,29 @@ A brief summary of the extensions is as follows: used when describing algorithms for spatial architectures such as FPGAs. - Enqueued barriers - simplifies dependence creation and tracking for some common programming patterns by allowing coarser grained synchronization within a queue without manual creation of fine grained dependencies. -- Group algorithms - defines collective operations that operate across groups of work-items, including broadcast, - reduce, and scan. Improves productivity by providing common algorithms without explicit coding, and enables optimized - implementations to exist for combinations of device and runtime. -- Group mask - defines a type that can represent a set of work-items from a group, and collective operations that create - or operate on that type such as ballot and count. -- Parallel for shortcuts - simplification of common patterns such as invoking a kernel with a scalar range. +- Extended atomics - adds *atomic_accessor* on top of SYCL 2020 atomics. +- Filter selector - adds a device selector which consumes a string of filter definitions, and that can be used to + easily restrict the set of devices which are passed to the usual device selection mechanisms. +- FPGA LSU controls - tuning controls for FPGA load/store operations. +- FPGA memory channel - placement controls for data with external memory banks (e.g. DDR channel) for tuning + FPGA designs. +- FPGA register - tuning control for FPGA high performance pipelining. +- FPGA selector - adds a set of device selectors that make it easy to acquire an FPGA hardware or emulation device. +- GPU device info - adds GPU-specific queries around SIMD width, memory bandwidth, unique identifiers, and + topology of the compute structures. +- Level zero backend - defines interoperability with Level Zero as a backend to SYCL. +- Local memory allocations - adds ability for local memory allocations to be declared within a kernel, as opposed + to through an accessor that is passed to a kernel. Makes kernels more self contained and easier to read and optimize. - Pinned memory property - optimization indicating that a buffer should use a specific memory resource if possible, to accelerate movement of data between host and devices in some implementations. -- Reductions - provides a reduction abstraction to the ND-range form of *parallel_for*. Improves productivity - by providing the common reduction pattern without explicit coding, and enables optimized - implementations to exist for combinations of device, runtime, and reduction properties. +- Platform context - adds a default context per SYCL platform, which simplifies and improves performance in common + coding patterns. - Restrict all arguments - defines an attribute that can be applied to kernels (including lambda definitions of kernels) which signals that there will be no memory aliasing between any pointer arguments that are passed to or captured by a kernel. This is an optimization attribute that can have large impact when the developer knows more about the kernel arguments than a compiler can infer or safely assume. -- Static local memory query - query for the amount of local memory used by a compiler and unavailable for dynamic use. -- Subgroups - defines a grouping of work-items within a work-group. Synchronization - of work-items in a sub-group can occur independently of work-items in other sub-groups, and - sub-groups expose communication operations across work-items in the group. Subgroups commonly - map to SIMD hardware where it exists. -- Subgroup algorithms - defines collective operations across work-items in a sub-group that are available - only for sub-groups. Also enables algorithms from the more generic "group algorithms" extension as sub-group - collective operations. +- Sub-group mask - adds a new opaque type and operations on it, which can be used to represent and manage sets of + work-items within a sub-group. Open Source Implementation -------------------------- @@ -206,11 +150,10 @@ until the release notes are available). Testing ------- -A DPC++ implementation must pass: - - 1. The `extension tests`_ for any extension implemented from the `Extensions Table`_. - Each extension in the `Extensions Table`_ lists the name of the directory that contains - corresponding tests, within the `extension tests`_ tree. +A DPC++ implementation must pass the `extension tests`_ for any +extension implemented from the `Extensions Table`_. Each extension in +the `Extensions Table`_ lists the name of the directory that contains +corresponding tests, within the `extension tests`_ tree. Acknowledgment --------------- @@ -220,10 +163,7 @@ and the Khronos SYCL working group for their efforts defining and evolving the S .. _`C++ Standard`: https://isocpp.org/std/the-standard -.. _`SYCL 1.2.1 Specification`: https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf -.. _`SYCL 2020 Provisional Specification`: https://www.khronos.org/registry/SYCL/specs/sycl-2020-provisional.pdf -.. _`SYCL 2020 Provisional`: https://www.khronos.org/registry/SYCL/specs/sycl-2020-provisional.pdf -.. _`SYCL Adopters`: https://www.khronos.org/sycl/adopters/ +.. _`SYCL 2020 Specification`: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html .. _`SYCL Extensions`: https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions .. _`open source implementation`: https://github.com/intel/llvm/tree/sycl/ .. _`conformance test suite`: https://github.com/KhronosGroup/SYCL-CTS