Skip to content

Commit 93f2621

Browse files
Merge pull request #185 from nikitaxgusev/17_release
Intel(R) oneAPI Collective Communications Library (oneCCL) 2021.17
2 parents 4f1449d + 993878a commit 93f2621

174 files changed

Lines changed: 7540 additions & 2790 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakeLists.txt

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ option(ENABLE_UMF "Enable UMF support" TRUE)
7575
option(ENABLE_STUB_BACKEND "Enable stub backend" TRUE)
7676
option(ENABLE_LINKER_RUNPATH "Enable linker runpath flags" FALSE)
7777
option(ENABLE_OMP "Enable openMP extension for intra-node collectives" TRUE)
78+
option(ENABLE_SYCL_PER_KERNEL_COMPILE "Enable SYCL device code module assembly per kernel" TRUE)
7879

7980
option(USE_CODECOV_FLAGS "Calculate code coverage" FALSE)
8081
option(WITH_ASAN "Use address sanitizer, can only be used in Debug build" FALSE)
@@ -107,6 +108,7 @@ message(STATUS "Enable DRM support: ${ENABLE_DRM}")
107108
message(STATUS "Enable stub backend: ${ENABLE_STUB_BACKEND}")
108109
message(STATUS "Enable linker rpath flags: ${ENABLE_LINKER_RUNPATH}")
109110
message(STATUS "Enable openMP extension for intra-node collectives: ${ENABLE_OMP}")
111+
message(STATUS "Enable SYCL device code module assembly per kernel: ${ENABLE_SYCL_PER_KERNEL_COMPILE}")
110112

111113
add_definitions(-DCCL_C_COMPILER="${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
112114
add_definitions(-DCCL_CXX_COMPILER="${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
@@ -227,6 +229,12 @@ if (COMPUTE_BACKEND)
227229
endif()
228230
if (${COMPUTE_BACKEND} STREQUAL "dpcpp" AND ${CMAKE_CXX_COMPILER} MATCHES ".*icpx")
229231
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl")
232+
if (ENABLE_SYCL_PER_KERNEL_COMPILE AND "${CMAKE_BUILD_TYPE_CASE_INSENSITIVE}" STREQUAL "release")
233+
message(STATUS "Enabling SYCL device code module assembly per kernel")
234+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-device-code-split=per_kernel")
235+
else()
236+
message(STATUS "Skipping per-kernel SYCL split for ${CMAKE_BUILD_TYPE}")
237+
endif()
230238
endif()
231239
endif()
232240

@@ -342,8 +350,8 @@ file(GLOB spv_kernels "${PROJECT_SOURCE_DIR}/src/kernels/kernels.spv")
342350
endif()
343351

344352
set(CCL_MAJOR_VERSION "2021")
345-
set(CCL_MINOR_VERSION "16")
346-
set(CCL_UPDATE_VERSION "2")
353+
set(CCL_MINOR_VERSION "17")
354+
set(CCL_UPDATE_VERSION "0")
347355
set(CCL_PRODUCT_STATUS "Gold")
348356
string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
349357
get_vcs_properties("git")

cmake/ccl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,4 @@ setenv CCL_ROOT "$topdir"
6969

7070
prepend-path LD_LIBRARY_PATH "$topdir/lib"
7171
prepend-path LIBRARY_PATH "$topdir/lib"
72-
prepend-path CPATH "$topdir/include"
72+
prepend-path CPLUS_INCLUDE_PATH "$topdir/include"

cmake/setvars.sh.in

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ fi
108108
LD_LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/libfabric/lib" "${LD_LIBRARY_PATH:-}") ; export LD_LIBRARY_PATH
109109
FI_PROVIDER_PATH="${I_MPI_ROOT}/libfabric/lib/prov:/usr/lib64/libfabric"; export FI_PROVIDER_PATH
110110

111-
CPATH=$(prepend_path "${I_MPI_ROOT}/include" "${CPATH:-}"); export CPATH
111+
C_INCLUDE_PATH=$(prepend_path "${I_MPI_ROOT}/include" "${C_INCLUDE_PATH:-}"); export C_INCLUDE_PATH
112+
CPLUS_INCLUDE_PATH=$(prepend_path "${I_MPI_ROOT}/include" "${CPLUS_INCLUDE_PATH:-}"); export CPLUS_INCLUDE_PATH
112113
LD_LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/lib" "${LD_LIBRARY_PATH:-}") ; export LD_LIBRARY_PATH
113114
LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/lib" "${LIBRARY_PATH:-}"); export LIBRARY_PATH
114115
PATH="${I_MPI_ROOT}/bin:${PATH}"; export PATH

cmake/vars.sh.in

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ WORK_DIR=$(get_script_path "${vars_script_name:-}")
110110

111111
CCL_ROOT="$(cd "${WORK_DIR}"/../; pwd -P)"; export CCL_ROOT
112112

113-
CPATH=$(prepend_path "${CCL_ROOT}/include" "${CPATH:-}"); export CPATH
113+
CPLUS_INCLUDE_PATH=$(prepend_path "${CCL_ROOT}/include" "${CPLUS_INCLUDE_PATH:-}"); export CPLUS_INCLUDE_PATH
114114
CMAKE_PREFIX_PATH=$(prepend_path "${CCL_ROOT}/lib/cmake/oneCCL" "${CMAKE_PREFIX_PATH:-}"); export CMAKE_PREFIX_PATH
115115
LIBRARY_PATH=$(prepend_path "${CCL_ROOT}/lib" "${LIBRARY_PATH:-}"); export LIBRARY_PATH
116116
LD_LIBRARY_PATH=$(prepend_path "${CCL_ROOT}/lib" "${LD_LIBRARY_PATH:-}"); export LD_LIBRARY_PATH
@@ -136,7 +136,8 @@ if [ -z "${SETVARS_CALL:-}" ] ; then
136136

137137
if [ "$ccl_bundled_mpi" = "yes" ] ; then
138138
export I_MPI_ROOT="${CCL_ROOT}/opt/mpi"
139-
CPATH=$(prepend_path "${I_MPI_ROOT}/include" "${CPATH:-}"); export CPATH
139+
C_INCLUDE_PATH=$(prepend_path "${I_MPI_ROOT}/include" "${C_INCLUDE_PATH:-}"); export C_INCLUDE_PATH
140+
CPLUS_INCLUDE_PATH=$(prepend_path "${I_MPI_ROOT}/include" "${CPLUS_INCLUDE_PATH:-}"); export CPLUS_INCLUDE_PATH
140141
LD_LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/lib" "${LD_LIBRARY_PATH:-}") ; export LD_LIBRARY_PATH
141142
LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/lib" "${LIBRARY_PATH:-}"); export LIBRARY_PATH
142143
PATH="${I_MPI_ROOT}/bin:${PATH}"; export PATH

deps/hwloc/include/hwloc.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright © 2009 CNRS
3-
* Copyright © 2009-2024 Inria. All rights reserved.
3+
* Copyright © 2009-2025 Inria. All rights reserved.
44
* Copyright © 2009-2012 Université Bordeaux
55
* Copyright © 2009-2020 Cisco Systems, Inc. All rights reserved.
66
* See COPYING in top-level directory.
@@ -112,7 +112,7 @@ extern "C" {
112112
* Two stable releases of the same series usually have the same ::HWLOC_API_VERSION
113113
* even if their HWLOC_VERSION are different.
114114
*/
115-
#define HWLOC_API_VERSION 0x00020b00
115+
#define HWLOC_API_VERSION 0x00020c00
116116

117117
/** \brief Indicate at runtime which hwloc API version was used at build time.
118118
*
@@ -346,9 +346,10 @@ typedef enum {
346346
*
347347
* Some operating systems (e.g. Linux) may expose a single die per package
348348
* even if the hardware does not support dies at all. To avoid showing
349-
* such non-existing dies, the corresponding hwloc backend may filter them out.
349+
* such non-existing dies, hwloc will filter them out if all of them are
350+
* identical to packages.
350351
* This is functionally equivalent to ::HWLOC_TYPE_FILTER_KEEP_STRUCTURE
351-
* being enforced.
352+
* being enforced for Dies versus Packages.
352353
*/
353354

354355
HWLOC_OBJ_TYPE_MAX /**< \private Sentinel value */
@@ -1047,7 +1048,7 @@ HWLOC_DECLSPEC const char * hwloc_obj_type_string (hwloc_obj_type_t type) __hwlo
10471048
* If \p size is 0, \p string may safely be \c NULL.
10481049
*
10491050
* \return the number of characters that were actually written if not truncating,
1050-
* or that would have been written (not including the ending \\0).
1051+
* or that would have been written (not including the ending \c \0).
10511052
*/
10521053
HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size,
10531054
hwloc_obj_t obj,
@@ -1062,7 +1063,7 @@ HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_
10621063
* If \p size is 0, \p string may safely be \c NULL.
10631064
*
10641065
* \return the number of characters that were actually written if not truncating,
1065-
* or that would have been written (not including the ending \\0).
1066+
* or that would have been written (not including the ending \c \0).
10661067
*/
10671068
HWLOC_DECLSPEC int hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size,
10681069
hwloc_obj_t obj, const char * __hwloc_restrict separator,
@@ -2002,7 +2003,7 @@ HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topo
20022003
* a file, as with hwloc_topology_set_xml()).
20032004
*
20042005
* Gather topology information from the XML memory buffer given at
2005-
* \p buffer and of length \p size (including an ending \0).
2006+
* \p buffer and of length \p size (including an ending \c \0).
20062007
* This buffer may have been filled earlier with
20072008
* hwloc_topology_export_xmlbuffer() in hwloc/export.h.
20082009
*

deps/hwloc/include/hwloc/autogen/config.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@
1212
#ifndef HWLOC_CONFIG_H
1313
#define HWLOC_CONFIG_H
1414

15-
#define HWLOC_VERSION "2.11.2rc2-git"
15+
#define HWLOC_VERSION "2.12.0rc2-git"
1616
#define HWLOC_VERSION_MAJOR 2
17-
#define HWLOC_VERSION_MINOR 11
18-
#define HWLOC_VERSION_RELEASE 2
17+
#define HWLOC_VERSION_MINOR 12
18+
#define HWLOC_VERSION_RELEASE 0
1919
#define HWLOC_VERSION_GREEK "rc2"
2020

2121
/* #undef HWLOC_PCI_COMPONENT_BUILTIN */

deps/hwloc/include/hwloc/bitmap.h

Lines changed: 67 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright © 2009 CNRS
3-
* Copyright © 2009-2023 Inria. All rights reserved.
3+
* Copyright © 2009-2024 Inria. All rights reserved.
44
* Copyright © 2009-2012 Université Bordeaux
55
* Copyright © 2009-2011 Cisco Systems, Inc. All rights reserved.
66
* See COPYING in top-level directory.
@@ -113,77 +113,132 @@ HWLOC_DECLSPEC int hwloc_bitmap_copy(hwloc_bitmap_t dst, hwloc_const_bitmap_t sr
113113
* Bitmap/String Conversion
114114
*/
115115

116-
/** \brief Stringify a bitmap.
116+
/** \brief Stringify a bitmap in the default hwloc format.
117+
*
118+
* <b>Note that if the bitmap is a CPU or nodeset, it contains physical indexes.</b>
119+
*
120+
* Print the bits set inside a bitmap as a comma-separated list of hexadecimal 32-bit blocks.
121+
* A bitmap containing bits 1, 33, 34, and all from 64 to 95 is printed as <tt>"0xffffffff,0x00000006,0x00000002"</tt>.
117122
*
118123
* Up to \p buflen characters may be written in buffer \p buf.
119124
*
120125
* If \p buflen is 0, \p buf may safely be \c NULL.
121126
*
122127
* \return the number of characters that were actually written if not truncating,
123-
* or that would have been written (not including the ending \\0).
128+
* or that would have been written (not including the ending \c \0).
129+
* \return -1 on error.
124130
*/
125131
HWLOC_DECLSPEC int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
126132

127-
/** \brief Stringify a bitmap into a newly allocated string.
133+
/** \brief Stringify a bitmap into a newly allocated string in the default hwloc format.
128134
*
129-
* \return 0 on success, -1 on error.
135+
* <b>Note that if the bitmap is a CPU or nodeset, it contains physical indexes.</b>
136+
*
137+
* Print the bits set inside a bitmap as a comma-separated list of hexadecimal 32-bit blocks.
138+
* A bitmap containing bits 1, 33, 34, and all from 64 to 95 is printed as <tt>"0xffffffff,0x00000006,0x00000002"</tt>.
139+
*
140+
* \return the number of characters that were written (not including the ending \c \0).
141+
* \return -1 on error, for instance with \p errno set to \c ENOMEM on failure to allocate the output string.
130142
*/
131143
HWLOC_DECLSPEC int hwloc_bitmap_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
132144

133-
/** \brief Parse a bitmap string and stores it in bitmap \p bitmap.
145+
/** \brief Parse a bitmap string as the default hwloc format and stores it in bitmap \p bitmap.
146+
*
147+
* <b>Note that if the bitmap is a CPU or nodeset, the input string must contain physical indexes.</b>
148+
*
149+
* The input string should be a comma-separared list of hexadecimal 32-bit blocks.
150+
* String <tt>"0xffffffff,0x6,0x2"</tt> is parsed as a bitmap containing all bits between 64 and 95,
151+
* and bits 33, 34 and 1.
134152
*
135153
* \return 0 on success, -1 on error.
136154
*/
137155
HWLOC_DECLSPEC int hwloc_bitmap_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
138156

139157
/** \brief Stringify a bitmap in the list format.
158+
*
159+
* <b>Note that if the bitmap is a CPU or nodeset, it contains physical indexes.</b>
140160
*
141161
* Lists are comma-separated indexes or ranges.
142162
* Ranges are dash separated indexes.
143-
* The last range may not have an ending indexes if the bitmap is infinitely set.
163+
* A bitmap containing bits 1, 33, 34, and all from 64 to 95 is printed as <tt>"1,33-34,64-95"</tt>.
164+
* The last range may not have an ending index if the bitmap is infinitely set.
144165
*
145166
* Up to \p buflen characters may be written in buffer \p buf.
146167
*
147168
* If \p buflen is 0, \p buf may safely be \c NULL.
148169
*
149170
* \return the number of characters that were actually written if not truncating,
150-
* or that would have been written (not including the ending \\0).
171+
* or that would have been written (not including the ending \c \0).
172+
* \return -1 on error.
151173
*/
152174
HWLOC_DECLSPEC int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
153175

154176
/** \brief Stringify a bitmap into a newly allocated list string.
155177
*
156-
* \return 0 on success, -1 on error.
178+
* <b>Note that if the bitmap is a CPU or nodeset, it contains physical indexes.</b>
179+
*
180+
* Lists are comma-separated indexes or ranges.
181+
* Ranges are dash separated indexes.
182+
* A bitmap containing bits 1, 33, 34, and all from 64 to 95 is printed as <tt>"1,33-34,64-95"</tt>.
183+
* The last range may not have an ending index if the bitmap is infinitely set.
184+
*
185+
* \return the number of characters that were written (not including the ending \c \0).
186+
* \return -1 on error, for instance with \p errno set to \c ENOMEM on failure to allocate the output string.
157187
*/
158188
HWLOC_DECLSPEC int hwloc_bitmap_list_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
159189

160190
/** \brief Parse a list string and stores it in bitmap \p bitmap.
191+
*
192+
* <b>Note that if the bitmap is a CPU or nodeset, the input string must contain physical indexes.</b>
193+
*
194+
* Lists are comma-separated indexes or ranges.
195+
* Ranges are dash separated indexes.
196+
* String <tt>"1,33-34,64-95"</tt> is parsed as a bitmap containing bits 1, 33, 34, and all from 64 to 95.
197+
* The last range may not have an ending index if the bitmap is infinitely set.
161198
*
162199
* \return 0 on success, -1 on error.
163200
*/
164201
HWLOC_DECLSPEC int hwloc_bitmap_list_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
165202

166203
/** \brief Stringify a bitmap in the taskset-specific format.
167204
*
168-
* The taskset command manipulates bitmap strings that contain a single
205+
* <b>Note that if the bitmap is a CPU or nodeset, it contains physical indexes.</b>
206+
*
207+
* The taskset program manipulates bitmap strings that contain a single
169208
* (possible very long) hexadecimal number starting with 0x.
209+
* A bitmap containing bits 1, 33, 34, and all from 64 to 95 is printed as </tt>"0xffffffff0000000600000002"</tt>.
170210
*
171211
* Up to \p buflen characters may be written in buffer \p buf.
172212
*
173213
* If \p buflen is 0, \p buf may safely be \c NULL.
174214
*
175215
* \return the number of characters that were actually written if not truncating,
176-
* or that would have been written (not including the ending \\0).
216+
* or that would have been written (not including the ending \c \0).
217+
* \return -1 on error.
177218
*/
178219
HWLOC_DECLSPEC int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
179220

180221
/** \brief Stringify a bitmap into a newly allocated taskset-specific string.
181222
*
182-
* \return 0 on success, -1 on error.
223+
* <b>Note that if the bitmap is a CPU or nodeset, it contains physical indexes.</b>
224+
*
225+
* The taskset program manipulates bitmap strings that contain a single
226+
* (possible very long) hexadecimal number starting with 0x.
227+
* A bitmap containing bits 1, 33, 34, and all from 64 to 95 is printed as <tt>"0xffffffff0000000600000002"</tt>.
228+
*
229+
* \return the number of characters that were written (not including the ending \c \0).
230+
* \return -1 on error, for instance with \p errno set to \c ENOMEM on failure to allocate the output string.
183231
*/
184232
HWLOC_DECLSPEC int hwloc_bitmap_taskset_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
185233

186234
/** \brief Parse a taskset-specific bitmap string and stores it in bitmap \p bitmap.
235+
*
236+
* <b>Note that if the bitmap is a CPU or nodeset, the input string must contain physical indexes.</b>
237+
*
238+
* The taskset program manipulates bitmap strings that contain a single
239+
* (possible very long) hexadecimal number starting with 0x.
240+
* String <tt>"0xffffffff0000000600000002"</tt> is parsed as a bitmap containing all bits between 64 and 95,
241+
* and bits 33, 34 and 1.
187242
*
188243
* \return 0 on success, -1 on error.
189244
*/

deps/hwloc/include/hwloc/diff.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright © 2013-2023 Inria. All rights reserved.
2+
* Copyright © 2013-2024 Inria. All rights reserved.
33
* See COPYING in top-level directory.
44
*/
55

@@ -258,7 +258,7 @@ HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_diff_t diff, co
258258
/** \brief Load a list of topology differences from a XML buffer.
259259
*
260260
* Build a list of differences from the XML memory buffer given
261-
* at \p xmlbuffer and of length \p buflen (including an ending \0).
261+
* at \p xmlbuffer and of length \p buflen (including an ending \c \0).
262262
* This buffer may have been filled earlier with
263263
* hwloc_topology_diff_export_xmlbuffer().
264264
*
@@ -284,7 +284,7 @@ HWLOC_DECLSPEC int hwloc_topology_diff_load_xmlbuffer(const char *xmlbuffer, int
284284
* that contains the reference topology.
285285
* This attribute is given back when reading the diff from XML.
286286
*
287-
* The returned buffer ends with a \0 that is included in the returned
287+
* The returned buffer ends with a \c \0 that is included in the returned
288288
* length.
289289
*
290290
* \return 0 on success, -1 on error.

deps/hwloc/include/hwloc/distances.h

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright © 2010-2024 Inria. All rights reserved.
2+
* Copyright © 2010-2025 Inria. All rights reserved.
33
* See COPYING in top-level directory.
44
*/
55

@@ -227,17 +227,24 @@ enum hwloc_distances_transform_e {
227227
HWLOC_DISTANCES_TRANSFORM_LINKS = 1,
228228

229229
/** \brief Merge switches with multiple ports into a single object.
230-
* This currently only applies to NVSwitches where GPUs seem connected to different
231-
* separate switch ports in the NVLinkBandwidth matrix. This transformation will
232-
* replace all of them with the same port connected to all GPUs.
233-
* Other ports are removed by applying ::HWLOC_DISTANCES_TRANSFORM_REMOVE_NULL internally.
230+
*
231+
* This currently only applies to NVSwitches where GPUs seem connected
232+
* to different switch ports. Switch ports must be objects with subtype
233+
* "NVSwitch" as in the NVLinkBandwidth matrix.
234+
*
235+
* This transformation will replace all ports with only the first one,
236+
* now connected to all GPUs. Other ports are removed by applying
237+
* ::HWLOC_DISTANCES_TRANSFORM_REMOVE_NULL internally.
234238
* \hideinitializer
235239
*/
236240
HWLOC_DISTANCES_TRANSFORM_MERGE_SWITCH_PORTS = 2,
237241

238242
/** \brief Apply a transitive closure to the matrix to connect objects across switches.
239-
* This currently only applies to GPUs and NVSwitches in the NVLinkBandwidth matrix.
240-
* All pairs of GPUs will be reported as directly connected.
243+
*
244+
* All pairs of GPUs will be reported as directly connected instead GPUs being
245+
* only connected to switches.
246+
*
247+
* Switch ports must be objects with subtype "NVSwitch" as in the NVLinkBandwidth matrix.
241248
* \hideinitializer
242249
*/
243250
HWLOC_DISTANCES_TRANSFORM_TRANSITIVE_CLOSURE = 3

0 commit comments

Comments
 (0)