From 5af340b42a5fcb3ee81330360a203bfdd4cfc18c Mon Sep 17 00:00:00 2001 From: Przemek Malon Date: Wed, 11 Jun 2025 09:47:52 +0100 Subject: [PATCH 1/7] [SYCL][UR][Bindless][E2E][Doc] Fix copy docs and implementation. This patch fixes the implementation of bindless image copies. Previously, source and destination pitch values were not being set correctly. This patch also updates the wording around the requirements for `ext_oneapi_copy`. A missing requrement was added. Namely that the `CopyExtent` parameter in the `ext_oneapi_copy` functions that take it, must not have `0` values in any of the three dimensions. The requirements for `ext_oneapi_copy` have also been re-written to prescribe what the functions expect, instead of providing a list of cases in which the function may fail. This should hopefully make it clearer and more prescriptive, rather than saying the copy function may fail if some condition is not met, we now say that the function requires that certain conditions be met. --- .../sycl_ext_oneapi_bindless_images.asciidoc | 74 +++++++++++-------- sycl/source/handler.cpp | 74 +++++++++++++------ .../source/adapters/cuda/image.cpp | 12 +-- 3 files changed, 99 insertions(+), 61 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc index dfe66f1fde7f8..59e04e9dca506 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc @@ -1335,51 +1335,63 @@ For the forms that take a USM pointer, the image memory must also have been allocated within the same context and device of the `queue`. The USM memory must be accessible on the queue's device. -The `ext_oneapi_copy` function variants that don't take offsets and extents may -fail in the following scenarios: +The `ext_oneapi_copy` function variants that do not take offsets and extents +must ensure that the following conditions are met.: -1. The `Src` and `Dest` memory was not allocated on the same device and -context of the queue. +1. The `Src` and `Dest` memory was allocated on the same device and context. -2. The `Src` and `Dest` memory regions, where `Src` or `Dest` can be either -on the host or device, do not have the same memory capacity, where the capacity -is calculate from the `width`, `height`, `depth`, `channel_order`, and +2. The `Src` and `Dest` memory regions, where `Src` or `Dest` can be either +on the host or device, have the same memory capacity, where the capacity +is calculated from the `width`, `height`, `depth`, `channel_order`, and `channel_type` members of the `image_descriptor` parameter. -The `ext_oneapi_copy` function variants that do take offsets and extents may -fail in the following scenarios: +The `ext_oneapi_copy` function variants that do take offsets and extents must +ensure that the following conditions are met. If a condition names a specific +parameter, it is only applicable to the function variants that take that +parameter. -1. The `Src` and `Dest` memory was not allocated on the same device and -context of the queue. +1. The `Src` and `Dest` memory was allocated on the same device and context. -2. The image descriptor passed does not match the image descriptor used to -allocate the image on the device. +2. The image descriptors passed match the image descriptors used to allocate +the image's memory on the device. -3. the `CopyExtent` describes a memory region larger than that which was -allocated on either the host or the device. +3. The `CopyExtent` describes a memory region that is not larger than that which +was allocated on either the host or the device. -4. The `HostExtent` describes a memory region larger than that which was -allocated on the host. +4. The `HostExtent` describes a memory region that is not larger than that which +was allocated on the host. -5. The `SrcExtent` describes a memory region larger than that which was -allocated, where `Src` can be either the host or device. +5. The `SrcExtent` describes a memory region that is not larger than that which +was allocated, where `Src` can be either on the host or on the device. -6. The `DestExtent` describes a memory region larger than that which was -allocated, where `Dest` can be either the host or device. +6. The `DestExtent` describes a memory region that is not larger than that which +was allocated, where `Dest` can be either on the host or on the device. -7. If `SrcOffset + CopyExtent` moves the memory sub-region outside the bounds -of the memory described by `Src`, irrespective of whether `Src` is on the host -or the device. +7. The `DeviceRowPitch` adheres to the alignment requirements outlined in the +"Pitch alignment restrictions and queries" section. -8. If `DestOffset + CopyExtent` moves the memory sub-region outside the bounds -of the memory described by `Dest`, irrespective of whether `Dest` is on the -host or the device. +8. The `DeviceRowPitch` is greater than or equal to the width of the image on +the device. -9. The `DeviceRowPitch` does not adhere to the alignment requirements -outlined in section "Pitch alignment restrictions and queries" +9. For the relevant dimensions, `SrcOffset + CopyExtent` does not move the +memory sub-region outside the bounds of the memory described by `Src`, +irrespective of whether `Src` is on the host or the device. The relevant +dimensions are `x` for 1D images; `x` and `y` for 2D images; and `x`, `y`, and +`z` for 3D images. `x`, `y`, and `z` correspond to indices `0`, `1`, and `2` of +the `SrcOffset` and `CopyExtent` parameters, respectively. + +10. For the relevant dimensions, `DestOffset + CopyExtent` does not move the +memory sub-region outside the bounds of the memory described by `Dest`, +irrespective of whether `Dest` is on the host or the device. The relevant +dimensions are `x` for 1D images, `x` and `y` for 2D images, and `x`, `y`, and +`z` for 3D images. `x`, `y`, and `z` correspond to indices `0`, `1`, and `2` of +the `SrcOffset` and `CopyExtent` parameters, respectively. + +11. The `CopyExtent`'s' `x`, `y`, and `z` dimensions must not be `0`. They must +be greater than or equal to `1`. Even if the image is 1D or 2D, the remaining +non-relevant dimension's values must be set to `1` in the `CopyExtent` +parameter. -10. The value of `DeviceRowPitch` is smaller than the width of the image on -the device. If copying of an image fails, `ext_oneapi_copy` will throw a `sycl::exception` with error code `sycl::errc::invalid`, and relay an error message back to the diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index b1cfeaf50a581..69c8f768f7406 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -96,6 +96,33 @@ void *getValueFromDynamicParameter( // Bindless image helpers +constexpr size_t get_channel_size( + const sycl::ext::oneapi::experimental::image_descriptor &Desc) { + switch (Desc.channel_type) { + case sycl::image_channel_type::fp16: + return sizeof(sycl::half); + case sycl::image_channel_type::fp32: + return sizeof(float); + case sycl::image_channel_type::snorm_int8: + case sycl::image_channel_type::unorm_int8: + case sycl::image_channel_type::signed_int8: + case sycl::image_channel_type::unsigned_int8: + return sizeof(uint8_t); + case sycl::image_channel_type::snorm_int16: + case sycl::image_channel_type::unorm_int16: + case sycl::image_channel_type::signed_int16: + case sycl::image_channel_type::unsigned_int16: + return sizeof(uint16_t); + case sycl::image_channel_type::signed_int32: + case sycl::image_channel_type::unsigned_int32: + return sizeof(uint32_t); + default: + throw sycl::exception(make_error_code(errc::invalid), + "Unsupported channel type"); + return 0; + } +} + // Fill image type and return depth or array_size static unsigned int fill_image_type(const ext::oneapi::experimental::image_descriptor &Desc, @@ -255,16 +282,8 @@ fill_copy_args(detail::handler_impl *impl, impl->MDstImageDesc.depth = DestExtent[2]; } - if (impl->MImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE) { - impl->MSrcImageDesc.rowPitch = 0; - impl->MDstImageDesc.rowPitch = DestPitch; - } else if (impl->MImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) { - impl->MSrcImageDesc.rowPitch = SrcPitch; - impl->MDstImageDesc.rowPitch = 0; - } else { - impl->MSrcImageDesc.rowPitch = SrcPitch; - impl->MDstImageDesc.rowPitch = DestPitch; - } + impl->MSrcImageDesc.rowPitch = SrcPitch; + impl->MDstImageDesc.rowPitch = DestPitch; } static void @@ -277,9 +296,11 @@ fill_copy_args(detail::handler_impl *impl, sycl::range<3> DestExtent = {0, 0, 0}, sycl::range<3> CopyExtent = {0, 0, 0}) { - fill_copy_args(impl, Desc, Desc, ImageCopyFlags, 0 /*SrcPitch*/, - 0 /*DestPitch*/, SrcOffset, SrcExtent, DestOffset, DestExtent, - CopyExtent); + size_t SrcPitch = SrcExtent[0] * Desc.num_channels * get_channel_size(Desc); + size_t DestPitch = DestExtent[0] * Desc.num_channels * get_channel_size(Desc); + + fill_copy_args(impl, Desc, Desc, ImageCopyFlags, SrcPitch, DestPitch, + SrcOffset, SrcExtent, DestOffset, DestExtent, CopyExtent); } static void @@ -307,8 +328,13 @@ fill_copy_args(detail::handler_impl *impl, sycl::range<3> DestExtent = {0, 0, 0}, sycl::range<3> CopyExtent = {0, 0, 0}) { - fill_copy_args(impl, SrcImgDesc, DestImgDesc, ImageCopyFlags, 0 /*SrcPitch*/, - 0 /*DestPitch*/, SrcOffset, SrcExtent, DestOffset, DestExtent, + size_t SrcPitch = + SrcExtent[0] * SrcImgDesc.num_channels * get_channel_size(SrcImgDesc); + size_t DestPitch = + DestExtent[0] * DestImgDesc.num_channels * get_channel_size(DestImgDesc); + + fill_copy_args(impl, SrcImgDesc, DestImgDesc, ImageCopyFlags, SrcPitch, + DestPitch, SrcOffset, SrcExtent, DestOffset, DestExtent, CopyExtent); } @@ -1595,10 +1621,10 @@ void handler::ext_oneapi_copy( get_pointer_type(Dest, createSyclObjFromImpl(impl->get_context()))); - if (ImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE || - ImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) { - detail::fill_copy_args(get_impl(), Desc, ImageCopyFlags, DeviceRowPitch, - DeviceRowPitch); + if (ImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE) { + detail::fill_copy_args(get_impl(), Desc, ImageCopyFlags, 0, DeviceRowPitch); + } else if (ImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) { + detail::fill_copy_args(get_impl(), Desc, ImageCopyFlags, DeviceRowPitch, 0); } else { throw sycl::exception(make_error_code(errc::invalid), "Copy Error: This copy function only performs host " @@ -1629,13 +1655,13 @@ void handler::ext_oneapi_copy( // Fill the host extent based on the type of copy. if (ImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE) { - detail::fill_copy_args(get_impl(), DeviceImgDesc, ImageCopyFlags, - DeviceRowPitch, DeviceRowPitch, SrcOffset, - HostExtent, DestOffset, {0, 0, 0}, CopyExtent); + detail::fill_copy_args(get_impl(), DeviceImgDesc, ImageCopyFlags, 0, + DeviceRowPitch, SrcOffset, HostExtent, DestOffset, + {0, 0, 0}, CopyExtent); } else if (ImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) { detail::fill_copy_args(get_impl(), DeviceImgDesc, ImageCopyFlags, - DeviceRowPitch, DeviceRowPitch, SrcOffset, {0, 0, 0}, - DestOffset, HostExtent, CopyExtent); + DeviceRowPitch, 0, SrcOffset, {0, 0, 0}, DestOffset, + HostExtent, CopyExtent); } else { throw sycl::exception(make_error_code(errc::invalid), "Copy Error: This copy function only performs host " diff --git a/unified-runtime/source/adapters/cuda/image.cpp b/unified-runtime/source/adapters/cuda/image.cpp index b9977eaf87035..ba751f9d8d70b 100644 --- a/unified-runtime/source/adapters/cuda/image.cpp +++ b/unified-runtime/source/adapters/cuda/image.cpp @@ -704,7 +704,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; - cpy_desc.srcPitch = pSrcImageDesc->width * PixelSizeBytes; + cpy_desc.srcPitch = pSrcImageDesc->rowPitch; if (pDstImageDesc->rowPitch == 0) { cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; @@ -725,7 +725,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; cpy_desc.srcHost = pSrc; - cpy_desc.srcPitch = pSrcImageDesc->width * PixelSizeBytes; + cpy_desc.srcPitch = pSrcImageDesc->rowPitch; cpy_desc.srcHeight = pSrcImageDesc->height; cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; @@ -745,7 +745,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; cpy_desc.srcHost = pSrc; - cpy_desc.srcPitch = pSrcImageDesc->width * PixelSizeBytes; + cpy_desc.srcPitch = pSrcImageDesc->rowPitch; cpy_desc.srcHeight = std::max(uint64_t{1}, pSrcImageDesc->height); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; @@ -793,7 +793,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; - cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes; + cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; cpy_desc.dstHost = pDst; if (pSrcImageDesc->rowPitch == 0) { @@ -818,7 +818,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; cpy_desc.dstHost = pDst; - cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes; + cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = pDstImageDesc->height; cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; @@ -838,7 +838,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; cpy_desc.dstHost = pDst; - cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes; + cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = std::max(uint64_t{1}, pDstImageDesc->height); cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height); From ce3822c74701d14d35bfd246d23645eacd9624a0 Mon Sep 17 00:00:00 2001 From: Przemek Malon Date: Mon, 23 Jun 2025 16:03:31 +0100 Subject: [PATCH 2/7] Update pitch values in HIP and L0 adapters --- unified-runtime/source/adapters/hip/image.cpp | 12 ++++++------ .../source/adapters/level_zero/image_common.cpp | 6 ++---- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/unified-runtime/source/adapters/hip/image.cpp b/unified-runtime/source/adapters/hip/image.cpp index d33d7c262efde..3788e307b748d 100644 --- a/unified-runtime/source/adapters/hip/image.cpp +++ b/unified-runtime/source/adapters/hip/image.cpp @@ -704,7 +704,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; cpy_desc.dstY = pCopyRegion->dstOffset.y; - cpy_desc.srcPitch = pSrcImageDesc->width * PixelSizeBytes; + cpy_desc.srcPitch = pSrcImageDesc->rowPitch; if (pDstImageDesc->rowPitch == 0) { cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); @@ -727,7 +727,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeHost; cpy_desc.srcHost = pSrc; - cpy_desc.srcPitch = pSrcImageDesc->width * PixelSizeBytes; + cpy_desc.srcPitch = pSrcImageDesc->rowPitch; cpy_desc.srcHeight = pSrcImageDesc->height; cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); @@ -749,7 +749,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeHost; cpy_desc.srcHost = pSrc; - cpy_desc.srcPitch = pSrcImageDesc->width * PixelSizeBytes; + cpy_desc.srcPitch = pSrcImageDesc->rowPitch; cpy_desc.srcHeight = std::max(MinCopyHeight, pSrcImageDesc->height); cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); @@ -824,7 +824,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( } cpy_desc.dstMemoryType = hipMemoryTypeHost; cpy_desc.dstHost = pDst; - cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes; + cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; UR_CHECK_ERROR(hipMemcpyParam2DAsync(&cpy_desc, Stream)); @@ -840,7 +840,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcArray = static_cast(const_cast(pSrc)); cpy_desc.dstMemoryType = hipMemoryTypeHost; cpy_desc.dstHost = pDst; - cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes; + cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = pDstImageDesc->height; cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; @@ -863,7 +863,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcArray = static_cast(const_cast(pSrc)); cpy_desc.dstMemoryType = hipMemoryTypeHost; cpy_desc.dstHost = pDst; - cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes; + cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = std::max(MinCopyHeight, pDstImageDesc->height); cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; cpy_desc.Height = diff --git a/unified-runtime/source/adapters/level_zero/image_common.cpp b/unified-runtime/source/adapters/level_zero/image_common.cpp index d191b2a7c37cf..d51bc16ed1fcb 100644 --- a/unified-runtime/source/adapters/level_zero/image_common.cpp +++ b/unified-runtime/source/adapters/level_zero/image_common.cpp @@ -779,8 +779,7 @@ ur_result_t bindlessImagesHandleCopyFlags( switch (imageCopyFlags) { case UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE: { - uint32_t SrcRowPitch = - pSrcImageDesc->width * getPixelSizeBytes(pSrcImageFormat); + uint32_t SrcRowPitch = pSrcImageDesc->rowPitch; uint32_t SrcSlicePitch = SrcRowPitch * pSrcImageDesc->height; if (pDstImageDesc->rowPitch == 0) { // Copy to Non-USM memory @@ -824,8 +823,7 @@ ur_result_t bindlessImagesHandleCopyFlags( return UR_RESULT_SUCCESS; }; case UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST: { - uint32_t DstRowPitch = - pDstImageDesc->width * getPixelSizeBytes(pDstImageFormat); + uint32_t DstRowPitch = pDstImageDesc->rowPitch; uint32_t DstSlicePitch = DstRowPitch * pDstImageDesc->height; if (pSrcImageDesc->rowPitch == 0) { // Copy from Non-USM memory to host From 994a5973e1a546776d5b88c69827b09f078f11d7 Mon Sep 17 00:00:00 2001 From: Przemek Malon Date: Tue, 24 Jun 2025 09:10:21 +0100 Subject: [PATCH 3/7] Calculate and pass host row pitch required for L0 backend --- sycl/source/handler.cpp | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 69c8f768f7406..edd75b4c5eb34 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1621,10 +1621,17 @@ void handler::ext_oneapi_copy( get_pointer_type(Dest, createSyclObjFromImpl(impl->get_context()))); + // Calculate host pitch, where host memory is always assumed to be tightly + // packed. + size_t HostRowPitch = + Desc.width * Desc.num_channels * detail::get_channel_size(Desc); + if (ImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE) { - detail::fill_copy_args(get_impl(), Desc, ImageCopyFlags, 0, DeviceRowPitch); + detail::fill_copy_args(get_impl(), Desc, ImageCopyFlags, HostRowPitch, + DeviceRowPitch); } else if (ImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) { - detail::fill_copy_args(get_impl(), Desc, ImageCopyFlags, DeviceRowPitch, 0); + detail::fill_copy_args(get_impl(), Desc, ImageCopyFlags, DeviceRowPitch, + HostRowPitch); } else { throw sycl::exception(make_error_code(errc::invalid), "Copy Error: This copy function only performs host " @@ -1653,15 +1660,20 @@ void handler::ext_oneapi_copy( get_pointer_type(Dest, createSyclObjFromImpl(impl->get_context()))); + // Calculate host pitch, where host memory is always assumed to be tightly + // packed. + size_t HostRowPitch = DeviceImgDesc.width * DeviceImgDesc.num_channels * + detail::get_channel_size(DeviceImgDesc); + // Fill the host extent based on the type of copy. if (ImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE) { - detail::fill_copy_args(get_impl(), DeviceImgDesc, ImageCopyFlags, 0, - DeviceRowPitch, SrcOffset, HostExtent, DestOffset, - {0, 0, 0}, CopyExtent); + detail::fill_copy_args(get_impl(), DeviceImgDesc, ImageCopyFlags, + HostRowPitch, DeviceRowPitch, SrcOffset, HostExtent, + DestOffset, {0, 0, 0}, CopyExtent); } else if (ImageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) { detail::fill_copy_args(get_impl(), DeviceImgDesc, ImageCopyFlags, - DeviceRowPitch, 0, SrcOffset, {0, 0, 0}, DestOffset, - HostExtent, CopyExtent); + DeviceRowPitch, HostRowPitch, SrcOffset, {0, 0, 0}, + DestOffset, HostExtent, CopyExtent); } else { throw sycl::exception(make_error_code(errc::invalid), "Copy Error: This copy function only performs host " From 80ec0bef0f516624a57c9e9bfa9bc61d62e931f3 Mon Sep 17 00:00:00 2001 From: Przemek Malon Date: Tue, 24 Jun 2025 09:28:53 +0100 Subject: [PATCH 4/7] Fix mistake in host row pitch calculation --- sycl/source/handler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index edd75b4c5eb34..17a4d02dccd06 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1662,7 +1662,7 @@ void handler::ext_oneapi_copy( // Calculate host pitch, where host memory is always assumed to be tightly // packed. - size_t HostRowPitch = DeviceImgDesc.width * DeviceImgDesc.num_channels * + size_t HostRowPitch = HostExtent[0] * DeviceImgDesc.num_channels * detail::get_channel_size(DeviceImgDesc); // Fill the host extent based on the type of copy. From d6f2fcb0e37d77e733587881af9b1c2b6c50982a Mon Sep 17 00:00:00 2001 From: Przemek Malon Date: Tue, 29 Jul 2025 12:44:34 +0100 Subject: [PATCH 5/7] Extend subregion copy test coverage --- ...1D_subregion.cpp => copy_subregion_1D.cpp} | 238 +++++++++++++----- ...2D_subregion.cpp => copy_subregion_2D.cpp} | 201 ++++++++++++--- ...3D_subregion.cpp => copy_subregion_3D.cpp} | 77 +++++- 3 files changed, 403 insertions(+), 113 deletions(-) rename sycl/test-e2e/bindless_images/copies/{device_to_device_copy_1D_subregion.cpp => copy_subregion_1D.cpp} (64%) rename sycl/test-e2e/bindless_images/copies/{device_to_device_copy_2D_subregion.cpp => copy_subregion_2D.cpp} (61%) rename sycl/test-e2e/bindless_images/copies/{device_to_device_copy_3D_subregion.cpp => copy_subregion_3D.cpp} (61%) diff --git a/sycl/test-e2e/bindless_images/copies/device_to_device_copy_1D_subregion.cpp b/sycl/test-e2e/bindless_images/copies/copy_subregion_1D.cpp similarity index 64% rename from sycl/test-e2e/bindless_images/copies/device_to_device_copy_1D_subregion.cpp rename to sycl/test-e2e/bindless_images/copies/copy_subregion_1D.cpp index 357d67d78de9b..b3b836610d2e7 100644 --- a/sycl/test-e2e/bindless_images/copies/device_to_device_copy_1D_subregion.cpp +++ b/sycl/test-e2e/bindless_images/copies/copy_subregion_1D.cpp @@ -4,14 +4,15 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include -#include #include #include #include -// Uncomment to print additional test information -// #define VERBOSE_PRINT +#include +#include + +// Uncomment to print additional test information. +#define VERBOSE_PRINT namespace syclexp = sycl::ext::oneapi::experimental; @@ -21,45 +22,71 @@ void copy_image_mem_handle_to_image_mem_handle( const std::vector &dataIn2, sycl::device dev, sycl::queue q, std::vector &out) { - // Check that output image is double size of input images + // Check that output image is double size of input images. assert(outDesc.width == dataInDesc.width * 2); syclexp::image_mem imgMemSrc1(dataInDesc, dev, q.get_context()); syclexp::image_mem imgMemSrc2(dataInDesc, dev, q.get_context()); syclexp::image_mem imgMemDst(outDesc, dev, q.get_context()); - // Copy input data to device - q.ext_oneapi_copy(dataIn1.data(), imgMemSrc1.get_handle(), dataInDesc); - q.ext_oneapi_copy(dataIn2.data(), imgMemSrc2.get_handle(), dataInDesc); + // Copy host input data to device. + // Extent to copy. + sycl::range copyExtent = {dataInDesc.width / 2, 1, 1}; - q.wait_and_throw(); + // Copy first half of dataIn1 to first quarter of imgMemSrc1. + q.ext_oneapi_copy(dataIn1.data(), {0, 0, 0}, {dataInDesc.width, 0, 0}, + imgMemSrc1.get_handle(), {0, 0, 0}, dataInDesc, copyExtent); - // Extent to copy - sycl::range copyExtent = {dataInDesc.width / 2, 1, 1}; + // Copy second half of dataIn1 to second quarter of imgMemSrc1. + q.ext_oneapi_copy(dataIn1.data(), {dataInDesc.width / 2, 0, 0}, + {dataInDesc.width, 0, 0}, imgMemSrc1.get_handle(), + {dataInDesc.width / 2, 0, 0}, dataInDesc, copyExtent); + + // Copy first half of dataIn2 to third quarter of imgMemSrc2. + q.ext_oneapi_copy(dataIn2.data(), {0, 0, 0}, {dataInDesc.width, 0, 0}, + imgMemSrc2.get_handle(), {0, 0, 0}, dataInDesc, copyExtent); - // Copy first half of imgMemSrcOne to first quarter of imgMemDst + // Copy second half of dataIn2 to fourth quarter of imgMemSrc2. + q.ext_oneapi_copy(dataIn2.data(), {dataInDesc.width / 2, 0, 0}, + {dataInDesc.width, 0, 0}, imgMemSrc2.get_handle(), + {dataInDesc.width / 2, 0, 0}, dataInDesc, copyExtent); + + q.wait_and_throw(); + + // Copy data from device to device. + // Copy first half of imgMemSrc1 to first quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc1.get_handle(), {0, 0, 0}, dataInDesc, imgMemDst.get_handle(), {0, 0, 0}, outDesc, copyExtent); - // Copy second half of imgMemSrcOne to second quarter of imgMemDst + // Copy second half of imgMemSrc1 to second quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc1.get_handle(), {dataInDesc.width / 2, 0, 0}, dataInDesc, imgMemDst.get_handle(), {outDesc.width / 4, 0, 0}, outDesc, copyExtent); - // Copy first half of imgMemSrcTwo to third quarter of imgMemDst + // Copy first half of imgMemSrc2 to third quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc2.get_handle(), {0, 0, 0}, dataInDesc, imgMemDst.get_handle(), {outDesc.width / 2, 0, 0}, outDesc, copyExtent); - // Copy second half of imgMemSrcTwo to fourth quarter of imgMemDst + // Copy second half of imgMemSrc2 to fourth quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc2.get_handle(), {dataInDesc.width / 2, 0, 0}, dataInDesc, imgMemDst.get_handle(), {(outDesc.width / 4) * 3, 0, 0}, outDesc, copyExtent); q.wait_and_throw(); - // Copy out data to host - q.ext_oneapi_copy(imgMemDst.get_handle(), out.data(), outDesc); + // Copy device data back to host. + // Extent to copy. + copyExtent = {outDesc.width / 2, 1, 1}; + + // Copy first half of imgMemDst to first half of out data. + q.ext_oneapi_copy(imgMemDst.get_handle(), {0, 0, 0}, outDesc, out.data(), + {0, 0, 0}, {outDesc.width / 2, 0, 0}, copyExtent); + + // Copy second half of imgMemDst to second half of out data. + q.ext_oneapi_copy(imgMemDst.get_handle(), {outDesc.width / 2, 0, 0}, outDesc, + out.data(), {outDesc.width / 2, 0, 0}, + {outDesc.width / 2, 0, 0}, copyExtent); q.wait_and_throw(); } @@ -71,48 +98,74 @@ void copy_image_mem_handle_to_usm(const syclexp::image_descriptor &dataInDesc, sycl::device dev, sycl::queue q, std::vector &out) { - // Check that output image is double size of input images + // Check that output image is double size of input images. assert(outDesc.width == dataInDesc.width * 2); syclexp::image_mem imgMemSrc1(dataInDesc, dev, q.get_context()); syclexp::image_mem imgMemSrc2(dataInDesc, dev, q.get_context()); - // Allocate 1D device USM memory. Pitch set to zero as it is a 1D image + // Allocate 1D device USM memory. Pitch set to zero as it is a 1D image. size_t pitch = 0; size_t elements = outDesc.width * outDesc.num_channels; void *imgMemDst = sycl::malloc_device(elements, q); - // Copy input data to device - q.ext_oneapi_copy(dataIn1.data(), imgMemSrc1.get_handle(), dataInDesc); - q.ext_oneapi_copy(dataIn2.data(), imgMemSrc2.get_handle(), dataInDesc); + // Copy host input data to device. + // Extent to copy. + sycl::range copyExtent = {dataInDesc.width / 2, 1, 1}; - q.wait_and_throw(); + // Copy first half of dataIn1 to first half of imgMemSrc1. + q.ext_oneapi_copy(dataIn1.data(), {0, 0, 0}, {dataInDesc.width, 0, 0}, + imgMemSrc1.get_handle(), {0, 0, 0}, dataInDesc, copyExtent); - // Extent to copy - sycl::range copyExtent = {dataInDesc.width / 2, 1, 1}; + // Copy second half of dataIn1 to second half of imgMemSrc1. + q.ext_oneapi_copy(dataIn1.data(), {dataInDesc.width / 2, 0, 0}, + {dataInDesc.width, 0, 0}, imgMemSrc1.get_handle(), + {dataInDesc.width / 2, 0, 0}, dataInDesc, copyExtent); + + // Copy first half of dataIn2 to first half of imgMemSrc2. + q.ext_oneapi_copy(dataIn2.data(), {0, 0, 0}, {dataInDesc.width, 0, 0}, + imgMemSrc2.get_handle(), {0, 0, 0}, dataInDesc, copyExtent); - // Copy first half of imgMemSrcOne to first quarter of imgMemDst + // Copy second half of dataIn2 to second half of imgMemSrc2. + q.ext_oneapi_copy(dataIn2.data(), {dataInDesc.width / 2, 0, 0}, + {dataInDesc.width, 0, 0}, imgMemSrc2.get_handle(), + {dataInDesc.width / 2, 0, 0}, dataInDesc, copyExtent); + + q.wait_and_throw(); + + // Copy data from device to device. + // Copy first half of imgMemSrc1 to first quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc1.get_handle(), {0, 0, 0}, dataInDesc, imgMemDst, {0, 0, 0}, outDesc, pitch, copyExtent); - // Copy second half of imgMemSrcOne to second quarter of imgMemDst + // Copy second half of imgMemSrc1 to second quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc1.get_handle(), {dataInDesc.width / 2, 0, 0}, dataInDesc, imgMemDst, {outDesc.width / 4, 0, 0}, outDesc, pitch, copyExtent); - // Copy first half of imgMemSrcTwo to third quarter of imgMemDst + // Copy first half of imgMemSrc2 to third quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc2.get_handle(), {0, 0, 0}, dataInDesc, imgMemDst, {outDesc.width / 2, 0, 0}, outDesc, pitch, copyExtent); - // Copy second half of imgMemSrcTwo to fourth quarter of imgMemDst + // Copy second half of imgMemSrc2 to fourth quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc2.get_handle(), {dataInDesc.width / 2, 0, 0}, dataInDesc, imgMemDst, {(outDesc.width / 4) * 3, 0, 0}, outDesc, pitch, copyExtent); q.wait_and_throw(); - // Copy out data to host - q.ext_oneapi_copy(imgMemDst, out.data(), outDesc, pitch); + // Copy device data back to host. + // Extent to copy. + copyExtent = {outDesc.width / 2, 1, 1}; + + // Copy first half of imgMemDst to first half of out data. + q.ext_oneapi_copy(imgMemDst, {0, 0, 0}, out.data(), {0, 0, 0}, outDesc, pitch, + {outDesc.width, 0, 0}, copyExtent); + + // Copy second half of imgMemDst to second half of out data. + q.ext_oneapi_copy(imgMemDst, {outDesc.width / 2, 0, 0}, out.data(), + {outDesc.width / 2, 0, 0}, outDesc, pitch, + {outDesc.width, 0, 0}, copyExtent); q.wait_and_throw(); @@ -126,7 +179,7 @@ void copy_usm_to_image_mem_handle(const syclexp::image_descriptor &dataInDesc, sycl::device dev, sycl::queue q, std::vector &out) { - // Check that output image is double size of input images + // Check that output image is double size of input images. assert(outDesc.width == dataInDesc.width * 2); size_t pitchSrc1 = 0; @@ -137,38 +190,66 @@ void copy_usm_to_image_mem_handle(const syclexp::image_descriptor &dataInDesc, syclexp::image_mem imgMemDst(outDesc, dev, q.get_context()); - // Copy input data to device - q.ext_oneapi_copy(dataIn1.data(), imgMemSrc1, dataInDesc, pitchSrc1); - q.ext_oneapi_copy(dataIn2.data(), imgMemSrc2, dataInDesc, pitchSrc2); + // Copy host input data to device. + // Extent to copy. + sycl::range copyExtent = {dataInDesc.width / 2, 1, 1}; - q.wait_and_throw(); + // Copy first half of dataIn1 to first half of imgMemSrc1. + q.ext_oneapi_copy(dataIn1.data(), {0, 0, 0}, imgMemSrc1, {0, 0, 0}, + dataInDesc, pitchSrc1, {dataInDesc.width, 0, 0}, + copyExtent); - // Extent to copy - sycl::range copyExtent = {dataInDesc.width / 2, 1, 1}; + // Copy second half of dataIn1 to second half of imgMemSrc1. + q.ext_oneapi_copy(dataIn1.data(), {dataInDesc.width / 2, 0, 0}, imgMemSrc1, + {dataInDesc.width / 2, 0, 0}, dataInDesc, pitchSrc1, + {dataInDesc.width, 0, 0}, copyExtent); + + // Copy first half of dataIn2 to first half of imgMemSrc2. + q.ext_oneapi_copy(dataIn2.data(), {0, 0, 0}, imgMemSrc2, {0, 0, 0}, + dataInDesc, pitchSrc2, {dataInDesc.width, 0, 0}, + copyExtent); + + // Copy second half of dataIn2 to second half of imgMemSrc2. + q.ext_oneapi_copy(dataIn2.data(), {dataInDesc.width / 2, 0, 0}, imgMemSrc2, + {dataInDesc.width / 2, 0, 0}, dataInDesc, pitchSrc2, + {dataInDesc.width, 0, 0}, copyExtent); - // Copy first half of imgMemSrcOne to first quarter of imgMemDst + q.wait_and_throw(); + + // Copy data from device to device. + // Copy first half of imgMemSrcOne to first quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc1, {0, 0, 0}, dataInDesc, pitchSrc1, imgMemDst.get_handle(), {0, 0, 0}, outDesc, copyExtent); - // Copy second half of imgMemSrcOne to second quarter of imgMemDst + // Copy second half of imgMemSrcOne to second quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc1, {dataInDesc.width / 2, 0, 0}, dataInDesc, pitchSrc1, imgMemDst.get_handle(), {outDesc.width / 4, 0, 0}, outDesc, copyExtent); - // Copy first half of imgMemSrcTwo to third quarter of imgMemDst + // Copy first half of imgMemSrcTwo to third quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc2, {0, 0, 0}, dataInDesc, pitchSrc2, imgMemDst.get_handle(), {outDesc.width / 2, 0, 0}, outDesc, copyExtent); - // Copy second half of imgMemSrcTwo to fourth quarter of imgMemDst + // Copy second half of imgMemSrcTwo to fourth quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc2, {dataInDesc.width / 2, 0, 0}, dataInDesc, pitchSrc2, imgMemDst.get_handle(), {(outDesc.width / 4) * 3, 0, 0}, outDesc, copyExtent); q.wait_and_throw(); - // Copy out data to host - q.ext_oneapi_copy(imgMemDst.get_handle(), out.data(), outDesc); + // Copy device data back to host. + // Extent to copy. + copyExtent = {outDesc.width / 2, 1, 1}; + + // Copy first half of imgMemDst to first half of out data. + q.ext_oneapi_copy(imgMemDst.get_handle(), {0, 0, 0}, outDesc, out.data(), + {0, 0, 0}, {outDesc.width, 0, 0}, copyExtent); + + // Copy second half of imgMemDst to second half of out data. + q.ext_oneapi_copy(imgMemDst.get_handle(), {outDesc.width / 2, 0, 0}, outDesc, + out.data(), {outDesc.width / 2, 0, 0}, + {outDesc.width, 0, 0}, copyExtent); q.wait_and_throw(); @@ -182,7 +263,7 @@ void copy_usm_to_usm(const syclexp::image_descriptor &dataInDesc, const std::vector &dataIn2, sycl::device dev, sycl::queue q, std::vector &out) { - // Check that output image is double size of input images + // Check that output image is double size of input images. assert(outDesc.width == dataInDesc.width * 2); size_t pitchSrc1 = 0; @@ -191,43 +272,69 @@ void copy_usm_to_usm(const syclexp::image_descriptor &dataInDesc, void *imgMemSrc1 = sycl::malloc_device(elementsSrc, q); void *imgMemSrc2 = sycl::malloc_device(elementsSrc, q); - // syclexp::image_mem imgMemDst(outDesc, dev, q.get_context()); - size_t pitchDst = 0; size_t elementsDst = outDesc.width * outDesc.num_channels; void *imgMemDst = sycl::malloc_device(elementsDst, q); - // Copy input data to device - q.ext_oneapi_copy(dataIn1.data(), imgMemSrc1, dataInDesc, pitchSrc1); - q.ext_oneapi_copy(dataIn2.data(), imgMemSrc2, dataInDesc, pitchSrc2); + // Copy host input data to device. + // Extent to copy. + sycl::range copyExtent = {dataInDesc.width / 2, 1, 1}; - q.wait_and_throw(); + // Copy first half of dataIn1 to first half of imgMemSrc1. + q.ext_oneapi_copy(dataIn1.data(), {0, 0, 0}, imgMemSrc1, {0, 0, 0}, + dataInDesc, pitchSrc1, {dataInDesc.width, 0, 0}, + copyExtent); - // Extent to copy - sycl::range copyExtent = {dataInDesc.width / 2, 1, 1}; + // Copy second half of dataIn1 to second half of imgMemSrc1. + q.ext_oneapi_copy(dataIn1.data(), {dataInDesc.width / 2, 0, 0}, imgMemSrc1, + {dataInDesc.width / 2, 0, 0}, dataInDesc, pitchSrc1, + {dataInDesc.width, 0, 0}, copyExtent); - // Copy first half of imgMemSrcOne to first quarter of imgMemDst + // Copy first half of dataIn2 to first half of imgMemSrc2. + q.ext_oneapi_copy(dataIn2.data(), {0, 0, 0}, imgMemSrc2, {0, 0, 0}, + dataInDesc, pitchSrc2, {dataInDesc.width, 0, 0}, + copyExtent); + + // Copy second half of dataIn2 to second half of imgMemSrc2. + q.ext_oneapi_copy(dataIn2.data(), {dataInDesc.width / 2, 0, 0}, imgMemSrc2, + {dataInDesc.width / 2, 0, 0}, dataInDesc, pitchSrc2, + {dataInDesc.width, 0, 0}, copyExtent); + + q.wait_and_throw(); + + // Copy data from device to device. + // Copy first half of imgMemSrc1 to first quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc1, {0, 0, 0}, dataInDesc, pitchSrc1, imgMemDst, {0, 0, 0}, outDesc, pitchDst, copyExtent); - // Copy second half of imgMemSrcOne to second quarter of imgMemDst + // Copy second half of imgMemSrc1 to second quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc1, {dataInDesc.width / 2, 0, 0}, dataInDesc, pitchSrc1, imgMemDst, {outDesc.width / 4, 0, 0}, outDesc, pitchDst, copyExtent); - // Copy first half of imgMemSrcTwo to third quarter of imgMemDst + // Copy first half of imgMemSrc2 to third quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc2, {0, 0, 0}, dataInDesc, pitchSrc2, imgMemDst, {outDesc.width / 2, 0, 0}, outDesc, pitchDst, copyExtent); - // Copy second half of imgMemSrcTwo to fourth quarter of imgMemDst + // Copy second half of imgMemSrc2 to fourth quarter of imgMemDst. q.ext_oneapi_copy(imgMemSrc2, {dataInDesc.width / 2, 0, 0}, dataInDesc, pitchSrc2, imgMemDst, {(outDesc.width / 4) * 3, 0, 0}, outDesc, pitchDst, copyExtent); q.wait_and_throw(); - // Copy out data to host - q.ext_oneapi_copy(imgMemDst, out.data(), outDesc, pitchDst); + // Copy device data back to host. + // Extent to copy. + copyExtent = {outDesc.width / 2, 1, 1}; + + // Copy first half of imgMemDst to first half of out data. + q.ext_oneapi_copy(imgMemDst, {0, 0, 0}, out.data(), {0, 0, 0}, outDesc, + pitchDst, {outDesc.width, 0, 0}, copyExtent); + + // Copy second half of imgMemDst to second half of out data. + q.ext_oneapi_copy(imgMemDst, {outDesc.width / 2, 0, 0}, out.data(), + {outDesc.width / 2, 0, 0}, outDesc, pitchDst, + {outDesc.width, 0, 0}, copyExtent); q.wait_and_throw(); @@ -409,12 +516,11 @@ bool check_test(const std::vector &out, return validated; } -template +template bool run_copy_test(sycl::device &dev, sycl::queue &q, sycl::range<1> dims) { std::vector dataIn1(dims.size() / 2); std::vector dataIn2(dims.size() / 2); - std::vector out(dims.size()); + std::vector out(dims.size(), 0); std::vector expected(dims.size()); @@ -439,16 +545,22 @@ bool run_copy_test(sycl::device &dev, sycl::queue &q, sycl::range<1> dims) { validated = validated && check_test(out, expected); + std::fill(out.begin(), out.end(), 0); + copy_image_mem_handle_to_usm(dataInDesc, outDesc, dataIn1, dataIn2, dev, q, out); validated = validated && check_test(out, expected); + std::fill(out.begin(), out.end(), 0); + copy_usm_to_image_mem_handle(dataInDesc, outDesc, dataIn1, dataIn2, dev, q, out); validated = validated && check_test(out, expected); + std::fill(out.begin(), out.end(), 0); + copy_usm_to_usm(dataInDesc, outDesc, dataIn1, dataIn2, dev, q, out); validated = validated && check_test(out, expected); @@ -483,7 +595,5 @@ int main() { return 1; } - std::cout << "Tests passed\n"; - return 0; } diff --git a/sycl/test-e2e/bindless_images/copies/device_to_device_copy_2D_subregion.cpp b/sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp similarity index 61% rename from sycl/test-e2e/bindless_images/copies/device_to_device_copy_2D_subregion.cpp rename to sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp index d13a9146b2784..10d9378a38764 100644 --- a/sycl/test-e2e/bindless_images/copies/device_to_device_copy_2D_subregion.cpp +++ b/sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp @@ -10,8 +10,8 @@ #include #include -// Uncomment to print additional test information -// #define VERBOSE_PRINT +// Uncomment to print additional test information. +#define VERBOSE_PRINT namespace syclexp = sycl::ext::oneapi::experimental; @@ -21,15 +21,30 @@ void copy_image_mem_handle_to_image_mem_handle( syclexp::image_mem imgMemSrc(desc, dev, q.get_context()); syclexp::image_mem imgMemDst(desc, dev, q.get_context()); - // Copy input data to device - q.ext_oneapi_copy(dataIn.data(), imgMemSrc.get_handle(), desc); + // Copy host input data to device. + // Extent to copy. + sycl::range copyExtent = {desc.width / 2, desc.height / 2, 1}; + sycl::range hostExtent = {desc.width, desc.height, 0}; - q.wait_and_throw(); + // Copy four quarters of input data into device image memory. + q.ext_oneapi_copy(dataIn.data(), {0, 0, 0}, hostExtent, + imgMemSrc.get_handle(), {0, 0, 0}, desc, copyExtent); - // Extent to copy - sycl::range copyExtent = {desc.width / 2, desc.height / 2, 1}; + q.ext_oneapi_copy(dataIn.data(), {desc.width / 2, 0, 0}, hostExtent, + imgMemSrc.get_handle(), {desc.width / 2, 0, 0}, desc, + copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {0, desc.height / 2, 0}, hostExtent, + imgMemSrc.get_handle(), {0, desc.height / 2, 0}, desc, + copyExtent); - // Copy four quarters of square into output image + q.ext_oneapi_copy(dataIn.data(), {desc.width / 2, desc.height / 2, 0}, + hostExtent, imgMemSrc.get_handle(), + {desc.width / 2, desc.height / 2, 0}, desc, copyExtent); + + q.wait_and_throw(); + + // Copy data from device to device, using four sub-region copies. q.ext_oneapi_copy(imgMemSrc.get_handle(), {0, 0, 0}, desc, imgMemDst.get_handle(), {0, 0, 0}, desc, copyExtent); @@ -48,8 +63,21 @@ void copy_image_mem_handle_to_image_mem_handle( q.wait_and_throw(); - // Copy out data to host - q.ext_oneapi_copy(imgMemDst.get_handle(), out.data(), desc); + // Copy device data back to host. + // Copy four quarters of device imgMemDst data to host out. + q.ext_oneapi_copy(imgMemDst.get_handle(), {0, 0, 0}, desc, out.data(), + {0, 0, 0}, hostExtent, copyExtent); + + q.ext_oneapi_copy(imgMemDst.get_handle(), {desc.width / 2, 0, 0}, desc, + out.data(), {desc.width / 2, 0, 0}, hostExtent, copyExtent); + + q.ext_oneapi_copy(imgMemDst.get_handle(), {0, desc.height / 2, 0}, desc, + out.data(), {0, desc.height / 2, 0}, hostExtent, + copyExtent); + + q.ext_oneapi_copy( + imgMemDst.get_handle(), {desc.width / 2, desc.height / 2, 0}, desc, + out.data(), {desc.width / 2, desc.height / 2, 0}, hostExtent, copyExtent); q.wait_and_throw(); } @@ -60,19 +88,33 @@ void copy_image_mem_handle_to_usm(const syclexp::image_descriptor &desc, std::vector &out) { syclexp::image_mem imgMemSrc(desc, dev, q.get_context()); - // Allocate 2D device USM memory + // Allocate 2D device USM memory. size_t pitch = 0; void *imgMemDst = syclexp::pitched_alloc_device(&pitch, desc, q); - // Copy input data to device - q.ext_oneapi_copy(dataIn.data(), imgMemSrc.get_handle(), desc); + // Copy host input data to device. + // Extent to copy. + sycl::range copyExtent = {desc.width / 2, desc.height / 2, 1}; - q.wait_and_throw(); + // Copy four quarters of input data into device image memory. + q.ext_oneapi_copy(dataIn.data(), {0, 0, 0}, {desc.width, desc.height, 0}, + imgMemSrc.get_handle(), {0, 0, 0}, desc, copyExtent); - // Extent to copy - sycl::range copyExtent = {desc.width / 2, desc.height / 2, 1}; + q.ext_oneapi_copy(dataIn.data(), {desc.width / 2, 0, 0}, + {desc.width, desc.height, 0}, imgMemSrc.get_handle(), + {desc.width / 2, 0, 0}, desc, copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {0, desc.height / 2, 0}, + {desc.width, desc.height, 0}, imgMemSrc.get_handle(), + {0, desc.height / 2, 0}, desc, copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {desc.width / 2, desc.height / 2, 0}, + {desc.width, desc.height, 0}, imgMemSrc.get_handle(), + {desc.width / 2, desc.height / 2, 0}, desc, copyExtent); - // Copy four quarters of square into output image + q.wait_and_throw(); + + // Copy data from device to device, using four sub-region copies. q.ext_oneapi_copy(imgMemSrc.get_handle(), {0, 0, 0}, desc, imgMemDst, {0, 0, 0}, desc, pitch, copyExtent); @@ -89,8 +131,22 @@ void copy_image_mem_handle_to_usm(const syclexp::image_descriptor &desc, q.wait_and_throw(); - // Copy out data to host - q.ext_oneapi_copy(imgMemDst, out.data(), desc, pitch); + // Copy device data back to host. + // Copy four quarters of device imgMemDst data to host out. + q.ext_oneapi_copy(imgMemDst, {0, 0, 0}, out.data(), {0, 0, 0}, desc, pitch, + {desc.width, desc.height, 0}, copyExtent); + + q.ext_oneapi_copy(imgMemDst, {desc.width / 2, 0, 0}, out.data(), + {desc.width / 2, 0, 0}, desc, pitch, + {desc.width, desc.height, 0}, copyExtent); + + q.ext_oneapi_copy(imgMemDst, {0, desc.height / 2, 0}, out.data(), + {0, desc.height / 2, 0}, desc, pitch, + {desc.width, desc.height, 0}, copyExtent); + + q.ext_oneapi_copy(imgMemDst, {desc.width / 2, desc.height / 2, 0}, out.data(), + {desc.width / 2, desc.height / 2, 0}, desc, pitch, + {desc.width, desc.height, 0}, copyExtent); q.wait_and_throw(); @@ -101,21 +157,36 @@ void copy_usm_to_image_mem_handle(const syclexp::image_descriptor &desc, const std::vector &dataIn, sycl::device dev, sycl::queue q, std::vector &out) { - // Allocate 2D device USM memory + // Allocate 2D device USM memory. size_t pitch = 0; void *imgMemSrc = syclexp::pitched_alloc_device(&pitch, desc, q); syclexp::image_mem imgMemDst(desc, dev, q.get_context()); - // Copy input data to device - q.ext_oneapi_copy(dataIn.data(), imgMemSrc, desc, pitch); + // Copy host input data to device. + // Extent to copy. + sycl::range copyExtent = {desc.width / 2, desc.height / 2, 1}; + sycl::range hostExtent = {desc.width, desc.height, 0}; - q.wait_and_throw(); + // Copy four quarters of input data into device image memory. + q.ext_oneapi_copy(dataIn.data(), {0, 0, 0}, imgMemSrc, {0, 0, 0}, desc, pitch, + hostExtent, copyExtent); - // Extent to copy - sycl::range copyExtent = {desc.width / 2, desc.height / 2, 1}; + q.ext_oneapi_copy(dataIn.data(), {desc.width / 2, 0, 0}, imgMemSrc, + {desc.width / 2, 0, 0}, desc, pitch, hostExtent, + copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {0, desc.height / 2, 0}, imgMemSrc, + {0, desc.height / 2, 0}, desc, pitch, hostExtent, + copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {desc.width / 2, desc.height / 2, 0}, + imgMemSrc, {desc.width / 2, desc.height / 2, 0}, desc, + pitch, hostExtent, copyExtent); + + q.wait_and_throw(); - // Copy four quarters of square into output image + // Copy data from device to device, using four sub-region copies. q.ext_oneapi_copy(imgMemSrc, {0, 0, 0}, desc, pitch, imgMemDst.get_handle(), {0, 0, 0}, desc, copyExtent); @@ -133,8 +204,23 @@ void copy_usm_to_image_mem_handle(const syclexp::image_descriptor &desc, q.wait_and_throw(); - // Copy out data to host - q.ext_oneapi_copy(imgMemDst.get_handle(), out.data(), desc); + // Copy device data back to host. + // Copy four quarters of device imgMemDst data to host out. + q.ext_oneapi_copy(imgMemDst.get_handle(), {0, 0, 0}, desc, out.data(), + {0, 0, 0}, {desc.width, desc.height, 0}, copyExtent); + + q.ext_oneapi_copy(imgMemDst.get_handle(), {desc.width / 2, 0, 0}, desc, + out.data(), {desc.width / 2, 0, 0}, + {desc.width, desc.height, 0}, copyExtent); + + q.ext_oneapi_copy(imgMemDst.get_handle(), {0, desc.height / 2, 0}, desc, + out.data(), {0, desc.height / 2, 0}, + {desc.width, desc.height, 0}, copyExtent); + + q.ext_oneapi_copy(imgMemDst.get_handle(), + {desc.width / 2, desc.height / 2, 0}, desc, out.data(), + {desc.width / 2, desc.height / 2, 0}, + {desc.width, desc.height, 0}, copyExtent); q.wait_and_throw(); @@ -144,22 +230,36 @@ void copy_usm_to_image_mem_handle(const syclexp::image_descriptor &desc, void copy_usm_to_usm(const syclexp::image_descriptor &desc, const std::vector &dataIn, sycl::device dev, sycl::queue q, std::vector &out) { - // Allocate 2D device USM memory + // Allocate 2D device USM memory. size_t pitchSrc = 0; void *imgMemSrc = syclexp::pitched_alloc_device(&pitchSrc, desc, q); size_t pitchDst = 0; void *imgMemDst = syclexp::pitched_alloc_device(&pitchDst, desc, q); - // Copy input data to device - q.ext_oneapi_copy(dataIn.data(), imgMemSrc, desc, pitchSrc); + // Copy host input data to device. + // Extent to copy. + sycl::range copyExtent = {desc.width / 2, desc.height / 2, 1}; + sycl::range hostExtent = {desc.width, desc.height, 0}; - q.wait_and_throw(); + q.ext_oneapi_copy(dataIn.data(), {0, 0, 0}, imgMemSrc, {0, 0, 0}, desc, + pitchSrc, hostExtent, copyExtent); - // Extent to copy - sycl::range copyExtent = {desc.width / 2, desc.height / 2, 1}; + q.ext_oneapi_copy(dataIn.data(), {desc.width / 2, 0, 0}, imgMemSrc, + {desc.width / 2, 0, 0}, desc, pitchSrc, hostExtent, + copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {0, desc.height / 2, 0}, imgMemSrc, + {0, desc.height / 2, 0}, desc, pitchSrc, hostExtent, + copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {desc.width / 2, desc.height / 2, 0}, + imgMemSrc, {desc.width / 2, desc.height / 2, 0}, desc, + pitchSrc, hostExtent, copyExtent); - // Copy four quarters of square into output image + q.wait_and_throw(); + + // Copy four quarters of square into output image. q.ext_oneapi_copy(imgMemSrc, {0, 0, 0}, desc, pitchSrc, imgMemDst, {0, 0, 0}, desc, pitchDst, copyExtent); @@ -177,8 +277,25 @@ void copy_usm_to_usm(const syclexp::image_descriptor &desc, q.wait_and_throw(); - // Copy out data to host - q.ext_oneapi_copy(imgMemDst, out.data(), desc, pitchDst); + // // Copy device data to host. + // q.ext_oneapi_copy(imgMemDst, out.data(), desc, pitchDst); + + // Copy device data back to host. + // Copy four quarters of device imgMemDst data to host out. + q.ext_oneapi_copy(imgMemDst, {0, 0, 0}, out.data(), {0, 0, 0}, desc, pitchDst, + hostExtent, copyExtent); + + q.ext_oneapi_copy(imgMemDst, {desc.width / 2, 0, 0}, out.data(), + {desc.width / 2, 0, 0}, desc, pitchDst, hostExtent, + copyExtent); + + q.ext_oneapi_copy(imgMemDst, {0, desc.height / 2, 0}, out.data(), + {0, desc.height / 2, 0}, desc, pitchDst, hostExtent, + copyExtent); + + q.ext_oneapi_copy(imgMemDst, {desc.width / 2, desc.height / 2, 0}, out.data(), + {desc.width / 2, desc.height / 2, 0}, desc, pitchDst, + hostExtent, copyExtent); q.wait_and_throw(); @@ -352,7 +469,7 @@ bool run_copy_test(sycl::device &dev, sycl::queue &q, sycl::range<2> dims) { std::vector expected(dims.size()); std::iota(expected.begin(), expected.end(), 0); - std::vector out(dims.size()); + std::vector out(dims.size(), 0); syclexp::image_descriptor desc = syclexp::image_descriptor(dims, channelNum, channelType); @@ -364,14 +481,20 @@ bool run_copy_test(sycl::device &dev, sycl::queue &q, sycl::range<2> dims) { validated = validated && check_test(out, expected); + std::fill(out.begin(), out.end(), 0); + copy_image_mem_handle_to_usm(desc, dataIn, dev, q, out); validated = validated && check_test(out, expected); + std::fill(out.begin(), out.end(), 0); + copy_usm_to_image_mem_handle(desc, dataIn, dev, q, out); validated = validated && check_test(out, expected); + std::fill(out.begin(), out.end(), 0); + copy_usm_to_usm(desc, dataIn, dev, q, out); validated = validated && check_test(out, expected); @@ -405,7 +528,5 @@ int main() { return 1; } - std::cout << "Tests passed\n"; - return 0; } diff --git a/sycl/test-e2e/bindless_images/copies/device_to_device_copy_3D_subregion.cpp b/sycl/test-e2e/bindless_images/copies/copy_subregion_3D.cpp similarity index 61% rename from sycl/test-e2e/bindless_images/copies/device_to_device_copy_3D_subregion.cpp rename to sycl/test-e2e/bindless_images/copies/copy_subregion_3D.cpp index 9c0d34e423cb9..147435fb5806a 100644 --- a/sycl/test-e2e/bindless_images/copies/device_to_device_copy_3D_subregion.cpp +++ b/sycl/test-e2e/bindless_images/copies/copy_subregion_3D.cpp @@ -19,13 +19,45 @@ void copy_image_mem_handle_to_image_mem_handle( syclexp::image_mem imgMemSrc(desc, dev, q.get_context()); syclexp::image_mem imgMemDst(desc, dev, q.get_context()); - // Copy input data to device - q.ext_oneapi_copy(dataIn.data(), imgMemSrc.get_handle(), desc); + // Copy host input data to device. + // Extent to copy. + sycl::range copyExtent = {desc.width / 2, desc.height / 2, desc.depth / 2}; + sycl::range hostExtent = {desc.width, desc.height, desc.depth}; - q.wait_and_throw(); + // Copy eight quadrants of input data into device image memory. + q.ext_oneapi_copy(dataIn.data(), {0, 0, 0}, hostExtent, + imgMemSrc.get_handle(), {0, 0, 0}, desc, copyExtent); - // Extent to copy - sycl::range copyExtent = {desc.width / 2, desc.height / 2, desc.depth / 2}; + q.ext_oneapi_copy(dataIn.data(), {desc.width / 2, 0, 0}, hostExtent, + imgMemSrc.get_handle(), {desc.width / 2, 0, 0}, desc, + copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {0, desc.height / 2, 0}, hostExtent, + imgMemSrc.get_handle(), {0, desc.height / 2, 0}, desc, + copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {desc.width / 2, desc.height / 2, 0}, + hostExtent, imgMemSrc.get_handle(), + {desc.width / 2, desc.height / 2, 0}, desc, copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {0, 0, desc.depth / 2}, hostExtent, + imgMemSrc.get_handle(), {0, 0, desc.depth / 2}, desc, + copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {desc.width / 2, 0, desc.depth / 2}, + hostExtent, imgMemSrc.get_handle(), + {desc.width / 2, 0, desc.depth / 2}, desc, copyExtent); + + q.ext_oneapi_copy(dataIn.data(), {0, desc.height / 2, desc.depth / 2}, + hostExtent, imgMemSrc.get_handle(), + {0, desc.height / 2, desc.depth / 2}, desc, copyExtent); + + q.ext_oneapi_copy( + dataIn.data(), {desc.width / 2, desc.height / 2, desc.depth / 2}, + hostExtent, imgMemSrc.get_handle(), + {desc.width / 2, desc.height / 2, desc.depth / 2}, desc, copyExtent); + + q.wait_and_throw(); // Copy eight quadrants of square into output image q.ext_oneapi_copy(imgMemSrc.get_handle(), {0, 0, 0}, desc, @@ -64,8 +96,37 @@ void copy_image_mem_handle_to_image_mem_handle( q.wait_and_throw(); - // Copy out data to host - q.ext_oneapi_copy(imgMemDst.get_handle(), out.data(), desc); + // Copy device data back to host. + // Copy four quarters of device imgMemDst data to host out. + q.ext_oneapi_copy(imgMemDst.get_handle(), {0, 0, 0}, desc, out.data(), + {0, 0, 0}, hostExtent, copyExtent); + + q.ext_oneapi_copy(imgMemDst.get_handle(), {desc.width / 2, 0, 0}, desc, + out.data(), {desc.width / 2, 0, 0}, hostExtent, copyExtent); + + q.ext_oneapi_copy(imgMemDst.get_handle(), {0, desc.height / 2, 0}, desc, + out.data(), {0, desc.height / 2, 0}, hostExtent, + copyExtent); + + q.ext_oneapi_copy( + imgMemDst.get_handle(), {desc.width / 2, desc.height / 2, 0}, desc, + out.data(), {desc.width / 2, desc.height / 2, 0}, hostExtent, copyExtent); + + q.ext_oneapi_copy(imgMemDst.get_handle(), {0, 0, desc.depth / 2}, desc, + out.data(), {0, 0, desc.depth / 2}, hostExtent, copyExtent); + + q.ext_oneapi_copy(imgMemDst.get_handle(), {desc.width / 2, 0, desc.depth / 2}, + desc, out.data(), {desc.width / 2, 0, desc.depth / 2}, + hostExtent, copyExtent); + + q.ext_oneapi_copy( + imgMemDst.get_handle(), {0, desc.height / 2, desc.depth / 2}, desc, + out.data(), {0, desc.height / 2, desc.depth / 2}, hostExtent, copyExtent); + + q.ext_oneapi_copy( + imgMemDst.get_handle(), {desc.width / 2, desc.height / 2, desc.depth / 2}, + desc, out.data(), {desc.width / 2, desc.height / 2, desc.depth / 2}, + hostExtent, copyExtent); q.wait_and_throw(); } @@ -160,7 +221,5 @@ int main() { return 1; } - std::cout << "Tests passed\n"; - return 0; } From b376cf0408a461837828b7dad6ddc5a510a38b8c Mon Sep 17 00:00:00 2001 From: Przemek Malon Date: Tue, 29 Jul 2025 12:45:33 +0100 Subject: [PATCH 6/7] Remove commented code --- sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp b/sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp index 10d9378a38764..2bb3c98409249 100644 --- a/sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp +++ b/sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp @@ -277,9 +277,6 @@ void copy_usm_to_usm(const syclexp::image_descriptor &desc, q.wait_and_throw(); - // // Copy device data to host. - // q.ext_oneapi_copy(imgMemDst, out.data(), desc, pitchDst); - // Copy device data back to host. // Copy four quarters of device imgMemDst data to host out. q.ext_oneapi_copy(imgMemDst, {0, 0, 0}, out.data(), {0, 0, 0}, desc, pitchDst, From 77dffdee84e712af317c48189d4cf8d3be9196b1 Mon Sep 17 00:00:00 2001 From: Przemek Malon Date: Tue, 29 Jul 2025 12:48:09 +0100 Subject: [PATCH 7/7] Fix typo. Disable verbose print. --- .../experimental/sycl_ext_oneapi_bindless_images.asciidoc | 2 +- sycl/test-e2e/bindless_images/copies/copy_subregion_1D.cpp | 2 +- sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc index e22a6e918f311..8a124ad549c69 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc @@ -1336,7 +1336,7 @@ allocated within the same context and device of the `queue`. The USM memory must be accessible on the queue's device. The `ext_oneapi_copy` function variants that do not take offsets and extents -must ensure that the following conditions are met.: +must ensure that the following conditions are met: 1. The `Src` and `Dest` memory was allocated on the same device and context. diff --git a/sycl/test-e2e/bindless_images/copies/copy_subregion_1D.cpp b/sycl/test-e2e/bindless_images/copies/copy_subregion_1D.cpp index b3b836610d2e7..ac93ae5db9e67 100644 --- a/sycl/test-e2e/bindless_images/copies/copy_subregion_1D.cpp +++ b/sycl/test-e2e/bindless_images/copies/copy_subregion_1D.cpp @@ -12,7 +12,7 @@ #include // Uncomment to print additional test information. -#define VERBOSE_PRINT +// #define VERBOSE_PRINT namespace syclexp = sycl::ext::oneapi::experimental; diff --git a/sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp b/sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp index 2bb3c98409249..6d61a2a2a259a 100644 --- a/sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp +++ b/sycl/test-e2e/bindless_images/copies/copy_subregion_2D.cpp @@ -11,7 +11,7 @@ #include // Uncomment to print additional test information. -#define VERBOSE_PRINT +// #define VERBOSE_PRINT namespace syclexp = sycl::ext::oneapi::experimental;