diff --git a/SYCL/Reduction/reduction_aux_resources.cpp b/SYCL/Reduction/reduction_aux_resources.cpp new file mode 100644 index 0000000000..192388c580 --- /dev/null +++ b/SYCL/Reduction/reduction_aux_resources.cpp @@ -0,0 +1,35 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 +// RUN: env SYCL_PI_TRACE=2 %CPU_RUN_PLACEHOLDER %t.out 2>&1 %CPU_CHECK_PLACEHOLDER --check-prefix=CHECK-POOL +// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER --check-prefix=CHECK-POOL +// RUN: env SYCL_PI_TRACE=2 SYCL_DISABLE_AUXILIARY_RESOURCE_POOL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1 %CPU_CHECK_PLACEHOLDER --check-prefix=CHECK-NO-POOL +// RUN: env SYCL_PI_TRACE=2 SYCL_DISABLE_AUXILIARY_RESOURCE_POOL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER --check-prefix=CHECK-NO-POOL + +// Tests the auxiliary resource pool when continuous reductions need same size +// resources. + +#include + +int main(void) { + sycl::queue Q; + double *Result = sycl::malloc_host(1, Q); + for (size_t I = 0; I < 20; ++I) { + sycl::nd_range<1> Range{2048, 32}; + auto Red = sycl::reduction(Result, 0.0, sycl::plus()); + Q.parallel_for(Range, Red, [=](sycl::nd_item<1> NDI, auto &Sum) {}).wait(); + } + sycl::free(Result, Q); + return 0; +} + +// Each reduction needs 2 auxiliary device-memory resources: +// 1. An integer counter. +// 2. A buffer for intermediate reduction results. + +// With pooling we should be reusing resources, hence there should only be 2 +// allocations (excluding USM). +// CHECK-POOL-COUNT-2: piMemBufferCreate +// CHECK-POOL-NOT: piMemBufferCreate + +// Without pooling, each reduction will allocate their own device memory. +// CHECK-NO-POOL-COUNT-40: piMemBufferCreate +// CHECK-NO-POOL-NOT: piMemBufferCreate