diff --git a/SYCL/Reduction/reduction_aux_resources.cpp b/SYCL/Reduction/reduction_aux_resources.cpp
new file mode 100644
index 0000000000..192388c580
--- /dev/null
+++ b/SYCL/Reduction/reduction_aux_resources.cpp
@@ -0,0 +1,35 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
+// RUN: env SYCL_PI_TRACE=2 %CPU_RUN_PLACEHOLDER %t.out 2>&1 %CPU_CHECK_PLACEHOLDER --check-prefix=CHECK-POOL
+// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER --check-prefix=CHECK-POOL
+// RUN: env SYCL_PI_TRACE=2 SYCL_DISABLE_AUXILIARY_RESOURCE_POOL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1 %CPU_CHECK_PLACEHOLDER --check-prefix=CHECK-NO-POOL
+// RUN: env SYCL_PI_TRACE=2 SYCL_DISABLE_AUXILIARY_RESOURCE_POOL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER --check-prefix=CHECK-NO-POOL
+
+// Tests the auxiliary resource pool when continuous reductions need same size
+// resources.
+
+#include <CL/sycl.hpp>
+
+int main(void) {
+  sycl::queue Q;
+  double *Result = sycl::malloc_host<double>(1, Q);
+  for (size_t I = 0; I < 20; ++I) {
+    sycl::nd_range<1> Range{2048, 32};
+    auto Red = sycl::reduction(Result, 0.0, sycl::plus<double>());
+    Q.parallel_for(Range, Red, [=](sycl::nd_item<1> NDI, auto &Sum) {}).wait();
+  }
+  sycl::free(Result, Q);
+  return 0;
+}
+
+// Each reduction needs 2 auxiliary device-memory resources:
+//  1. An integer counter.
+//  2. A buffer for intermediate reduction results.
+
+// With pooling we should be reusing resources, hence there should only be 2
+// allocations (excluding USM).
+// CHECK-POOL-COUNT-2: piMemBufferCreate
+// CHECK-POOL-NOT: piMemBufferCreate
+
+// Without pooling, each reduction will allocate their own device memory.
+// CHECK-NO-POOL-COUNT-40: piMemBufferCreate
+// CHECK-NO-POOL-NOT: piMemBufferCreate