JiabinYang
diff --git a/‎paddle/phi/kernels/cpu/size_kernel.cc‎
Lines changed: 0 additions & 32 deletions b/‎paddle/phi/kernels/cpu/size_kernel.cc‎
Lines changed: 0 additions & 32 deletions
diff --git a/‎paddle/phi/kernels/gpu/size_kernel.cu‎
Lines changed: 0 additions & 31 deletions b/‎paddle/phi/kernels/gpu/size_kernel.cu‎
Lines changed: 0 additions & 31 deletions
diff --git a/‎…ddle/phi/kernels/impl/size_kernel_impl.h‎ ‎paddle/phi/kernels/size_kernel.cc‎paddle/phi/kernels/impl/size_kernel_impl.h renamed to paddle/phi/kernels/size_kernel.cc
Lines changed: 19 additions & 14 deletions b/‎…ddle/phi/kernels/impl/size_kernel_impl.h‎ ‎paddle/phi/kernels/size_kernel.cc‎paddle/phi/kernels/impl/size_kernel_impl.h renamed to paddle/phi/kernels/size_kernel.cc
Lines changed: 19 additions & 14 deletions
diff --git a/‎paddle/phi/kernels/size_kernel.h‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/kernels/size_kernel.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/paddle/distributed/collective.py‎
Lines changed: 3 additions & 0 deletions b/‎python/paddle/distributed/collective.py‎
Lines changed: 3 additions & 0 deletions
@@ -12,28 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
+#include "paddle/phi/kernels/size_kernel.h"
 
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 
 namespace phi {
 
-template <typename T, typename Context>
+template <typename Context>
 void SizeKernel(const Context& ctx,
                 const DenseTensor& input,
                 DenseTensor* out) {
-  auto place = ctx.GetPlace();
-  auto out_data = ctx.template Alloc<int64_t>(out);
-  auto cpu_place = phi::CPUPlace();
-  if (place == cpu_place) {
-    out_data[0] = input.numel();
-  } else {
-    DenseTensor cpu_tensor;
-    cpu_tensor.Resize(out->dims());
-    auto cpu_data = ctx.template HostAlloc<int64_t>(&cpu_tensor);
-    cpu_data[0] = input.numel();
-    phi::Copy(ctx, cpu_tensor, place, false, out);
-  }
+  auto* out_data = ctx.template HostAlloc<int64_t>(out);
+  out_data[0] = input.numel();
 }
 
 }  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(
+    size, CPU, ALL_LAYOUT, phi::SizeKernel<phi::CPUContext>, ALL_DTYPE) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::INT64);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    size, GPU, ALL_LAYOUT, phi::SizeKernel<phi::GPUContext>, ALL_DTYPE) {
+  kernel->OutputAt(0)
+      .SetBackend(phi::Backend::CPU)
+      .SetDataType(phi::DataType::INT64);
+}
+#endif
@@ -18,7 +18,7 @@
 
 namespace phi {
 
-template <typename T, typename Context>
+template <typename Context>
 void SizeKernel(const Context& ctx, const DenseTensor& input, DenseTensor* out);
 
 }  // namespace phi
@@ -1140,6 +1140,9 @@ def all_gather_object(object_list, obj, group=None):
     ), "all_gather_object doesn't support static graph mode."
 
     tensor, len_of_tensor = _convert_object_to_tensor(obj)
+    if paddle.get_device() != "cpu":
+        len_of_tensor = len_of_tensor._copy_to(
+            paddle.framework._current_expected_place(), False)
 
     # gather len_of_tensor from all ranks
     list_len_of_tensor = []