fix bug of split

ForFishes · ForFishes · commit c1f688cca2c1 · 2021-06-01T22:32:15.000+08:00
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -26,9 +26,6 @@ class CEmbeddingOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "CEmbeddingOp");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CEmbeddingOp");
 
-    // auto start_index = ctx->Attrs().Get<int64_t>("start_index");
-    // auto end_index = ctx->Attrs().Get<int64_t>("end_index");
-
     auto table_dims = ctx->GetInputDim("W");
     auto ids_dims = ctx->GetInputDim("Ids");
     int ids_rank = ids_dims.size();
@@ -42,15 +39,6 @@ class CEmbeddingOp : public framework::OperatorWithKernel {
             "c_embedding's shape = [%s].",
             table_dims.size(), table_dims));
 
-    // PADDLE_ENFORCE_EQ(
-    //     end_index - start_index, table_dims[0],
-    //     platform::errors::InvalidArgument(
-    //         "The value of end_index - start_index should be equal to table's
-    //         length."
-    //         "But received end_index - start_index = %d, "
-    //         "table's length = %d.",
-    //         end_index - start_index, table_dims[0]));
-
     auto output_dims = framework::vectorize(ids_dims);
     output_dims.push_back(table_dims[1]);
     ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
@@ -81,20 +69,9 @@ class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The lookup results, which have the same type as W.");
 
     AddAttr<int64_t>("start_index",
-                     "(int64, default 0) "
-                     "If the value is 0, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
+                     "(int64, default 0), The starting index is indeed, "
+                     "and the out-of-bounds will be set to 0 ")
         .SetDefault(0);
-
-    // AddAttr<int64_t>("end_index",
-    //                  "(int64, default -1) "
-    //                  "If the value is -1, it makes no effect to lookup. "
-    //                  "Otherwise the given value indicates padding the output
-    //                  "
-    //                  "with zeros whenever lookup encounters it in Ids.")
-    //     .SetDefault(1);
-
     AddComment(R"DOC(
 c_embedding Operator.
 
@@ -153,7 +130,6 @@ class CEmbeddingOpGradVarTypeInference : public framework::VarTypeInference {
     VLOG(3) << "c_embedding_grad op " << framework::GradVarName("W")
             << " is set to LoDTensor";
     ctx->SetOutputType(out_var_name, framework::proto::VarType::LOD_TENSOR);
-    // }
     ctx->SetOutputDataType(out_var_name, ctx->GetInputDataType("W"));
   }
 };
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -82,8 +82,6 @@ class CEmbeddingCUDAKernel : public framework::OpKernel<T> {
     const auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     const int64_t start_idx = context.Attr<int64_t>("start_index");
-    // const int64_t end_idx = context.Attr<int64_t>("end_index");
-
     size_t N = table_t->dims()[0];
     size_t D = table_t->dims()[1];
     size_t K = ids_t->numel();
@@ -118,8 +116,6 @@ class CEmbeddingGradCUDAKernel : public framework::OpKernel<T> {
     const auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     const int64_t start_idx = context.Attr<int64_t>("start_index");
-    // const int64_t end_idx = context.Attr<int64_t>("end_index");
-
     auto ids_t = context.Input<LoDTensor>("Ids");
     auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto d_table_t = context.Output<LoDTensor>(framework::GradVarName("W"));
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
@@ -775,7 +775,7 @@ def _c_identity(tensor, group=None):
     return out
 
 
-def _c_concat(tensor, nranks, group=None):
+def _c_concat(tensor, group=None):
     """
     Return allgather of the tensor, mainly used with model parallel.
 
@@ -791,10 +791,14 @@ def _c_concat(tensor, nranks, group=None):
         return
     ring_id = 0 if group is None else group.id
 
+    global_rank = _get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = _get_global_env().world_size if group is None else group.nranks
+
     if in_dygraph_mode():
         return core.ops.c_concat(tensor, 'ring_id', ring_id, 'use_calc_stream',
-                                 True, 'nranks', nranks, 'use_model_parallel',
-                                 True)
+                                 True, 'rank', rank, 'nranks', nranks,
+                                 'use_model_parallel', True)
 
     op_type = 'c_concat'
     helper = LayerHelper(op_type, **locals())
@@ -812,12 +816,13 @@ def _c_concat(tensor, nranks, group=None):
             'ring_id': ring_id,
             'use_calc_stream': True,
             'use_model_parallel': True,
-            'nranks': nranks
+            'nranks': nranks,
+            'rank': rank
         })
     return out
 
 
-def _c_split(tensor, rank, nranks, group=None):
+def _c_split(tensor, group=None):
     """
     Split tensor evenly among all members, mainly used with model parallel.
 
@@ -834,6 +839,10 @@ def _c_split(tensor, rank, nranks, group=None):
         return
     ring_id = 0 if group is None else group.id
 
+    global_rank = _get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = _get_global_env().world_size if group is None else group.nranks
+
     if in_dygraph_mode():
         return core.ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
                                 ring_id, 'rank', rank, 'nranks', nranks,
@@ -884,11 +893,10 @@ def _mp_allreduce(tensor,
 
 
 def _c_embedding(x, weight, start_index=0, name=None):
-
     if in_dygraph_mode():
         return core.ops.c_embedding(weight, x, "start_index", start_index)
     else:
-        helper = LayerHelper('_c_embedding', **locals())
+        helper = LayerHelper('c_embedding', **locals())
         dtype = helper.input_dtype(input_param_name='weight')
 
         check_variable_and_dtype(x, 'input', ['int32', 'int64'], 'embedding')
@@ -1008,7 +1016,7 @@ def _parallel_linear(x,
 
     if axis == 0:
         if split_tensor:
-            x = _c_split(x, inner_rank, nranks, group=group)
+            x = _c_split(x, group=group)
     else:
         x = _c_identity(x, group=group)
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -27,31 +27,6 @@
 # language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
 
 
-class _EmbeddingInModelParallel(PyLayer):
-    @staticmethod
-    def forward(ctx, masked_input, weight, input_mask, name):
-        output_parallel = F.embedding(
-            masked_input,
-            weight=weight,
-            padding_idx=None,
-            sparse=False,
-            name=name)
-        # Mask the output embedding.
-        output_parallel[input_mask, :] = 0.0
-
-        ctx.save_for_backward(output_parallel, input_mask)
-
-        return output_parallel
-
-    @staticmethod
-    def backward(ctx, dout):
-        output_parallel, input_mask = ctx.saved_tensor()
-        paddle.autograd.backward(tensors=[output_parallel], grad_tensors=[dout])
-        output_parallel.grad[input_mask, :] = 0
-
-        return None, output_parallel.grad, None
-
-
 class VocabParallelEmbedding(Layer):
     def __init__(self,
                  num_embeddings,
@@ -76,9 +51,6 @@ def __init__(self,
         per_part_size = num_embeddings // self.world_size
 
         self.vocab_start_index = self.rank * per_part_size
-        self.vocab_end_index = self.vocab_start_index + per_part_size
-        # self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
-
         self._dtype = self._helper.get_default_dtype()
         self._size = [per_part_size, embedding_dim]
         self._weight_attr = weight_attr
@@ -92,32 +64,25 @@ def __init__(self,
                 is_bias=False)
         self.weight.is_distributed = True
 
-    def forward(self, input_):
+    def forward(self, x):
         if self.is_mp:
-            # Build the mask.
-            input_mask = paddle.logical_or((input_ < self.vocab_start_index),
-                                           (input_ >= self.vocab_end_index))
-            # Mask the input.
-            masked_input = input_.clone() - self.vocab_start_index
-            masked_input[input_mask] = 0
+            output_parallel = paddle.distributed.collective._c_embedding(
+                x,
+                self.weight,
+                start_index=self.vocab_start_index,
+                name=self._name)
+            output = paddle.distributed.collective._mp_allreduce(
+                output_parallel,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True)
         else:
-            masked_input = input_
-
-        output_parallel = F.embedding(
-            masked_input,
-            weight=self.weight,
-            padding_idx=None,
-            sparse=False,
-            name=self._name)
-        # Mask the output embedding.
-        if self.is_mp:
-            output_parallel[input_mask, :] = 0.0
-
-        output = paddle.distributed.collective._mp_allreduce(
-            output_parallel,
-            group=self.model_parallel_group,
-            use_calc_stream=True,
-            use_model_parallel=True)
+            output = F.embedding(
+                x,
+                weight=self.weight,
+                padding_idx=None,
+                sparse=False,
+                name=self._name)
         return output
 
 
@@ -188,9 +153,7 @@ def forward(self, x):
 
         if self.gather_output and self.is_mp:
             output = paddle.distributed.collective._c_concat(
-                output_parallel,
-                nranks=self.world_size,
-                group=self.model_parallel_group)
+                output_parallel, group=self.model_parallel_group)
         else:
             output = output_parallel
         return output
@@ -258,10 +221,7 @@ def forward(self, x):
         else:
             # split last dim
             input_parallel = paddle.distributed.collective._c_split(
-                x,
-                rank=self.rank,
-                nranks=self.world_size,
-                group=self.model_parallel_group)
+                x, group=self.model_parallel_group)
 
         output_parallel = F.linear(input_parallel, self.weight, name=self._name)
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py