@@ -38,6 +38,11 @@ limitations under the License. */
3838#include " paddle/fluid/platform/hccl_helper.h"
3939#endif
4040
41+ // Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
42+ // DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
43+ // Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
44+ // DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
45+
4146namespace f = paddle::framework;
4247namespace p = paddle::platform;
4348namespace m = paddle::operators::math;
@@ -52,10 +57,11 @@ DECLARE_string(selected_npus);
5257template <typename T>
5358void PrintDebugInfo (const std::string preStr, const std::vector<T>& data) {
5459 std::string debugstring = " " ;
60+ std::cout << preStr << " :" << std::endl << debugstring;
5561 for (auto ele : data) {
56- debugstring += std::to_string ( ele) + std::string ( " , " ) ;
62+ std::cout << ele << " " ;
5763 }
58- VLOG ( 3 ) << preStr << " : " << std::endl << debugstring ;
64+ std::cout << std::endl;
5965}
6066
6167void PrepareUniqueId (f::Scope* scope, const p::DeviceContext& ctx,
@@ -120,6 +126,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
120126 ctx.Wait ();
121127}
122128
129+ template <typename T>
123130void TestHCCLAllReduceOp (f::Scope* scope, const p::DeviceContext& ctx,
124131 int iter) {
125132 // init
@@ -130,10 +137,11 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
130137 int num1 = 3 ;
131138 int num2 = 128 ;
132139
133- std::vector<float > init;
140+ std::vector<T > init;
134141 for (int64_t i = 0 ; i < num1 * num2; ++i) {
135- init.push_back (1.0 + rank_id);
142+ init.push_back (static_cast <T>( 1.0 + rank_id) );
136143 }
144+ init[0 ] = static_cast <T>(std::numeric_limits<float >::quiet_NaN ());
137145 PrintDebugInfo (" input data" , init);
138146
139147 auto place = ctx.GetPlace ();
@@ -145,31 +153,33 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
145153 auto out = scope->Var (" OutData" );
146154 auto tensor_out = out->GetMutable <f::LoDTensor>();
147155 tensor_out->Resize ({num1, num2});
148- tensor_out->mutable_data <float >(place); // allocate
156+ tensor_out->mutable_data <T >(place); // allocate
149157 ctx.Wait ();
150158
151159 // run
152160 f::AttributeMap attrs;
153161 attrs[" tag" ] = std::string (" tagx_" + std::to_string (iter));
154162 attrs[" ring_id" ] = 0 ;
163+ attrs[" use_calc_stream" ] = 1 ;
155164
156165 auto op = f::OpRegistry::CreateOp (" c_allreduce_sum" , {{" X" , {" Data" }}},
157166 {{" Out" , {" OutData" }}}, attrs);
158-
159- for (int i = 0 ; i < 10 ; i++) {
167+ for (int i = 0 ; i < 1 ; i++) {
160168 op->Run (*scope, place);
161169 }
162170 ctx.Wait ();
163171
164- std::vector<float > out_vec;
172+ std::vector<T > out_vec;
165173 TensorToVector (*tensor_out, ctx, &out_vec);
166174 ctx.Wait ();
167175
168176 PrintDebugInfo (" output data" , out_vec);
169177
178+ float diff = static_cast <float >(out_vec[0 ]) - 65504 ;
179+ EXPECT_TRUE (diff < 0.1 && diff > -0.1 );
170180 EXPECT_EQ (out_vec.size (), init.size ());
171- for (uint32_t i = 0 ; i < out_vec. size () ; i++) {
172- EXPECT_EQ (out_vec[i], 3.0 );
181+ for (uint32_t i = 1 ; i < 10 ; i++) {
182+ EXPECT_EQ (out_vec[i], static_cast <paddle::platform::float16>( 3.0 ) );
173183 }
174184}
175185
@@ -182,8 +192,7 @@ TEST(c_allreduce_sum, NPU) {
182192 // only support one device, if more than one device, use first default
183193 PrepareUniqueId (&scope, ctx, &hccl_id);
184194 Prepare (&scope, ctx, &hccl_id);
185- for (int i = 0 ; i < 1 ; i++) {
186- VLOG (2 ) << " iter num: " << i;
187- TestHCCLAllReduceOp (&scope, ctx, i);
188- }
195+
196+ TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1 );
197+ // TestHCCLAllReduceOp<float>(&scope, ctx, 0);
189198}
0 commit comments