diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index 63d84ece4aa988..6e76d10910bed2 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -444,8 +444,6 @@ list(REMOVE_ITEM TEST_OPS test_feed_data_check_shape_type) list(REMOVE_ITEM TEST_OPS test_fetch_lod_tensor_array) list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_data_norm_op) -list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) -list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer_auto_growth) list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_imperative_resnet) @@ -453,12 +451,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_resnet_sorted_gradient) list(REMOVE_ITEM TEST_OPS test_imperative_mnist_sorted_gradient) list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext) list(REMOVE_ITEM TEST_OPS test_imperative_mnist) -list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer) list(REMOVE_ITEM TEST_OPS test_layers) -list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_base_cpu) -list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_with_reduce_cpu) -list(REMOVE_ITEM TEST_OPS - test_parallel_executor_seresnext_with_fuse_all_reduce_cpu) list(REMOVE_ITEM TEST_OPS test_imperative_ocr_attention_model) list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist) list(REMOVE_ITEM TEST_OPS test_install_check) @@ -466,7 +459,6 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api) list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op) list(REMOVE_ITEM TEST_OPS test_basic_lstm_api) list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op) -list(REMOVE_ITEM TEST_OPS test_fuse_all_reduce_pass) list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass) list(REMOVE_ITEM TEST_OPS test_fuse_bn_add_act_pass) list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op) @@ -772,25 +764,12 @@ if(WITH_DISTRIBUTE) endif() endif() -py_test_modules(test_parallel_executor_transformer MODULES - test_parallel_executor_transformer) if(WIN32) - py_test_modules( - test_parallel_executor_transformer_auto_growth MODULES - test_parallel_executor_transformer_auto_growth ENVS - FLAGS_allocator_strategy=auto_growth CUDA_VISIBLE_DEVICES=0) - py_test_modules(test_fuse_all_reduce_pass MODULES test_fuse_all_reduce_pass - ENVS CUDA_VISIBLE_DEVICES=0) py_test_modules(test_feed_data_check_shape_type MODULES test_feed_data_check_shape_type ENVS CUDA_VISIBLE_DEVICES=0) py_test_modules(test_fetch_lod_tensor_array MODULES test_fetch_lod_tensor_array ENVS CUDA_VISIBLE_DEVICES=0) else() - py_test_modules( - test_parallel_executor_transformer_auto_growth MODULES - test_parallel_executor_transformer_auto_growth ENVS - FLAGS_allocator_strategy=auto_growth) - py_test_modules(test_fuse_all_reduce_pass MODULES test_fuse_all_reduce_pass) py_test_modules(test_feed_data_check_shape_type MODULES test_feed_data_check_shape_type) py_test_modules(test_fetch_lod_tensor_array MODULES @@ -815,38 +794,10 @@ py_test_modules( FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000) -# NOTE: These unittests will appear NaN steadily in windows CI. After analysis, -# it is found that windows CI will run all the training unittests with the ON_INFER option turned on, -# which will not appear in other CIs. The calculation behavior of some ops in inference mode is -# inconsistent with that in non-inference mode. -if(WITH_PYTHON) - py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES - test_parallel_executor_seresnext_base_cpu) - py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES - test_parallel_executor_seresnext_with_reduce_cpu) - py_test_modules( - test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES - test_parallel_executor_seresnext_with_fuse_all_reduce_cpu) - set_tests_properties(test_parallel_executor_seresnext_base_cpu - PROPERTIES TIMEOUT 900) - set_tests_properties(test_parallel_executor_seresnext_base_cpu - PROPERTIES LABELS "RUN_TYPE=NIGHTLY") - set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu - PROPERTIES TIMEOUT 750) - set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu - PROPERTIES LABELS "RUN_TYPE=NIGHTLY") - set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu - PROPERTIES TIMEOUT 750) - set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu - PROPERTIES LABELS "RUN_TYPE=NIGHTLY") -endif() - if(NOT WIN32) # TODO: fix these unittests failure on Windows py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1) - py_test_modules(test_ir_memory_optimize_transformer MODULES - test_ir_memory_optimize_transformer) endif() if(WITH_HETERPS) @@ -871,11 +822,7 @@ set_tests_properties( test_data_norm_op test_dataloader_keep_order test_dataloader_unkeep_order test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST") set_tests_properties( - test_sync_batch_norm_op - test_parallel_executor_seresnext_base_gpu - test_parallel_executor_seresnext_with_reduce_gpu - test_parallel_executor_seresnext_with_fuse_all_reduce_gpu - test_distributed_fused_lamb_op_with_clip + test_sync_batch_norm_op test_distributed_fused_lamb_op_with_clip test_distributed_fused_lamb_op_without_clip test_distributed_fused_lamb_op_with_gradient_merge PROPERTIES LABELS "RUN_TYPE=DIST") @@ -907,13 +854,6 @@ if(NOT WIN32) set_tests_properties(test_multiprocess_reader_exception PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") set_tests_properties(test_layers PROPERTIES TIMEOUT 120) - if(WITH_NV_JETSON) - set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT - 1200) - else() - set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT - 120) - endif() endif() if(WITH_DISTRIBUTE) @@ -950,7 +890,6 @@ set_tests_properties(test_cross_entropy_loss PROPERTIES TIMEOUT 180) set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120) set_tests_properties(test_activation_nn_grad PROPERTIES TIMEOUT 250) set_tests_properties(test_empty_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_parallel_executor_transformer PROPERTIES TIMEOUT 120) set_tests_properties(test_elementwise_div_op PROPERTIES TIMEOUT 120) set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 150) set_tests_properties(test_multiclass_nms_op PROPERTIES TIMEOUT 120) @@ -963,7 +902,6 @@ set_tests_properties(test_add_reader_dependency PROPERTIES TIMEOUT 120) set_tests_properties(test_bilateral_slice_op PROPERTIES TIMEOUT 120) set_tests_properties(test_buffer_shared_memory_reuse_pass PROPERTIES TIMEOUT 120) -set_tests_properties(test_fuse_relu_depthwise_conv_pass PROPERTIES TIMEOUT 120) set_tests_properties(test_fleet_util PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_transformer_sorted_gradient PROPERTIES TIMEOUT 120) @@ -1022,16 +960,12 @@ set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120) set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200) set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT 120) -set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu - PROPERTIES TIMEOUT 120) set_tests_properties(test_dropout_op PROPERTIES TIMEOUT 120) set_tests_properties(test_argsort_op PROPERTIES TIMEOUT 120) set_tests_properties(test_gather_nd_op PROPERTIES TIMEOUT 120) set_tests_properties(test_nn_grad PROPERTIES TIMEOUT 180) set_tests_properties(test_elementwise_sub_op PROPERTIES TIMEOUT 120) set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu - PROPERTIES TIMEOUT 120) set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT 240) set_tests_properties(test_distributed_fused_lamb_op_without_clip @@ -1041,8 +975,6 @@ set_tests_properties(test_distributed_fused_lamb_op_with_gradient_merge set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120) set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120) set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300) -set_tests_properties(test_parallel_executor_transformer_auto_growth - PROPERTIES TIMEOUT 120) set_tests_properties(test_elementwise_add_op PROPERTIES TIMEOUT 200) if(NOT WITH_COVERAGE) set_tests_properties(test_weight_decay PROPERTIES TIMEOUT 120) @@ -1070,13 +1002,10 @@ set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 150) set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120) set_tests_properties(test_cond PROPERTIES TIMEOUT 240) set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250) -set_tests_properties(test_parallel_executor_seresnext_base_gpu - PROPERTIES TIMEOUT 120) set_tests_properties(test_norm_nn_grad PROPERTIES TIMEOUT 180) set_tests_properties(test_matrix_nms_op PROPERTIES TIMEOUT 120) set_tests_properties(test_generator_dataloader PROPERTIES TIMEOUT 120) set_tests_properties(test_partial_concat_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_fuse_optimizer_pass PROPERTIES TIMEOUT 120) set_tests_properties(test_softmax_with_cross_entropy_op PROPERTIES TIMEOUT 220) set_tests_properties(test_reduce_op PROPERTIES TIMEOUT 500) set_tests_properties(test_adam_optimizer_fp32_fp64 PROPERTIES TIMEOUT 120) @@ -1102,7 +1031,6 @@ set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES TIMEOUT 120) set_tests_properties(test_conv2d_api PROPERTIES TIMEOUT 120) set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120) set_tests_properties(test_cyclic_cifar_dataset PROPERTIES TIMEOUT 120) -set_tests_properties(test_fuse_all_reduce_pass PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120) @@ -1284,15 +1212,6 @@ foreach(TEST_CINN_OP ${TEST_CINN_OPS}) endforeach() if(WITH_CINN AND WITH_TESTING) - set_tests_properties( - test_parallel_executor_run_cinn - PROPERTIES - LABELS - "RUN_TYPE=CINN" - ENVIRONMENT - FLAGS_allow_cinn_ops="conv2d;conv2d_grad;elementwise_add;elementwise_add_grad;relu;relu_grad;sum" - ) - set_tests_properties(test_tile_op PROPERTIES TIMEOUT 300) endif() diff --git a/test/legacy_test/parallel_executor_test_base.py b/test/legacy_test/parallel_executor_test_base.py deleted file mode 100644 index a74d72d77f1f53..00000000000000 --- a/test/legacy_test/parallel_executor_test_base.py +++ /dev/null @@ -1,267 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import multiprocessing -import os -import sys -import time -import unittest - -import numpy as np -from feed_data_reader import FeedDataReader - -import paddle -from paddle import base -from paddle.base import compiler, core - -__all__ = ['TestParallelExecutorBase'] -DeviceType = core.DeviceType - - -class TestParallelExecutorBase(unittest.TestCase): - @classmethod - def check_network_convergence( - cls, - method, - use_device=DeviceType.CUDA, - iter=5, - batch_size=None, - feed_dict=None, - feed_data_reader=None, - get_data_from_feeder=None, - use_parallel_executor=True, - use_reduce=False, - use_ir_memory_optimize=False, - enable_inplace=True, - fuse_elewise_add_act_ops=False, - fuse_all_optimizer_ops=False, - fuse_all_reduce_ops=False, - fuse_relu_depthwise_conv=False, - optimizer=paddle.optimizer.Adam, - use_fast_executor=False, - enable_sequential_execution=False, - ): - def run_executor(exe, binary, feed, fetch_list): - if feed_data_reader is None: - res = exe.run(binary, feed=feed, fetch_list=fetch_list) - else: - res = exe.run( - binary, - feed=feed_data_reader.get_next(exe, binary), - fetch_list=fetch_list, - ) - return res - - if feed_data_reader is not None: - assert isinstance( - feed_data_reader, FeedDataReader - ), "feed_data_reader must be type of FeedDataReader" - - paddle.seed(0) - paddle.framework.random._manual_program_seed(0) - main = base.Program() - startup = base.Program() - - with base.program_guard(main, startup): - feed_dict, loss = cls.build_model( - feed_dict, get_data_from_feeder, main, method, optimizer - ) - - place = ( - base.CUDAPlace(0) - if use_device == DeviceType.CUDA - else base.XPUPlace(0) - if use_device == DeviceType.XPU - else base.CPUPlace() - ) - exe = base.Executor(place) - exe.run(startup) - - build_strategy = cls.set_strategy( - enable_inplace, - enable_sequential_execution, - fuse_all_optimizer_ops, - fuse_all_reduce_ops, - fuse_elewise_add_act_ops, - fuse_relu_depthwise_conv, - use_fast_executor, - use_ir_memory_optimize, - use_reduce, - use_device, - ) - - if use_parallel_executor: - binary = compiler.CompiledProgram( - main, - build_strategy=build_strategy, - ) - else: - binary = main - - if batch_size is not None: - batch_size *= ( - base.core.get_cuda_device_count() - if use_device == DeviceType.CUDA - else base.core.get_xpu_device_count() - if use_device == DeviceType.XPU - else int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - ) - - area_below_loss = 0 - begin = time.time() - (first_loss,) = run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name] - ) - area_below_loss += 0.5 * first_loss.mean() - for _ in range(iter): - mid_loss = run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name] - ) - area_below_loss += mid_loss[0].mean() - (last_loss,) = run_executor( - exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name] - ) - area_below_loss += 0.5 * last_loss.mean() - end = time.time() - - if batch_size is not None: - print( - "%.4f Instance per second" - % ((batch_size * iter + 2) / (end - begin)) - ) - - avg_last_loss_val = np.array(last_loss).mean() - avg_first_loss_val = np.array(first_loss).mean() - if math.isnan(float(avg_last_loss_val)) or math.isnan( - float(avg_first_loss_val) - ): - sys.exit("got NaN loss, training failed.") - - print(first_loss, last_loss, area_below_loss) - # self.assertGreater(first_loss[0], last_loss[0]) - return first_loss, last_loss, area_below_loss - - @classmethod - def check_pass_conflict( - cls, - method, - use_device=DeviceType.CUDA, - feed_dict=None, - get_data_from_feeder=None, - use_reduce=False, - use_ir_memory_optimize=True, - enable_inplace=True, - fuse_elewise_add_act_ops=False, - fuse_all_optimizer_ops=False, - fuse_all_reduce_ops=False, - fuse_relu_depthwise_conv=False, - optimizer=paddle.optimizer.Adam, - use_fast_executor=True, - enable_sequential_execution=False, - ): - main = base.Program() - startup = base.Program() - with base.program_guard(main, startup): - feed_dict, loss = cls.build_model( - feed_dict, get_data_from_feeder, main, method, optimizer - ) - - place = ( - base.CUDAPlace(0) - if use_device == DeviceType.CUDA - else base.XPUPlace(0) - if use_device == DeviceType.XPU - else base.CPUPlace() - ) - exe = base.Executor(place) - exe.run(startup) - - build_strategy = cls.set_strategy( - enable_inplace, - enable_sequential_execution, - fuse_all_optimizer_ops, - fuse_all_reduce_ops, - fuse_elewise_add_act_ops, - fuse_relu_depthwise_conv, - use_fast_executor, - use_ir_memory_optimize, - use_reduce, - use_device, - ) - - binary = compiler.CompiledProgram( - main, - build_strategy=build_strategy, - ) - - exe.run(binary, feed=feed_dict, fetch_list=[loss.name]) - - @classmethod - def set_strategy( - cls, - enable_inplace, - enable_sequential_execution, - fuse_all_optimizer_ops, - fuse_all_reduce_ops, - fuse_elewise_add_act_ops, - fuse_relu_depthwise_conv, - use_fast_executor, - use_ir_memory_optimize, - use_reduce, - use_device, - ): - build_strategy = base.BuildStrategy() - build_strategy.reduce_strategy = ( - base.BuildStrategy.ReduceStrategy.Reduce - if use_reduce - else base.BuildStrategy.ReduceStrategy.AllReduce - ) - build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops - build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv - build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops - build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops - build_strategy.memory_optimize = use_ir_memory_optimize - build_strategy.enable_inplace = enable_inplace - build_strategy.enable_sequential_execution = enable_sequential_execution - - if use_device == DeviceType.CUDA and core.is_compiled_with_cuda(): - build_strategy.remove_unnecessary_lock = True - if use_device == DeviceType.XPU and core.is_compiled_with_xpu(): - build_strategy.fuse_elewise_add_act_ops = False - build_strategy.fuse_relu_depthwise_conv = False - build_strategy.fuse_all_optimizer_ops = False - build_strategy.memory_optimize = False - build_strategy.enable_inplace = False - build_strategy.enable_sequential_execution = False - - return build_strategy - - @classmethod - def build_model( - cls, feed_dict, get_data_from_feeder, main, method, optimizer - ): - loss = method(use_feed=feed_dict is not None) - # NOTE(zjl): memory_optimize/inplace pass would not require - # that loss.persistable = True. - # We set loss.persistable = False here to verify our memory - # optimization strategies intentionally. - loss.persistable = False - if optimizer: - optimizer().minimize(loss) - - if get_data_from_feeder is not None: - assert feed_dict is None - feed_dict = get_data_from_feeder() - return feed_dict, loss diff --git a/test/legacy_test/seresnext_net.py b/test/legacy_test/seresnext_net.py index 357b5b7e226b19..ef19deebba3789 100644 --- a/test/legacy_test/seresnext_net.py +++ b/test/legacy_test/seresnext_net.py @@ -18,11 +18,12 @@ import os -from seresnext_test_base import DeviceType from simple_nets import init_data import paddle +DeviceType = base.core.DeviceType + os.environ['CPU_NUM'] = str(4) os.environ['FLAGS_cudnn_deterministic'] = str(1) diff --git a/test/legacy_test/seresnext_test_base.py b/test/legacy_test/seresnext_test_base.py deleted file mode 100644 index 73ad9c27c0196f..00000000000000 --- a/test/legacy_test/seresnext_test_base.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import seresnext_net -from parallel_executor_test_base import DeviceType, TestParallelExecutorBase - -from paddle.base import core - - -class TestResnetBase(TestParallelExecutorBase): - def _compare_result_with_origin_model( - self, check_func, use_device, delta2=1e-5, compare_separately=True - ): - if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): - return - - ( - func_1_first_loss, - func_1_last_loss, - func_1_loss_area, - ) = self.check_network_convergence( - seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_device), - iter=seresnext_net.iter(use_device), - batch_size=seresnext_net.batch_size(use_device), - use_device=use_device, - use_reduce=False, - optimizer=seresnext_net.optimizer, - ) - - func_2_first_loss, func_2_last_loss, func_2_loss_area = check_func( - seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_device), - iter=seresnext_net.iter(use_device), - batch_size=seresnext_net.batch_size(use_device), - use_device=use_device, - ) - - if compare_separately: - self.assertAlmostEqual( - func_1_first_loss, func_2_first_loss, delta=1e-5 - ) - self.assertAlmostEqual( - func_1_last_loss, func_2_last_loss, delta=delta2 - ) - else: - np.testing.assert_allclose( - func_1_loss_area, func_2_loss_area, rtol=delta2 - ) - self.assertAlmostEqual( - func_1_first_loss, func_2_first_loss, delta=1e-5 - ) - self.assertAlmostEqual( - func_1_last_loss, func_2_last_loss, delta=delta2 - ) diff --git a/test/legacy_test/test_fuse_all_reduce_pass.py b/test/legacy_test/test_fuse_all_reduce_pass.py deleted file mode 100644 index 0745844bda323e..00000000000000 --- a/test/legacy_test/test_fuse_all_reduce_pass.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest -from functools import partial - -from fake_reader import fake_imdb_reader -from parallel_executor_test_base import DeviceType, TestParallelExecutorBase -from simple_nets import bow_net, fc_with_batchnorm, init_data, simple_fc_net - -import paddle -from paddle import base -from paddle.base import core - -paddle.enable_static() - - -class TestFuseAllReduceOpsBase(TestParallelExecutorBase): - @classmethod - def setUpClass(cls): - os.environ['CPU_NUM'] = str(4) - - def compare_fuse_all_reduce_ops( - self, - model, - use_device, - init_feed_dict=None, - get_data_from_feeder=None, - optimizer=None, - fuse_all_optimizer_ops=False, - ): - if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): - return - if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): - return - - feed_dict_data = None - if init_feed_dict is not None: - img, label = init_feed_dict() - feed_dict_data = {"image": img, "label": label} - - ( - not_fuse_op_first_loss, - not_fuse_op_last_loss, - _, - ) = self.check_network_convergence( - model, - feed_dict=feed_dict_data, - get_data_from_feeder=get_data_from_feeder, - use_device=use_device, - fuse_all_reduce_ops=False, - fuse_all_optimizer_ops=fuse_all_optimizer_ops, - optimizer=optimizer, - ) - ( - fuse_op_first_loss, - fuse_op_last_loss, - _, - ) = self.check_network_convergence( - model, - feed_dict=feed_dict_data, - get_data_from_feeder=get_data_from_feeder, - use_device=use_device, - fuse_all_reduce_ops=True, - fuse_all_optimizer_ops=fuse_all_optimizer_ops, - optimizer=optimizer, - ) - - self.assertAlmostEqual( - not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6 - ) - self.assertAlmostEqual( - not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6 - ) - - def optimizer(self, learning_rate=1e-3): - optimizer = paddle.optimizer.SGD( - learning_rate=learning_rate, - weight_decay=paddle.regularizer.L2Decay(1e-3), - ) - return optimizer - - -class TestFuseAllReduceOps(TestFuseAllReduceOpsBase): - def _decorate_compare_fused_all_reduce(self, model, use_device): - self.compare_fuse_all_reduce_ops( - model, - use_device, - init_feed_dict=init_data, - optimizer=self.optimizer, - fuse_all_optimizer_ops=True, - ) - - def test_simple_fc_with_fuse_all_reduce(self): - self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA) - self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.XPU) - self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU) - - def test_batchnorm_fc_with_fuse_all_reduce(self): - self._decorate_compare_fused_all_reduce( - fc_with_batchnorm, DeviceType.CUDA - ) - # TODO(wangxi): xpu batch_norm op only support dim = 4 - # self._decorate_compare_fused_all_reduce(fc_with_batchnorm, - # DeviceType.XPU) - self._decorate_compare_fused_all_reduce( - fc_with_batchnorm, DeviceType.CPU - ) - - -class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps): - def _decorate_compare_fused_all_reduce(self, model, use_device): - self.compare_fuse_all_reduce_ops( - model, - use_device, - init_feed_dict=init_data, - optimizer=self.optimizer, - fuse_all_optimizer_ops=True, - ) - - -class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase): - @classmethod - def setUpClass(cls): - os.environ['CPU_NUM'] = str(4) - cls.word_dict_len = 5147 - batch_size = 64 - reader = fake_imdb_reader(cls.word_dict_len, batch_size * 100) - reader = paddle.batch(reader, batch_size=batch_size)() - cls.train_data = next(reader) - - def get_data_from_feeder(self): - place = base.CPUPlace() - feeder = base.DataFeeder(feed_list=["words", "label"], place=place) - return feeder.feed(self.train_data) - - def _decorate_compare_fused_all_reduce(self, model, use_device): - self.compare_fuse_all_reduce_ops( - model, - use_device, - get_data_from_feeder=self.get_data_from_feeder, - optimizer=self.optimizer, - ) - - def test_simple_bow_net_with_fuse_all_reduce(self): - model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) - self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA) - # TODO(wangxi): xpu sum op only support LodTensor for now - # self._decorate_compare_fused_all_reduce(model, DeviceType.XPU) - self._decorate_compare_fused_all_reduce(model, DeviceType.CPU) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_fuse_elewise_add_act_pass.py b/test/legacy_test/test_fuse_elewise_add_act_pass.py index b9237a14bd1085..2f61178920a107 100644 --- a/test/legacy_test/test_fuse_elewise_add_act_pass.py +++ b/test/legacy_test/test_fuse_elewise_add_act_pass.py @@ -12,86 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import unittest import numpy -from parallel_executor_test_base import DeviceType, TestParallelExecutorBase -from simple_nets import fc_with_batchnorm, init_data, simple_fc_net import paddle import paddle.nn.functional as F from paddle import base -from paddle.base import core - - -class TestMNIST(TestParallelExecutorBase): - @classmethod - def setUpClass(cls): - os.environ['CPU_NUM'] = str(4) - - def _compare_fuse_elewise_add_act_ops(self, model, use_device): - if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): - return - img, label = init_data() - - def _optimizer(learning_rate=1e-6): - optimizer = paddle.optimizer.SGD( - learning_rate=learning_rate, - weight_decay=paddle.regularizer.L2Decay(1e-6), - ) - return optimizer - - # NOTE(dzh): - # need to make it compatible with elewise fuse act - # FIXME (liuwei12) - # the new memory optimize strategy will crash this unittest - # add enable_inplace=False here to force pass the unittest - ( - not_fuse_op_first_loss, - not_fuse_op_last_loss, - _, - ) = self.check_network_convergence( - model, - feed_dict={"image": img, "label": label}, - use_device=use_device, - fuse_elewise_add_act_ops=False, - use_ir_memory_optimize=False, - enable_inplace=False, - optimizer=_optimizer, - ) - ( - fuse_op_first_loss, - fuse_op_last_loss, - _, - ) = self.check_network_convergence( - model, - feed_dict={"image": img, "label": label}, - use_device=use_device, - fuse_elewise_add_act_ops=True, - use_ir_memory_optimize=False, - enable_inplace=False, - optimizer=_optimizer, - ) - - self.assertAlmostEqual( - not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6 - ) - self.assertAlmostEqual( - not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6 - ) - - def test_simple_fc_with_fuse_op(self): - self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CUDA) - self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CPU) - - def test_batchnorm_fc_with_fuse_op(self): - self._compare_fuse_elewise_add_act_ops( - fc_with_batchnorm, DeviceType.CUDA - ) - self._compare_fuse_elewise_add_act_ops( - fc_with_batchnorm, DeviceType.CPU - ) class TestFuseActElewiseAddInplaceGradPass(unittest.TestCase): diff --git a/test/legacy_test/test_fuse_optimizer_pass.py b/test/legacy_test/test_fuse_optimizer_pass.py deleted file mode 100644 index 3fa7f3d999a615..00000000000000 --- a/test/legacy_test/test_fuse_optimizer_pass.py +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest -from functools import partial - -from fake_reader import fake_imdb_reader -from parallel_executor_test_base import DeviceType, TestParallelExecutorBase -from simple_nets import bow_net, fc_with_batchnorm, init_data - -import paddle -from paddle import base -from paddle.base import core - - -class TestFuseOptimizationOps(TestParallelExecutorBase): - @classmethod - def setUpClass(cls): - os.environ['CPU_NUM'] = str(4) - - def _get_feed_dict(self): - img, label = init_data() - return {"image": img, "label": label} - - def _compare_fused_optimizer_ops( - self, - model, - use_device, - feed_dict=None, - get_data_from_feeder=None, - optimizer=paddle.optimizer.Adam, - ): - if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): - return - - ( - not_fuse_op_first_loss, - not_fuse_op_last_loss, - _, - ) = self.check_network_convergence( - model, - feed_dict=feed_dict, - get_data_from_feeder=get_data_from_feeder, - use_device=use_device, - fuse_all_optimizer_ops=False, - optimizer=optimizer, - ) - ( - fuse_op_first_loss, - fuse_op_last_loss, - _, - ) = self.check_network_convergence( - model, - feed_dict=feed_dict, - get_data_from_feeder=get_data_from_feeder, - use_device=use_device, - fuse_all_optimizer_ops=True, - optimizer=optimizer, - ) - - self.assertAlmostEqual( - not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6 - ) - self.assertAlmostEqual( - not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6 - ) - - def _decorate_compare_fused_optimizer_ops( - self, model, use_device, optimizer - ): - self._compare_fused_optimizer_ops( - model, - use_device, - feed_dict=self._get_feed_dict(), - optimizer=optimizer, - ) - - -class TestFuseAdamOps(TestFuseOptimizationOps): - def optimizer(self, learning_rate=1e-4): - return paddle.optimizer.Adam(learning_rate=learning_rate) - - def test_batchnorm_fc_with_fuse_op(self): - self._decorate_compare_fused_optimizer_ops( - fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer - ) - self._decorate_compare_fused_optimizer_ops( - fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer - ) - - -class TestFuseSGDOps(TestFuseAdamOps): - def optimizer(self, learning_rate=1e-3): - return paddle.optimizer.SGD(learning_rate=learning_rate) - - -class TestFuseMomentumOps(TestFuseAdamOps): - def optimizer(self, learning_rate=1e-3): - return paddle.optimizer.Momentum( - learning_rate=learning_rate, momentum=0.1 - ) - - -class TestSpareFuseAdamOps(TestFuseOptimizationOps): - @classmethod - def setUpClass(cls): - os.environ['CPU_NUM'] = str(4) - cls.word_dict_len = 5147 - batch_size = 64 - reader = fake_imdb_reader(cls.word_dict_len, batch_size * 100) - reader = paddle.batch(reader, batch_size=batch_size)() - cls.train_data = next(reader) - - def _get_data_from_feeder(self): - place = base.CPUPlace() - feeder = base.DataFeeder(feed_list=["words", "label"], place=place) - return feeder.feed(self.train_data) - - def _decorate_compare_fused_optimizer_ops( - self, model, use_device, optimizer - ): - self._compare_fused_optimizer_ops( - model, - use_device, - get_data_from_feeder=self._get_data_from_feeder, - optimizer=optimizer, - ) - - def optimizer(self, learning_rate=1e-4): - return paddle.optimizer.Adam(learning_rate=learning_rate) - - def test_simple_bow_net_with_fuse_op(self): - model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) - self._decorate_compare_fused_optimizer_ops( - model, DeviceType.CUDA, optimizer=self.optimizer - ) - self._decorate_compare_fused_optimizer_ops( - model, DeviceType.CPU, optimizer=self.optimizer - ) - - -class TestSpareFuseSGDOps(TestSpareFuseAdamOps): - def optimizer(self, learning_rate=1e-3): - return paddle.optimizer.SGD(learning_rate=learning_rate) - - -class TestSpareFuseMomentumOps(TestSpareFuseAdamOps): - def optimizer(self, learning_rate=1e-3): - return paddle.optimizer.Momentum( - learning_rate=learning_rate, momentum=0.1 - ) - - -class TestPassConflictBase(TestFuseAdamOps): - def _compare_fused_optimizer_ops( - self, - model, - use_device, - feed_dict=None, - get_data_from_feeder=None, - optimizer=paddle.optimizer.Adam, - ): - if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): - return - - self.check_pass_conflict( - model, - feed_dict=feed_dict, - get_data_from_feeder=get_data_from_feeder, - use_device=use_device, - fuse_all_optimizer_ops=True, - optimizer=optimizer, - enable_sequential_execution=True, - ) - - -class TestFuseAdamOpsPassConflict(TestPassConflictBase): - def optimizer(self, learning_rate=1e-4): - return paddle.optimizer.Adam(learning_rate=learning_rate) - - def test_batchnorm_fc_with_fuse_op(self): - self._decorate_compare_fused_optimizer_ops( - fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer - ) - self._decorate_compare_fused_optimizer_ops( - fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer - ) - - -class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict): - def optimizer(self, learning_rate=1e-3): - return paddle.optimizer.SGD(learning_rate=learning_rate) - - -class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict): - def optimizer(self, learning_rate=1e-3): - return paddle.optimizer.Momentum( - learning_rate=learning_rate, momentum=0.1 - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py b/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py deleted file mode 100644 index 50392ac9744607..00000000000000 --- a/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from parallel_executor_test_base import DeviceType, TestParallelExecutorBase - -import paddle -import paddle.nn.functional as F -from paddle.base import core - - -def norm(*args, **kargs): - return paddle.static.nn.batch_norm(*args, **kargs) - - -def sep_conv(input, channel, stride, filter, dilation=1, act=None): - # with scope('depthwise'): - input = paddle.static.nn.conv2d( - input, - input.shape[1], - filter, - stride, - groups=input.shape[1], - padding=(filter // 2) * dilation, - dilation=dilation, - use_cudnn=False, - bias_attr=False, - ) - input = norm(input) - if act: - input = act(input) - # with scope('pointwise'): - input = paddle.static.nn.conv2d( - input, channel, 1, 1, groups=1, padding=0, bias_attr=False - ) - input = norm(input) - if act: - input = act(input) - return input - - -def simple_depthwise_net(use_feed): - assert use_feed - img = paddle.static.data(name='image', shape=[-1, 784], dtype='float32') - label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64') - hidden = paddle.reshape(img, (-1, 1, 28, 28)) - for _ in range(4): - hidden = sep_conv(hidden, channel=200, stride=2, filter=5) - hidden = F.relu(hidden) - prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax') - loss = paddle.nn.functional.cross_entropy( - input=prediction, label=label, reduction='none', use_softmax=False - ) - loss = paddle.mean(loss) - return loss - - -class TestMNIST(TestParallelExecutorBase): - def _init_data(self, random=True): - np.random.seed(5) - if random: - img = np.random.random(size=[32, 784]).astype(np.float32) - else: - img = np.ones(shape=[32, 784], dtype='float32') - label = np.ones(shape=[32, 1], dtype='int64') - return img, label - - def _compare(self, model, use_device, random_data=True, only_forward=False): - if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): - return - img, label = self._init_data(random_data) - - def _optimizer(learning_rate=1e-6): - optimizer = paddle.optimizer.SGD( - learning_rate=learning_rate, - weight_decay=paddle.regularizer.L2Decay(1e-6), - ) - return optimizer - - if only_forward: - _optimizer = None - - ( - fuse_op_first_loss, - fuse_op_last_loss, - _, - ) = self.check_network_convergence( - model, - feed_dict={"image": img, "label": label}, - use_device=use_device, - fuse_relu_depthwise_conv=True, - use_ir_memory_optimize=True, - optimizer=_optimizer, - ) - ( - not_fuse_op_first_loss, - not_fuse_op_last_loss, - _, - ) = self.check_network_convergence( - model, - feed_dict={"image": img, "label": label}, - use_device=use_device, - fuse_relu_depthwise_conv=False, - optimizer=_optimizer, - ) - - self.assertAlmostEqual( - not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6 - ) - self.assertAlmostEqual( - not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6 - ) - - def test_simple_depthwise_with_fuse_op(self): - self._compare(simple_depthwise_net, DeviceType.CUDA) - self._compare(simple_depthwise_net, DeviceType.CPU) - - def test_simple_depthwise_with_fuse_op_only_forward(self): - self._compare(simple_depthwise_net, DeviceType.CUDA, only_forward=True) - self._compare(simple_depthwise_net, DeviceType.CPU, only_forward=True) - - -if __name__ == '__main__': - paddle.enable_static() - unittest.main() diff --git a/test/legacy_test/test_ir_inplace_pass.py b/test/legacy_test/test_ir_inplace_pass.py deleted file mode 100644 index c5a5be1168f870..00000000000000 --- a/test/legacy_test/test_ir_inplace_pass.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest - -import numpy as np -from parallel_executor_test_base import DeviceType, TestParallelExecutorBase - -import paddle -from paddle import base -from paddle.base import core - - -def fc_with_batchnorm(use_feed): - img = paddle.static.data(name='image', shape=[-1, 784], dtype='float32') - label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64') - - hidden = img - for _ in range(3): - hidden = paddle.static.nn.fc( - hidden, - size=200, - activation='tanh', - bias_attr=base.ParamAttr( - initializer=paddle.nn.initializer.Constant(value=1.0) - ), - ) - - hidden = paddle.static.nn.batch_norm(input=hidden) - prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax') - loss = paddle.nn.functional.cross_entropy( - input=prediction, label=label, reduction='none', use_softmax=False - ) - loss = paddle.mean(loss) - return loss - - -class TestIrInplace(TestParallelExecutorBase): - @classmethod - def setUpClass(cls): - os.environ['CPU_NUM'] = str(4) - - def _fc_with_batchnorm(self, ir_memory_optimize, enable_inplace): - if not core.is_compiled_with_cuda(): - return - np.random.seed(5) - img = np.random.random(size=[32, 784]).astype(np.float32) - label = np.ones(shape=[32, 1], dtype='int64') - self.check_network_convergence( - fc_with_batchnorm, - feed_dict={"image": img, "label": label}, - use_device=DeviceType.CUDA, - use_ir_memory_optimize=ir_memory_optimize, - enable_inplace=enable_inplace, - ) - - def test_fc_with_batchnorm(self, delta=1e-3): - loss00 = self._fc_with_batchnorm(False, False) - loss10 = self._fc_with_batchnorm(True, False) - loss01 = self._fc_with_batchnorm(False, True) - loss11 = self._fc_with_batchnorm(True, True) - self.assertAlmostEqual(loss00, loss10, delta=delta) - self.assertAlmostEqual(loss00, loss01, delta=delta) - self.assertAlmostEqual(loss00, loss11, delta=delta) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_ir_memory_optimize_pass.py b/test/legacy_test/test_ir_memory_optimize_pass.py deleted file mode 100644 index 6112d0aedd7ad5..00000000000000 --- a/test/legacy_test/test_ir_memory_optimize_pass.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from parallel_executor_test_base import DeviceType, TestParallelExecutorBase - -import paddle -from paddle.base import core - - -def _feed_data_helper(): - img = paddle.static.data(name='image', shape=[-1, 784], dtype='float32') - label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64') - return img, label - - -def simple_fc_net(use_feed): - assert use_feed - x, y = _feed_data_helper() - hidden_layer = 4 - for _ in range(hidden_layer): - x = paddle.static.nn.fc(x, size=20, activation='relu') - y_predict = paddle.static.nn.fc(x, size=10, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=y_predict, label=y, reduction='none', use_softmax=False - ) - avg_cost = paddle.mean(cost) - return avg_cost - - -def fc_with_inplace_net(use_feed): - assert use_feed - x, y = _feed_data_helper() - fc = paddle.static.nn.fc(x=x, size=20, activation='relu') - fc = paddle.static.nn.fc(x=fc, size=10, activation='relu') - reshape = paddle.reshape(x=fc, shape=[-1, 2, 5]) - reshape = paddle.reshape(x=reshape, shape=[-1, 5, 2]) - y_predict = paddle.static.nn.fc(x=reshape, size=10, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=y_predict, label=y, reduction='none', use_softmax=False - ) - avg_cost = paddle.mean(cost) - return avg_cost - - -class TestMNIST(TestParallelExecutorBase): - def _dummy_data(self): - np.random.seed(5) - img = np.random.random(size=[32, 784]).astype(np.float32) - label = np.ones(shape=[32, 1], dtype='int64') - return img, label - - def _compare_ir_memory_optimize(self, model, use_device): - if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): - return - - img, label = self._dummy_data() - first_loss0, last_loss0, _ = self.check_network_convergence( - model, - feed_dict={"image": img, "label": label}, - use_device=use_device, - use_ir_memory_optimize=False, - ) - first_loss1, last_loss1, _ = self.check_network_convergence( - model, - feed_dict={"image": img, "label": label}, - use_device=use_device, - use_ir_memory_optimize=True, - ) - - self.assertAlmostEqual(first_loss0, first_loss1, delta=1e-6) - self.assertAlmostEqual(last_loss0, last_loss1, delta=1e-6) - - def test_simple_fc_net(self): - self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CPU) - self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CUDA) - - def test_fc_with_reshape_net(self): - self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CPU) - self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CUDA) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_ir_memory_optimize_transformer.py b/test/legacy_test/test_ir_memory_optimize_transformer.py deleted file mode 100644 index b3dc82c12e6369..00000000000000 --- a/test/legacy_test/test_ir_memory_optimize_transformer.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest - -from paddle.base import core - -os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" - -from parallel_executor_test_base import DeviceType, TestParallelExecutorBase -from test_parallel_executor_transformer import get_feed_data_reader, transformer - - -# NOTE(dzhwinter): test diferent strategy colisions. -# open the eager delete tensor strategy by default. -class TestTransformerWithIR(TestParallelExecutorBase): - def test_main(self): - if core.is_compiled_with_cuda(): - # check python transpiler - self.check_network_convergence( - transformer, - use_device=DeviceType.CUDA, - feed_data_reader=get_feed_data_reader(), - use_ir_memory_optimize=False, - iter=2, - ) - # check IR memory optimize - self.check_network_convergence( - transformer, - use_device=DeviceType.CUDA, - feed_data_reader=get_feed_data_reader(), - use_ir_memory_optimize=True, - iter=2, - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_mix_precision_all_reduce_fuse.py b/test/legacy_test/test_mix_precision_all_reduce_fuse.py deleted file mode 100644 index 6887b2d0de6318..00000000000000 --- a/test/legacy_test/test_mix_precision_all_reduce_fuse.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import nets -import numpy as np -from parallel_executor_test_base import DeviceType, TestParallelExecutorBase -from simple_nets import init_data - -import paddle -from paddle.base import core - -batch_size = 12 -img_shape = [1, 28, 28] - - -def loss_net(hidden, label): - prediction = paddle.static.nn.fc(x=hidden, size=10, activation='softmax') - loss = paddle.nn.functional.cross_entropy( - input=prediction, label=label, reduction='none', use_softmax=False - ) - avg_loss = paddle.mean(loss) - return avg_loss - - -def conv_net(use_feed): - img = paddle.static.data( - name='image', shape=[-1] + img_shape, dtype='float16' - ) - label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64') - - conv_pool_1 = nets.simple_img_conv_pool( - input=img, - filter_size=5, - num_filters=20, - pool_size=2, - pool_stride=2, - act="relu", - ) - conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1) - - conv_pool_1 = paddle.cast(conv_pool_1, np.float32) - conv_pool_2 = nets.simple_img_conv_pool( - input=conv_pool_1, - filter_size=5, - num_filters=50, - pool_size=2, - pool_stride=2, - act="relu", - ) - hidden = paddle.cast(conv_pool_2, np.float32) - return loss_net(hidden, label) - - -def _optimizer(learning_rate=1e-6): - optimizer = paddle.optimizer.SGD(learning_rate=learning_rate) - return optimizer - - -class TestResnet(TestParallelExecutorBase): - def check_model(self, use_device): - img, label = init_data( - batch_size=batch_size, img_shape=img_shape, label_range=9 - ) - img = np.float16(img) - feed_dict = {"image": img, "label": label} - - TestParallelExecutorBase.check_network_convergence( - conv_net, - feed_dict=feed_dict, - iter=10, - use_device=use_device, - fuse_all_reduce_ops=True, - optimizer=_optimizer, - ) - - def test_model(self): - if core.is_compiled_with_cuda(): - self.check_model(DeviceType.CUDA) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_parallel_executor_run_cinn.py b/test/legacy_test/test_parallel_executor_run_cinn.py deleted file mode 100644 index 2ca34842f0b906..00000000000000 --- a/test/legacy_test/test_parallel_executor_run_cinn.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import shutil -import tempfile -import unittest - -import numpy as np - -import paddle - -paddle.enable_static() - -logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO -) -logger = logging.getLogger("paddle_with_cinn") - - -def set_cinn_flag(val): - cinn_compiled = False - try: - paddle.set_flags({'FLAGS_use_cinn': val}) - cinn_compiled = True - except ValueError: - logger.warning("The used paddle is not compiled with CINN.") - return cinn_compiled - - -def reader(limit): - for _ in range(limit): - yield np.random.random([1, 28]).astype('float32'), np.random.randint( - 0, 2, size=[1] - ).astype('int64') - - -def rand_data(img, label, loop_num=10): - feed = [] - data = reader(loop_num) - for _ in range(loop_num): - d, l = next(data) - feed.append({img: d, label: l}) - return feed - - -def build_program(main_program, startup_program): - with paddle.static.program_guard(main_program, startup_program): - img = paddle.static.data(name='img', shape=[1, 28], dtype='float32') - param = paddle.create_parameter( - name="bias", - shape=[1, 28], - dtype="float32", - attr=paddle.ParamAttr( - initializer=paddle.nn.initializer.Assign( - np.random.rand(1, 28).astype(np.float32) - ) - ), - ) - label = paddle.static.data(name="label", shape=[1], dtype='int64') - - hidden = paddle.add(img, param) - prediction = paddle.nn.functional.relu(hidden) - - loss = paddle.nn.functional.cross_entropy(input=prediction, label=label) - avg_loss = paddle.mean(loss) - adam = paddle.optimizer.Adam(learning_rate=0.001) - adam.minimize(avg_loss) - return img, label, avg_loss - - -def train(dot_save_dir, prefix, seed=1234): - np.random.seed(seed) - paddle.seed(seed) - if paddle.is_compiled_with_cuda(): - paddle.set_flags({'FLAGS_cudnn_deterministic': 1}) - - startup_program = paddle.static.Program() - main_program = paddle.static.Program() - img, label, loss = build_program(main_program, startup_program) - - place = ( - paddle.CUDAPlace(0) - if paddle.is_compiled_with_cuda() - else paddle.CPUPlace() - ) - exe = paddle.static.Executor(place) - exe.run(startup_program) - - build_strategy = paddle.static.BuildStrategy() - build_strategy.debug_graphviz_path = os.path.join(dot_save_dir, prefix) - compiled_program = paddle.static.CompiledProgram( - main_program, build_strategy - ) - - iters = 100 - feed = rand_data(img.name, label.name, iters) - loss_values = [] - for step in range(iters): - loss_v = exe.run(compiled_program, feed=feed[step], fetch_list=[loss]) - loss_values.append(loss_v[0]) - return loss_values - - -@unittest.skipIf(not set_cinn_flag(True), "Paddle is not compiled with CINN.") -class TestParallelExecutorRunCinn(unittest.TestCase): - def setUp(self): - self.tmpdir = tempfile.mkdtemp(prefix="dots_") - - def tearDown(self): - shutil.rmtree(self.tmpdir) - - def test_run_with_cinn(self): - cinn_losses = np.array(train(self.tmpdir, "paddle")).flatten() - set_cinn_flag(False) - pd_losses = np.array(train(self.tmpdir, "cinn")).flatten() - np.testing.assert_allclose( - cinn_losses, pd_losses, rtol=1e-05, atol=1e-05 - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_parallel_executor_seresnext_base_cpu.py b/test/legacy_test/test_parallel_executor_seresnext_base_cpu.py deleted file mode 100644 index 7c9c9968c4a182..00000000000000 --- a/test/legacy_test/test_parallel_executor_seresnext_base_cpu.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -from functools import partial - -import seresnext_net -from seresnext_test_base import DeviceType, TestResnetBase - - -class TestResnetCPU(TestResnetBase): - def test_seresnext_with_learning_rate_decay(self): - # NOTE(zcd): This test is compare the result of use parallel_executor - # and executor, and the result of drop_out op and batch_norm op in - # this two executor have diff, so the two ops should be removed - # from the model. - check_func = partial( - self.check_network_convergence, - optimizer=seresnext_net.optimizer, - use_parallel_executor=False, - ) - self._compare_result_with_origin_model( - check_func, - use_device=DeviceType.CPU, - compare_separately=False, - delta2=1e-3, - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_parallel_executor_seresnext_base_gpu.py b/test/legacy_test/test_parallel_executor_seresnext_base_gpu.py deleted file mode 100644 index 75bd61f5c6c7d8..00000000000000 --- a/test/legacy_test/test_parallel_executor_seresnext_base_gpu.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -from functools import partial - -import seresnext_net -from seresnext_test_base import DeviceType, TestResnetBase - - -class TestResnetGPU(TestResnetBase): - def test_seresnext_with_learning_rate_decay(self): - # NOTE(zcd): This test is compare the result of use parallel_executor - # and executor, and the result of drop_out op and batch_norm op in - # this two executor have diff, so the two ops should be removed - # from the model. - check_func = partial( - self.check_network_convergence, - optimizer=seresnext_net.optimizer, - use_parallel_executor=False, - ) - self._compare_result_with_origin_model( - check_func, - use_device=DeviceType.CUDA, - delta2=1e-3, - compare_separately=False, - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py b/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py deleted file mode 100644 index 75d3d85e20e5b9..00000000000000 --- a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle import base - -base.core._set_fuse_parameter_group_size(3) -base.core._set_fuse_parameter_memory_size(131072) - -import unittest -from functools import partial - -import seresnext_net -from seresnext_test_base import DeviceType, TestResnetBase - - -class TestResnetWithFuseAllReduceCPU(TestResnetBase): - def test_seresnext_with_fused_all_reduce(self): - # NOTE(zcd): In order to make the program faster, - # this unit test remove drop_out and batch_norm. - check_func = partial( - self.check_network_convergence, - optimizer=seresnext_net.optimizer, - fuse_all_reduce_ops=True, - ) - self._compare_result_with_origin_model( - check_func, use_device=DeviceType.CPU - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py b/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py deleted file mode 100644 index 752538efaa0597..00000000000000 --- a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle import base - -base.core._set_fuse_parameter_group_size(3) -base.core._set_fuse_parameter_memory_size(131072) - -import unittest -from functools import partial - -import seresnext_net -from seresnext_test_base import DeviceType, TestResnetBase - - -class TestResnetWithFuseAllReduceGPU(TestResnetBase): - def test_seresnext_with_fused_all_reduce(self): - # NOTE(zcd): In order to make the program faster, - # this unit test remove drop_out and batch_norm. - check_func = partial( - self.check_network_convergence, - optimizer=seresnext_net.optimizer, - fuse_all_reduce_ops=True, - ) - self._compare_result_with_origin_model( - check_func, use_device=DeviceType.CUDA, delta2=1e-2 - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_cpu.py b/test/legacy_test/test_parallel_executor_seresnext_with_reduce_cpu.py deleted file mode 100644 index 9dead366227630..00000000000000 --- a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_cpu.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import seresnext_net -from parallel_executor_test_base import DeviceType, TestParallelExecutorBase - -from paddle.base import core - - -class TestResnetWithReduceBase(TestParallelExecutorBase): - def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5): - if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): - return - - ( - all_reduce_first_loss, - all_reduce_last_loss, - _, - ) = self.check_network_convergence( - seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_device), - iter=seresnext_net.iter(use_device), - batch_size=seresnext_net.batch_size(use_device), - use_device=use_device, - use_reduce=False, - optimizer=seresnext_net.optimizer, - ) - reduce_first_loss, reduce_last_loss, _ = self.check_network_convergence( - seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_device), - iter=seresnext_net.iter(use_device), - batch_size=seresnext_net.batch_size(use_device), - use_device=use_device, - use_reduce=True, - optimizer=seresnext_net.optimizer, - ) - - self.assertAlmostEqual( - all_reduce_first_loss, reduce_first_loss, delta=1e-5 - ) - self.assertAlmostEqual( - all_reduce_last_loss, - reduce_last_loss, - delta=all_reduce_last_loss * delta2, - ) - - if not use_device: - return - - ( - all_reduce_first_loss_seq, - all_reduce_last_loss_seq, - _, - ) = self.check_network_convergence( - seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_device), - iter=seresnext_net.iter(use_device), - batch_size=seresnext_net.batch_size(use_device), - use_device=use_device, - use_reduce=False, - optimizer=seresnext_net.optimizer, - enable_sequential_execution=True, - ) - - ( - reduce_first_loss_seq, - reduce_last_loss_seq, - _, - ) = self.check_network_convergence( - seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_device), - iter=seresnext_net.iter(use_device), - batch_size=seresnext_net.batch_size(use_device), - use_device=use_device, - use_reduce=True, - optimizer=seresnext_net.optimizer, - enable_sequential_execution=True, - ) - - self.assertAlmostEqual( - all_reduce_first_loss, all_reduce_first_loss_seq, delta=1e-5 - ) - self.assertAlmostEqual( - all_reduce_last_loss, - all_reduce_last_loss_seq, - delta=all_reduce_last_loss * delta2, - ) - - self.assertAlmostEqual( - reduce_first_loss, reduce_first_loss_seq, delta=1e-5 - ) - self.assertAlmostEqual( - reduce_last_loss, - reduce_last_loss_seq, - delta=reduce_last_loss * delta2, - ) - - self.assertAlmostEqual( - all_reduce_first_loss_seq, reduce_first_loss_seq, delta=1e-5 - ) - self.assertAlmostEqual( - all_reduce_last_loss_seq, - reduce_last_loss_seq, - delta=all_reduce_last_loss_seq * delta2, - ) - - -class TestResnetWithReduceCPU(TestResnetWithReduceBase): - def test_seresnext_with_reduce(self): - self._compare_reduce_and_allreduce( - use_device=DeviceType.CPU, delta2=1e-3 - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_gpu.py b/test/legacy_test/test_parallel_executor_seresnext_with_reduce_gpu.py deleted file mode 100644 index 187f837e7e7b1e..00000000000000 --- a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_gpu.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -from test_parallel_executor_seresnext_with_reduce_cpu import ( - DeviceType, - TestResnetWithReduceBase, -) - - -class TestResnetWithReduceGPU(TestResnetWithReduceBase): - def test_seresnext_with_reduce(self): - self._compare_reduce_and_allreduce( - use_device=DeviceType.CUDA, delta2=1e-2 - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_parallel_executor_transformer.py b/test/legacy_test/test_parallel_executor_transformer.py deleted file mode 100644 index d6bcf26c24bbd6..00000000000000 --- a/test/legacy_test/test_parallel_executor_transformer.py +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest - -import numpy as np -import transformer_model -from feed_data_reader import FeedDataReader -from parallel_executor_test_base import DeviceType, TestParallelExecutorBase - -import paddle -from paddle.base import core -from paddle.dataset import wmt16 - -os.environ['CPU_NUM'] = str(4) - - -class ModelHyperParams: - # Dictionary size for source and target language. This model directly uses - # paddle.dataset.wmt16 in which , and token has - # already been added, but the token is not added. Transformer requires - # sequences in a mini-batch are padded to have the same length. A token is - # added into the original dictionary in paddle.dateset.wmt16. - - # size of source word dictionary. - src_vocab_size = 10000 - # index for token in source language. - src_pad_idx = src_vocab_size - - # size of target word dictionary - trg_vocab_size = 10000 - # index for token in target language. - trg_pad_idx = trg_vocab_size - - # position value corresponding to the token. - pos_pad_idx = 0 - - # max length of sequences. It should plus 1 to include position - # padding token for position encoding. - max_length = 50 - - # the dimension for word embeddings, which is also the last dimension of - # the input and output of multi-head attention, position-wise feed-forward - # networks, encoder and decoder. - - d_model = 512 - # size of the hidden layer in position-wise feed-forward networks. - d_inner_hid = 1024 - # the dimension that keys are projected to for dot-product attention. - d_key = 64 - # the dimension that values are projected to for dot-product attention. - d_value = 64 - # number of head used in multi-head attention. - n_head = 8 - # number of sub-layers to be stacked in the encoder and decoder. - # NOTE(zcd): the origin number of layer is 6, to make this unit test faster, - # we should reduce the layer number to 4. - n_layer = 4 - # dropout rate used by all dropout layers. - dropout = 0.1 - - -def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. Then, convert the numpy - data to tensors and return a dict mapping names to tensors. - """ - - def __pad_batch_data( - insts, - pad_idx, - is_target=False, - return_pos=True, - return_attn_bias=True, - return_max_len=True, - ): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. - """ - return_list = [] - max_len = max(len(inst) for inst in insts) - inst_data = np.array( - [inst + [pad_idx] * (max_len - len(inst)) for inst in insts] - ) - return_list += [inst_data.astype("int64").reshape([-1, 1])] - if return_pos: - inst_pos = np.array( - [ - [ - pos_i + 1 if w_i != pad_idx else 0 - for pos_i, w_i in enumerate(inst) - ] - for inst in inst_data - ] - ) - - return_list += [inst_pos.astype("int64").reshape([-1, 1])] - if return_attn_bias: - if is_target: - # This is used to avoid attention on paddings and subsequent - # words. - slf_attn_bias_data = np.ones( - (inst_data.shape[0], max_len, max_len) - ) - slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( - [-1, 1, max_len, max_len] - ) - slf_attn_bias_data = np.tile( - slf_attn_bias_data, [1, n_head, 1, 1] - ) * [-1e9] - else: - # This is used to avoid attention on paddings. - slf_attn_bias_data = np.array( - [ - [0] * len(inst) + [-1e9] * (max_len - len(inst)) - for inst in insts - ] - ) - slf_attn_bias_data = np.tile( - slf_attn_bias_data.reshape([-1, 1, 1, max_len]), - [1, n_head, max_len, 1], - ) - return_list += [slf_attn_bias_data.astype("float32")] - if return_max_len: - return_list += [max_len] - return return_list if len(return_list) > 1 else return_list[0] - - src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( - [inst[0] for inst in insts], src_pad_idx, is_target=False - ) - trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( - [inst[1] for inst in insts], trg_pad_idx, is_target=True - ) - trg_src_attn_bias = np.tile( - src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1] - ).astype("float32") - lbl_word = __pad_batch_data( - [inst[2] for inst in insts], trg_pad_idx, False, False, False, False - ) - lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) - - return [ - src_word, - src_pos, - trg_word, - trg_pos, - src_slf_attn_bias, - trg_slf_attn_bias, - trg_src_attn_bias, - lbl_word, - lbl_weight, - ] - - -feed_data_reader = None - - -def transformer(use_feed): - assert not use_feed, "transformer doesn't support feed yet" - return transformer_model.transformer( - ModelHyperParams.src_vocab_size + 1, - ModelHyperParams.trg_vocab_size + 1, - ModelHyperParams.max_length + 1, - ModelHyperParams.n_layer, - ModelHyperParams.n_head, - ModelHyperParams.d_key, - ModelHyperParams.d_value, - ModelHyperParams.d_model, - ModelHyperParams.d_inner_hid, - ModelHyperParams.dropout, - ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, - ModelHyperParams.pos_pad_idx, - ) - - -def get_feed_data_reader(): - global feed_data_reader - if feed_data_reader is not None: - return feed_data_reader - - reader = paddle.batch( - wmt16.train( - ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size - ), - batch_size=transformer_model.batch_size, - ) - all_batch_tensors = [] - for batch in reader(): - tensors = [] - for tensor in prepare_batch_input( - batch, - ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, - ModelHyperParams.n_head, - ): - tensors.append(np.array(tensor)) - all_batch_tensors.append(tensors) - - def __reader__(): - yield from all_batch_tensors - - feed_data_reader = FeedDataReader( - feed_list=transformer_model.build_inputs( - ModelHyperParams.max_length + 1, ModelHyperParams.n_head - ), - reader=__reader__, - ) - - return feed_data_reader - - -class TestTransformer(TestParallelExecutorBase): - def test_main(self): - if core.is_compiled_with_cuda(): - self.check_network_convergence( - transformer, - use_device=DeviceType.CUDA, - feed_data_reader=get_feed_data_reader(), - ) - self.check_network_convergence( - transformer, - use_device=DeviceType.CUDA, - enable_sequential_execution=True, - feed_data_reader=get_feed_data_reader(), - ) - self.check_network_convergence( - transformer, - use_device=DeviceType.CPU, - iter=2, - feed_data_reader=get_feed_data_reader(), - ) - - -if __name__ == '__main__': - paddle.enable_static() - unittest.main() diff --git a/test/legacy_test/test_parallel_executor_transformer_auto_growth.py b/test/legacy_test/test_parallel_executor_transformer_auto_growth.py deleted file mode 100644 index 7f38de13af4cdf..00000000000000 --- a/test/legacy_test/test_parallel_executor_transformer_auto_growth.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -if __name__ == '__main__': - unittest.main() diff --git a/test/legacy_test/test_program_prune_backward.py b/test/legacy_test/test_program_prune_backward.py index 581635d5a68ada..36e3fb67c254e8 100755 --- a/test/legacy_test/test_program_prune_backward.py +++ b/test/legacy_test/test_program_prune_backward.py @@ -17,16 +17,213 @@ import numpy as np import seresnext_net +import transformer_model +from feed_data_reader import FeedDataReader from simple_nets import fc_with_batchnorm, init_data, simple_fc_net -from test_parallel_executor_transformer import ( - DeviceType, - get_feed_data_reader, - transformer, -) import paddle from paddle import base from paddle.base import core +from paddle.dataset import wmt16 + +DeviceType = core.DeviceType + + +class ModelHyperParams: + # Dictionary size for source and target language. This model directly uses + # paddle.dataset.wmt16 in which , and token has + # already been added, but the token is not added. Transformer requires + # sequences in a mini-batch are padded to have the same length. A token is + # added into the original dictionary in paddle.dateset.wmt16. + + # size of source word dictionary. + src_vocab_size = 10000 + # index for token in source language. + src_pad_idx = src_vocab_size + + # size of target word dictionary + trg_vocab_size = 10000 + # index for token in target language. + trg_pad_idx = trg_vocab_size + + # position value corresponding to the token. + pos_pad_idx = 0 + + # max length of sequences. It should plus 1 to include position + # padding token for position encoding. + max_length = 50 + + # the dimension for word embeddings, which is also the last dimension of + # the input and output of multi-head attention, position-wise feed-forward + # networks, encoder and decoder. + + d_model = 512 + # size of the hidden layer in position-wise feed-forward networks. + d_inner_hid = 1024 + # the dimension that keys are projected to for dot-product attention. + d_key = 64 + # the dimension that values are projected to for dot-product attention. + d_value = 64 + # number of head used in multi-head attention. + n_head = 8 + # number of sub-layers to be stacked in the encoder and decoder. + # NOTE(zcd): the origin number of layer is 6, to make this unit test faster, + # we should reduce the layer number to 4. + n_layer = 4 + # dropout rate used by all dropout layers. + dropout = 0.1 + + +def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. Then, convert the numpy + data to tensors and return a dict mapping names to tensors. + """ + + def __pad_batch_data( + insts, + pad_idx, + is_target=False, + return_pos=True, + return_attn_bias=True, + return_max_len=True, + ): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + return_list = [] + max_len = max(len(inst) for inst in insts) + inst_data = np.array( + [inst + [pad_idx] * (max_len - len(inst)) for inst in insts] + ) + return_list += [inst_data.astype("int64").reshape([-1, 1])] + if return_pos: + inst_pos = np.array( + [ + [ + pos_i + 1 if w_i != pad_idx else 0 + for pos_i, w_i in enumerate(inst) + ] + for inst in inst_data + ] + ) + + return_list += [inst_pos.astype("int64").reshape([-1, 1])] + if return_attn_bias: + if is_target: + # This is used to avoid attention on paddings and subsequent + # words. + slf_attn_bias_data = np.ones( + (inst_data.shape[0], max_len, max_len) + ) + slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( + [-1, 1, max_len, max_len] + ) + slf_attn_bias_data = np.tile( + slf_attn_bias_data, [1, n_head, 1, 1] + ) * [-1e9] + else: + # This is used to avoid attention on paddings. + slf_attn_bias_data = np.array( + [ + [0] * len(inst) + [-1e9] * (max_len - len(inst)) + for inst in insts + ] + ) + slf_attn_bias_data = np.tile( + slf_attn_bias_data.reshape([-1, 1, 1, max_len]), + [1, n_head, max_len, 1], + ) + return_list += [slf_attn_bias_data.astype("float32")] + if return_max_len: + return_list += [max_len] + return return_list if len(return_list) > 1 else return_list[0] + + src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, is_target=False + ) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, is_target=True + ) + trg_src_attn_bias = np.tile( + src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1] + ).astype("float32") + lbl_word = __pad_batch_data( + [inst[2] for inst in insts], trg_pad_idx, False, False, False, False + ) + lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) + + return [ + src_word, + src_pos, + trg_word, + trg_pos, + src_slf_attn_bias, + trg_slf_attn_bias, + trg_src_attn_bias, + lbl_word, + lbl_weight, + ] + + +feed_data_reader = None + + +def transformer(use_feed): + assert not use_feed, "transformer doesn't support feed yet" + return transformer_model.transformer( + ModelHyperParams.src_vocab_size + 1, + ModelHyperParams.trg_vocab_size + 1, + ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, + ModelHyperParams.n_head, + ModelHyperParams.d_key, + ModelHyperParams.d_value, + ModelHyperParams.d_model, + ModelHyperParams.d_inner_hid, + ModelHyperParams.dropout, + ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, + ModelHyperParams.pos_pad_idx, + ) + + +def get_feed_data_reader(): + global feed_data_reader + if feed_data_reader is not None: + return feed_data_reader + + reader = paddle.batch( + wmt16.train( + ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size + ), + batch_size=transformer_model.batch_size, + ) + all_batch_tensors = [] + for batch in reader(): + tensors = [] + for tensor in prepare_batch_input( + batch, + ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, + ModelHyperParams.n_head, + ): + tensors.append(np.array(tensor)) + all_batch_tensors.append(tensors) + + def __reader__(): + yield from all_batch_tensors + + feed_data_reader = FeedDataReader( + feed_list=transformer_model.build_inputs( + ModelHyperParams.max_length + 1, ModelHyperParams.n_head + ), + reader=__reader__, + ) + + return feed_data_reader def simple_fc_net_with_accuracy(use_feed): diff --git a/test/legacy_test/test_py_func_op.py b/test/legacy_test/test_py_func_op.py index 1706ad14d644d7..3fa249935406fc 100644 --- a/test/legacy_test/test_py_func_op.py +++ b/test/legacy_test/test_py_func_op.py @@ -19,7 +19,6 @@ import paddle from paddle import base -from paddle.base import compiler dev_cnt = 2 if base.core.is_compiled_with_cuda(): @@ -171,7 +170,7 @@ def reader(): ) -def test_main(use_cuda, use_py_func_op, use_parallel_executor): +def test_main(use_cuda, use_py_func_op): if use_cuda and not base.core.is_compiled_with_cuda(): return None @@ -197,12 +196,7 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor): exe.run(base.default_startup_program()) train_cp = base.default_main_program() - - if use_parallel_executor: - train_cp = compiler.CompiledProgram(base.default_main_program()) - fetch_list = [loss.name] - else: - fetch_list = [loss] + fetch_list = [loss] ret = [] for epoch_id in range(2): @@ -215,16 +209,11 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor): class TestPyFuncOpUseExecutor(unittest.TestCase): - def setUp(self): - self.use_parallel_executor = False - def test_loss_diff(self): for use_cuda in [True, False]: losses = [] for use_py_func_op in [True, False]: - L = test_main( - use_cuda, use_py_func_op, self.use_parallel_executor - ) + L = test_main(use_cuda, use_py_func_op) if L is not None: losses.append(L) @@ -233,10 +222,5 @@ def test_loss_diff(self): self.assertAlmostEqual(max_diff, 0, delta=1e-3) -class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor): - def setUp(self): - self.use_parallel_executor = True - - if __name__ == '__main__': unittest.main() diff --git a/test/standalone_executor/test_standalone_executor.py b/test/standalone_executor/test_standalone_executor.py index 6c510c77ca1f99..934558c170f513 100644 --- a/test/standalone_executor/test_standalone_executor.py +++ b/test/standalone_executor/test_standalone_executor.py @@ -70,9 +70,6 @@ def setUp(self): ) self.perf_path = './perfstat' - def test_parallel_executor_statistics(self): - self.run_with_statistics(executor='ParallelExecutor') - def test_executor_statistics(self): self.run_with_statistics(executor='Executor') @@ -88,13 +85,6 @@ def run_with_statistics(self, executor=None): # note: startup program is empty main_program, startup_program, fetch_list = build_program() - enable = True - if executor == 'ParallelExecutor': - main_program = paddle.base.compiler.CompiledProgram(main_program) - enable = False - elif executor == 'Executor': - enable = False - scope = paddle.static.Scope() with paddle.static.scope_guard(scope): exe = paddle.static.Executor(self.place) diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 999ae623e9e2d7..f8c236265ae27d 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -624,7 +624,6 @@ 'test_memory_analysis', 'test_matrix_rank_op', 'test_merged_momentum_op', - 'test_parallel_executor_run_cinn', 'test_parallel_dygraph_dataparallel_cpuonly', 'test_eigvals_op', 'test_sparse_attention_op', @@ -670,9 +669,7 @@ 'test_analyzer_int8_googlenet', 'test_analyzer_seq_pool1_compare_determine', 'save_quant2_model_ernie', - 'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu', 'test_dataset_uci_housing', - 'test_parallel_executor_seresnext_base_cpu', 'test_dataset_download', 'test_quant_int8_mobilenetv1_mkldnn', 'test_crf_decoding_op', @@ -688,7 +685,6 @@ 'test_weight_quantization_mobilenetv1', 'test_concat_mkldnn_op', 'test_gaussian_random_mkldnn_op', - 'test_parallel_executor_seresnext_with_reduce_cpu', 'test_dataset_imikolov', 'test_analyzer_rnn1', 'test_conv2d_mkldnn_op', @@ -807,7 +803,6 @@ 'test_maximum_op', 'test_rnn_cell_api', 'device_code_test', - 'test_ir_inplace_pass', 'test_cos_sim_op', 'test_lite_tensor_utils', 'test_fit_a_line', @@ -890,7 +885,6 @@ 'test_scale_mkldnn_op', 'test_load_state_dict_from_old_format', 'test_lookup_table_v2_op', - 'test_mix_precision_all_reduce_fuse', 'test_spp_op', 'test_op_converter', 'test_mixed_vector', @@ -921,7 +915,6 @@ 'test_run_program_op', 'test_cuda_random_seed', 'test_linear_interp_op', - 'test_fuse_all_reduce_pass', 'tensor_util_test', 'test_median', 'test_nanmedian', @@ -1027,7 +1020,6 @@ 'test_gather_tree_op', 'test_elementwise_mul_op', 'test_cycle_gan', - 'test_parallel_executor_transformer_auto_growth', 'test_bitwise_op', 'test_uniform_random_op', 'trt_split_converter_test', @@ -1083,7 +1075,6 @@ 'test_imperative_layer_children', 'nccl_op_test', 'test_share_data_op', - 'test_ir_memory_optimize_transformer', 'test_math_op_patch', 'test_base_layer', 'test_dequantize_log_op', @@ -1101,7 +1092,6 @@ 'test_affine_channel_op', 'test_leaky_relu_grad_grad_functor', 'test_ctc_align', - 'test_fuse_relu_depthwise_conv_pass', 'test_complex_kron', 'test_imperative_skip_op', 'test_dgc_op', @@ -1253,7 +1243,6 @@ 'test_conv_elementwise_add2_act_fuse_pass', 'test_imperative_container_layerlist', 'test_dequantize_abs_max_op', - 'test_fuse_optimizer_pass', 'test_optimizer', 'test_dynamic_rnn_stop_gradient', 'test_raw_program_optimizer', @@ -1355,7 +1344,6 @@ 'test_gradient_accmulator', 'test_instance_norm_op_v2', 'test_mobile_net', - 'test_parallel_executor_transformer', 'test_tensor_scalar_type_promotion_dynamic', 'test_eager_deletion_delete_vars', 'test_asp_pruning_1d', @@ -1382,7 +1370,6 @@ 'test_tensorrt_engine', 'test_affine_grid_function', 'test_nonzero_api', - 'test_ir_memory_optimize_pass', 'test_reduce_mkldnn_op', 'test_bilinear_interp_op', 'test_cvm_op', @@ -1464,9 +1451,6 @@ 'test_save_inference_model', 'test_smooth_l1_loss', 'test_bilateral_slice_op', - 'test_parallel_executor_seresnext_base_gpu', - 'test_parallel_executor_seresnext_with_fuse_all_reduce_gpu', - 'test_parallel_executor_seresnext_with_reduce_gpu', 'test_data_norm_op', 'test_install_check', 'graph_node_test', @@ -2164,7 +2148,6 @@ 'test_analyzer_capi_exp_xpu', 'test_egr_task_autocodegen', 'test_static_save_load_bf16', - 'test_parallel_executor_run_cinn', 'test_egr_task_tensor_utils', 'test_egr_task_hook', 'test_egr_task_forward_autograd', @@ -2279,15 +2262,12 @@ 'test_fused_transformer_encoder_layer', 'test_eager_deletion_while_op', 'test_dataloader_unkeep_order', - 'test_parallel_executor_profiler', 'test_correlation', - 'test_ir_inplace_pass', 'test_moving_average_abs_max_scale_op', 'test_flatten_contiguous_range_op', 'test_transforms', 'test_sum_op', 'test_scatter_op', - 'test_mix_precision_all_reduce_fuse', 'test_tensorrt_engine_op', 'test_zeropad2d', 'test_isclose_op', @@ -2880,7 +2860,6 @@ 'test_user_defined_quantization', 'test_quantization_scale_pass', 'feed_forward_test', - 'test_fuse_optimizer_pass', 'test_standalone_executor', 'test_imperative_qat_user_defined', 'test_mkldnn_fc_act_fuse_pass', @@ -2888,7 +2867,6 @@ 'test_signal', 'test_fused_feedforward_op', 'test_weight_decay_extend', - 'test_fuse_relu_depthwise_conv_pass', 'test_diag_v2', 'test_tensordot', 'test_rnn_decode_api', @@ -2913,7 +2891,6 @@ 'test_multinomial_op', 'test_fused_elemwise_activation_op', 'test_profiler', - 'test_ir_memory_optimize_pass', 'test_callback_reduce_lr_on_plateau', 'test_paddle_save_load', 'test_stack_op', @@ -3055,10 +3032,8 @@ 'test_squeeze2_mkldnn_op', 'test_conv2d_transpose_bf16_mkldnn_op', 'test_slice_mkldnn_op', - 'test_parallel_executor_seresnext_base_cpu', 'test_stack_mkldnn_op', 'test_softplus_mkldnn_op', - 'test_parallel_executor_seresnext_with_reduce_cpu', 'test_nearest_interp_v2_mkldnn_op', 'test_fusion_lstm_mkldnn_op', 'test_fuse_resnet_unit', @@ -3066,7 +3041,6 @@ 'test_uniform_random_bf16_op', 'test_reshape_mkldnn_op', 'test_reduce_bf16_mkldnn_op', - 'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu', 'test_nearest_interp_mkldnn_op', 'test_ir_graph_to_program_pass', 'test_fusion_lstm_int8_mkldnn_op', diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index fe6c5814dbd8b7..9c5f73d7665c6e 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -217,9 +217,6 @@ 'test_functional_conv2d_transpose', 'test_functional_conv3d', 'test_functional_conv3d_transpose', - 'test_fuse_all_reduce_pass', - 'test_fuse_optimizer_pass', - 'test_fuse_relu_depthwise_conv_pass', 'test_fused_elemwise_activation_op', 'test_fused_emb_seq_pool_op', 'test_fused_embedding_fc_lstm_op', @@ -279,6 +276,7 @@ 'test_instance_norm_op_v2', 'test_inverse_op', 'test_io_save_load', + 'test_iou_similarity_op', 'test_ir_memory_optimize_pass', 'test_kldiv_loss_op', 'test_kron_op', @@ -503,14 +501,8 @@ 'test_transpiler_ops', 'test_communicator_sync', 'test_collective_optimizer', - 'test_parallel_executor_profiler', - 'test_parallel_executor_transformer', - 'test_parallel_executor_transformer_auto_growth', 'test_data_norm_op', 'test_fuse_bn_act_pass', - 'test_parallel_executor_seresnext_base_cpu', - 'test_parallel_executor_seresnext_with_reduce_cpu', - 'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu', 'test_layers', 'test_sequence_concat', 'test_sequence_conv', @@ -612,12 +604,9 @@ 'test_fleet_metric', 'test_fused_bn_add_act', 'test_fused_multihead_matmul_op', - 'test_ir_inplace_pass', - 'test_mix_precision_all_reduce_fuse', 'test_rank_attention_op', 'test_fleet_base', 'test_fleet_meta_optimizer_base', - 'test_ir_memory_optimize_transformer', 'test_trt_fc_fuse_pass', 'test_trt_quant_conv2d_dequant_fuse_pass', 'test_trt_slice_plugin', @@ -640,9 +629,6 @@ 'test_trt_pad_op', 'test_trt_shuffle_channel_detect_pass', 'test_trt_subgraph_pass', - 'test_parallel_executor_seresnext_base_gpu', - 'test_parallel_executor_seresnext_with_fuse_all_reduce_gpu', - 'test_parallel_executor_seresnext_with_reduce_gpu', 'test_sync_batch_norm_op', 'test_multiprocess_dataloader_iterable_dataset_static', 'test_multiprocess_dataloader_static', diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index a11e3ad47724f7..29b71c4306ee8c 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -24,15 +24,10 @@ disable_wingpu_test="^test_model$|\ ^test_generator_dataloader$|\ ^test_parallel_dygraph_sync_batch_norm$|\ ^test_py_reader_using_executor$|\ -^test_parallel_executor_seresnext_base_gpu$|\ -^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\ -^test_parallel_executor_seresnext_with_reduce_gpu$|\ ^test_program_prune_backward$|\ ^test_decoupled_py_reader_data_check$|\ ^test_fleet_base_single$|\ ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\ -^test_parallel_executor_feed_persistable_var$|\ -^test_parallel_executor_inference_feed_partial_data$|\ ^test_py_reader_combination$|\ ^test_py_reader_pin_memory$|\ ^test_py_reader_push_pop$|\ @@ -76,7 +71,6 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\ ^test_elementwise_add_mkldnn_op$|\ ^test_comp_high_grad$|\ ^test_multi_precision_fp16_train$|\ -^test_fuse_relu_depthwise_conv_pass$|\ ^test_imperative_skip_op$|\ ^test_qat$|\ ^test_standalone_cuda_graph_multi_stream$|\ @@ -209,7 +203,6 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\ ^test_argsort_op$|\ ^test_image_classification_fp16$|\ ^test_imperative_double_grad$|\ -^test_parallel_executor_transformer$|\ ^test_se_resnet$|\ ^test_standalone_executor_aot_choose_kernel$|\ ^test_imperative_qat_user_defined$|\ @@ -217,7 +210,6 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\ ^test_callback_reduce_lr_on_plateau$|\ ^test_callback_visualdl$|\ ^test_callback_wandb$|\ -^test_mix_precision_all_reduce_fuse$|\ ^test_user_defined_quantization$|\ ^test_quantization_scale_pass$|\ ^test_quantization_pass$|\ @@ -399,10 +391,7 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_model$|\ ^test_py_reader_combination$|\ ^test_py_reader_push_pop$|\ -^test_parallel_executor_feed_persistable_var$|\ -^test_parallel_executor_inference_feed_partial_data$|\ ^test_reader_reset$|\ -^test_parallel_executor_seresnext_base_gpu$|\ ^test_py_reader_pin_memory$|\ ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\ ^test_multiprocess_dataloader_iterable_dataset_static$|\ @@ -432,8 +421,6 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_trt_convert_multihead_matmul$|\ ^test_trt_convert_prelu$|\ ^test_trt_fc_fuse_quant_dequant_pass$|\ -^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\ -^test_parallel_executor_seresnext_with_reduce_gpu$|\ ^test_api_impl$|\ ^test_tensordot$|\ ^disable_win_inference_test$|\