diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 63d84ece4aa988..6e76d10910bed2 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -444,8 +444,6 @@ list(REMOVE_ITEM TEST_OPS test_feed_data_check_shape_type)
 list(REMOVE_ITEM TEST_OPS test_fetch_lod_tensor_array)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_data_norm_op)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer_auto_growth)
 list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
@@ -453,12 +451,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_resnet_sorted_gradient)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist_sorted_gradient)
 list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
-list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
 list(REMOVE_ITEM TEST_OPS test_layers)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_base_cpu)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_seresnext_with_reduce_cpu)
-list(REMOVE_ITEM TEST_OPS
-     test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
 list(REMOVE_ITEM TEST_OPS test_imperative_ocr_attention_model)
 list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist)
 list(REMOVE_ITEM TEST_OPS test_install_check)
@@ -466,7 +459,6 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
-list(REMOVE_ITEM TEST_OPS test_fuse_all_reduce_pass)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_add_act_pass)
 list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
@@ -772,25 +764,12 @@ if(WITH_DISTRIBUTE)
   endif()
 endif()
 
-py_test_modules(test_parallel_executor_transformer MODULES
-                test_parallel_executor_transformer)
 if(WIN32)
-  py_test_modules(
-    test_parallel_executor_transformer_auto_growth MODULES
-    test_parallel_executor_transformer_auto_growth ENVS
-    FLAGS_allocator_strategy=auto_growth CUDA_VISIBLE_DEVICES=0)
-  py_test_modules(test_fuse_all_reduce_pass MODULES test_fuse_all_reduce_pass
-                  ENVS CUDA_VISIBLE_DEVICES=0)
   py_test_modules(test_feed_data_check_shape_type MODULES
                   test_feed_data_check_shape_type ENVS CUDA_VISIBLE_DEVICES=0)
   py_test_modules(test_fetch_lod_tensor_array MODULES
                   test_fetch_lod_tensor_array ENVS CUDA_VISIBLE_DEVICES=0)
 else()
-  py_test_modules(
-    test_parallel_executor_transformer_auto_growth MODULES
-    test_parallel_executor_transformer_auto_growth ENVS
-    FLAGS_allocator_strategy=auto_growth)
-  py_test_modules(test_fuse_all_reduce_pass MODULES test_fuse_all_reduce_pass)
   py_test_modules(test_feed_data_check_shape_type MODULES
                   test_feed_data_check_shape_type)
   py_test_modules(test_fetch_lod_tensor_array MODULES
@@ -815,38 +794,10 @@ py_test_modules(
   FLAGS_cudnn_batchnorm_spatial_persistent=1
   FLAGS_conv_workspace_size_limit=1000)
 
-# NOTE: These unittests will appear NaN steadily in windows CI. After analysis,
-# it is found that windows CI will run all the training unittests with the ON_INFER option turned on,
-# which will not appear in other CIs. The calculation behavior of some ops in inference mode is
-# inconsistent with that in non-inference mode.
-if(WITH_PYTHON)
-  py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES
-                  test_parallel_executor_seresnext_base_cpu)
-  py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES
-                  test_parallel_executor_seresnext_with_reduce_cpu)
-  py_test_modules(
-    test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES
-    test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
-  set_tests_properties(test_parallel_executor_seresnext_base_cpu
-                       PROPERTIES TIMEOUT 900)
-  set_tests_properties(test_parallel_executor_seresnext_base_cpu
-                       PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
-  set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu
-                       PROPERTIES TIMEOUT 750)
-  set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu
-                       PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
-  set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu
-                       PROPERTIES TIMEOUT 750)
-  set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu
-                       PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
-endif()
-
 if(NOT WIN32)
   # TODO: fix these unittests failure on Windows
   py_test_modules(test_layers MODULES test_layers ENVS
                   FLAGS_cudnn_deterministic=1)
-  py_test_modules(test_ir_memory_optimize_transformer MODULES
-                  test_ir_memory_optimize_transformer)
 endif()
 
 if(WITH_HETERPS)
@@ -871,11 +822,7 @@ set_tests_properties(
   test_data_norm_op test_dataloader_keep_order test_dataloader_unkeep_order
   test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST")
 set_tests_properties(
-  test_sync_batch_norm_op
-  test_parallel_executor_seresnext_base_gpu
-  test_parallel_executor_seresnext_with_reduce_gpu
-  test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
-  test_distributed_fused_lamb_op_with_clip
+  test_sync_batch_norm_op test_distributed_fused_lamb_op_with_clip
   test_distributed_fused_lamb_op_without_clip
   test_distributed_fused_lamb_op_with_gradient_merge
   PROPERTIES LABELS "RUN_TYPE=DIST")
@@ -907,13 +854,6 @@ if(NOT WIN32)
   set_tests_properties(test_multiprocess_reader_exception
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
   set_tests_properties(test_layers PROPERTIES TIMEOUT 120)
-  if(WITH_NV_JETSON)
-    set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT
-                                                                        1200)
-  else()
-    set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT
-                                                                        120)
-  endif()
 endif()
 
 if(WITH_DISTRIBUTE)
@@ -950,7 +890,6 @@ set_tests_properties(test_cross_entropy_loss PROPERTIES TIMEOUT 180)
 set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_activation_nn_grad PROPERTIES TIMEOUT 250)
 set_tests_properties(test_empty_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_parallel_executor_transformer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_div_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 150)
 set_tests_properties(test_multiclass_nms_op PROPERTIES TIMEOUT 120)
@@ -963,7 +902,6 @@ set_tests_properties(test_add_reader_dependency PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilateral_slice_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_buffer_shared_memory_reuse_pass PROPERTIES TIMEOUT
                                                                      120)
-set_tests_properties(test_fuse_relu_depthwise_conv_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fleet_util PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_transformer_sorted_gradient
                      PROPERTIES TIMEOUT 120)
@@ -1022,16 +960,12 @@ set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
 set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT
                                                                         120)
-set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu
-                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dropout_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_argsort_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_gather_nd_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nn_grad PROPERTIES TIMEOUT 180)
 set_tests_properties(test_elementwise_sub_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
-                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT
                                                                          240)
 set_tests_properties(test_distributed_fused_lamb_op_without_clip
@@ -1041,8 +975,6 @@ set_tests_properties(test_distributed_fused_lamb_op_with_gradient_merge
 set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
-set_tests_properties(test_parallel_executor_transformer_auto_growth
-                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_add_op PROPERTIES TIMEOUT 200)
 if(NOT WITH_COVERAGE)
   set_tests_properties(test_weight_decay PROPERTIES TIMEOUT 120)
@@ -1070,13 +1002,10 @@ set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 150)
 set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cond PROPERTIES TIMEOUT 240)
 set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250)
-set_tests_properties(test_parallel_executor_seresnext_base_gpu
-                     PROPERTIES TIMEOUT 120)
 set_tests_properties(test_norm_nn_grad PROPERTIES TIMEOUT 180)
 set_tests_properties(test_matrix_nms_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_generator_dataloader PROPERTIES TIMEOUT 120)
 set_tests_properties(test_partial_concat_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fuse_optimizer_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_softmax_with_cross_entropy_op PROPERTIES TIMEOUT 220)
 set_tests_properties(test_reduce_op PROPERTIES TIMEOUT 500)
 set_tests_properties(test_adam_optimizer_fp32_fp64 PROPERTIES TIMEOUT 120)
@@ -1102,7 +1031,6 @@ set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv2d_api PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cyclic_cifar_dataset PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fuse_all_reduce_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120)
@@ -1284,15 +1212,6 @@ foreach(TEST_CINN_OP ${TEST_CINN_OPS})
 endforeach()
 
 if(WITH_CINN AND WITH_TESTING)
-  set_tests_properties(
-    test_parallel_executor_run_cinn
-    PROPERTIES
-      LABELS
-      "RUN_TYPE=CINN"
-      ENVIRONMENT
-      FLAGS_allow_cinn_ops="conv2d;conv2d_grad;elementwise_add;elementwise_add_grad;relu;relu_grad;sum"
-  )
-
   set_tests_properties(test_tile_op PROPERTIES TIMEOUT 300)
 endif()
 
diff --git a/test/legacy_test/parallel_executor_test_base.py b/test/legacy_test/parallel_executor_test_base.py
deleted file mode 100644
index a74d72d77f1f53..00000000000000
--- a/test/legacy_test/parallel_executor_test_base.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import multiprocessing
-import os
-import sys
-import time
-import unittest
-
-import numpy as np
-from feed_data_reader import FeedDataReader
-
-import paddle
-from paddle import base
-from paddle.base import compiler, core
-
-__all__ = ['TestParallelExecutorBase']
-DeviceType = core.DeviceType
-
-
-class TestParallelExecutorBase(unittest.TestCase):
-    @classmethod
-    def check_network_convergence(
-        cls,
-        method,
-        use_device=DeviceType.CUDA,
-        iter=5,
-        batch_size=None,
-        feed_dict=None,
-        feed_data_reader=None,
-        get_data_from_feeder=None,
-        use_parallel_executor=True,
-        use_reduce=False,
-        use_ir_memory_optimize=False,
-        enable_inplace=True,
-        fuse_elewise_add_act_ops=False,
-        fuse_all_optimizer_ops=False,
-        fuse_all_reduce_ops=False,
-        fuse_relu_depthwise_conv=False,
-        optimizer=paddle.optimizer.Adam,
-        use_fast_executor=False,
-        enable_sequential_execution=False,
-    ):
-        def run_executor(exe, binary, feed, fetch_list):
-            if feed_data_reader is None:
-                res = exe.run(binary, feed=feed, fetch_list=fetch_list)
-            else:
-                res = exe.run(
-                    binary,
-                    feed=feed_data_reader.get_next(exe, binary),
-                    fetch_list=fetch_list,
-                )
-            return res
-
-        if feed_data_reader is not None:
-            assert isinstance(
-                feed_data_reader, FeedDataReader
-            ), "feed_data_reader must be type of FeedDataReader"
-
-        paddle.seed(0)
-        paddle.framework.random._manual_program_seed(0)
-        main = base.Program()
-        startup = base.Program()
-
-        with base.program_guard(main, startup):
-            feed_dict, loss = cls.build_model(
-                feed_dict, get_data_from_feeder, main, method, optimizer
-            )
-
-        place = (
-            base.CUDAPlace(0)
-            if use_device == DeviceType.CUDA
-            else base.XPUPlace(0)
-            if use_device == DeviceType.XPU
-            else base.CPUPlace()
-        )
-        exe = base.Executor(place)
-        exe.run(startup)
-
-        build_strategy = cls.set_strategy(
-            enable_inplace,
-            enable_sequential_execution,
-            fuse_all_optimizer_ops,
-            fuse_all_reduce_ops,
-            fuse_elewise_add_act_ops,
-            fuse_relu_depthwise_conv,
-            use_fast_executor,
-            use_ir_memory_optimize,
-            use_reduce,
-            use_device,
-        )
-
-        if use_parallel_executor:
-            binary = compiler.CompiledProgram(
-                main,
-                build_strategy=build_strategy,
-            )
-        else:
-            binary = main
-
-        if batch_size is not None:
-            batch_size *= (
-                base.core.get_cuda_device_count()
-                if use_device == DeviceType.CUDA
-                else base.core.get_xpu_device_count()
-                if use_device == DeviceType.XPU
-                else int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            )
-
-        area_below_loss = 0
-        begin = time.time()
-        (first_loss,) = run_executor(
-            exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]
-        )
-        area_below_loss += 0.5 * first_loss.mean()
-        for _ in range(iter):
-            mid_loss = run_executor(
-                exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]
-            )
-            area_below_loss += mid_loss[0].mean()
-        (last_loss,) = run_executor(
-            exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]
-        )
-        area_below_loss += 0.5 * last_loss.mean()
-        end = time.time()
-
-        if batch_size is not None:
-            print(
-                "%.4f Instance per second"
-                % ((batch_size * iter + 2) / (end - begin))
-            )
-
-        avg_last_loss_val = np.array(last_loss).mean()
-        avg_first_loss_val = np.array(first_loss).mean()
-        if math.isnan(float(avg_last_loss_val)) or math.isnan(
-            float(avg_first_loss_val)
-        ):
-            sys.exit("got NaN loss, training failed.")
-
-        print(first_loss, last_loss, area_below_loss)
-        # self.assertGreater(first_loss[0], last_loss[0])
-        return first_loss, last_loss, area_below_loss
-
-    @classmethod
-    def check_pass_conflict(
-        cls,
-        method,
-        use_device=DeviceType.CUDA,
-        feed_dict=None,
-        get_data_from_feeder=None,
-        use_reduce=False,
-        use_ir_memory_optimize=True,
-        enable_inplace=True,
-        fuse_elewise_add_act_ops=False,
-        fuse_all_optimizer_ops=False,
-        fuse_all_reduce_ops=False,
-        fuse_relu_depthwise_conv=False,
-        optimizer=paddle.optimizer.Adam,
-        use_fast_executor=True,
-        enable_sequential_execution=False,
-    ):
-        main = base.Program()
-        startup = base.Program()
-        with base.program_guard(main, startup):
-            feed_dict, loss = cls.build_model(
-                feed_dict, get_data_from_feeder, main, method, optimizer
-            )
-
-        place = (
-            base.CUDAPlace(0)
-            if use_device == DeviceType.CUDA
-            else base.XPUPlace(0)
-            if use_device == DeviceType.XPU
-            else base.CPUPlace()
-        )
-        exe = base.Executor(place)
-        exe.run(startup)
-
-        build_strategy = cls.set_strategy(
-            enable_inplace,
-            enable_sequential_execution,
-            fuse_all_optimizer_ops,
-            fuse_all_reduce_ops,
-            fuse_elewise_add_act_ops,
-            fuse_relu_depthwise_conv,
-            use_fast_executor,
-            use_ir_memory_optimize,
-            use_reduce,
-            use_device,
-        )
-
-        binary = compiler.CompiledProgram(
-            main,
-            build_strategy=build_strategy,
-        )
-
-        exe.run(binary, feed=feed_dict, fetch_list=[loss.name])
-
-    @classmethod
-    def set_strategy(
-        cls,
-        enable_inplace,
-        enable_sequential_execution,
-        fuse_all_optimizer_ops,
-        fuse_all_reduce_ops,
-        fuse_elewise_add_act_ops,
-        fuse_relu_depthwise_conv,
-        use_fast_executor,
-        use_ir_memory_optimize,
-        use_reduce,
-        use_device,
-    ):
-        build_strategy = base.BuildStrategy()
-        build_strategy.reduce_strategy = (
-            base.BuildStrategy.ReduceStrategy.Reduce
-            if use_reduce
-            else base.BuildStrategy.ReduceStrategy.AllReduce
-        )
-        build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
-        build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
-        build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops
-        build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops
-        build_strategy.memory_optimize = use_ir_memory_optimize
-        build_strategy.enable_inplace = enable_inplace
-        build_strategy.enable_sequential_execution = enable_sequential_execution
-
-        if use_device == DeviceType.CUDA and core.is_compiled_with_cuda():
-            build_strategy.remove_unnecessary_lock = True
-        if use_device == DeviceType.XPU and core.is_compiled_with_xpu():
-            build_strategy.fuse_elewise_add_act_ops = False
-            build_strategy.fuse_relu_depthwise_conv = False
-            build_strategy.fuse_all_optimizer_ops = False
-            build_strategy.memory_optimize = False
-            build_strategy.enable_inplace = False
-            build_strategy.enable_sequential_execution = False
-
-        return build_strategy
-
-    @classmethod
-    def build_model(
-        cls, feed_dict, get_data_from_feeder, main, method, optimizer
-    ):
-        loss = method(use_feed=feed_dict is not None)
-        # NOTE(zjl): memory_optimize/inplace pass would not require
-        # that loss.persistable = True.
-        # We set loss.persistable = False here to verify our memory
-        # optimization strategies intentionally.
-        loss.persistable = False
-        if optimizer:
-            optimizer().minimize(loss)
-
-        if get_data_from_feeder is not None:
-            assert feed_dict is None
-            feed_dict = get_data_from_feeder()
-        return feed_dict, loss
diff --git a/test/legacy_test/seresnext_net.py b/test/legacy_test/seresnext_net.py
index 357b5b7e226b19..ef19deebba3789 100644
--- a/test/legacy_test/seresnext_net.py
+++ b/test/legacy_test/seresnext_net.py
@@ -18,11 +18,12 @@
 
 import os
 
-from seresnext_test_base import DeviceType
 from simple_nets import init_data
 
 import paddle
 
+DeviceType = base.core.DeviceType
+
 os.environ['CPU_NUM'] = str(4)
 os.environ['FLAGS_cudnn_deterministic'] = str(1)
 
diff --git a/test/legacy_test/seresnext_test_base.py b/test/legacy_test/seresnext_test_base.py
deleted file mode 100644
index 73ad9c27c0196f..00000000000000
--- a/test/legacy_test/seresnext_test_base.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import seresnext_net
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-from paddle.base import core
-
-
-class TestResnetBase(TestParallelExecutorBase):
-    def _compare_result_with_origin_model(
-        self, check_func, use_device, delta2=1e-5, compare_separately=True
-    ):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-
-        (
-            func_1_first_loss,
-            func_1_last_loss,
-            func_1_loss_area,
-        ) = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer,
-        )
-
-        func_2_first_loss, func_2_last_loss, func_2_loss_area = check_func(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-        )
-
-        if compare_separately:
-            self.assertAlmostEqual(
-                func_1_first_loss, func_2_first_loss, delta=1e-5
-            )
-            self.assertAlmostEqual(
-                func_1_last_loss, func_2_last_loss, delta=delta2
-            )
-        else:
-            np.testing.assert_allclose(
-                func_1_loss_area, func_2_loss_area, rtol=delta2
-            )
-            self.assertAlmostEqual(
-                func_1_first_loss, func_2_first_loss, delta=1e-5
-            )
-            self.assertAlmostEqual(
-                func_1_last_loss, func_2_last_loss, delta=delta2
-            )
diff --git a/test/legacy_test/test_fuse_all_reduce_pass.py b/test/legacy_test/test_fuse_all_reduce_pass.py
deleted file mode 100644
index 0745844bda323e..00000000000000
--- a/test/legacy_test/test_fuse_all_reduce_pass.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-from functools import partial
-
-from fake_reader import fake_imdb_reader
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-from simple_nets import bow_net, fc_with_batchnorm, init_data, simple_fc_net
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-paddle.enable_static()
-
-
-class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def compare_fuse_all_reduce_ops(
-        self,
-        model,
-        use_device,
-        init_feed_dict=None,
-        get_data_from_feeder=None,
-        optimizer=None,
-        fuse_all_optimizer_ops=False,
-    ):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-        if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
-            return
-
-        feed_dict_data = None
-        if init_feed_dict is not None:
-            img, label = init_feed_dict()
-            feed_dict_data = {"image": img, "label": label}
-
-        (
-            not_fuse_op_first_loss,
-            not_fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict_data,
-            get_data_from_feeder=get_data_from_feeder,
-            use_device=use_device,
-            fuse_all_reduce_ops=False,
-            fuse_all_optimizer_ops=fuse_all_optimizer_ops,
-            optimizer=optimizer,
-        )
-        (
-            fuse_op_first_loss,
-            fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict_data,
-            get_data_from_feeder=get_data_from_feeder,
-            use_device=use_device,
-            fuse_all_reduce_ops=True,
-            fuse_all_optimizer_ops=fuse_all_optimizer_ops,
-            optimizer=optimizer,
-        )
-
-        self.assertAlmostEqual(
-            not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6
-        )
-        self.assertAlmostEqual(
-            not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6
-        )
-
-    def optimizer(self, learning_rate=1e-3):
-        optimizer = paddle.optimizer.SGD(
-            learning_rate=learning_rate,
-            weight_decay=paddle.regularizer.L2Decay(1e-3),
-        )
-        return optimizer
-
-
-class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
-    def _decorate_compare_fused_all_reduce(self, model, use_device):
-        self.compare_fuse_all_reduce_ops(
-            model,
-            use_device,
-            init_feed_dict=init_data,
-            optimizer=self.optimizer,
-            fuse_all_optimizer_ops=True,
-        )
-
-    def test_simple_fc_with_fuse_all_reduce(self):
-        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
-        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.XPU)
-        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU)
-
-    def test_batchnorm_fc_with_fuse_all_reduce(self):
-        self._decorate_compare_fused_all_reduce(
-            fc_with_batchnorm, DeviceType.CUDA
-        )
-        # TODO(wangxi): xpu batch_norm op only support dim = 4
-        # self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
-        #                                         DeviceType.XPU)
-        self._decorate_compare_fused_all_reduce(
-            fc_with_batchnorm, DeviceType.CPU
-        )
-
-
-class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps):
-    def _decorate_compare_fused_all_reduce(self, model, use_device):
-        self.compare_fuse_all_reduce_ops(
-            model,
-            use_device,
-            init_feed_dict=init_data,
-            optimizer=self.optimizer,
-            fuse_all_optimizer_ops=True,
-        )
-
-
-class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-        cls.word_dict_len = 5147
-        batch_size = 64
-        reader = fake_imdb_reader(cls.word_dict_len, batch_size * 100)
-        reader = paddle.batch(reader, batch_size=batch_size)()
-        cls.train_data = next(reader)
-
-    def get_data_from_feeder(self):
-        place = base.CPUPlace()
-        feeder = base.DataFeeder(feed_list=["words", "label"], place=place)
-        return feeder.feed(self.train_data)
-
-    def _decorate_compare_fused_all_reduce(self, model, use_device):
-        self.compare_fuse_all_reduce_ops(
-            model,
-            use_device,
-            get_data_from_feeder=self.get_data_from_feeder,
-            optimizer=self.optimizer,
-        )
-
-    def test_simple_bow_net_with_fuse_all_reduce(self):
-        model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
-        self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA)
-        # TODO(wangxi): xpu sum op only support LodTensor for now
-        # self._decorate_compare_fused_all_reduce(model, DeviceType.XPU)
-        self._decorate_compare_fused_all_reduce(model, DeviceType.CPU)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_fuse_elewise_add_act_pass.py b/test/legacy_test/test_fuse_elewise_add_act_pass.py
index b9237a14bd1085..2f61178920a107 100644
--- a/test/legacy_test/test_fuse_elewise_add_act_pass.py
+++ b/test/legacy_test/test_fuse_elewise_add_act_pass.py
@@ -12,86 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import unittest
 
 import numpy
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-from simple_nets import fc_with_batchnorm, init_data, simple_fc_net
 
 import paddle
 import paddle.nn.functional as F
 from paddle import base
-from paddle.base import core
-
-
-class TestMNIST(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def _compare_fuse_elewise_add_act_ops(self, model, use_device):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-        img, label = init_data()
-
-        def _optimizer(learning_rate=1e-6):
-            optimizer = paddle.optimizer.SGD(
-                learning_rate=learning_rate,
-                weight_decay=paddle.regularizer.L2Decay(1e-6),
-            )
-            return optimizer
-
-        # NOTE(dzh):
-        # need to make it compatible with elewise fuse act
-        # FIXME (liuwei12)
-        # the new memory optimize strategy will crash this unittest
-        # add enable_inplace=False here to force pass the unittest
-        (
-            not_fuse_op_first_loss,
-            not_fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            fuse_elewise_add_act_ops=False,
-            use_ir_memory_optimize=False,
-            enable_inplace=False,
-            optimizer=_optimizer,
-        )
-        (
-            fuse_op_first_loss,
-            fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            fuse_elewise_add_act_ops=True,
-            use_ir_memory_optimize=False,
-            enable_inplace=False,
-            optimizer=_optimizer,
-        )
-
-        self.assertAlmostEqual(
-            not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6
-        )
-        self.assertAlmostEqual(
-            not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6
-        )
-
-    def test_simple_fc_with_fuse_op(self):
-        self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CUDA)
-        self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CPU)
-
-    def test_batchnorm_fc_with_fuse_op(self):
-        self._compare_fuse_elewise_add_act_ops(
-            fc_with_batchnorm, DeviceType.CUDA
-        )
-        self._compare_fuse_elewise_add_act_ops(
-            fc_with_batchnorm, DeviceType.CPU
-        )
 
 
 class TestFuseActElewiseAddInplaceGradPass(unittest.TestCase):
diff --git a/test/legacy_test/test_fuse_optimizer_pass.py b/test/legacy_test/test_fuse_optimizer_pass.py
deleted file mode 100644
index 3fa7f3d999a615..00000000000000
--- a/test/legacy_test/test_fuse_optimizer_pass.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-from functools import partial
-
-from fake_reader import fake_imdb_reader
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-from simple_nets import bow_net, fc_with_batchnorm, init_data
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-
-class TestFuseOptimizationOps(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def _get_feed_dict(self):
-        img, label = init_data()
-        return {"image": img, "label": label}
-
-    def _compare_fused_optimizer_ops(
-        self,
-        model,
-        use_device,
-        feed_dict=None,
-        get_data_from_feeder=None,
-        optimizer=paddle.optimizer.Adam,
-    ):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-
-        (
-            not_fuse_op_first_loss,
-            not_fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict,
-            get_data_from_feeder=get_data_from_feeder,
-            use_device=use_device,
-            fuse_all_optimizer_ops=False,
-            optimizer=optimizer,
-        )
-        (
-            fuse_op_first_loss,
-            fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict=feed_dict,
-            get_data_from_feeder=get_data_from_feeder,
-            use_device=use_device,
-            fuse_all_optimizer_ops=True,
-            optimizer=optimizer,
-        )
-
-        self.assertAlmostEqual(
-            not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6
-        )
-        self.assertAlmostEqual(
-            not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6
-        )
-
-    def _decorate_compare_fused_optimizer_ops(
-        self, model, use_device, optimizer
-    ):
-        self._compare_fused_optimizer_ops(
-            model,
-            use_device,
-            feed_dict=self._get_feed_dict(),
-            optimizer=optimizer,
-        )
-
-
-class TestFuseAdamOps(TestFuseOptimizationOps):
-    def optimizer(self, learning_rate=1e-4):
-        return paddle.optimizer.Adam(learning_rate=learning_rate)
-
-    def test_batchnorm_fc_with_fuse_op(self):
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer
-        )
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer
-        )
-
-
-class TestFuseSGDOps(TestFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.SGD(learning_rate=learning_rate)
-
-
-class TestFuseMomentumOps(TestFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1
-        )
-
-
-class TestSpareFuseAdamOps(TestFuseOptimizationOps):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-        cls.word_dict_len = 5147
-        batch_size = 64
-        reader = fake_imdb_reader(cls.word_dict_len, batch_size * 100)
-        reader = paddle.batch(reader, batch_size=batch_size)()
-        cls.train_data = next(reader)
-
-    def _get_data_from_feeder(self):
-        place = base.CPUPlace()
-        feeder = base.DataFeeder(feed_list=["words", "label"], place=place)
-        return feeder.feed(self.train_data)
-
-    def _decorate_compare_fused_optimizer_ops(
-        self, model, use_device, optimizer
-    ):
-        self._compare_fused_optimizer_ops(
-            model,
-            use_device,
-            get_data_from_feeder=self._get_data_from_feeder,
-            optimizer=optimizer,
-        )
-
-    def optimizer(self, learning_rate=1e-4):
-        return paddle.optimizer.Adam(learning_rate=learning_rate)
-
-    def test_simple_bow_net_with_fuse_op(self):
-        model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
-        self._decorate_compare_fused_optimizer_ops(
-            model, DeviceType.CUDA, optimizer=self.optimizer
-        )
-        self._decorate_compare_fused_optimizer_ops(
-            model, DeviceType.CPU, optimizer=self.optimizer
-        )
-
-
-class TestSpareFuseSGDOps(TestSpareFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.SGD(learning_rate=learning_rate)
-
-
-class TestSpareFuseMomentumOps(TestSpareFuseAdamOps):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1
-        )
-
-
-class TestPassConflictBase(TestFuseAdamOps):
-    def _compare_fused_optimizer_ops(
-        self,
-        model,
-        use_device,
-        feed_dict=None,
-        get_data_from_feeder=None,
-        optimizer=paddle.optimizer.Adam,
-    ):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-
-        self.check_pass_conflict(
-            model,
-            feed_dict=feed_dict,
-            get_data_from_feeder=get_data_from_feeder,
-            use_device=use_device,
-            fuse_all_optimizer_ops=True,
-            optimizer=optimizer,
-            enable_sequential_execution=True,
-        )
-
-
-class TestFuseAdamOpsPassConflict(TestPassConflictBase):
-    def optimizer(self, learning_rate=1e-4):
-        return paddle.optimizer.Adam(learning_rate=learning_rate)
-
-    def test_batchnorm_fc_with_fuse_op(self):
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer
-        )
-        self._decorate_compare_fused_optimizer_ops(
-            fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer
-        )
-
-
-class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.SGD(learning_rate=learning_rate)
-
-
-class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict):
-    def optimizer(self, learning_rate=1e-3):
-        return paddle.optimizer.Momentum(
-            learning_rate=learning_rate, momentum=0.1
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py b/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py
deleted file mode 100644
index 50392ac9744607..00000000000000
--- a/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-import paddle
-import paddle.nn.functional as F
-from paddle.base import core
-
-
-def norm(*args, **kargs):
-    return paddle.static.nn.batch_norm(*args, **kargs)
-
-
-def sep_conv(input, channel, stride, filter, dilation=1, act=None):
-    # with scope('depthwise'):
-    input = paddle.static.nn.conv2d(
-        input,
-        input.shape[1],
-        filter,
-        stride,
-        groups=input.shape[1],
-        padding=(filter // 2) * dilation,
-        dilation=dilation,
-        use_cudnn=False,
-        bias_attr=False,
-    )
-    input = norm(input)
-    if act:
-        input = act(input)
-    # with scope('pointwise'):
-    input = paddle.static.nn.conv2d(
-        input, channel, 1, 1, groups=1, padding=0, bias_attr=False
-    )
-    input = norm(input)
-    if act:
-        input = act(input)
-    return input
-
-
-def simple_depthwise_net(use_feed):
-    assert use_feed
-    img = paddle.static.data(name='image', shape=[-1, 784], dtype='float32')
-    label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
-    hidden = paddle.reshape(img, (-1, 1, 28, 28))
-    for _ in range(4):
-        hidden = sep_conv(hidden, channel=200, stride=2, filter=5)
-        hidden = F.relu(hidden)
-    prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
-    loss = paddle.nn.functional.cross_entropy(
-        input=prediction, label=label, reduction='none', use_softmax=False
-    )
-    loss = paddle.mean(loss)
-    return loss
-
-
-class TestMNIST(TestParallelExecutorBase):
-    def _init_data(self, random=True):
-        np.random.seed(5)
-        if random:
-            img = np.random.random(size=[32, 784]).astype(np.float32)
-        else:
-            img = np.ones(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
-
-    def _compare(self, model, use_device, random_data=True, only_forward=False):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-        img, label = self._init_data(random_data)
-
-        def _optimizer(learning_rate=1e-6):
-            optimizer = paddle.optimizer.SGD(
-                learning_rate=learning_rate,
-                weight_decay=paddle.regularizer.L2Decay(1e-6),
-            )
-            return optimizer
-
-        if only_forward:
-            _optimizer = None
-
-        (
-            fuse_op_first_loss,
-            fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            fuse_relu_depthwise_conv=True,
-            use_ir_memory_optimize=True,
-            optimizer=_optimizer,
-        )
-        (
-            not_fuse_op_first_loss,
-            not_fuse_op_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            fuse_relu_depthwise_conv=False,
-            optimizer=_optimizer,
-        )
-
-        self.assertAlmostEqual(
-            not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6
-        )
-        self.assertAlmostEqual(
-            not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6
-        )
-
-    def test_simple_depthwise_with_fuse_op(self):
-        self._compare(simple_depthwise_net, DeviceType.CUDA)
-        self._compare(simple_depthwise_net, DeviceType.CPU)
-
-    def test_simple_depthwise_with_fuse_op_only_forward(self):
-        self._compare(simple_depthwise_net, DeviceType.CUDA, only_forward=True)
-        self._compare(simple_depthwise_net, DeviceType.CPU, only_forward=True)
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_ir_inplace_pass.py b/test/legacy_test/test_ir_inplace_pass.py
deleted file mode 100644
index c5a5be1168f870..00000000000000
--- a/test/legacy_test/test_ir_inplace_pass.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-import numpy as np
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-import paddle
-from paddle import base
-from paddle.base import core
-
-
-def fc_with_batchnorm(use_feed):
-    img = paddle.static.data(name='image', shape=[-1, 784], dtype='float32')
-    label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
-
-    hidden = img
-    for _ in range(3):
-        hidden = paddle.static.nn.fc(
-            hidden,
-            size=200,
-            activation='tanh',
-            bias_attr=base.ParamAttr(
-                initializer=paddle.nn.initializer.Constant(value=1.0)
-            ),
-        )
-
-        hidden = paddle.static.nn.batch_norm(input=hidden)
-    prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
-    loss = paddle.nn.functional.cross_entropy(
-        input=prediction, label=label, reduction='none', use_softmax=False
-    )
-    loss = paddle.mean(loss)
-    return loss
-
-
-class TestIrInplace(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def _fc_with_batchnorm(self, ir_memory_optimize, enable_inplace):
-        if not core.is_compiled_with_cuda():
-            return
-        np.random.seed(5)
-        img = np.random.random(size=[32, 784]).astype(np.float32)
-        label = np.ones(shape=[32, 1], dtype='int64')
-        self.check_network_convergence(
-            fc_with_batchnorm,
-            feed_dict={"image": img, "label": label},
-            use_device=DeviceType.CUDA,
-            use_ir_memory_optimize=ir_memory_optimize,
-            enable_inplace=enable_inplace,
-        )
-
-    def test_fc_with_batchnorm(self, delta=1e-3):
-        loss00 = self._fc_with_batchnorm(False, False)
-        loss10 = self._fc_with_batchnorm(True, False)
-        loss01 = self._fc_with_batchnorm(False, True)
-        loss11 = self._fc_with_batchnorm(True, True)
-        self.assertAlmostEqual(loss00, loss10, delta=delta)
-        self.assertAlmostEqual(loss00, loss01, delta=delta)
-        self.assertAlmostEqual(loss00, loss11, delta=delta)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_ir_memory_optimize_pass.py b/test/legacy_test/test_ir_memory_optimize_pass.py
deleted file mode 100644
index 6112d0aedd7ad5..00000000000000
--- a/test/legacy_test/test_ir_memory_optimize_pass.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-import paddle
-from paddle.base import core
-
-
-def _feed_data_helper():
-    img = paddle.static.data(name='image', shape=[-1, 784], dtype='float32')
-    label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
-    return img, label
-
-
-def simple_fc_net(use_feed):
-    assert use_feed
-    x, y = _feed_data_helper()
-    hidden_layer = 4
-    for _ in range(hidden_layer):
-        x = paddle.static.nn.fc(x, size=20, activation='relu')
-    y_predict = paddle.static.nn.fc(x, size=10, activation='softmax')
-    cost = paddle.nn.functional.cross_entropy(
-        input=y_predict, label=y, reduction='none', use_softmax=False
-    )
-    avg_cost = paddle.mean(cost)
-    return avg_cost
-
-
-def fc_with_inplace_net(use_feed):
-    assert use_feed
-    x, y = _feed_data_helper()
-    fc = paddle.static.nn.fc(x=x, size=20, activation='relu')
-    fc = paddle.static.nn.fc(x=fc, size=10, activation='relu')
-    reshape = paddle.reshape(x=fc, shape=[-1, 2, 5])
-    reshape = paddle.reshape(x=reshape, shape=[-1, 5, 2])
-    y_predict = paddle.static.nn.fc(x=reshape, size=10, activation='softmax')
-    cost = paddle.nn.functional.cross_entropy(
-        input=y_predict, label=y, reduction='none', use_softmax=False
-    )
-    avg_cost = paddle.mean(cost)
-    return avg_cost
-
-
-class TestMNIST(TestParallelExecutorBase):
-    def _dummy_data(self):
-        np.random.seed(5)
-        img = np.random.random(size=[32, 784]).astype(np.float32)
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
-
-    def _compare_ir_memory_optimize(self, model, use_device):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-
-        img, label = self._dummy_data()
-        first_loss0, last_loss0, _ = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            use_ir_memory_optimize=False,
-        )
-        first_loss1, last_loss1, _ = self.check_network_convergence(
-            model,
-            feed_dict={"image": img, "label": label},
-            use_device=use_device,
-            use_ir_memory_optimize=True,
-        )
-
-        self.assertAlmostEqual(first_loss0, first_loss1, delta=1e-6)
-        self.assertAlmostEqual(last_loss0, last_loss1, delta=1e-6)
-
-    def test_simple_fc_net(self):
-        self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CPU)
-        self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CUDA)
-
-    def test_fc_with_reshape_net(self):
-        self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CPU)
-        self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CUDA)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_ir_memory_optimize_transformer.py b/test/legacy_test/test_ir_memory_optimize_transformer.py
deleted file mode 100644
index b3dc82c12e6369..00000000000000
--- a/test/legacy_test/test_ir_memory_optimize_transformer.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-from paddle.base import core
-
-os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
-
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-from test_parallel_executor_transformer import get_feed_data_reader, transformer
-
-
-# NOTE(dzhwinter): test diferent strategy colisions.
-# open the eager delete tensor strategy by default.
-class TestTransformerWithIR(TestParallelExecutorBase):
-    def test_main(self):
-        if core.is_compiled_with_cuda():
-            # check python transpiler
-            self.check_network_convergence(
-                transformer,
-                use_device=DeviceType.CUDA,
-                feed_data_reader=get_feed_data_reader(),
-                use_ir_memory_optimize=False,
-                iter=2,
-            )
-            # check IR memory optimize
-            self.check_network_convergence(
-                transformer,
-                use_device=DeviceType.CUDA,
-                feed_data_reader=get_feed_data_reader(),
-                use_ir_memory_optimize=True,
-                iter=2,
-            )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_mix_precision_all_reduce_fuse.py b/test/legacy_test/test_mix_precision_all_reduce_fuse.py
deleted file mode 100644
index 6887b2d0de6318..00000000000000
--- a/test/legacy_test/test_mix_precision_all_reduce_fuse.py
+++ /dev/null
@@ -1,95 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import nets
-import numpy as np
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-from simple_nets import init_data
-
-import paddle
-from paddle.base import core
-
-batch_size = 12
-img_shape = [1, 28, 28]
-
-
-def loss_net(hidden, label):
-    prediction = paddle.static.nn.fc(x=hidden, size=10, activation='softmax')
-    loss = paddle.nn.functional.cross_entropy(
-        input=prediction, label=label, reduction='none', use_softmax=False
-    )
-    avg_loss = paddle.mean(loss)
-    return avg_loss
-
-
-def conv_net(use_feed):
-    img = paddle.static.data(
-        name='image', shape=[-1] + img_shape, dtype='float16'
-    )
-    label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
-
-    conv_pool_1 = nets.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
-    )
-    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
-
-    conv_pool_1 = paddle.cast(conv_pool_1, np.float32)
-    conv_pool_2 = nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu",
-    )
-    hidden = paddle.cast(conv_pool_2, np.float32)
-    return loss_net(hidden, label)
-
-
-def _optimizer(learning_rate=1e-6):
-    optimizer = paddle.optimizer.SGD(learning_rate=learning_rate)
-    return optimizer
-
-
-class TestResnet(TestParallelExecutorBase):
-    def check_model(self, use_device):
-        img, label = init_data(
-            batch_size=batch_size, img_shape=img_shape, label_range=9
-        )
-        img = np.float16(img)
-        feed_dict = {"image": img, "label": label}
-
-        TestParallelExecutorBase.check_network_convergence(
-            conv_net,
-            feed_dict=feed_dict,
-            iter=10,
-            use_device=use_device,
-            fuse_all_reduce_ops=True,
-            optimizer=_optimizer,
-        )
-
-    def test_model(self):
-        if core.is_compiled_with_cuda():
-            self.check_model(DeviceType.CUDA)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_run_cinn.py b/test/legacy_test/test_parallel_executor_run_cinn.py
deleted file mode 100644
index 2ca34842f0b906..00000000000000
--- a/test/legacy_test/test_parallel_executor_run_cinn.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-
-import paddle
-
-paddle.enable_static()
-
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO
-)
-logger = logging.getLogger("paddle_with_cinn")
-
-
-def set_cinn_flag(val):
-    cinn_compiled = False
-    try:
-        paddle.set_flags({'FLAGS_use_cinn': val})
-        cinn_compiled = True
-    except ValueError:
-        logger.warning("The used paddle is not compiled with CINN.")
-    return cinn_compiled
-
-
-def reader(limit):
-    for _ in range(limit):
-        yield np.random.random([1, 28]).astype('float32'), np.random.randint(
-            0, 2, size=[1]
-        ).astype('int64')
-
-
-def rand_data(img, label, loop_num=10):
-    feed = []
-    data = reader(loop_num)
-    for _ in range(loop_num):
-        d, l = next(data)
-        feed.append({img: d, label: l})
-    return feed
-
-
-def build_program(main_program, startup_program):
-    with paddle.static.program_guard(main_program, startup_program):
-        img = paddle.static.data(name='img', shape=[1, 28], dtype='float32')
-        param = paddle.create_parameter(
-            name="bias",
-            shape=[1, 28],
-            dtype="float32",
-            attr=paddle.ParamAttr(
-                initializer=paddle.nn.initializer.Assign(
-                    np.random.rand(1, 28).astype(np.float32)
-                )
-            ),
-        )
-        label = paddle.static.data(name="label", shape=[1], dtype='int64')
-
-        hidden = paddle.add(img, param)
-        prediction = paddle.nn.functional.relu(hidden)
-
-        loss = paddle.nn.functional.cross_entropy(input=prediction, label=label)
-        avg_loss = paddle.mean(loss)
-        adam = paddle.optimizer.Adam(learning_rate=0.001)
-        adam.minimize(avg_loss)
-    return img, label, avg_loss
-
-
-def train(dot_save_dir, prefix, seed=1234):
-    np.random.seed(seed)
-    paddle.seed(seed)
-    if paddle.is_compiled_with_cuda():
-        paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
-
-    startup_program = paddle.static.Program()
-    main_program = paddle.static.Program()
-    img, label, loss = build_program(main_program, startup_program)
-
-    place = (
-        paddle.CUDAPlace(0)
-        if paddle.is_compiled_with_cuda()
-        else paddle.CPUPlace()
-    )
-    exe = paddle.static.Executor(place)
-    exe.run(startup_program)
-
-    build_strategy = paddle.static.BuildStrategy()
-    build_strategy.debug_graphviz_path = os.path.join(dot_save_dir, prefix)
-    compiled_program = paddle.static.CompiledProgram(
-        main_program, build_strategy
-    )
-
-    iters = 100
-    feed = rand_data(img.name, label.name, iters)
-    loss_values = []
-    for step in range(iters):
-        loss_v = exe.run(compiled_program, feed=feed[step], fetch_list=[loss])
-        loss_values.append(loss_v[0])
-    return loss_values
-
-
-@unittest.skipIf(not set_cinn_flag(True), "Paddle is not compiled with CINN.")
-class TestParallelExecutorRunCinn(unittest.TestCase):
-    def setUp(self):
-        self.tmpdir = tempfile.mkdtemp(prefix="dots_")
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdir)
-
-    def test_run_with_cinn(self):
-        cinn_losses = np.array(train(self.tmpdir, "paddle")).flatten()
-        set_cinn_flag(False)
-        pd_losses = np.array(train(self.tmpdir, "cinn")).flatten()
-        np.testing.assert_allclose(
-            cinn_losses, pd_losses, rtol=1e-05, atol=1e-05
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_seresnext_base_cpu.py b/test/legacy_test/test_parallel_executor_seresnext_base_cpu.py
deleted file mode 100644
index 7c9c9968c4a182..00000000000000
--- a/test/legacy_test/test_parallel_executor_seresnext_base_cpu.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from functools import partial
-
-import seresnext_net
-from seresnext_test_base import DeviceType, TestResnetBase
-
-
-class TestResnetCPU(TestResnetBase):
-    def test_seresnext_with_learning_rate_decay(self):
-        # NOTE(zcd): This test is compare the result of use parallel_executor
-        # and executor, and the result of drop_out op and batch_norm op in
-        # this two executor have diff, so the two ops should be removed
-        # from the model.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            use_parallel_executor=False,
-        )
-        self._compare_result_with_origin_model(
-            check_func,
-            use_device=DeviceType.CPU,
-            compare_separately=False,
-            delta2=1e-3,
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_seresnext_base_gpu.py b/test/legacy_test/test_parallel_executor_seresnext_base_gpu.py
deleted file mode 100644
index 75bd61f5c6c7d8..00000000000000
--- a/test/legacy_test/test_parallel_executor_seresnext_base_gpu.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from functools import partial
-
-import seresnext_net
-from seresnext_test_base import DeviceType, TestResnetBase
-
-
-class TestResnetGPU(TestResnetBase):
-    def test_seresnext_with_learning_rate_decay(self):
-        # NOTE(zcd): This test is compare the result of use parallel_executor
-        # and executor, and the result of drop_out op and batch_norm op in
-        # this two executor have diff, so the two ops should be removed
-        # from the model.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            use_parallel_executor=False,
-        )
-        self._compare_result_with_origin_model(
-            check_func,
-            use_device=DeviceType.CUDA,
-            delta2=1e-3,
-            compare_separately=False,
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py b/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
deleted file mode 100644
index 75d3d85e20e5b9..00000000000000
--- a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle import base
-
-base.core._set_fuse_parameter_group_size(3)
-base.core._set_fuse_parameter_memory_size(131072)
-
-import unittest
-from functools import partial
-
-import seresnext_net
-from seresnext_test_base import DeviceType, TestResnetBase
-
-
-class TestResnetWithFuseAllReduceCPU(TestResnetBase):
-    def test_seresnext_with_fused_all_reduce(self):
-        # NOTE(zcd): In order to make the program faster,
-        # this unit test remove drop_out and batch_norm.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            fuse_all_reduce_ops=True,
-        )
-        self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.CPU
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py b/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
deleted file mode 100644
index 752538efaa0597..00000000000000
--- a/test/legacy_test/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle import base
-
-base.core._set_fuse_parameter_group_size(3)
-base.core._set_fuse_parameter_memory_size(131072)
-
-import unittest
-from functools import partial
-
-import seresnext_net
-from seresnext_test_base import DeviceType, TestResnetBase
-
-
-class TestResnetWithFuseAllReduceGPU(TestResnetBase):
-    def test_seresnext_with_fused_all_reduce(self):
-        # NOTE(zcd): In order to make the program faster,
-        # this unit test remove drop_out and batch_norm.
-        check_func = partial(
-            self.check_network_convergence,
-            optimizer=seresnext_net.optimizer,
-            fuse_all_reduce_ops=True,
-        )
-        self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.CUDA, delta2=1e-2
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_cpu.py b/test/legacy_test/test_parallel_executor_seresnext_with_reduce_cpu.py
deleted file mode 100644
index 9dead366227630..00000000000000
--- a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_cpu.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import seresnext_net
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-from paddle.base import core
-
-
-class TestResnetWithReduceBase(TestParallelExecutorBase):
-    def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5):
-        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
-            return
-
-        (
-            all_reduce_first_loss,
-            all_reduce_last_loss,
-            _,
-        ) = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer,
-        )
-        reduce_first_loss, reduce_last_loss, _ = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-            use_reduce=True,
-            optimizer=seresnext_net.optimizer,
-        )
-
-        self.assertAlmostEqual(
-            all_reduce_first_loss, reduce_first_loss, delta=1e-5
-        )
-        self.assertAlmostEqual(
-            all_reduce_last_loss,
-            reduce_last_loss,
-            delta=all_reduce_last_loss * delta2,
-        )
-
-        if not use_device:
-            return
-
-        (
-            all_reduce_first_loss_seq,
-            all_reduce_last_loss_seq,
-            _,
-        ) = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-            use_reduce=False,
-            optimizer=seresnext_net.optimizer,
-            enable_sequential_execution=True,
-        )
-
-        (
-            reduce_first_loss_seq,
-            reduce_last_loss_seq,
-            _,
-        ) = self.check_network_convergence(
-            seresnext_net.model,
-            feed_dict=seresnext_net.feed_dict(use_device),
-            iter=seresnext_net.iter(use_device),
-            batch_size=seresnext_net.batch_size(use_device),
-            use_device=use_device,
-            use_reduce=True,
-            optimizer=seresnext_net.optimizer,
-            enable_sequential_execution=True,
-        )
-
-        self.assertAlmostEqual(
-            all_reduce_first_loss, all_reduce_first_loss_seq, delta=1e-5
-        )
-        self.assertAlmostEqual(
-            all_reduce_last_loss,
-            all_reduce_last_loss_seq,
-            delta=all_reduce_last_loss * delta2,
-        )
-
-        self.assertAlmostEqual(
-            reduce_first_loss, reduce_first_loss_seq, delta=1e-5
-        )
-        self.assertAlmostEqual(
-            reduce_last_loss,
-            reduce_last_loss_seq,
-            delta=reduce_last_loss * delta2,
-        )
-
-        self.assertAlmostEqual(
-            all_reduce_first_loss_seq, reduce_first_loss_seq, delta=1e-5
-        )
-        self.assertAlmostEqual(
-            all_reduce_last_loss_seq,
-            reduce_last_loss_seq,
-            delta=all_reduce_last_loss_seq * delta2,
-        )
-
-
-class TestResnetWithReduceCPU(TestResnetWithReduceBase):
-    def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(
-            use_device=DeviceType.CPU, delta2=1e-3
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_gpu.py b/test/legacy_test/test_parallel_executor_seresnext_with_reduce_gpu.py
deleted file mode 100644
index 187f837e7e7b1e..00000000000000
--- a/test/legacy_test/test_parallel_executor_seresnext_with_reduce_gpu.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from test_parallel_executor_seresnext_with_reduce_cpu import (
-    DeviceType,
-    TestResnetWithReduceBase,
-)
-
-
-class TestResnetWithReduceGPU(TestResnetWithReduceBase):
-    def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(
-            use_device=DeviceType.CUDA, delta2=1e-2
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_transformer.py b/test/legacy_test/test_parallel_executor_transformer.py
deleted file mode 100644
index d6bcf26c24bbd6..00000000000000
--- a/test/legacy_test/test_parallel_executor_transformer.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-import numpy as np
-import transformer_model
-from feed_data_reader import FeedDataReader
-from parallel_executor_test_base import DeviceType, TestParallelExecutorBase
-
-import paddle
-from paddle.base import core
-from paddle.dataset import wmt16
-
-os.environ['CPU_NUM'] = str(4)
-
-
-class ModelHyperParams:
-    # Dictionary size for source and target language. This model directly uses
-    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
-    # already been added, but the <pad> token is not added. Transformer requires
-    # sequences in a mini-batch are padded to have the same length. A <pad> token is
-    # added into the original dictionary in paddle.dateset.wmt16.
-
-    # size of source word dictionary.
-    src_vocab_size = 10000
-    # index for <pad> token in source language.
-    src_pad_idx = src_vocab_size
-
-    # size of target word dictionary
-    trg_vocab_size = 10000
-    # index for <pad> token in target language.
-    trg_pad_idx = trg_vocab_size
-
-    # position value corresponding to the <pad> token.
-    pos_pad_idx = 0
-
-    # max length of sequences. It should plus 1 to include position
-    # padding token for position encoding.
-    max_length = 50
-
-    # the dimension for word embeddings, which is also the last dimension of
-    # the input and output of multi-head attention, position-wise feed-forward
-    # networks, encoder and decoder.
-
-    d_model = 512
-    # size of the hidden layer in position-wise feed-forward networks.
-    d_inner_hid = 1024
-    # the dimension that keys are projected to for dot-product attention.
-    d_key = 64
-    # the dimension that values are projected to for dot-product attention.
-    d_value = 64
-    # number of head used in multi-head attention.
-    n_head = 8
-    # number of sub-layers to be stacked in the encoder and decoder.
-    # NOTE(zcd): the origin number of layer is 6, to make this unit test faster,
-    # we should reduce the layer number to 4.
-    n_layer = 4
-    # dropout rate used by all dropout layers.
-    dropout = 0.1
-
-
-def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and attention bias. Then, convert the numpy
-    data to tensors and return a dict mapping names to tensors.
-    """
-
-    def __pad_batch_data(
-        insts,
-        pad_idx,
-        is_target=False,
-        return_pos=True,
-        return_attn_bias=True,
-        return_max_len=True,
-    ):
-        """
-        Pad the instances to the max sequence length in batch, and generate the
-        corresponding position data and attention bias.
-        """
-        return_list = []
-        max_len = max(len(inst) for inst in insts)
-        inst_data = np.array(
-            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]
-        )
-        return_list += [inst_data.astype("int64").reshape([-1, 1])]
-        if return_pos:
-            inst_pos = np.array(
-                [
-                    [
-                        pos_i + 1 if w_i != pad_idx else 0
-                        for pos_i, w_i in enumerate(inst)
-                    ]
-                    for inst in inst_data
-                ]
-            )
-
-            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
-        if return_attn_bias:
-            if is_target:
-                # This is used to avoid attention on paddings and subsequent
-                # words.
-                slf_attn_bias_data = np.ones(
-                    (inst_data.shape[0], max_len, max_len)
-                )
-                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
-                    [-1, 1, max_len, max_len]
-                )
-                slf_attn_bias_data = np.tile(
-                    slf_attn_bias_data, [1, n_head, 1, 1]
-                ) * [-1e9]
-            else:
-                # This is used to avoid attention on paddings.
-                slf_attn_bias_data = np.array(
-                    [
-                        [0] * len(inst) + [-1e9] * (max_len - len(inst))
-                        for inst in insts
-                    ]
-                )
-                slf_attn_bias_data = np.tile(
-                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
-                    [1, n_head, max_len, 1],
-                )
-            return_list += [slf_attn_bias_data.astype("float32")]
-        if return_max_len:
-            return_list += [max_len]
-        return return_list if len(return_list) > 1 else return_list[0]
-
-    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
-        [inst[0] for inst in insts], src_pad_idx, is_target=False
-    )
-    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
-        [inst[1] for inst in insts], trg_pad_idx, is_target=True
-    )
-    trg_src_attn_bias = np.tile(
-        src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1]
-    ).astype("float32")
-    lbl_word = __pad_batch_data(
-        [inst[2] for inst in insts], trg_pad_idx, False, False, False, False
-    )
-    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
-
-    return [
-        src_word,
-        src_pos,
-        trg_word,
-        trg_pos,
-        src_slf_attn_bias,
-        trg_slf_attn_bias,
-        trg_src_attn_bias,
-        lbl_word,
-        lbl_weight,
-    ]
-
-
-feed_data_reader = None
-
-
-def transformer(use_feed):
-    assert not use_feed, "transformer doesn't support feed yet"
-    return transformer_model.transformer(
-        ModelHyperParams.src_vocab_size + 1,
-        ModelHyperParams.trg_vocab_size + 1,
-        ModelHyperParams.max_length + 1,
-        ModelHyperParams.n_layer,
-        ModelHyperParams.n_head,
-        ModelHyperParams.d_key,
-        ModelHyperParams.d_value,
-        ModelHyperParams.d_model,
-        ModelHyperParams.d_inner_hid,
-        ModelHyperParams.dropout,
-        ModelHyperParams.src_pad_idx,
-        ModelHyperParams.trg_pad_idx,
-        ModelHyperParams.pos_pad_idx,
-    )
-
-
-def get_feed_data_reader():
-    global feed_data_reader
-    if feed_data_reader is not None:
-        return feed_data_reader
-
-    reader = paddle.batch(
-        wmt16.train(
-            ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size
-        ),
-        batch_size=transformer_model.batch_size,
-    )
-    all_batch_tensors = []
-    for batch in reader():
-        tensors = []
-        for tensor in prepare_batch_input(
-            batch,
-            ModelHyperParams.src_pad_idx,
-            ModelHyperParams.trg_pad_idx,
-            ModelHyperParams.n_head,
-        ):
-            tensors.append(np.array(tensor))
-        all_batch_tensors.append(tensors)
-
-    def __reader__():
-        yield from all_batch_tensors
-
-    feed_data_reader = FeedDataReader(
-        feed_list=transformer_model.build_inputs(
-            ModelHyperParams.max_length + 1, ModelHyperParams.n_head
-        ),
-        reader=__reader__,
-    )
-
-    return feed_data_reader
-
-
-class TestTransformer(TestParallelExecutorBase):
-    def test_main(self):
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                transformer,
-                use_device=DeviceType.CUDA,
-                feed_data_reader=get_feed_data_reader(),
-            )
-            self.check_network_convergence(
-                transformer,
-                use_device=DeviceType.CUDA,
-                enable_sequential_execution=True,
-                feed_data_reader=get_feed_data_reader(),
-            )
-        self.check_network_convergence(
-            transformer,
-            use_device=DeviceType.CPU,
-            iter=2,
-            feed_data_reader=get_feed_data_reader(),
-        )
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_parallel_executor_transformer_auto_growth.py b/test/legacy_test/test_parallel_executor_transformer_auto_growth.py
deleted file mode 100644
index 7f38de13af4cdf..00000000000000
--- a/test/legacy_test/test_parallel_executor_transformer_auto_growth.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_program_prune_backward.py b/test/legacy_test/test_program_prune_backward.py
index 581635d5a68ada..36e3fb67c254e8 100755
--- a/test/legacy_test/test_program_prune_backward.py
+++ b/test/legacy_test/test_program_prune_backward.py
@@ -17,16 +17,213 @@
 
 import numpy as np
 import seresnext_net
+import transformer_model
+from feed_data_reader import FeedDataReader
 from simple_nets import fc_with_batchnorm, init_data, simple_fc_net
-from test_parallel_executor_transformer import (
-    DeviceType,
-    get_feed_data_reader,
-    transformer,
-)
 
 import paddle
 from paddle import base
 from paddle.base import core
+from paddle.dataset import wmt16
+
+DeviceType = core.DeviceType
+
+
+class ModelHyperParams:
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # already been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+
+    # size of target word dictionary
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    # NOTE(zcd): the origin number of layer is 6, to make this unit test faster,
+    # we should reduce the layer number to 4.
+    n_layer = 4
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+
+    def __pad_batch_data(
+        insts,
+        pad_idx,
+        is_target=False,
+        return_pos=True,
+        return_attn_bias=True,
+        return_max_len=True,
+    ):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]
+        )
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array(
+                [
+                    [
+                        pos_i + 1 if w_i != pad_idx else 0
+                        for pos_i, w_i in enumerate(inst)
+                    ]
+                    for inst in inst_data
+                ]
+            )
+
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones(
+                    (inst_data.shape[0], max_len, max_len)
+                )
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len]
+                )
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data, [1, n_head, 1, 1]
+                ) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array(
+                    [
+                        [0] * len(inst) + [-1e9] * (max_len - len(inst))
+                        for inst in insts
+                    ]
+                )
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1],
+                )
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False
+    )
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True
+    )
+    trg_src_attn_bias = np.tile(
+        src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1]
+    ).astype("float32")
+    lbl_word = __pad_batch_data(
+        [inst[2] for inst in insts], trg_pad_idx, False, False, False, False
+    )
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    return [
+        src_word,
+        src_pos,
+        trg_word,
+        trg_pos,
+        src_slf_attn_bias,
+        trg_slf_attn_bias,
+        trg_src_attn_bias,
+        lbl_word,
+        lbl_weight,
+    ]
+
+
+feed_data_reader = None
+
+
+def transformer(use_feed):
+    assert not use_feed, "transformer doesn't support feed yet"
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1,
+        ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer,
+        ModelHyperParams.n_head,
+        ModelHyperParams.d_key,
+        ModelHyperParams.d_value,
+        ModelHyperParams.d_model,
+        ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout,
+        ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx,
+        ModelHyperParams.pos_pad_idx,
+    )
+
+
+def get_feed_data_reader():
+    global feed_data_reader
+    if feed_data_reader is not None:
+        return feed_data_reader
+
+    reader = paddle.batch(
+        wmt16.train(
+            ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size
+        ),
+        batch_size=transformer_model.batch_size,
+    )
+    all_batch_tensors = []
+    for batch in reader():
+        tensors = []
+        for tensor in prepare_batch_input(
+            batch,
+            ModelHyperParams.src_pad_idx,
+            ModelHyperParams.trg_pad_idx,
+            ModelHyperParams.n_head,
+        ):
+            tensors.append(np.array(tensor))
+        all_batch_tensors.append(tensors)
+
+    def __reader__():
+        yield from all_batch_tensors
+
+    feed_data_reader = FeedDataReader(
+        feed_list=transformer_model.build_inputs(
+            ModelHyperParams.max_length + 1, ModelHyperParams.n_head
+        ),
+        reader=__reader__,
+    )
+
+    return feed_data_reader
 
 
 def simple_fc_net_with_accuracy(use_feed):
diff --git a/test/legacy_test/test_py_func_op.py b/test/legacy_test/test_py_func_op.py
index 1706ad14d644d7..3fa249935406fc 100644
--- a/test/legacy_test/test_py_func_op.py
+++ b/test/legacy_test/test_py_func_op.py
@@ -19,7 +19,6 @@
 
 import paddle
 from paddle import base
-from paddle.base import compiler
 
 dev_cnt = 2
 if base.core.is_compiled_with_cuda():
@@ -171,7 +170,7 @@ def reader():
         )
 
 
-def test_main(use_cuda, use_py_func_op, use_parallel_executor):
+def test_main(use_cuda, use_py_func_op):
     if use_cuda and not base.core.is_compiled_with_cuda():
         return None
 
@@ -197,12 +196,7 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
             exe.run(base.default_startup_program())
 
             train_cp = base.default_main_program()
-
-            if use_parallel_executor:
-                train_cp = compiler.CompiledProgram(base.default_main_program())
-                fetch_list = [loss.name]
-            else:
-                fetch_list = [loss]
+            fetch_list = [loss]
 
             ret = []
             for epoch_id in range(2):
@@ -215,16 +209,11 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
 
 
 class TestPyFuncOpUseExecutor(unittest.TestCase):
-    def setUp(self):
-        self.use_parallel_executor = False
-
     def test_loss_diff(self):
         for use_cuda in [True, False]:
             losses = []
             for use_py_func_op in [True, False]:
-                L = test_main(
-                    use_cuda, use_py_func_op, self.use_parallel_executor
-                )
+                L = test_main(use_cuda, use_py_func_op)
                 if L is not None:
                     losses.append(L)
 
@@ -233,10 +222,5 @@ def test_loss_diff(self):
                     self.assertAlmostEqual(max_diff, 0, delta=1e-3)
 
 
-class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor):
-    def setUp(self):
-        self.use_parallel_executor = True
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/standalone_executor/test_standalone_executor.py b/test/standalone_executor/test_standalone_executor.py
index 6c510c77ca1f99..934558c170f513 100644
--- a/test/standalone_executor/test_standalone_executor.py
+++ b/test/standalone_executor/test_standalone_executor.py
@@ -70,9 +70,6 @@ def setUp(self):
         )
         self.perf_path = './perfstat'
 
-    def test_parallel_executor_statistics(self):
-        self.run_with_statistics(executor='ParallelExecutor')
-
     def test_executor_statistics(self):
         self.run_with_statistics(executor='Executor')
 
@@ -88,13 +85,6 @@ def run_with_statistics(self, executor=None):
         # note: startup program is empty
         main_program, startup_program, fetch_list = build_program()
 
-        enable = True
-        if executor == 'ParallelExecutor':
-            main_program = paddle.base.compiler.CompiledProgram(main_program)
-            enable = False
-        elif executor == 'Executor':
-            enable = False
-
         scope = paddle.static.Scope()
         with paddle.static.scope_guard(scope):
             exe = paddle.static.Executor(self.place)
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 999ae623e9e2d7..f8c236265ae27d 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -624,7 +624,6 @@
     'test_memory_analysis',
     'test_matrix_rank_op',
     'test_merged_momentum_op',
-    'test_parallel_executor_run_cinn',
     'test_parallel_dygraph_dataparallel_cpuonly',
     'test_eigvals_op',
     'test_sparse_attention_op',
@@ -670,9 +669,7 @@
     'test_analyzer_int8_googlenet',
     'test_analyzer_seq_pool1_compare_determine',
     'save_quant2_model_ernie',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
     'test_dataset_uci_housing',
-    'test_parallel_executor_seresnext_base_cpu',
     'test_dataset_download',
     'test_quant_int8_mobilenetv1_mkldnn',
     'test_crf_decoding_op',
@@ -688,7 +685,6 @@
     'test_weight_quantization_mobilenetv1',
     'test_concat_mkldnn_op',
     'test_gaussian_random_mkldnn_op',
-    'test_parallel_executor_seresnext_with_reduce_cpu',
     'test_dataset_imikolov',
     'test_analyzer_rnn1',
     'test_conv2d_mkldnn_op',
@@ -807,7 +803,6 @@
     'test_maximum_op',
     'test_rnn_cell_api',
     'device_code_test',
-    'test_ir_inplace_pass',
     'test_cos_sim_op',
     'test_lite_tensor_utils',
     'test_fit_a_line',
@@ -890,7 +885,6 @@
     'test_scale_mkldnn_op',
     'test_load_state_dict_from_old_format',
     'test_lookup_table_v2_op',
-    'test_mix_precision_all_reduce_fuse',
     'test_spp_op',
     'test_op_converter',
     'test_mixed_vector',
@@ -921,7 +915,6 @@
     'test_run_program_op',
     'test_cuda_random_seed',
     'test_linear_interp_op',
-    'test_fuse_all_reduce_pass',
     'tensor_util_test',
     'test_median',
     'test_nanmedian',
@@ -1027,7 +1020,6 @@
     'test_gather_tree_op',
     'test_elementwise_mul_op',
     'test_cycle_gan',
-    'test_parallel_executor_transformer_auto_growth',
     'test_bitwise_op',
     'test_uniform_random_op',
     'trt_split_converter_test',
@@ -1083,7 +1075,6 @@
     'test_imperative_layer_children',
     'nccl_op_test',
     'test_share_data_op',
-    'test_ir_memory_optimize_transformer',
     'test_math_op_patch',
     'test_base_layer',
     'test_dequantize_log_op',
@@ -1101,7 +1092,6 @@
     'test_affine_channel_op',
     'test_leaky_relu_grad_grad_functor',
     'test_ctc_align',
-    'test_fuse_relu_depthwise_conv_pass',
     'test_complex_kron',
     'test_imperative_skip_op',
     'test_dgc_op',
@@ -1253,7 +1243,6 @@
     'test_conv_elementwise_add2_act_fuse_pass',
     'test_imperative_container_layerlist',
     'test_dequantize_abs_max_op',
-    'test_fuse_optimizer_pass',
     'test_optimizer',
     'test_dynamic_rnn_stop_gradient',
     'test_raw_program_optimizer',
@@ -1355,7 +1344,6 @@
     'test_gradient_accmulator',
     'test_instance_norm_op_v2',
     'test_mobile_net',
-    'test_parallel_executor_transformer',
     'test_tensor_scalar_type_promotion_dynamic',
     'test_eager_deletion_delete_vars',
     'test_asp_pruning_1d',
@@ -1382,7 +1370,6 @@
     'test_tensorrt_engine',
     'test_affine_grid_function',
     'test_nonzero_api',
-    'test_ir_memory_optimize_pass',
     'test_reduce_mkldnn_op',
     'test_bilinear_interp_op',
     'test_cvm_op',
@@ -1464,9 +1451,6 @@
     'test_save_inference_model',
     'test_smooth_l1_loss',
     'test_bilateral_slice_op',
-    'test_parallel_executor_seresnext_base_gpu',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_gpu',
-    'test_parallel_executor_seresnext_with_reduce_gpu',
     'test_data_norm_op',
     'test_install_check',
     'graph_node_test',
@@ -2164,7 +2148,6 @@
     'test_analyzer_capi_exp_xpu',
     'test_egr_task_autocodegen',
     'test_static_save_load_bf16',
-    'test_parallel_executor_run_cinn',
     'test_egr_task_tensor_utils',
     'test_egr_task_hook',
     'test_egr_task_forward_autograd',
@@ -2279,15 +2262,12 @@
     'test_fused_transformer_encoder_layer',
     'test_eager_deletion_while_op',
     'test_dataloader_unkeep_order',
-    'test_parallel_executor_profiler',
     'test_correlation',
-    'test_ir_inplace_pass',
     'test_moving_average_abs_max_scale_op',
     'test_flatten_contiguous_range_op',
     'test_transforms',
     'test_sum_op',
     'test_scatter_op',
-    'test_mix_precision_all_reduce_fuse',
     'test_tensorrt_engine_op',
     'test_zeropad2d',
     'test_isclose_op',
@@ -2880,7 +2860,6 @@
     'test_user_defined_quantization',
     'test_quantization_scale_pass',
     'feed_forward_test',
-    'test_fuse_optimizer_pass',
     'test_standalone_executor',
     'test_imperative_qat_user_defined',
     'test_mkldnn_fc_act_fuse_pass',
@@ -2888,7 +2867,6 @@
     'test_signal',
     'test_fused_feedforward_op',
     'test_weight_decay_extend',
-    'test_fuse_relu_depthwise_conv_pass',
     'test_diag_v2',
     'test_tensordot',
     'test_rnn_decode_api',
@@ -2913,7 +2891,6 @@
     'test_multinomial_op',
     'test_fused_elemwise_activation_op',
     'test_profiler',
-    'test_ir_memory_optimize_pass',
     'test_callback_reduce_lr_on_plateau',
     'test_paddle_save_load',
     'test_stack_op',
@@ -3055,10 +3032,8 @@
     'test_squeeze2_mkldnn_op',
     'test_conv2d_transpose_bf16_mkldnn_op',
     'test_slice_mkldnn_op',
-    'test_parallel_executor_seresnext_base_cpu',
     'test_stack_mkldnn_op',
     'test_softplus_mkldnn_op',
-    'test_parallel_executor_seresnext_with_reduce_cpu',
     'test_nearest_interp_v2_mkldnn_op',
     'test_fusion_lstm_mkldnn_op',
     'test_fuse_resnet_unit',
@@ -3066,7 +3041,6 @@
     'test_uniform_random_bf16_op',
     'test_reshape_mkldnn_op',
     'test_reduce_bf16_mkldnn_op',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
     'test_nearest_interp_mkldnn_op',
     'test_ir_graph_to_program_pass',
     'test_fusion_lstm_int8_mkldnn_op',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index fe6c5814dbd8b7..9c5f73d7665c6e 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -217,9 +217,6 @@
     'test_functional_conv2d_transpose',
     'test_functional_conv3d',
     'test_functional_conv3d_transpose',
-    'test_fuse_all_reduce_pass',
-    'test_fuse_optimizer_pass',
-    'test_fuse_relu_depthwise_conv_pass',
     'test_fused_elemwise_activation_op',
     'test_fused_emb_seq_pool_op',
     'test_fused_embedding_fc_lstm_op',
@@ -279,6 +276,7 @@
     'test_instance_norm_op_v2',
     'test_inverse_op',
     'test_io_save_load',
+    'test_iou_similarity_op',
     'test_ir_memory_optimize_pass',
     'test_kldiv_loss_op',
     'test_kron_op',
@@ -503,14 +501,8 @@
     'test_transpiler_ops',
     'test_communicator_sync',
     'test_collective_optimizer',
-    'test_parallel_executor_profiler',
-    'test_parallel_executor_transformer',
-    'test_parallel_executor_transformer_auto_growth',
     'test_data_norm_op',
     'test_fuse_bn_act_pass',
-    'test_parallel_executor_seresnext_base_cpu',
-    'test_parallel_executor_seresnext_with_reduce_cpu',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
     'test_layers',
     'test_sequence_concat',
     'test_sequence_conv',
@@ -612,12 +604,9 @@
     'test_fleet_metric',
     'test_fused_bn_add_act',
     'test_fused_multihead_matmul_op',
-    'test_ir_inplace_pass',
-    'test_mix_precision_all_reduce_fuse',
     'test_rank_attention_op',
     'test_fleet_base',
     'test_fleet_meta_optimizer_base',
-    'test_ir_memory_optimize_transformer',
     'test_trt_fc_fuse_pass',
     'test_trt_quant_conv2d_dequant_fuse_pass',
     'test_trt_slice_plugin',
@@ -640,9 +629,6 @@
     'test_trt_pad_op',
     'test_trt_shuffle_channel_detect_pass',
     'test_trt_subgraph_pass',
-    'test_parallel_executor_seresnext_base_gpu',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_gpu',
-    'test_parallel_executor_seresnext_with_reduce_gpu',
     'test_sync_batch_norm_op',
     'test_multiprocess_dataloader_iterable_dataset_static',
     'test_multiprocess_dataloader_static',
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index a11e3ad47724f7..29b71c4306ee8c 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -24,15 +24,10 @@ disable_wingpu_test="^test_model$|\
 ^test_generator_dataloader$|\
 ^test_parallel_dygraph_sync_batch_norm$|\
 ^test_py_reader_using_executor$|\
-^test_parallel_executor_seresnext_base_gpu$|\
-^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\
-^test_parallel_executor_seresnext_with_reduce_gpu$|\
 ^test_program_prune_backward$|\
 ^test_decoupled_py_reader_data_check$|\
 ^test_fleet_base_single$|\
 ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\
-^test_parallel_executor_feed_persistable_var$|\
-^test_parallel_executor_inference_feed_partial_data$|\
 ^test_py_reader_combination$|\
 ^test_py_reader_pin_memory$|\
 ^test_py_reader_push_pop$|\
@@ -76,7 +71,6 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_elementwise_add_mkldnn_op$|\
 ^test_comp_high_grad$|\
 ^test_multi_precision_fp16_train$|\
-^test_fuse_relu_depthwise_conv_pass$|\
 ^test_imperative_skip_op$|\
 ^test_qat$|\
 ^test_standalone_cuda_graph_multi_stream$|\
@@ -209,7 +203,6 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_argsort_op$|\
 ^test_image_classification_fp16$|\
 ^test_imperative_double_grad$|\
-^test_parallel_executor_transformer$|\
 ^test_se_resnet$|\
 ^test_standalone_executor_aot_choose_kernel$|\
 ^test_imperative_qat_user_defined$|\
@@ -217,7 +210,6 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_callback_reduce_lr_on_plateau$|\
 ^test_callback_visualdl$|\
 ^test_callback_wandb$|\
-^test_mix_precision_all_reduce_fuse$|\
 ^test_user_defined_quantization$|\
 ^test_quantization_scale_pass$|\
 ^test_quantization_pass$|\
@@ -399,10 +391,7 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_model$|\
 ^test_py_reader_combination$|\
 ^test_py_reader_push_pop$|\
-^test_parallel_executor_feed_persistable_var$|\
-^test_parallel_executor_inference_feed_partial_data$|\
 ^test_reader_reset$|\
-^test_parallel_executor_seresnext_base_gpu$|\
 ^test_py_reader_pin_memory$|\
 ^test_multiprocess_dataloader_iterable_dataset_dynamic$|\
 ^test_multiprocess_dataloader_iterable_dataset_static$|\
@@ -432,8 +421,6 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_trt_convert_multihead_matmul$|\
 ^test_trt_convert_prelu$|\
 ^test_trt_fc_fuse_quant_dequant_pass$|\
-^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\
-^test_parallel_executor_seresnext_with_reduce_gpu$|\
 ^test_api_impl$|\
 ^test_tensordot$|\
 ^disable_win_inference_test$|\