diff --git a/CMakeLists.txt b/CMakeLists.txt
index f30671bd3a87e8..28dc39920c6d07 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,13 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 include(system)
 
+# Note(zhouwei): Ninja Generator will set CMAKE_BUILD_TYPE to Debug
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+      FORCE)
+endif()
+
 project(paddle CXX C)
 
 # enable language CUDA
@@ -213,12 +220,6 @@ if(NOT PY_VERSION)
 endif()
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 
-# CMAKE_BUILD_TYPE
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
-      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-      FORCE)
-endif()
 
 # the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined. Default: OFF
 if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$")
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 7f2addb02d36dd..033b40622e2594 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -205,23 +205,16 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 if(WIN32)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"/wd4244 /wd4267 /wd4819 \"")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj")
-  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    # match the cl's _ITERATOR_DEBUG_LEVEL
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"-g -G -D_DEBUG\"")
-    if(MSVC_STATIC_CRT)
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MTd")
-    else()
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MDd")
-    endif()
-  elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"-DNDEBUG\"")
-    if(MSVC_STATIC_CRT)
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MT")
-    else()
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MD")
-    endif()
-  else()
-    message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
+  if(MSVC_STATIC_CRT)
+    set(CMAKE_CUDA_FLAGS_DEBUG   "${CMAKE_CUDA_FLAGS_DEBUG} -Xcompiler /MTd")
+    set(CMAKE_CUDA_FLAGS_RELEASE  "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler /MT")
+    foreach(flag_var
+        CMAKE_CUDA_FLAGS CMAKE_CUDA_FLAGS_DEBUG CMAKE_CUDA_FLAGS_RELEASE
+        CMAKE_CUDA_FLAGS_MINSIZEREL CMAKE_CUDA_FLAGS_RELWITHDEBINFO)
+        if(${flag_var} MATCHES "-MD")
+            string(REGEX REPLACE "-MD" "-MT" ${flag_var} "${${flag_var}}")
+        endif()
+    endforeach(flag_var)
   endif()
 endif()
 
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 4e0768fc10f03f..c37e28523f43c5 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -60,8 +60,8 @@ ExternalProject_Add(
     DEPENDS             ${MKLDNN_DEPENDS}
     PREFIX              ${MKLDNN_PREFIX_DIR}
     SOURCE_DIR          ${MKLDNN_SOURCE_DIR}
-    BUILD_ALWAYS        1
-    # UPDATE_COMMAND      ""
+    UPDATE_COMMAND      ""
+    #BUILD_ALWAYS        1
     CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index c591a9391dfa5d..b0ea338d20525d 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -100,9 +100,9 @@ else()
         "${WARPCTC_DOWNLOAD_CMD}"
         PREFIX          ${WARPCTC_PREFIX_DIR}
         SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
-        #UPDATE_COMMAND  ""
+        UPDATE_COMMAND  ""
         PATCH_COMMAND   ""
-        BUILD_ALWAYS    1
+        #BUILD_ALWAYS    1
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index a2ddad557c2956..94fd29b905009b 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -28,7 +28,12 @@ function(CheckCompilerCXX14Flag)
 endfunction()
 
 CheckCompilerCXX14Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+if(NOT WIN32)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+else()
+    set(CMAKE_CXX_STANDARD 14)
+endif()
+
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 53dcde616b261d..cea65f17fbe836 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -92,7 +92,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 # including io directory for inference lib paddle_api.h
 include_directories("${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io")
 
-if(NOT APPLE)
+if(NOT APPLE AND NOT WIN32)
   find_package(Threads REQUIRED)
   link_libraries(${CMAKE_THREAD_LIBS_INIT})
   if(WITH_PSLIB OR WITH_DISTRIBUTE)
@@ -100,7 +100,7 @@ if(NOT APPLE)
   else()
     set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
   endif()
-endif(NOT APPLE)
+endif()
 
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 4bdcaeb4c5f3c0..0ebcdc8ceeebca 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -17,16 +17,30 @@ if(NOT WIN32)
     set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
+    
+    if(WITH_GPU)
+        set(CMAKE_CUDA_FLAGS_DEBUG "-g")
+        set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
+        set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
+        set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
+    endif()
 else()
-    set(CMAKE_C_FLAGS_DEBUG "/Zi /DEBUG")
-    set(CMAKE_C_FLAGS_RELEASE "/O2 /DNDEBUG")
-    set(CMAKE_C_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG")
-    set(CMAKE_C_FLAGS_MINSIZEREL "/Os /DNDEBUG")
+    set(CMAKE_C_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1")
+    set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG")
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG")
+    set(CMAKE_C_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG")
 
-    set(CMAKE_CXX_FLAGS_DEBUG "/Zi /DEBUG")
-    set(CMAKE_CXX_FLAGS_RELEASE "/O2 /DNDEBUG")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG")
-    set(CMAKE_CXX_FLAGS_MINSIZEREL "/Os /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1")
+    set(CMAKE_CXX_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG")
+
+    if(WITH_GPU)
+        set(CMAKE_CUDA_FLAGS_DEBUG "-Xcompiler=\"-MDd -Zi -Ob0 -Od /RTC1\"")
+        set(CMAKE_CUDA_FLAGS_RELEASE "-Xcompiler=\"-MD -O2 -Ob2\" -DNDEBUG")
+        set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-Xcompiler=\"-MD -Zi -O2 -Ob1\" -DNDEBUG")
+        set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Xcompiler=\"-MD -O1 -Ob1\" -DNDEBUG")
+    endif()
 
     # It can specify CUDA compile flag manualy,
     # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
@@ -34,10 +48,3 @@ else()
     # Now, it's only used in VS2015 + CUDA:[10.0, 10.2]
     set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()
-
-if(WITH_GPU)
-    set(CMAKE_CUDA_FLAGS_DEBUG "-g")
-    set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
-    set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
-    set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG")
-endif()
diff --git a/paddle/fluid/distributed/common/sparse_sharding_merge.h b/paddle/fluid/distributed/common/sparse_sharding_merge.h
new file mode 100644
index 00000000000000..3f84b5c4b212e2
--- /dev/null
+++ b/paddle/fluid/distributed/common/sparse_sharding_merge.h
@@ -0,0 +1,311 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <sys/time.h>
+
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include <ThreadPool.h>
+#include "boost/lexical_cast.hpp"
+#include "glog/logging.h"
+#include "paddle/fluid/distributed/common/utils.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/dim.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/string/split.h"
+
+constexpr int FG = 256 * 1024 * 1024;
+constexpr int Q_SIZE = 10000;
+constexpr int BUCKET = 10;
+constexpr char XEOF[] = "EOF";
+
+using boost::lexical_cast;
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+namespace paddle {
+namespace distributed {
+
+class ShardingMerge {
+ public:
+  ShardingMerge() {}
+  ~ShardingMerge() {}
+
+  void Merge(const std::vector<std::string> &inputs,
+             const std::vector<int64_t> &feasigns, const std::string &output,
+             const int embedding_dim) {
+    pool_.reset(new ::ThreadPool(inputs.size()));
+
+    std::vector<std::future<int>> tasks(inputs.size());
+    std::vector<std::vector<int64_t>> rows;
+    rows.resize(inputs.size());
+
+    auto begin = GetCurrentUS();
+    for (int x = 0; x < inputs.size(); ++x) {
+      tasks[x] = pool_->enqueue([this, x, &rows, &inputs, &feasigns]() -> int {
+        DeserializeRowsFromFile(inputs[x], feasigns[x], &rows[x]);
+        return 0;
+      });
+    }
+
+    for (size_t x = 0; x < tasks.size(); ++x) {
+      tasks[x].wait();
+    }
+
+    int64_t total_rows = 0;
+    for (auto x = 0; x < rows.size(); x++) {
+      total_rows += rows[x].size();
+    }
+
+    auto end = GetCurrentUS();
+
+    VLOG(0) << "got " << total_rows
+            << " feasigin ids from sparse embedding using " << end - begin;
+
+    std::vector<int64_t> total_dims = {total_rows,
+                                       static_cast<int64_t>(embedding_dim)};
+
+    std::vector<std::vector<int>> batch_buckets;
+    batch_buckets.resize(inputs.size());
+
+    for (int x = 0; x < rows.size(); ++x) {
+      batch_buckets[x] = bucket(rows[x].size(), BUCKET);
+    }
+
+    std::ofstream out(output, std::ios::binary);
+
+    begin = GetCurrentUS();
+    SerializeRowsToStream(out, rows, batch_buckets, total_rows);
+    end = GetCurrentUS();
+    VLOG(0) << "write rows to oostrream using " << end - begin;
+
+    begin = GetCurrentUS();
+    SerializePreTensorToStream(out, total_dims);
+    end = GetCurrentUS();
+    VLOG(0) << "write pretensor to oostrream using " << end - begin;
+
+    begin = GetCurrentUS();
+    SerializeValueToStream(out, inputs, batch_buckets, embedding_dim);
+    end = GetCurrentUS();
+    VLOG(0) << "write values to oostrream using " << end - begin;
+  }
+
+ private:
+  void SerializeRowsToStream(std::ostream &os,
+                             const std::vector<std::vector<int64_t>> &rows,
+                             const std::vector<std::vector<int>> &batch_buckets,
+                             int64_t total_rows) {
+    {  // the 1st field, uint32_t version
+      constexpr uint32_t version = 0;
+      os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+    }
+
+    {
+      // the 2st field, rows information
+      os.write(reinterpret_cast<const char *>(&total_rows), sizeof(total_rows));
+
+      for (int b = 0; b < BUCKET; ++b) {
+        for (int x = 0; x < batch_buckets.size(); ++x) {
+          auto begin = batch_buckets[x][b];
+          auto end = batch_buckets[x][b + 1];
+
+          if (end - begin == 0) continue;
+
+          os.write(reinterpret_cast<const char *>(rows[x].data() + begin),
+                   sizeof(int64_t) * (end - begin));
+        }
+      }
+
+      // the 3st field, the height of SelectedRows
+      int64_t height = total_rows;
+      os.write(reinterpret_cast<const char *>(&height), sizeof(height));
+    }
+  }
+
+  void SerializePreTensorToStream(std::ostream &os,
+                                  const std::vector<int64_t> &dims) {
+    {  // the 1st field, uint32_t version
+      constexpr uint32_t version = 0;
+      os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+    }
+    {  // the 2nd field, tensor description
+      // int32_t  size
+      framework::proto::VarType::TensorDesc desc;
+      desc.set_data_type(framework::proto::VarType::FP32);
+      auto *pb_dims = desc.mutable_dims();
+      pb_dims->Resize(static_cast<int>(dims.size()), 0);
+      std::copy(dims.begin(), dims.end(), pb_dims->begin());
+      int32_t size = desc.ByteSize();
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      auto out = desc.SerializeAsString();
+      os.write(out.data(), size);
+    }
+  }
+
+  void SerializeValueToVec(std::ifstream &in, const int batch,
+                           const int embedding_dim, std::vector<float> *out) {
+    auto queue =
+        std::make_shared<framework::BlockingQueue<std::vector<std::string>>>();
+
+    auto read = [batch, &in, &queue]() {
+      std::string line;
+      std::vector<std::string> columns;
+      std::vector<std::string> values_str;
+
+      int count = 0;
+
+      while (std::getline(in, line)) {
+        ++count;
+        columns = string::Split(line, '\t');
+
+        if (columns.size() != 5) {
+          VLOG(0) << "unexpected line: " << line << ", skip it";
+          continue;
+        }
+
+        values_str = string::Split(columns[4], ',');
+        queue->Push(values_str);
+
+        if (count >= batch) {
+          break;
+        }
+      }
+      queue->Push({});
+    };
+
+    auto write = [embedding_dim, &out, &queue]() {
+      std::vector<std::string> values_str;
+      std::string line;
+
+      while (true) {
+        queue->Pop(&values_str);
+
+        if (values_str.size() == 0) {
+          break;
+        }
+
+        for (int x = 0; x < embedding_dim; ++x) {
+          float v = 0.0;
+          try {
+            v = lexical_cast<float>(values_str[x]);
+          } catch (boost::bad_lexical_cast &e) {
+            VLOG(0) << " get unexpected line: " << line;
+          }
+          out->push_back(v);
+        }
+      }
+    };
+
+    std::thread p_read(read);
+    std::thread p_write(write);
+    p_read.join();
+    p_write.join();
+  }
+
+  void SerializeVecToStream(std::ostream &out,
+                            const std::vector<float> &value) {
+    out.write(reinterpret_cast<const char *>(value.data()),
+              static_cast<std::streamsize>(sizeof(float) * value.size()));
+  }
+
+  void SerializeValueToStream(
+      std::ostream &out, const std::vector<std::string> &ins,
+      const std::vector<std::vector<int>> &batch_buckets,
+      const int embedding_dim) {
+    std::vector<std::shared_ptr<std::ifstream>> in_streams;
+
+    for (int x = 0; x < ins.size(); ++x) {
+      in_streams.emplace_back(std::make_shared<std::ifstream>(ins[x]));
+    }
+
+    std::vector<std::future<int>> tasks(ins.size());
+
+    for (int b = 0; b < BUCKET; ++b) {
+      std::vector<std::vector<float>> values;
+      values.resize(tasks.size());
+
+      auto begin = GetCurrentUS();
+
+      for (int x = 0; x < tasks.size(); ++x) {
+        auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b];
+        values[x].clear();
+        values[x].reserve(batch * embedding_dim);
+      }
+
+      for (int x = 0; x < tasks.size(); ++x) {
+        tasks[x] =
+            pool_->enqueue([this, b, x, &out, &in_streams, &batch_buckets,
+                            &values, embedding_dim]() -> int {
+              auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b];
+              if (batch == 0) return 0;
+              SerializeValueToVec(*(in_streams[x].get()), batch, embedding_dim,
+                                  &values[x]);
+              return 0;
+            });
+      }
+
+      for (size_t x = 0; x < tasks.size(); ++x) {
+        tasks[x].wait();
+      }
+
+      auto end = GetCurrentUS();
+
+      auto begin1 = GetCurrentUS();
+      for (size_t x = 0; x < tasks.size(); ++x) {
+        SerializeVecToStream(out, values[x]);
+      }
+      auto end1 = GetCurrentUS();
+
+      VLOG(0) << "serialize buckets " << b << " read using " << end - begin
+              << ", to oostream using " << end1 - begin1;
+    }
+  }
+
+  void DeserializeRowsFromFile(const std::string &input_file,
+                               const int64_t feasigns,
+                               std::vector<int64_t> *rows) {
+    std::string line;
+    std::vector<std::string> columns;
+    std::ifstream file(input_file);
+
+    rows->reserve(feasigns);
+
+    while (std::getline(file, line)) {
+      columns = string::Split(line, '\t');
+      if (columns.size() != 5) {
+        VLOG(0) << "unexpected line: " << line << ", skip it";
+        continue;
+      }
+      rows->push_back(std::stoull(columns[0]));
+    }
+
+    VLOG(0) << "parse " << rows->size() << " embedding rows from "
+            << input_file;
+  }
+
+ private:
+  std::unique_ptr<::ThreadPool> pool_;
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/common/utils.h b/paddle/fluid/distributed/common/utils.h
index f81f84b1e11751..2305001ad6f8f9 100644
--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <sys/time.h>
+
 #include <functional>
 #include <memory>
 #include <string>
@@ -83,5 +85,11 @@ std::string to_string(const std::vector<T>& vec) {
   }
   return ss.str();
 }
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
 }
-}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index a4f672c2963a84..b667aec186f9e3 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -134,10 +134,23 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
   }
 }
 
-int64_t SaveToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
-                   const int mode) {
-  int64_t save_num = 0;
+void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
+                    const size_t shard_idx, const int64_t total) {
+  // save meta
+  std::stringstream stream;
+  stream << "param=" << common.table_name() << "\n";
+  stream << "shard_id=" << shard_idx << "\n";
+  stream << "row_names=" << paddle::string::join_strings(common.params(), ',')
+         << "\n";
+  stream << "row_dims=" << paddle::string::join_strings(common.dims(), ',')
+         << "\n";
+  stream << "count=" << total << "\n";
+  os->write(stream.str().c_str(), sizeof(char) * stream.str().size());
+}
 
+int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
+                        std::shared_ptr<::ThreadPool> pool, const int mode) {
+  int64_t save_num = 0;
   for (auto& table : block->values_) {
     for (auto& value : table) {
       if (mode == SaveMode::delta && !value.second->need_save_) {
@@ -334,16 +347,24 @@ int32_t CommonSparseTable::set_global_lr(float* lr) {
 
 int32_t CommonSparseTable::load(const std::string& path,
                                 const std::string& param) {
+  auto begin = GetCurrentUS();
   rwlock_->WRLock();
-  VLOG(3) << "sparse table load with " << path << " with meta " << param;
   LoadFromText(path, param, _shard_idx, _shard_num, task_pool_size_,
                &shard_values_);
   rwlock_->UNLock();
+  auto end = GetCurrentUS();
+
+  auto varname = _config.common().table_name();
+  VLOG(0) << "load " << varname << " with value: " << path
+          << " , meta: " << param
+          << " using: " << std::to_string((end - begin) / 1e+6) << " seconds";
+
   return 0;
 }
 
 int32_t CommonSparseTable::save(const std::string& dirname,
                                 const std::string& param) {
+  auto begin = GetCurrentUS();
   rwlock_->WRLock();
   int mode = std::stoi(param);
   VLOG(3) << "sparse table save: " << dirname << " mode: " << mode;
@@ -356,36 +377,33 @@ int32_t CommonSparseTable::save(const std::string& dirname,
   VLOG(3) << "save " << varname << " in dir: " << var_store << " begin";
   std::vector<std::string> params(_config.common().params().begin(),
                                   _config.common().params().end());
+
   std::string shard_var_pre =
       string::Sprintf("%s.block%d", varname, _shard_idx);
 
   std::string value_ = string::Sprintf("%s/%s.txt", var_store, shard_var_pre);
 
-  std::unique_ptr<std::ofstream> value_out(new std::ofstream(value_));
+  std::unique_ptr<std::ofstream> vs(new std::ofstream(value_));
 
   int64_t total_ins = 0;
   for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) {
     // save values
-    total_ins += SaveToText(value_out.get(), shard_values_[shard_id], mode);
+    auto shard_save_num = SaveValueToText(vs.get(), shard_values_[shard_id],
+                                          _shards_task_pool[shard_id], mode);
+    total_ins += shard_save_num;
   }
-  value_out->close();
+  vs->close();
 
-  // save meta
-  std::stringstream stream;
-  stream << "param=" << _config.common().table_name() << "\n";
-  stream << "shard_id=" << _shard_idx << "\n";
-  stream << "row_names="
-         << paddle::string::join_strings(_config.common().params(), ',')
-         << "\n";
-  stream << "row_dims="
-         << paddle::string::join_strings(_config.common().dims(), ',') << "\n";
-  stream << "count=" << total_ins << "\n";
   std::string meta_ = string::Sprintf("%s/%s.meta", var_store, shard_var_pre);
-  std::unique_ptr<std::ofstream> meta_out(new std::ofstream(meta_));
-  meta_out->write(stream.str().c_str(), sizeof(char) * stream.str().size());
-  meta_out->close();
-  VLOG(3) << "save " << varname << " in dir: " << var_store << " done";
+  std::unique_ptr<std::ofstream> ms(new std::ofstream(meta_));
+  SaveMetaToText(ms.get(), _config.common(), _shard_idx, total_ins);
+  ms->close();
+
+  auto end = GetCurrentUS();
   rwlock_->UNLock();
+  VLOG(0) << "save " << varname << " with path: " << value_
+          << " using: " << std::to_string((end - begin) / 1e+6) << " seconds";
+
   return 0;
 }
 
@@ -403,8 +421,6 @@ std::pair<int64_t, int64_t> CommonSparseTable::print_table_stat() {
 }
 
 int32_t CommonSparseTable::pour() {
-  rwlock_->RDLock();
-
   std::vector<float> values;
   std::vector<uint64_t> keys;
 
@@ -421,14 +437,11 @@ int32_t CommonSparseTable::pour() {
   _push_sparse(keys.data(), values.data(), pull_reservoir_.size());
 
   pull_reservoir_.clear();
-  rwlock_->UNLock();
   return 0;
 }
 
 int32_t CommonSparseTable::pull_sparse(float* pull_values,
                                        const PullSparseValue& pull_value) {
-  rwlock_->RDLock();
-
   auto shard_num = task_pool_size_;
   std::vector<std::future<int>> tasks(shard_num);
 
@@ -464,7 +477,6 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values,
   for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
     tasks[shard_id].wait();
   }
-  rwlock_->UNLock();
   return 0;
 }
 
@@ -507,7 +519,6 @@ int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values,
 
 int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
                                         const float* values, size_t num) {
-  rwlock_->RDLock();
   std::vector<std::vector<uint64_t>> offset_bucket;
   offset_bucket.resize(task_pool_size_);
 
@@ -531,7 +542,6 @@ int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
   for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
     tasks[shard_id].wait();
   }
-  rwlock_->UNLock();
   return 0;
 }
 
@@ -569,7 +579,6 @@ int32_t CommonSparseTable::push_sparse(const uint64_t* keys,
 
 int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
                                         const float** values, size_t num) {
-  rwlock_->RDLock();
   std::vector<std::vector<uint64_t>> offset_bucket;
   offset_bucket.resize(task_pool_size_);
 
@@ -596,14 +605,11 @@ int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
   for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
     tasks[shard_id].wait();
   }
-  rwlock_->UNLock();
   return 0;
 }
 
 int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys,
                                              const float* values, size_t num) {
-  rwlock_->RDLock();
-
   std::vector<std::vector<uint64_t>> offset_bucket;
   offset_bucket.resize(task_pool_size_);
 
@@ -635,14 +641,12 @@ int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys,
   for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
     tasks[shard_id].wait();
   }
-  rwlock_->UNLock();
   return 0;
 }
 
 int32_t CommonSparseTable::flush() { return 0; }
 
 int32_t CommonSparseTable::shrink(const std::string& param) {
-  rwlock_->WRLock();
   int threshold = std::stoi(param);
   VLOG(3) << "sparse table shrink: " << threshold;
 
@@ -651,7 +655,6 @@ int32_t CommonSparseTable::shrink(const std::string& param) {
     VLOG(4) << shard_id << " " << task_pool_size_ << " begin shrink";
     shard_values_[shard_id]->Shrink(threshold);
   }
-  rwlock_->UNLock();
   return 0;
 }
 
diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h
index 81a1ff5eced2bb..55fc92c9b57859 100644
--- a/paddle/fluid/distributed/table/table.h
+++ b/paddle/fluid/distributed/table/table.h
@@ -36,7 +36,7 @@ class Table {
   Table() {}
   virtual ~Table() {}
   virtual int32_t initialize(const TableParameter &config,
-                             const FsClientParameter &fs_config) final;
+                             const FsClientParameter &fs_config);
 
   virtual int32_t pull_dense(float *values, size_t num) = 0;
   virtual int32_t push_dense(const float *values, size_t num) = 0;
@@ -58,7 +58,9 @@ class Table {
   virtual int32_t push_sparse(const uint64_t *keys, const float *values,
                               size_t num) = 0;
   virtual int32_t push_sparse(const uint64_t *keys, const float **values,
-                              size_t num){};
+                              size_t num) {
+    return 0;
+  }
   virtual int32_t push_sparse_param(const uint64_t *keys, const float *values,
                                     size_t num) {
     return 0;
@@ -108,7 +110,7 @@ class Table {
   virtual int32_t save(const std::string &path,
                        const std::string &converter) = 0;
 
-  virtual int32_t set_shard(size_t shard_idx, size_t shard_num) final {
+  virtual int32_t set_shard(size_t shard_idx, size_t shard_num) {
     _shard_idx = shard_idx;
     _shard_num = shard_num;
     return initialize_shard();
@@ -123,7 +125,7 @@ class Table {
 
  protected:
   virtual int32_t initialize() = 0;
-  virtual int32_t initialize_accessor() final;
+  virtual int32_t initialize_accessor();
   virtual int32_t initialize_shard() = 0;
   virtual std::string table_dir(const std::string &model_dir) {
     return paddle::string::format_string("%s/%03d/", model_dir.c_str(),
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index db2f9c9fc5fc55..8d1ae4926a8012 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -27,6 +27,7 @@ add_subdirectory(fleet)
 add_subdirectory(io)
 #ddim lib
 proto_library(framework_proto SRCS framework.proto)
+proto_library(op_def_proto SRCS op_def.proto)
 proto_library(heter_service_proto SRCS heter_service.proto)
 proto_library(data_feed_proto SRCS data_feed.proto)
 proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 84369011476c77..db83cd55889c43 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -195,6 +195,9 @@ class DeviceWorker {
   virtual void SetReaderPlace(const paddle::platform::Place& place) {
     device_reader_->SetPlace(place);
   }
+  virtual void SetDeviceContext(platform::DeviceContext* dev_ctx) {
+    dev_ctx_ = dev_ctx;
+  }
   virtual Scope* GetThreadScope() { return thread_scope_; }
   DataFeed* device_reader_ = nullptr;
 
@@ -221,6 +224,7 @@ class DeviceWorker {
   int dump_mode_ = 0;
   int dump_interval_ = 10000;
   ChannelWriter<std::string> writer_;
+  platform::DeviceContext* dev_ctx_ = nullptr;
 };
 
 class CPUWorkerBase : public DeviceWorker {
@@ -266,9 +270,6 @@ class HogwildWorker : public CPUWorkerBase {
   HogwildWorkerParameter param_;
   std::vector<std::string> skip_ops_;
   std::map<std::string, int> stat_var_name_map_;
-#ifdef PADDLE_WITH_HETERPS
-  platform::DeviceContext* dev_ctx_ = nullptr;
-#endif
 };
 
 class DownpourWorker : public HogwildWorker {
@@ -622,7 +623,6 @@ class PSGPUWorker : public HogwildWorker {
   gpuStream_t copy_stream_;
   int batch_cnt_{0};
   std::atomic<int> done_cnt_{0};
-  platform::DeviceContext* dev_ctx_ = nullptr;
 
   double total_time_;
   double read_time_;
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index b2d170888e28fc..0c66622ed7b9a6 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -39,9 +39,6 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) {
   for (int i = 0; i < param_.stat_var_names_size(); ++i) {
     stat_var_name_map_[param_.stat_var_names(i)] = 1;
   }
-#ifdef PADDLE_WITH_HETERPS
-  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
-#endif
 }
 
 void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 01536fd36ff83b..7e7f1fed5ad58d 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -52,7 +52,7 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PA
 
 cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector)
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor)
-cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
+cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass)
 cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass)
 
 cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper)
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h
index ce7635bb35ce61..bc5fc2a16d3939 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
@@ -46,7 +46,7 @@ enum FuseOptions {
   FUSE_MKLDNN   // fusing will be done with MKL-DNN
 };
 
-class FusePassBase : public Pass {
+class FusePassBase : public OpCompatSensiblePass {
  public:
   void Init(const std::string& repr, Graph* graph) const;
   Scope* param_scope() const;
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
index f7312ca5555311..b056c3b07a2f65 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
-
+#include "paddle/fluid/framework/op_info.h"
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -51,11 +51,33 @@ AttrCompat& AttrCompat::IsIntIn(const std::set<int>& candidates) {
 }
 
 //! Todo: append the definition.
-AttrCompat& AttrCompat::IsLeftDefault() { return *this; }
+AttrCompat& AttrCompat::IsLeftDefault() {
+  const std::string& op_name = op_compat_->Name();
+  if (!OpInfoMap::Instance().Has(op_name)) {
+    VLOG(3) << "Op (" << op_name << ") is not registered!";
+    conditions_.emplace_back([](const Attribute& attr) { return false; });
+    return *this;
+  }
+  const OpInfo& op_info = OpInfoMap::Instance().Get(op_name);
+  const AttributeMap attrs = op_info.Checker()->GetAttrsDefaultValuesMap();
+  if (attrs.find(attr_name_) == attrs.end()) {
+    VLOG(3) << "Op (" << op_name << ") has no default attr:" << attr_name_;
+    conditions_.emplace_back([](const Attribute& attr) { return false; });
+  } else {
+    Attribute default_attr = attrs.at(attr_name_);
+    conditions_.emplace_back([default_attr](const Attribute& attr) -> bool {
+      return attr == default_attr;
+    });
+  }
+  return *this;
+}
 
 bool AttrCompat::operator()(const OpDesc& op_desc) {
+  if (conditions_.empty()) {
+    return true;
+  }
   if (!op_desc.HasAttr(attr_name_)) {
-    return false;
+    return optional_;
   }
   const Attribute attr = op_desc.GetAttr(attr_name_);
   for (auto& func : conditions_) {
@@ -65,6 +87,10 @@ bool AttrCompat::operator()(const OpDesc& op_desc) {
   }
   return true;
 }
+AttrCompat& AttrCompat::IsOptional() {
+  optional_ = true;
+  return *this;
+}
 
 AttrCompat& AttrCompat::IsBoolEQ(bool v) {
   conditions_.emplace_back([v](const Attribute& attr) -> bool {
@@ -98,8 +124,12 @@ bool InputOrOutputCompat::operator()(
 }
 
 AttrCompat& OpCompat::AddAttr(const std::string& attr_name) {
-  attr_compats_.emplace_back(attr_name, this);
-  return attr_compats_.back();
+  PADDLE_ENFORCE_EQ(
+      attr_compats_.find(attr_name), attr_compats_.end(),
+      platform::errors::InvalidArgument(
+          "The attrubute compat with the same name has been added"));
+  attr_compats_.emplace(attr_name, AttrCompat(attr_name, this));
+  return attr_compats_.at(attr_name);
 }
 
 InputOrOutputCompat& OpCompat::AddInput(const std::string& name) {
@@ -119,8 +149,19 @@ InputOrOutputCompat& OpCompat::AddOutput(const std::string& name) {
 }
 
 bool OpCompat::Judge(const OpDesc& op_desc) {
+  for (auto& attr_map : op_desc.GetAttrMap()) {
+    if (attr_compats_.find(attr_map.first) == attr_compats_.end()) {
+      if (!AttrCompat(attr_map.first, this).IsLeftDefault()(op_desc)) {
+        VLOG(3) << "The Attr(" << attr_map.first << ") of Op (" << op_name_
+                << ") not reigistered in OpCompat, not equal to default value!";
+        return false;
+      }
+    }
+  }
   for (auto& attr_compat : attr_compats_) {
-    if (!attr_compat(op_desc)) {
+    if (!attr_compat.second(op_desc)) {
+      VLOG(3) << " Check the Attr(" << attr_compat.first << ") of Op("
+              << op_name_ << ") failed!";
       return false;
     }
   }
@@ -129,6 +170,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& input_desc : inputs_map) {
     if (input_compats_.find(input_desc.first) == input_compats_.end()) {
       if (!input_desc.second.empty()) {
+        VLOG(3) << "The Input (" << input_desc.first << ") of Operator ("
+                << op_name_ << ") not reigistered in OpCompat!";
         return false;
       }
     }
@@ -136,10 +179,14 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& input_val : input_compats_) {
     if (inputs_map.find(input_val.first) == inputs_map.end()) {
       if (!input_val.second.Optional()) {
+        VLOG(3) << "The No optional Input (" << input_val.first
+                << ") of Operator (" << op_name_ << ") not find in op_desc!";
         return false;
       }
     } else {
       if (!input_val.second(inputs_map.at(input_val.first))) {
+        VLOG(3) << "The Input (" << input_val.first << ") of Operator ("
+                << op_name_ << ") compat check failed!";
         return false;
       }
     }
@@ -149,6 +196,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& output_desc : outputs_map) {
     if (output_compats_.find(output_desc.first) == output_compats_.end()) {
       if (!output_desc.second.empty()) {
+        VLOG(3) << "The Output (" << output_desc.first << ") of Operator ("
+                << op_name_ << ") not reigistered in OpCompat!";
         return false;
       }
     }
@@ -156,10 +205,14 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& output_val : output_compats_) {
     if (outputs_map.find(output_val.first) == outputs_map.end()) {
       if (!output_val.second.Optional()) {
+        VLOG(3) << "The No optional Output (" << output_val.first
+                << ") of Operator (" << op_name_ << ") not find in op_desc!";
         return false;
       }
     } else {
       if (!output_val.second(outputs_map.at(output_val.first))) {
+        VLOG(3) << "The Output (" << output_val.first << ") of Operator ("
+                << op_name_ << ") compat check failed!";
         return false;
       }
     }
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.h b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
index 6c0860549fbfee..3f2ea673d879b8 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.h
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
@@ -29,7 +29,7 @@ class OpCompat;
 class AttrCompat {
  public:
   AttrCompat(const std::string& attr_name, OpCompat* op_compat)
-      : attr_name_(attr_name), op_compat_(op_compat) {}
+      : optional_(false), attr_name_(attr_name), op_compat_(op_compat) {}
 
   // @{ String-related methods
   //! Assert the attribute is an string in the `candidates` domain.
@@ -70,12 +70,15 @@ class AttrCompat {
   //! Tell whether this attribute is left as default value.
   AttrCompat& IsLeftDefault();
 
+  AttrCompat& IsOptional();
+
   //! Jump back to retrieve OpCompat instance.
   OpCompat& End() { return *op_compat_; }
 
   bool operator()(const OpDesc& op_desc);
 
  private:
+  bool optional_;
   std::string attr_name_;
   OpCompat* op_compat_;
   std::vector<std::function<bool(const Attribute&)>> conditions_;
@@ -134,7 +137,7 @@ class OpCompat {
 
  private:
   std::string op_name_;
-  std::vector<AttrCompat> attr_compats_;
+  std::unordered_map<std::string, AttrCompat> attr_compats_;
   std::unordered_map<std::string, InputOrOutputCompat> input_compats_;
   std::unordered_map<std::string, InputOrOutputCompat> output_compats_;
 };
@@ -179,15 +182,6 @@ class OpCompat {
  * };
  */
 class OpCompatSensiblePass : public Pass {
- public:
-  //! Access the subgraph and pattern.
-  void AccessSubgraph(const GraphPatternDetector::subgraph_t& subgraph,
-                      Graph* g) {
-    if (IsCompat(subgraph, g)) {
-      AccessSubgraphImpl(subgraph, g);
-    }
-  }
-
  protected:
   /**
    * Developer should push the compatibility `teller` for each kind of Op in the
@@ -197,12 +191,6 @@ class OpCompatSensiblePass : public Pass {
    */
   OpCompat& AddOpCompat(OpCompat&& op_compat);
 
-  //! Modify the subgraph.
-  virtual bool AccessSubgraphImpl(
-      const GraphPatternDetector::subgraph_t& subgraph, Graph* g) const {
-    return true;
-  }
-
   //! Tell the Op compability of a subgraph.
   bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph,
                 Graph* g) const {
@@ -212,7 +200,7 @@ class OpCompatSensiblePass : public Pass {
     // Check the all the ops in the subgraph are contained in the
     // op_compat.
     for (auto& node_pair : subgraph) {
-      if (!node_pair.first->IsOp()) continue;
+      if (!node_pair.second->IsOp()) continue;
       auto op_type = node_pair.second->Op()->Type();
       if (!op_compat_judgers_.count(op_type)) {
         return false;
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
index 3d0863a6d12d95..0878e4d9890d35 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
-
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 
 namespace paddle {
@@ -23,7 +23,7 @@ namespace ir {
 
 TEST(OpCompatSensiblePass, compatOp) {
   auto lambda = [](const std::string& str) { return str == "tanh"; };
-  OpCompat compat("FC");
+  OpCompat compat("fc");
   compat.AddAttr("in_num_col_dims")
       .IsIntIn({1, 2})
       .IsNumLE(1)
@@ -67,10 +67,75 @@ TEST(OpCompatSensiblePass, compatOp) {
   fc_op.SetInput("Bias", std::vector<std::string>{"test_input_1"});
   fc_op.SetOutput("Out", std::vector<std::string>{"test_output"});
 
-  EXPECT_STREQ(compat.Name().c_str(), "FC");
+  EXPECT_STREQ(compat.Name().c_str(), "fc");
+  EXPECT_FALSE(compat.Judge(fc_op));
+}
+
+TEST(OpCompatSensiblePass, compatOpAttribute) {
+  OpCompat compat("fc");
+
+  OpDesc fc_op;
+
+  std::unordered_map<std::string, Attribute> attr_map;
+  attr_map["in_num_col_dims"] = 1;
+  fc_op.SetAttrMap(attr_map);
+
+  OpInfo info;
+  info.checker_ = new OpAttrChecker();
+  OpInfoMap::Instance().Insert("fc", info);
+
+  EXPECT_FALSE(compat.Judge(fc_op));
+
+  info.checker_->AddAttrChecker<int>("in_num_col_dims").SetDefault(1);
+
+  EXPECT_TRUE(compat.Judge(fc_op));
+  delete info.checker_;
+}
+
+TEST(OpCompatSensiblePass, compatOpAttributeOptional) {
+  OpCompat compat("fc");
+  compat.AddAttr("activation_type")
+      .IsOptional()
+      .IsStringIn({"tanh", "sigmoid"});
+  OpDesc fc_op;
   EXPECT_TRUE(compat.Judge(fc_op));
 }
 
+TEST(OpCompatSensiblePass, compatOpInput) {
+  OpCompat compat("fc");
+
+  OpDesc fc_op;
+  fc_op.SetInput("Input", std::vector<std::string>{"test_input"});
+
+  EXPECT_FALSE(compat.Judge(fc_op));
+
+  compat.AddInput("Input").IsTensor().End().AddInput("Bias").IsTensor().End();
+  EXPECT_FALSE(compat.Judge(fc_op));
+
+  fc_op.SetInput("Bias", std::vector<std::string>{"test_input", ""});
+  EXPECT_FALSE(compat.Judge(fc_op));
+}
+
+TEST(OpCompatSensiblePass, compatOutput) {
+  OpCompat compat("fc");
+
+  OpDesc fc_op;
+  fc_op.SetOutput("Output", std::vector<std::string>{"test_output"});
+
+  EXPECT_FALSE(compat.Judge(fc_op));
+
+  compat.AddOutput("Output")
+      .IsTensor()
+      .End()
+      .AddOutput("Output_2")
+      .IsTensor()
+      .End();
+  EXPECT_FALSE(compat.Judge(fc_op));
+
+  fc_op.SetOutput("Output_2", std::vector<std::string>{"test_output", ""});
+  EXPECT_FALSE(compat.Judge(fc_op));
+}
+
 class OpCompatSensiblePassTest : public OpCompatSensiblePass {
  public:
   OpCompatSensiblePassTest();
@@ -78,7 +143,7 @@ class OpCompatSensiblePassTest : public OpCompatSensiblePass {
 };
 
 OpCompatSensiblePassTest::OpCompatSensiblePassTest() {
-  AddOpCompat(OpCompat("FC"))
+  AddOpCompat(OpCompat("fc"))
       .AddAttr("in_num_col_dims")
       .IsNumLE(1)
       .End()
@@ -102,7 +167,7 @@ OpCompatSensiblePassTest::OpCompatSensiblePassTest() {
 TEST(OpCompatSensiblePass, IsCompat) {
   OpCompatSensiblePassTest test;
   OpDesc fc_op;
-  fc_op.SetType("FC");
+  fc_op.SetType("fc");
   std::unordered_map<std::string, Attribute> attr_map;
   attr_map["in_num_col_dims"] = 1;
   attr_map["activation_type"] = std::string("tanh");
@@ -114,18 +179,6 @@ TEST(OpCompatSensiblePass, IsCompat) {
   fc_op.SetOutput("Out", std::vector<std::string>{"test_output"});
 
   EXPECT_TRUE(test.TestIsCompat(fc_op));
-
-  ProgramDesc prog;
-  std::unique_ptr<Graph> g(new Graph(prog));
-  Node* o1 = g->CreateOpNode(&fc_op);
-
-  GraphPatternDetector detector;
-  PDNode* op2 =
-      detector.mutable_pattern()->NewNode([](Node* x) { return true; });
-  GraphPatternDetector::subgraph_t subgraph;
-  subgraph[op2] = o1;
-
-  test.AccessSubgraph(subgraph, g.get());
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index 7afa76c3fbd23a..c0ccc196348a57 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -112,6 +112,8 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
 #ifdef PADDLE_WITH_HETERPS
     workers_[i]->SetPlace(places_[i]);
     workers_[i]->SetReaderPlace(places_[i]);
+    workers_[i]->SetDeviceContext(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
 #else
     workers_[i]->SetPlace(place);
     workers_[i]->SetReaderPlace(place);
diff --git a/paddle/fluid/framework/op_def.proto b/paddle/fluid/framework/op_def.proto
new file mode 100644
index 00000000000000..7c4b42b1344b8b
--- /dev/null
+++ b/paddle/fluid/framework/op_def.proto
@@ -0,0 +1,43 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+
+import "framework.proto";
+package paddle.framework.proto;
+
+message OpDef {
+
+  message VarDef {
+    required string name = 1;
+
+    // For the type of input / output variables.
+    reserved 2;
+  }
+
+  message AttrDef {
+    required string name = 1;
+    required AttrType type = 2;
+  }
+
+  message Desc {
+    repeated VarDef inputs = 1;
+    repeated VarDef outputs = 2;
+    repeated AttrDef attrs = 3;
+  }
+
+  required string type = 1;
+  required Desc def = 2;
+  optional Desc extra = 3;
+}
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 9244b9af0bbd6c..e6a0ecf4aececc 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -52,11 +52,6 @@ class ActivationOpConverter : public OpConverter {
         engine_->GetITensor(op_desc.Input("X")[0]);
 
     auto op_pair = ops.find(op_type_);
-    if (op_pair == ops.end()) {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Wrong activation op type, the trt do not support the %s act type.",
-          op_type_));
-    }
 
     nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
         engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
index 813342c08483b7..eba67c3c098ca6 100644
--- a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
@@ -55,16 +55,6 @@ class AffineChannelOpConverter : public OpConverter {
     auto* bias_t = bias_v->GetMutable<framework::LoDTensor>();
     float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t, false);
 
-    auto data_layout = framework::StringToDataLayout(
-        BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout")));
-
-    PADDLE_ENFORCE_EQ(
-        data_layout, framework::DataLayout::kNCHW,
-        platform::errors::InvalidArgument(
-            "TensorRT affine channel converter can only convert NCHW format. "
-            "Other format should be run in fluid mode. Report a bug on github "
-            "issue if you see this line."));
-
     // tensorrt scalend layer only support spatial dims >= 2,
     // so nhwc is not availabe (spatial dims == 0)
     const int channel_axis = engine_->with_dynamic_shape();
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 47f5cc97d39cdf..df2400854414c3 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -25,10 +25,6 @@ static bool CheckDims(const nvinfer1::Dims& dims_x,
     return false;
   }
   for (int i = 0; i < dims_x.nbDims; i++) {
-    // conservative judgment
-    if (dims_x.d[i] == -1 || dims_y.d[i] == -1) {
-      return false;
-    }
     if (dims_x.d[i] != dims_y.d[i]) {
       return false;
     }
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 54fc9492b7193e..9df3ec0445ad1c 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -143,6 +143,19 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
 
       if (paddings.size() > 2) return false;
+// strides > 1 is only supported by trt7.0 above
+#if !IS_TRT_VERSION_GE(7000)
+      if (desc.HasAttr("strides")) {
+        const std::vector<int> strides =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
+        // there is no issue if strides.size() less than 2
+        if (strides.size() > 1) {
+          for (size_t i = 0; i < strides.size(); i++) {
+            if (strides[i] > 1) return false;
+          }
+        }
+      }
+#endif
     }
 
     if (op_type == "pool2d") {
@@ -225,6 +238,20 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                 << desc.Output("Output").size() << " output.";
         return false;
       }
+
+// strides > 1 is only supported by trt7.0 above
+#if !IS_TRT_VERSION_GE(7000)
+      if (desc.HasAttr("strides")) {
+        const std::vector<int> strides =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
+        // there is no issue if strides.size() less than 2
+        if (strides.size() > 1) {
+          for (size_t i = 0; i < strides.size(); i++) {
+            if (strides[i] > 1) return false;
+          }
+        }
+      }
+#endif
     }
 
     if (op_type == "matmul") {
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 60479f806f3667..a5f075b8dc68c2 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -176,7 +176,7 @@ if(NOT APPLE AND WITH_MKLML)
     inference_analysis_api_test(test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc)
     inference_analysis_api_test(test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_statis_tester.cc)
     inference_analysis_api_test(test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_profile_tester.cc)
-    if(NOT WIN32)
+    if(NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
         set_tests_properties(test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120)
         set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120)
         set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120)
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 7252ed72b20836..952e9ca329f102 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -27,6 +27,9 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The output tensor of cast op");
     AddAttr<int>("out_dtype", "output data type");
     AddAttr<int>("in_dtype", "input data type");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Cast Operator.
 
@@ -50,6 +53,7 @@ class CastOpGradMaker : public framework::SingleGradOpMaker<T> {
     grad->SetOutput("Out", this->InputGrad("X"));
     grad->SetAttr("out_dtype", this->GetAttr("in_dtype"));
     grad->SetAttr("in_dtype", this->GetAttr("out_dtype"));
+    grad->SetAttr("use_mkldnn", this->GetAttr("use_mkldnn"));
   }
 };
 
@@ -77,6 +81,28 @@ class CastOp : public framework::OperatorWithKernel {
     if (platform::is_cuda_pinned_place(tensor_place)) {
       return framework::OpKernelType(tensor->type(), ctx.device_context());
     }
+
+#ifdef PADDLE_WITH_MKLDNN
+    int in_dtype = ctx.Attr<int>("in_dtype");
+    int out_dtype = ctx.Attr<int>("out_dtype");
+
+    auto MKLDNNSupportsCast = [&]() -> bool {
+      int dtype_fp32 = static_cast<int>(framework::proto::VarType::FP32);
+      int dtype_bf16 = static_cast<int>(framework::proto::VarType::BF16);
+
+      if ((in_dtype != dtype_fp32 && in_dtype != dtype_bf16) ||
+          (out_dtype != dtype_fp32 && out_dtype != dtype_bf16))
+        return false;
+
+      return true;
+    };
+
+    if (this->CanMKLDNNBeUsed(ctx, tensor->type()) && MKLDNNSupportsCast()) {
+      return framework::OpKernelType(tensor->type(), ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
     return framework::OpKernelType(tensor->type(), tensor_place);
   }
 };
diff --git a/paddle/fluid/operators/compat/while.pbtxt b/paddle/fluid/operators/compat/while.pbtxt
new file mode 100644
index 00000000000000..34435e1d9e5ff3
--- /dev/null
+++ b/paddle/fluid/operators/compat/while.pbtxt
@@ -0,0 +1,49 @@
+type: "while"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Condition"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "StepScopes"
+  }
+  attrs {
+    name: "sub_block"
+    type: BLOCK
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "skip_eager_deletion_vars"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index b551629169deed..67e2e3a1e96772 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct complex128;
-struct complex64;
+template <typename T>
+struct complex;
 }  // namespace platform
 }  // namespace paddle
 
@@ -135,9 +135,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add_grad,
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -145,9 +145,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add_grad_grad,
     ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
@@ -159,9 +159,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>,
     ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 
 // A specialization elementwise_add operator, used in gradient accumulation with
 // inplace addto.
@@ -178,9 +178,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_add)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index a4b97301a2611b..37e5fa5a206577 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -141,8 +140,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_add_grad,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
@@ -150,8 +149,10 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext,
+                                  plat::complex<float>>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext,
+                                  plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_add_grad_grad,
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, float>,
@@ -160,9 +161,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex64>,
+                                        plat::complex<float>>,
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex128>);
+                                        plat::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
@@ -170,5 +171,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 0252e6dfff5d75..9a899ec11b4c17 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -135,9 +134,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div_grad,
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -145,9 +144,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_div_grad_grad,
@@ -160,9 +159,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_div)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 0cf9294c9de67f..b10ed57af901f0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -14,8 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -76,18 +75,21 @@ static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y,
 }
 
 template <>
-__global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex64>(
-    const paddle::platform::complex64* x, const paddle::platform::complex64* y,
-    const paddle::platform::complex64* out,
-    const paddle::platform::complex64* dout, int64_t size,
-    paddle::platform::complex64* dx, paddle::platform::complex64* dy) {
+__global__ void
+SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex<float>>(
+    const paddle::platform::complex<float>* x,
+    const paddle::platform::complex<float>* y,
+    const paddle::platform::complex<float>* out,
+    const paddle::platform::complex<float>* dout, int64_t size,
+    paddle::platform::complex<float>* dx,
+    paddle::platform::complex<float>* dy) {
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
   while (col < size) {
-    paddle::platform::complex64 o = dout[col];
-    paddle::platform::complex64 y_conj(y[col].real, -y[col].imag);
-    paddle::platform::complex64 out_div_y_conj((out[col] / y[col]).real,
-                                               -(out[col] / y[col]).imag);
+    paddle::platform::complex<float> o = dout[col];
+    paddle::platform::complex<float> y_conj(y[col].real, -y[col].imag);
+    paddle::platform::complex<float> out_div_y_conj((out[col] / y[col]).real,
+                                                    -(out[col] / y[col]).imag);
     dx[col] = o / y_conj;
     dy[col] = -o * out_div_y_conj;
     col += blockDim.x * gridDim.x;
@@ -95,19 +97,21 @@ __global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex64>(
 }
 
 template <>
-__global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex128>(
-    const paddle::platform::complex128* x,
-    const paddle::platform::complex128* y,
-    const paddle::platform::complex128* out,
-    const paddle::platform::complex128* dout, int64_t size,
-    paddle::platform::complex128* dx, paddle::platform::complex128* dy) {
+__global__ void
+SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex<double>>(
+    const paddle::platform::complex<double>* x,
+    const paddle::platform::complex<double>* y,
+    const paddle::platform::complex<double>* out,
+    const paddle::platform::complex<double>* dout, int64_t size,
+    paddle::platform::complex<double>* dx,
+    paddle::platform::complex<double>* dy) {
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
   while (col < size) {
-    paddle::platform::complex128 o = dout[col];
-    paddle::platform::complex128 y_conj(y[col].real, -y[col].imag);
-    paddle::platform::complex128 out_div_y_conj((out[col] / y[col]).real,
-                                                -(out[col] / y[col]).imag);
+    paddle::platform::complex<double> o = dout[col];
+    paddle::platform::complex<double> y_conj(y[col].real, -y[col].imag);
+    paddle::platform::complex<double> out_div_y_conj((out[col] / y[col]).real,
+                                                     -(out[col] / y[col]).imag);
     dx[col] = o / y_conj;
     dy[col] = -o * out_div_y_conj;
     col += blockDim.x * gridDim.x;
@@ -145,9 +149,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
@@ -157,9 +161,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad_grad,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
@@ -173,6 +177,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         int64_t>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index 0be8d934b17af7..a0b9633acb2e59 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -74,23 +74,13 @@ struct DivGradDX {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
 };
 
-template <>
-struct DivGradDX<paddle::platform::complex64> {
-  HOSTDEVICE paddle::platform::complex64 operator()(
-      paddle::platform::complex64 x, paddle::platform::complex64 y,
-      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
-    paddle::platform::complex64 y_conj(y.real, -y.imag);
-    return dout / y_conj;
-  }
-};
-
-template <>
-struct DivGradDX<paddle::platform::complex128> {
-  HOSTDEVICE paddle::platform::complex128 operator()(
-      paddle::platform::complex128 x, paddle::platform::complex128 y,
-      paddle::platform::complex128 out,
-      paddle::platform::complex128 dout) const {
-    paddle::platform::complex128 y_conj(y.real, -y.imag);
+template <typename T>
+struct DivGradDX<paddle::platform::complex<T>> {
+  HOSTDEVICE paddle::platform::complex<T> operator()(
+      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
+      paddle::platform::complex<T> out,
+      paddle::platform::complex<T> dout) const {
+    paddle::platform::complex<T> y_conj(y.real, -y.imag);
     return dout / y_conj;
   }
 };
@@ -102,23 +92,13 @@ struct DivGradDY {
   }
 };
 
-template <>
-struct DivGradDY<paddle::platform::complex64> {
-  HOSTDEVICE paddle::platform::complex64 operator()(
-      paddle::platform::complex64 x, paddle::platform::complex64 y,
-      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
-    paddle::platform::complex64 out_div_y_conj((out / y).real, -(out / y).imag);
-    return -dout * out_div_y_conj;
-  }
-};
-
-template <>
-struct DivGradDY<paddle::platform::complex128> {
-  HOSTDEVICE paddle::platform::complex128 operator()(
-      paddle::platform::complex128 x, paddle::platform::complex128 y,
-      paddle::platform::complex128 out,
-      paddle::platform::complex128 dout) const {
-    paddle::platform::complex128 out_div_y_conj((out / y).real,
+template <typename T>
+struct DivGradDY<paddle::platform::complex<T>> {
+  HOSTDEVICE paddle::platform::complex<T> operator()(
+      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
+      paddle::platform::complex<T> out,
+      paddle::platform::complex<T> dout) const {
+    paddle::platform::complex<T> out_div_y_conj((out / y).real,
                                                 -(out / y).imag);
     return -dout * out_div_y_conj;
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index 6bf296f0e0b57a..0045f00ecc6c25 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -134,9 +133,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -144,9 +143,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad_grad,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
@@ -158,9 +157,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_mul)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index e01b5eb5fb73d9..8fd4609c3aa850 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -76,31 +75,31 @@ static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
 }
 
 template <>
-__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex64>(
-    const plat::complex64* x, const plat::complex64* y,
-    const plat::complex64* out, const plat::complex64* dout, int64_t size,
-    plat::complex64* dx, plat::complex64* dy) {
+__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<float>>(
+    const plat::complex<float>* x, const plat::complex<float>* y,
+    const plat::complex<float>* out, const plat::complex<float>* dout,
+    int64_t size, plat::complex<float>* dx, plat::complex<float>* dy) {
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
   while (col < size) {
-    plat::complex64 o = dout[col];
-    dx[col] = plat::complex64(y[col].real, -y[col].imag) * o;
-    dy[col] = plat::complex64(x[col].real, -x[col].imag) * o;
+    plat::complex<float> o = dout[col];
+    dx[col] = plat::complex<float>(y[col].real, -y[col].imag) * o;
+    dy[col] = plat::complex<float>(x[col].real, -x[col].imag) * o;
     col += blockDim.x * gridDim.x;
   }
 }
 
 template <>
-__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex128>(
-    const plat::complex128* x, const plat::complex128* y,
-    const plat::complex128* out, const plat::complex128* dout, int64_t size,
-    plat::complex128* dx, plat::complex128* dy) {
+__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<double>>(
+    const plat::complex<double>* x, const plat::complex<double>* y,
+    const plat::complex<double>* out, const plat::complex<double>* dout,
+    int64_t size, plat::complex<double>* dx, plat::complex<double>* dy) {
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
   while (col < size) {
-    plat::complex128 o = dout[col];
-    dx[col] = plat::complex128(y[col].real, -y[col].imag) * o;
-    dy[col] = plat::complex128(x[col].real, -x[col].imag) * o;
+    plat::complex<double> o = dout[col];
+    dx[col] = plat::complex<double>(y[col].real, -y[col].imag) * o;
+    dy[col] = plat::complex<double>(x[col].real, -x[col].imag) * o;
     col += blockDim.x * gridDim.x;
   }
 }
@@ -133,8 +132,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
@@ -142,8 +141,10 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::complex64>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::complex128>);
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
+                                  plat::complex<float>>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
+                                  plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad_grad,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, float>,
@@ -152,6 +153,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex64>,
+                                        plat::complex<float>>,
     ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex128>);
+                                        plat::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 46a00268e4134a..10e69491643c92 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -132,23 +132,13 @@ struct MulGradDX {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
 };
 
-template <>
-struct MulGradDX<paddle::platform::complex64> {
-  HOSTDEVICE paddle::platform::complex64 operator()(
-      paddle::platform::complex64 x, paddle::platform::complex64 y,
-      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
-    paddle::platform::complex64 y_conj(y.real, -y.imag);
-    return dout * y_conj;
-  }
-};
-
-template <>
-struct MulGradDX<paddle::platform::complex128> {
-  HOSTDEVICE paddle::platform::complex128 operator()(
-      paddle::platform::complex128 x, paddle::platform::complex128 y,
-      paddle::platform::complex128 out,
-      paddle::platform::complex128 dout) const {
-    paddle::platform::complex128 y_conj(y.real, -y.imag);
+template <typename T>
+struct MulGradDX<paddle::platform::complex<T>> {
+  HOSTDEVICE paddle::platform::complex<T> operator()(
+      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
+      paddle::platform::complex<T> out,
+      paddle::platform::complex<T> dout) const {
+    paddle::platform::complex<T> y_conj(y.real, -y.imag);
     return dout * y_conj;
   }
 };
@@ -158,23 +148,13 @@ struct MulGradDY {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
 };
 
-template <>
-struct MulGradDY<paddle::platform::complex64> {
-  HOSTDEVICE paddle::platform::complex64 operator()(
-      paddle::platform::complex64 x, paddle::platform::complex64 y,
-      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
-    paddle::platform::complex64 x_conj(x.real, -x.imag);
-    return dout * x_conj;
-  }
-};
-
-template <>
-struct MulGradDY<paddle::platform::complex128> {
-  HOSTDEVICE paddle::platform::complex128 operator()(
-      paddle::platform::complex128 x, paddle::platform::complex128 y,
-      paddle::platform::complex128 out,
-      paddle::platform::complex128 dout) const {
-    paddle::platform::complex128 x_conj(x.real, -x.imag);
+template <typename T>
+struct MulGradDY<paddle::platform::complex<T>> {
+  HOSTDEVICE paddle::platform::complex<T> operator()(
+      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
+      paddle::platform::complex<T> out,
+      paddle::platform::complex<T> dout) const {
+    paddle::platform::complex<T> x_conj(x.real, -x.imag);
     return dout * x_conj;
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index 1951ed7f5da673..84aa189b89e909 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-struct complex128;
-struct complex64;
+template <typename T>
+struct complex;
 }  // namespace platform
 }  // namespace paddle
 
@@ -134,9 +134,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub_grad,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
@@ -144,9 +144,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub_grad_grad,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
@@ -158,9 +158,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_sub)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 192999fd2ac831..19cbbb7bf04287 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -14,8 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -103,9 +102,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex64>,
+                              paddle::platform::complex<float>>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex128>);
+                              paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub_grad,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
@@ -115,9 +114,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex64>,
+                                  paddle::platform::complex<float>>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex128>);
+                                  paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub_grad_grad,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
@@ -129,6 +128,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                         int64_t>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex64>,
+                                        paddle::platform::complex<float>>,
     ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex128>);
+                                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
new file mode 100644
index 00000000000000..9cfeace6bef99f
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+template <typename T>
+class CastMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    int in_dtype = ctx.Attr<int>("in_dtype");
+    int out_dtype = ctx.Attr<int>("out_dtype");
+
+    auto x_paddle_type = framework::proto::VarType::Type(in_dtype);
+    auto out_paddle_type = framework::proto::VarType::Type(out_dtype);
+
+    mkldnn::memory::data_type x_type =
+        framework::ToMKLDNNDataType(x_paddle_type);
+    mkldnn::memory::data_type out_type =
+        framework::ToMKLDNNDataType(out_paddle_type);
+
+    auto x_tz = framework::vectorize(x->dims());
+
+    std::string key =
+        platform::CreateKey(dev_ctx, x_tz, x->format(), x->format(), x_type);
+    platform::ReorderMKLDNNHandler reorder_handler(
+        x_tz, x_paddle_type, x_type, out_paddle_type, out_type, dev_ctx,
+        dev_ctx.GetEngine(), key);
+
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        x->format(), platform::to_void_cast(x->data<T>()));
+    auto reorder_dst_memory_p =
+        reorder_handler.AcquireDstMemory(out, x->format(), dev_ctx.GetPlace());
+    auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
+                                                    reorder_src_memory_p);
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(cast, MKLDNN, paddle::platform::CPUPlace,
+                   ops::CastMKLDNNKernel<float>,
+                   ops::CastMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index e9bc351de4d692..8262273b7ca7da 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -30,6 +30,7 @@ register_unity_group(cc
     bmm_op.cc
     bpr_loss_op.cc
     cast_op.cc
+    mkldnn/cast_mkldnn_op.cc
     cholesky_op.cc
     chunk_eval_op.cc
     clip_by_norm_op.cc
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index dde9531e591442..4095720f71eb71 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -16,8 +16,7 @@ limitations under the License. */
 
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -82,28 +81,52 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
 #endif
 }
 
-// CUDA 9.0 have native compatible float16 shfl_down
 #if defined(PADDLE_WITH_HIP)
 template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
                                                        int width) {
-#ifdef PADDLE_WITH_HIP
   return float16(__shfl_down(static_cast<float>(val),
                              static_cast<unsigned>(delta), width));
-#else
-  return float16(
-      __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
-#endif
 }
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
+    unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
+  float real = __shfl_down(val.real, delta, width);
+  float imag = __shfl_down(val.imag, delta, width);
+  return paddle::platform::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<double>
+CudaShuffleDownSync(unsigned mask, paddle::platform::complex<double> val,
+                    int delta, int width) {
+  double real = __shfl_down(val.real, delta, width);
+  double imag = __shfl_down(val.imag, delta, width);
+  return paddle::platform::complex<double>(real, imag);
+}
+
 template <>
 __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
                                                       float16 val, int width) {
-#ifdef PADDLE_WITH_HIP
   return float16(__shfl_xor(static_cast<float>(val), width));
-#else
-  return float16(__shfl_xor(static_cast<half>(val), width));
-#endif
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<float> val, int width) {
+  float real = __shfl_xor(val.real, width);
+  float imag = __shfl_xor(val.imag, width);
+  return paddle::platform::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<double> val, int width) {
+  double real = __shfl_xor(val.real, width);
+  double imag = __shfl_xor(val.imag, width);
+  return paddle::platform::complex<double>(real, imag);
 }
 #else
 template <>
@@ -115,25 +138,26 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
 }
 
 template <>
-__forceinline__ __device__ paddle::platform::complex64 CudaShuffleDownSync(
-    unsigned mask, paddle::platform::complex64 val, int delta, int width) {
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
+    unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
   float real = static_cast<float>(__shfl_down_sync(
       mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
   float imag = static_cast<float>(__shfl_down_sync(
       mask, static_cast<float>(val.imag), static_cast<unsigned>(delta), width));
-  return paddle::platform::complex64(real, imag);
+  return paddle::platform::complex<float>(real, imag);
 }
 
 template <>
-__forceinline__ __device__ paddle::platform::complex128 CudaShuffleDownSync(
-    unsigned mask, paddle::platform::complex128 val, int delta, int width) {
+__forceinline__ __device__ paddle::platform::complex<double>
+CudaShuffleDownSync(unsigned mask, paddle::platform::complex<double> val,
+                    int delta, int width) {
   double real = static_cast<double>(
       __shfl_down_sync(mask, static_cast<double>(val.real),
                        static_cast<unsigned>(delta), width));
   double imag = static_cast<double>(
       __shfl_down_sync(mask, static_cast<double>(val.imag),
                        static_cast<unsigned>(delta), width));
-  return paddle::platform::complex128(real, imag);
+  return paddle::platform::complex<double>(real, imag);
 }
 
 template <>
@@ -143,23 +167,23 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
 }
 
 template <>
-__forceinline__ __device__ paddle::platform::complex64 CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex64 val, int width) {
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<float> val, int width) {
   float real = static_cast<float>(
       __shfl_xor_sync(mask, static_cast<float>(val.real), width));
   float imag = static_cast<float>(
       __shfl_xor_sync(mask, static_cast<float>(val.imag), width));
-  return paddle::platform::complex64(real, imag);
+  return paddle::platform::complex<float>(real, imag);
 }
 
 template <>
-__forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex128 val, int width) {
+__forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<double> val, int width) {
   double real = static_cast<double>(
       __shfl_xor_sync(mask, static_cast<double>(val.real), width));
   double imag = static_cast<double>(
       __shfl_xor_sync(mask, static_cast<double>(val.imag), width));
-  return paddle::platform::complex128(real, imag);
+  return paddle::platform::complex<double>(real, imag);
 }
 #endif
 
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 5ff6f893a89531..d6563be48fe484 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -926,7 +926,23 @@ class ReorderMKLDNNHandler : public MKLDNNHandler {
       : platform::MKLDNNHandler(dev_ctx, engine, base_key),
         dims_(dims),
         vtype_(vtype),
-        dtype_(dtype) {}
+        vtype_dst_(vtype),
+        dtype_(dtype),
+        dtype_dst_(dtype) {}
+
+  ReorderMKLDNNHandler(std::vector<int64_t>& dims,  // NOLINT
+                       framework::proto::VarType::Type vtype,
+                       mkldnn::memory::data_type dtype,
+                       framework::proto::VarType::Type vtype_dst,
+                       mkldnn::memory::data_type dtype_dst,
+                       const platform::MKLDNNDeviceContext& dev_ctx,
+                       mkldnn::engine engine, const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        dims_(dims),
+        vtype_(vtype),
+        vtype_dst_(vtype_dst),
+        dtype_(dtype),
+        dtype_dst_(dtype_dst) {}
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
       const MKLDNNMemoryFormat& fmt, void* ptr) {
@@ -940,15 +956,16 @@ class ReorderMKLDNNHandler : public MKLDNNHandler {
     auto mem_p =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
     if (mem_p == nullptr) {
-      auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_, fmt);
-      auto dst_data = output->mutable_data(place, vtype_, dst_md.get_size());
+      auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt);
+      auto dst_data =
+          output->mutable_data(place, vtype_dst_, dst_md.get_size());
 
       mem_p = std::make_shared<mkldnn::memory>(dst_md, engine_, dst_data);
       dev_ctx_.SetBlob(local_key, mem_p);
     } else {
       // Even if memory object exists , we may be using it for diffrent tensor
       auto dst_data =
-          output->mutable_data(place, vtype_, mem_p->get_desc().get_size());
+          output->mutable_data(place, vtype_dst_, mem_p->get_desc().get_size());
       mem_p->set_data_handle(dst_data);
     }
     return mem_p;
@@ -970,8 +987,8 @@ class ReorderMKLDNNHandler : public MKLDNNHandler {
 
  private:
   std::vector<int64_t> dims_;
-  framework::proto::VarType::Type vtype_;
-  mkldnn::memory::data_type dtype_;
+  framework::proto::VarType::Type vtype_, vtype_dst_;
+  mkldnn::memory::data_type dtype_, dtype_dst_;
 };
 
 template <typename T>
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 91461aa26f341a..fa14ad4f63be08 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/distributed/common/sparse_sharding_merge.h"
 #include "paddle/fluid/distributed/communicator_common.h"
 #include "paddle/fluid/distributed/fleet.h"
 #include "paddle/fluid/distributed/index_dataset/index_sampler.h"
@@ -48,6 +49,7 @@ using paddle::distributed::GraphNode;
 using paddle::distributed::GraphPyServer;
 using paddle::distributed::GraphPyClient;
 using paddle::distributed::FeatureNode;
+using paddle::distributed::ShardingMerge;
 
 namespace paddle {
 namespace pybind {
@@ -85,6 +87,12 @@ void BindPSHost(py::module* m) {
       .def("to_string", &distributed::PSHost::to_string);
 }
 
+void BindSparseShardingTools(py::module* m) {
+  py::class_<ShardingMerge>(*m, "ShardingMerge")
+      .def(py::init<>())
+      .def("merge", &ShardingMerge::Merge);
+}
+
 void BindCommunicatorContext(py::module* m) {
   py::class_<CommContext>(*m, "CommContext")
       .def(
diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h
index 206a69f5a80197..4dc0f002ad3c1d 100644
--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
@@ -36,5 +36,6 @@ void BindIndexNode(py::module* m);
 void BindTreeIndex(py::module* m);
 void BindIndexWrapper(py::module* m);
 void BindIndexSampler(py::module* m);
+void BindSparseShardingTools(py::module* m);
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 560d8c892b09f9..6dd08e5dfa4bf2 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -3159,7 +3159,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindTreeIndex(&m);
   BindIndexWrapper(&m);
   BindIndexSampler(&m);
-
+  BindSparseShardingTools(&m);
 #endif
 }
 }  // namespace pybind
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 69138a37f461ca..dd8146aa3a1147 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -18,7 +18,7 @@ rem       Paddle CI Task On Windows Platform
 rem =================================================
 
 @ECHO ON
-setlocal
+setlocal enabledelayedexpansion
 
 rem -------clean up environment-----------
 set work_dir=%cd%
@@ -63,7 +63,7 @@ if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
 if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
-if not defined WITH_TPCACHE set WITH_TPCACHE=ON
+if not defined WITH_TPCACHE set WITH_TPCACHE=OFF
 if not defined WITH_CLCACHE set WITH_CLCACHE=OFF
 if not defined WITH_CACHE set WITH_CACHE=OFF
 if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF
@@ -236,6 +236,8 @@ call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary
 set DISTUTILS_USE_SDK=1
 rem Windows 10 Kit bin dir
 set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH%
+rem Use 64-bit ToolSet to compile
+set PreferredToolArchitecture=x64
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
@@ -263,12 +265,12 @@ rem ------initialize the python environment------
 @ECHO ON
 set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
 set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
-if %WITH_PYTHON% == "ON" (
+if "%WITH_PYTHON%" == "ON" (
     where python
     where pip
     pip install wheel --user
     pip install -r %work_dir%\python\requirements.txt --user
-    if %ERRORLEVEL% NEQ 0 (
+    if !ERRORLEVEL! NEQ 0 (
         echo pip install requirements.txt failed!
         exit /b 7
     )
@@ -329,14 +331,14 @@ if "%WITH_GPU%"=="ON" (
 )
 
 :cmake_impl
-echo cmake .. -G %GENERATOR% -T host=x64 -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
 -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME%
 
-cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
@@ -366,7 +368,7 @@ echo Build third_party the %build_times% time:
 if %GENERATOR% == "Ninja" (
     ninja third_party
 ) else (
-    MSBuild /m /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:quiet third_party.vcxproj
+    MSBuild /m /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% third_party.vcxproj
 )
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
@@ -412,10 +414,10 @@ if "%WITH_TESTING%"=="ON" (
 
 echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
-    ninja -j %PARALLEL_PROJECT_COUNT%
+    ninja all
 ) else (
     if "%WITH_CLCACHE%"=="OFF" (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
+        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
     ) else (
         MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
     )
@@ -644,7 +646,7 @@ echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_
 echo fi>>  check_change_of_unittest.sh
 echo git checkout -b origin_pr >>  check_change_of_unittest.sh
 echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G %GENERATOR% -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+echo cmake .. -G %GENERATOR% -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index d3df57fcf6b7d3..4f3a6f4768933d 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -977,6 +977,11 @@ def _parallel_linear(x,
                      group=None):
     """
     Parallel Linear
+
+    axis the dimension of the parameter of linear layer. 
+    axis = 0: the row dimension
+    axid = 1: the col dimension
+    
     """
     if group is not None and not group.is_member():
         return
@@ -1008,6 +1013,12 @@ def _parallel_linear(x,
     main_block = paddle.static.default_main_program().global_block()
     startup_block.vars[linear.weight.name].is_distributed = True
     main_block.vars[linear.weight.name].is_distributed = True
+    # set is_distributed for splited bias
+    # if a linear layer is splited by row, each rank would hold a complete bias and they should be the same in each rank.
+    # if a linear layer is splited by col, the bias would also be split into each rank as its weight
+    if axis == 1 and linear._bias_attr != False:
+        startup_block.vars[linear.bias.name].is_distributed = True
+        main_block.vars[linear.bias.name].is_distributed = True
 
     if not gather_out: return linear_out
 
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
old mode 100755
new mode 100644
index f9cd623afef76a..0a989fe90f96a6
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -814,7 +814,7 @@ def sharding_configs(self):
                 "sharding_segment_strategy": "segment_broadcast_MB",
                 "segment_broadcast_MB": 32,
                 "sharding_degree": 8,
-                "sharding_degree": 2,
+                "dp_degree": 2,
                 "gradient_merge_acc_step": 4,
                 }
         """
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index d31fa549ad5623..f18b82eaecd76a 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -847,8 +847,6 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
         dirname = os.path.normpath(dirname)
         pserver_id = self.role_maker._role_id()
 
-        import time
-        begin = time.time()
         for var_name in load_varnames:
             table_id = sparse_table_maps[var_name]
             path = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
@@ -856,9 +854,6 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
             meta = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
                                 "{}.block{}.meta".format(var_name, pserver_id))
             self._server.load_sparse(path, meta, table_id)
-        end = time.time()
-        print("init sparse variables: {} cost time: {}".format(load_varnames,
-                                                               end - begin))
 
     def _run_server(self):
         if self.role_maker._is_heter_worker():
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 30981f531289ae..0221a42e2a3e78 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -1,4 +1,5 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,6 +33,8 @@
 from . import layers
 from .layers import *
 from . import optimizer
+from . import sparsity
+from .sparsity import *
 
 __all__ = []
 __all__ += decoder.__all__
@@ -42,3 +45,4 @@
 __all__ += ['mixed_precision']
 __all__ += layers.__all__
 __all__ += optimizer.__all__
+__all__ += sparsity.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index f940f6a3143a09..2913d99ee6b217 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -145,6 +145,7 @@ def _update_list(self):
     'sign',
     'cast',
     'fused_bn_add_activation',
+    'c_identity',
 }
 
 # The set of ops that don't support fp16 calculation
diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py
new file mode 100644
index 00000000000000..f78ea1b1c38b85
--- /dev/null
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import utils
+from .utils import *
+
+__all__ = utils.__all__
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
new file mode 100644
index 00000000000000..f1108c327407ff
--- /dev/null
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -0,0 +1,587 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities of Auto SParsity (ASP).
+"""
+
+from __future__ import print_function
+
+import sys
+import math
+import collections
+import numpy as np
+from enum import Enum
+from itertools import permutations
+import threading
+
+__all__ = [
+    'density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
+    'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
+    'MaskAlgo', 'CheckMethod'
+]
+
+
+class MaskAlgo(Enum):
+    r"""
+    A collection of all mask generating algorithms.
+    There currently are three algorithms, `MASK_1D`, `MASK_2D_GREEDY` and `MASK_2D_BEST`
+    """
+    MASK_1D = 'get_mask_1d'
+    MASK_2D_GREEDY = 'get_mask_2d_greedy'
+    MASK_2D_BEST = 'get_mask_2d_best'
+
+
+class CheckMethod(Enum):
+    r"""
+    A collection of all sparsity checking approaches.
+    There currently are two methods, `CHECK_1D` and `CHECK_2D`
+    """
+    CHECK_1D = 'check_mask_1d'
+    CHECK_2D = 'check_mask_2d'
+
+    @staticmethod
+    def get_checking_method(mask_algo):
+        r"""
+        Get sparsity checking method by mask generating algorithm.
+
+        Args:
+            mask_algo (MaskAlgo): The algorithm of mask generating.
+        Returns:
+            CheckMethod: The corresponded sparsity checking method.
+        Examples:
+            .. code-block:: python
+
+            import numpy as np
+            from paddle.fluid.contrib.sparsity import MaskAlgo, CheckMethod
+
+            CheckMethod.get_checking_method(MaskAlgo.MASK_1D)
+            # CheckMethod.CHECK_1D
+
+            CheckMethod.get_checking_method(MaskAlgo.MASK_2D_GREEDY)
+            # CheckMethod.CHECK_2D
+
+            CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
+            # CheckMethod.CHECK_2D
+        """
+        assert type(mask_algo) == MaskAlgo, \
+               "mask_algo should be MaskAlgo type"
+        if mask_algo == MaskAlgo.MASK_1D:
+            return CheckMethod.CHECK_1D
+        else:
+            return CheckMethod.CHECK_2D
+
+
+def density(x):
+    r"""
+    Return the density of the input tensor.
+
+    Args:
+        x (nparray): The input tensor.
+    Returns:
+        float: The density of :attr:`x`.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          x = np.array([[0, 1, 3, 0],
+                        [1, 1, 0, 1]])
+          sparsity.density(x) # 0.625
+    """
+    x_flattened = x.flatten()
+    return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
+
+
+def reshape_1d(mat, m):
+    r"""
+    Reshape the input matrix to shape (-1, m).
+    If the second dimension of :attr:`mat` is not a multiples of :attr:`m`, 
+    then this function would pad the remainder with 0 before reshaping.
+
+    .. math::
+
+        remainder = mat.shape[1] % m
+
+    Args:
+        mat (nparray): The input matrix.
+        m (int): The second dimension of reshaped matrix.
+    Returns:
+        tuple: A pair of the reshaped and padded matrix and the shape of padded matrix (non-reshaping).
+    """
+    remainder = mat.shape[1] % m
+    if mat.shape[1] % m > 0:
+        mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
+        mat_padded[:, :mat.shape[1]] = mat
+        shape = mat_padded.shape
+        return mat_padded.reshape(-1, m), shape
+    else:
+        return mat.reshape(-1, m), mat.shape
+
+
+def check_mask_1d(mat, n, m):
+    r"""
+    Check if every row of the input matrix :attr:`mat` is in 1D `n:m` sparse pattern.
+    This function would pad the second dimension of :attr:`mat` by zero 
+    to be a multiples of :attr:`m` if necessary.
+
+    1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
+
+    Args:
+        mat (nparray): The input matrix.
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        bool: True if every row of :attr:`mat` is in 1D n:m sparse pattern, else False.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          x = np.array([[0, 1, 3, 0],
+                        [1, 0, 0, 1]])
+          sparsity.check_mask_1d(x, 2, 4) # True
+
+          x = np.array([[0, 1, 5, 4],
+                        [1, 0, 0, 1]])
+          sparsity.check_mask_1d(x, 2, 4) # False
+
+          # x would be padded to shape (2, 8)
+          x = np.array([[0, 1, 0, 4, 6],
+                        [1, 0, 0, 1, 7]])
+          sparsity.check_mask_1d(x, 2, 4) # True
+    """
+    if len(mat.shape) <= 1:
+        mat_flattern, shape = reshape_1d(mat.reshape(1, mat.shape[0]), m)
+    else:
+        mat_flattern, shape = reshape_1d(mat, m)
+
+    for sub_mat in mat_flattern:
+        if np.nonzero(sub_mat)[0].size > (m - n):
+            return False
+    return True
+
+
+def get_mask_1d(mat, n, m):
+    r"""
+    Generate 1D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
+    in row-directory. This function would pad the second dimension of :attr:`mat` 
+    by zero to be a multiples of :attr:`m` before mask generation.
+
+    1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
+
+    Args:
+        mat (nparray): The input matrix.
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        nparray: The 1D `n:m` sparse mask of :attr:`mat`.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          mat = np.array([[0, 1, 5, 4],
+                          [2, 7, 3, 6]])
+          mask = sparsity.get_mask_1d(mat, 2, 4)
+          # nparray([[0, 0, 1, 1],
+          #          [0, 1, 0, 1]])
+          sparsity.check_mask_1d(mask, 2, 4) # True
+    """
+    mat_flattern, shape = reshape_1d(mat, m)
+
+    mask_flattern = np.ones_like(mat_flattern)
+    mask = np.ones_like(mat)
+    for i in range(mat_flattern.shape[0]):
+        sub_mat = mat_flattern[i]
+        min_order_indices = np.argsort(np.absolute(sub_mat))
+        mask_flattern[i, min_order_indices[:n].tolist()] = 0
+    mask_flattern = mask_flattern.reshape(shape)
+    mask[:, :] = mask_flattern[:, :mat.shape[1]]
+    return mask
+
+
+def reshape_2d(mat, m):
+    r"""
+    Reshape the input matrix to shape (-1, :math:`m \times m`).
+    In each dimension of :attr:`mat`, if it is not a multiples of :attr:`m`, 
+    then this function would pad the remainder with 0 before reshaping.
+
+    .. math::
+
+        remainder_0 = mat.shape[0] % m \\
+        remainder_1 = mat.shape[1] % m
+
+    Args:
+        mat (nparray): The input matrix.
+        m (int): The square root of second dimension of reshaped matrix.
+    Returns:
+        tuple: A pair of the reshaped and padded matrix and the shape of padded matrix (non-reshaping).
+    """
+    remainder_0 = mat.shape[0] % m
+    remainder_1 = mat.shape[1] % m
+
+    new_shape = (mat.shape[0] if remainder_0 == 0 \
+                 else mat.shape[0] + (m - remainder_0),
+                 mat.shape[1] if remainder_1 == 0 \
+                 else mat.shape[1] + (m - remainder_1))
+    mat_padded = np.zeros(new_shape)
+    mat_padded[:mat.shape[0], :mat.shape[1]] = mat
+
+    mat_flattern = np.empty(new_shape).reshape(-1, m * m)
+    curr_idx = 0
+    for row_start in range(0, mat_padded.shape[0], m):
+        row_end = row_start + m
+        for col_start in range(0, mat_padded.shape[1], m):
+            col_end = col_start + m
+            sub_mat = np.squeeze(mat_padded[row_start:row_end, \
+                                            col_start:col_end] \
+                                            .reshape(-1))
+            mat_flattern[curr_idx] = sub_mat
+            curr_idx += 1
+    return mat_flattern, mat_padded.shape
+
+
+def check_mask_2d(mat, n, m):
+    r"""
+    Check if every :math:`m \times m` block of the input matrix :attr:`mat` is in 2D `n:m` sparse pattern.
+    This function would pad each dimension of :attr:`mat` by zero to be a multiples of 
+    :attr:`m` if necessary.
+
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    under the constraint of at least :attr:`n` zeros for each row and column.
+
+    Args:
+        mat (nparray): The input matrix.
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        bool: True if  every :math:`m \times m` block of the input matrix :attr:`mat` is in 2D `n:m` sparse pattern, else False.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          x = np.array([[0, 8, 9, 0],
+                        [9, 0, 0, 10],
+                        [5, 0, 0, 6],
+                        [0, 4, 6, 0]])
+          sparsity.check_mask_2d(x, 2, 4) # True
+
+          x = np.array([[0, 8, 0, 9],
+                        [9, 0, 0, 10],
+                        [0, 5, 0, 6],
+                        [0, 4, 6, 0]])
+          sparsity.check_mask_2d(x, 2, 4) # False
+
+          # x would be padded to shape (8, 8)
+          x = np.array([[0, 8, 0, 9],
+                        [9, 0, 7, 0],
+                        [0, 5, 0, 6],
+                        [3, 0, 6, 0],
+                        [1, 1, 0, 1]])
+          sparsity.check_mask_2d(x, 2, 4) # True
+    """
+    mat_padded, shape = reshape_2d(mat, m)
+    for sub_mat in mat_padded:
+        sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
+        if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \
+            (np.sum(np.sum(sub_mask, axis=0) > (m-n)) != 0):
+            return False
+    return True
+
+
+def get_mask_2d_greedy(mat, n, m):
+    r"""
+    Greedily generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`. 
+    This function would pad each dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
+
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    under the constraint of at least :attr:`n` zeros for each row and column.
+    Greedily generating: For each :math:`m \times m` block, selecting values to keep in descent order.
+
+    Args:
+        mat (nparray): The input matrix.
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        nparray: The 2D `n:m` sparse mask of :attr:`mat`.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          mat = np.array([[9, 8, 3, 7],
+                          [9, 2, 1, 10],
+                          [5, 1, 3, 6],
+                          [2, 4, 6, 1]])
+          mask = sparsity.get_mask_2d_greedy(mat, 2, 4)
+          # nparray([[1. 1. 0. 0.]
+          #          [1. 0. 0. 1.]
+          #          [0. 0. 1. 1.]
+          #          [0. 1. 1. 0.]])
+          sparsity.check_mask_2d(mask, 2, 4) # True
+    """
+    mat_padded, shape = reshape_2d(mat, m)
+    mask_padded = np.zeros_like(mat_padded).reshape(-1, m, m)
+
+    for idx in range(len(mat_padded)):
+        sub_mat = np.absolute(np.squeeze(mat_padded[idx]))
+        sub_mask = np.squeeze(mask_padded[idx])
+
+        min_order_1d_indices = np.argsort(sub_mat)
+        min_order_2d_indices = [(int(x / m), x % m)
+                                for x in min_order_1d_indices]
+        row_counter = collections.Counter()
+        col_counter = collections.Counter()
+
+        for i in range(len(min_order_1d_indices) - 1, -1, -1):
+            matrix_entry = min_order_2d_indices[i]
+            if (row_counter[matrix_entry[0]] == n) or \
+               (col_counter[matrix_entry[1]] == n):
+                continue
+
+            sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0
+            row_counter[matrix_entry[0]] += 1
+            col_counter[matrix_entry[1]] += 1
+
+    mask = np.empty(shape)
+    curr_idx = 0
+    for row_start in range(0, shape[0], m):
+        row_end = row_start + m
+        for col_start in range(0, shape[1], m):
+            col_end = col_start + m
+            mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx]
+            curr_idx += 1
+    return mask[:mat.shape[0], :mat.shape[1]]
+
+
+valid_2d_patterns_lock = threading.Lock()
+valid_2d_patterns = {}
+
+
+def compute_valid_2d_patterns(n, m):
+    r"""
+    Compute all vaild 2D `n:m` sparse patterns.
+
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    under the constraint of at least :attr:`n` zeros for each row and column.
+
+    Args:
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        dictionary: A dictionary with key: *m_n* (string) and value: all vaild 2D `n:m` sparse patterns.
+    """
+    global valid_2d_patterns_lock
+    global valid_2d_patterns
+
+    valid_key = '{}_{}'.format(m, n)
+    if valid_key in valid_2d_patterns:
+        return valid_2d_patterns[valid_key]
+    else:
+        patterns = np.zeros(m)
+        patterns[:n] = 1
+        patterns = list(set(permutations(patterns.tolist())))
+        patterns = patterns + patterns
+        patterns = np.asarray(list(set(permutations(patterns, m))))
+
+        valid = ((patterns.sum(axis=1) <= n).sum(axis=1) == m
+                 ).nonzero()[0].reshape(-1)
+        valid_patterns = np.empty((valid.shape[0], m, m))
+        valid_patterns[:] = patterns[valid[:]]
+
+        valid_2d_patterns_lock.acquire()
+        valid_2d_patterns[valid_key] = valid_patterns
+        valid_2d_patterns_lock.release()
+
+        return valid_patterns
+
+
+def get_mask_2d_best(mat, n, m):
+    r"""
+    Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
+    to form sparse matrix with maximun L1 norm .This function would pad each 
+    dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
+
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    under the constraint of at least :attr:`n` zeros for each row and column.
+
+    *Note*: L1 norm of sparse matrix from `Best` API is greater than or equal to the one from `Greedy`.
+
+    Args:
+        mat (nparray): The input matrix.
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+    Returns:
+        nparray: The 1D `n:m` sparse mask of :attr:`mat`.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          mat = np.array([[2, 8, 9, 9],
+                          [9, 1, 3, 9],
+                          [5, 6, 3, 9],
+                          [2, 4, 6, 9]])
+          mask_greedy = sparsity.get_mask_2d_greedy(mat, 2, 4)
+          mask_greedy = sparsity.get_mask_2d_best(mat, 2, 4)
+          print("L1 norm of `greedy` sparse matrix", np.multiply(mat, mask_greedy).sum()) # 56
+          print("L1 norm of `best` sparse matrix", np.multiply(mat, mask_best).sum()) # 61
+    """
+    patterns = compute_valid_2d_patterns(n, m)
+
+    mat_flattern, shape = reshape_2d(mat, m)
+    mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
+    pmax = np.argmax(
+        np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
+        axis=1)
+
+    mask_flattern[:] = patterns[pmax[:]]
+    mask = np.empty(shape)
+
+    curr_idx = 0
+    for row_start in range(0, shape[0], m):
+        row_end = row_start + m
+        for col_start in range(0, shape[1], m):
+            col_end = col_start + m
+            mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx]
+            curr_idx += 1
+    return mask[:mat.shape[0], :mat.shape[1]]
+
+
+def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
+    r"""
+    Create `n:m` sparse pattern mask of the input tensor via function given by :attr:`func_name`.
+    Currently only support tensor with dimension less than or equal to 4.
+
+    Args:
+        tensor (nparray): The input tensor.
+        func_name (MaskAlgo, optional): The function name to generate spase mask. Default is `MaskAlgo.MASK_1D`. All options please refer to `MaskAlgo`.
+        n (int, optional): n of `n:m` sparse pattern. Default is 2.
+        m (int, optional): m of `n:m` sparse pattern. Default is 4.
+    Returns:
+        nparray: The `n:m` sparse mask of :attr:`tensor` generated by :attr:`func_name`.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          tensor = np.array([[2, 8, 9, 9],
+                             [9, 1, 3, 9],
+                             [5, 6, 3, 9],
+                             [2, 4, 6, 9]])
+          mask_1d = sparsity.create_mask(tensor, func_name=sparsity.MaskAlgo.MASK_1D)
+          # nparray([[0 0 1 1],
+          #          [1 0 0 1],
+          #          [0 1 0 1],
+          #          [0 0 1 1]])
+          mask_2d = sparsity.create_mask(tensor, func_name=sparsity.MaskAlgo.MASK_2D_BEST)
+          # nparray([[0 1 1 0],
+          #          [1 0 0 1],
+          #          [1 1 0 0],
+          #          [0 0 1 1]])
+    """
+    shape = tensor.shape
+    dtype = tensor.dtype
+    t = tensor.astype(float)
+
+    assert type(func_name) == MaskAlgo, \
+           "func_name argumet of create_mask is only accepted as type MaskAlgo. " \
+           "But got {}".format(type(func_name))
+    func = getattr(sys.modules[__name__], func_name.value, None)
+    if len(shape) == 1:
+        t = t.reshape(1, shape[0])
+        mask = func(t, n=n, m=m)
+        return mask.reshape(shape).astype(dtype)
+    elif len(shape) == 2:
+        t = t.reshape(shape[0], shape[1])
+        mask = func(t, n=n, m=m)
+        return mask.reshape(shape).astype(dtype)
+    elif len(shape) == 3:
+        t = t.reshape(shape[0] * shape[1], shape[2])
+        mask = func(t, n=n, m=m)
+        return mask.reshape(shape).astype(dtype)
+    # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op
+    elif len(shape) == 4:
+        t = t.reshape(shape[0], shape[1] * shape[2] * shape[3])
+        mask = func(t, n=n, m=m)
+        return mask.reshape(shape).astype(dtype)
+    else:
+        assert True, "The dimension of input tensor is not supported in create_mask, " \
+                     "Only dimension < 4 is supported but got {}".format(len(shape))
+
+
+def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
+    r"""
+    Check if input tensor is in `n:m` sparse pattern via function given by :attr:`func_name`.
+    Currently only support tensor with dimension less than or equal to 4.
+
+    Args:
+        tensor (nparray): The input tensor.
+        func_name (CheckMethod, optional): The function name to generate spase mask. Default is `CheckMethod.CHECK_1D`. All options please refer to `CheckMethod`.
+        n (int, optional): n of `n:m` sparse pattern. Default is 2.
+        m (int, optional): m of `n:m` sparse pattern. Default is 4.
+    Returns:
+        bool: True if tensor pass checking of function given by :attr:`func_name`, else False.
+    Examples:
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid.contrib.sparsity as sparsity
+
+          tensor = np.array([[2, 8, 9, 9],
+                             [9, 1, 3, 9],
+                             [5, 6, 3, 9],
+                             [2, 4, 6, 9]])
+          mask_1d = sparsity.create_mask(tensor, func_name=sparsity.MaskAlgo.MASK_1D)
+          # nparray([[0 0 1 1],
+          #          [1 0 0 1],
+          #          [0 1 0 1],
+          #          [0 0 1 1]])
+          sparsity.check_sparsity(mask_1d, func_name=sparsity.CheckMethod.CHECK_1D) # True
+          sparsity.check_sparsity(mask_1d, func_name=sparsity.CheckMethod.CHECK_2D) # False
+    """
+    shape = tensor.shape
+    t = tensor.astype(float)
+
+    assert type(func_name) == CheckMethod, \
+           "func_name argumet of check_sparsity is only accepted as type CheckMethod. " \
+           "But got {}".format(type(func_name))
+    func = getattr(sys.modules[__name__], func_name.value, None)
+    if len(shape) == 1:
+        t = t.reshape(1, shape[0])
+        return func(t, n=n, m=m)
+    elif len(shape) == 2:
+        t = t.reshape(shape[0], shape[1])
+        return func(t, n=n, m=m)
+    elif len(shape) == 3:
+        t = t.reshape(shape[0] * shape[1], shape[2])
+        return func(t, n=n, m=m)
+    # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op
+    elif len(shape) == 4:
+        t = t.reshape(shape[0], shape[1] * shape[2] * shape[3])
+        return func(t, n=n, m=m)
+    else:
+        assert True, "The dimension of input tensor is not supported in check_sparsity, " \
+                     "Only dimension < 4 is supported but got {}".format(len(shape))
+
+    return False
diff --git a/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py
index cfe70cf2922392..815018dc4b2f4e 100644
--- a/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py
+++ b/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py
@@ -69,7 +69,7 @@ def get_model(self, main_prog, startup_program, rank):
                 axis=1,
                 num_partitions=2,
                 weight_attr=param_attr,
-                bias_attr=False, )
+                bias_attr=True, )
 
             return [linear_out]
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
index 14d7e960f4a68c..912849ffbeb71c 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
@@ -113,7 +113,7 @@ def test_pp_model(self):
 
             print("loss: ", loss_a.numpy(), loss_b.numpy())
             np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+                loss_a.numpy(), loss_b.numpy(), rtol=5e-5)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py
new file mode 100644
index 00000000000000..95de37fdc0251a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestCastBF16ToFP32MKLDNNOp(OpTest):
+    def init_data(self):
+        self.out = np.random.random(size=[10, 10]).astype("float32")
+        self.x = convert_float_to_uint16(self.out)
+
+    def setUp(self):
+        self.init_data()
+        self.inputs = {'X': self.x}
+        self.outputs = {'Out': self.out}
+        prepare_dtype = lambda x: int(core.VarDesc.VarType.BF16 if x.dtype != np.float32 else core.VarDesc.VarType.FP32)
+        self.attrs = {
+            'in_dtype': prepare_dtype(self.x),
+            'out_dtype': prepare_dtype(self.out),
+            'use_mkldnn': True
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.inputs['X']],
+            user_defined_grad_outputs=[self.outputs['Out']])
+
+
+class TestCastFP32ToBF16MKLDNNOp(TestCastBF16ToFP32MKLDNNOp):
+    def init_data(self):
+        self.x = np.random.random(size=[2, 6]).astype("float32")
+        self.out = convert_float_to_uint16(self.x)
+
+
+class TestCastBF16ToBF16MKLDNNOp(TestCastBF16ToFP32MKLDNNOp):
+    def init_data(self):
+        self.x = np.random.random(size=[6, 13]).astype("uint16")
+        self.out = self.x
+
+
+class TestCastFP32ToFP32MKLDNNOp(TestCastBF16ToFP32MKLDNNOp):
+    def init_data(self):
+        self.x = np.random.random(size=[7, 15]).astype("float32")
+        self.out = self.x
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 3524d1e553d1bb..654723d8629900 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1191,8 +1191,12 @@ def find_actual(target_name, fetch_list):
                         np.float32, np.float64
                 ]:
                     actual_t = convert_uint16_to_float(actual_t)
-                    atol = 0.03
+                    atol = max(atol, 0.03)
 
+                if expect_t.dtype == np.uint16 and actual_t.dtype == np.uint16:
+                    expect_t = convert_uint16_to_float(expect_t)
+                    actual_t = convert_uint16_to_float(actual_t)
+                    atol = max(atol, 0.03)
                 # NOTE(zhiqiu): np.allclose([], [1.]) returns True
                 # see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng
                 if expect_t.size == 0:
@@ -1501,13 +1505,21 @@ def check_grad_with_place(self,
 
         # comparison of bf16 results will happen as fp32
         # loop over list of grads and convert bf16 to fp32
-        fp32_grads = []
+        fp32_analytic_grads = []
         for grad in analytic_grads:
             if grad.dtype == np.uint16:
                 grad = convert_uint16_to_float(grad)
                 max_relative_error = 0.03
-            fp32_grads.append(grad)
-        analytic_grads = fp32_grads
+            fp32_analytic_grads.append(grad)
+        analytic_grads = fp32_analytic_grads
+
+        fp32_numeric_grads = []
+        for grad in numeric_grads:
+            if grad.dtype == np.uint16:
+                grad = convert_uint16_to_float(grad)
+                max_relative_error = 0.03
+            fp32_numeric_grads.append(grad)
+        numeric_grads = fp32_numeric_grads
 
         self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
                               max_relative_error,
diff --git a/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py
index a62e3c05508a16..a24c0874482113 100644
--- a/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py
+++ b/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py
@@ -65,12 +65,12 @@ def get_model(self, main_prog, startup_program, rank):
 
             linear_out = paddle.distributed.split(
                 data,
-                size=(1000, 8),
+                size=(1000, 16),
                 operation='linear',
                 axis=0,
                 num_partitions=2,
                 weight_attr=param_attr,
-                bias_attr=False, )
+                bias_attr=True, )
 
             return [linear_out]
 
diff --git a/python/paddle/fluid/tests/unittests/test_asp_utils.py b/python/paddle/fluid/tests/unittests/test_asp_utils.py
new file mode 100644
index 00000000000000..faffd477ae5661
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_asp_utils.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import threading, time
+import paddle
+from paddle.fluid.contrib import sparsity
+import numpy as np
+
+
+class TestASPUtils(unittest.TestCase):
+    def test_get_check_method(self):
+        self.assertEqual(
+            sparsity.CheckMethod.get_checking_method(sparsity.MaskAlgo.MASK_1D),
+            sparsity.CheckMethod.CHECK_1D)
+        self.assertEqual(
+            sparsity.CheckMethod.get_checking_method(
+                sparsity.MaskAlgo.MASK_2D_GREEDY),
+            sparsity.CheckMethod.CHECK_2D)
+        self.assertEqual(
+            sparsity.CheckMethod.get_checking_method(
+                sparsity.MaskAlgo.MASK_2D_BEST), sparsity.CheckMethod.CHECK_2D)
+
+    def test_density(self):
+        x = np.array([[1.0, 1.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
+                      [1.0, 0.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
+                      [0.0, 1.0, 0.0, 0.0, 1.0]])
+        self.assertEqual(sparsity.density(x), 0.56)
+        x[:, 0] = 0.0
+        self.assertEqual(sparsity.density(x), 0.4)
+
+    def test_check_mask_1d(self):
+        x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
+                      [1.0, 1.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
+                      [0.0, 1.0, 0.0, 0.0, 1.0]])
+        self.assertTrue(sparsity.check_mask_1d(x, 2, 4))
+        self.assertFalse(sparsity.check_mask_1d(x, 3, 4))
+        self.assertTrue(sparsity.check_mask_1d(x, 2, 5))
+        self.assertFalse(sparsity.check_mask_1d(x, 3, 5))
+        self.assertTrue(sparsity.check_mask_1d(x, 3, 6))
+        self.assertFalse(sparsity.check_mask_1d(x, 4, 6))
+
+    def test_get_mask_1d(self):
+        for _ in range(10):
+            x = np.random.randint(10, size=(5, 5))
+            x = sparsity.get_mask_1d(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_1d(x, 2, 4))
+
+            x = np.random.randn(5, 4)
+            x = sparsity.get_mask_1d(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_1d(x, 2, 4))
+
+    def test_check_mask_2d(self):
+        x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 0.0, 0.0, 0.0],
+                      [0.0, 0.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 0.0],
+                      [0.0, 1.0, 0.0, 0.0, 1.0]])
+        self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+        self.assertFalse(sparsity.check_mask_2d(x, 3, 4))
+        self.assertTrue(sparsity.check_mask_2d(x, 2, 5))
+        self.assertFalse(sparsity.check_mask_2d(x, 3, 5))
+        self.assertTrue(sparsity.check_mask_2d(x, 3, 6))
+        self.assertFalse(sparsity.check_mask_2d(x, 4, 6))
+
+    def test_get_mask_2d_greedy(self):
+        for _ in range(10):
+            x = np.random.randint(10, size=(5, 5))
+            x = sparsity.get_mask_2d_greedy(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+
+            x = np.random.randn(5, 4)
+            x = sparsity.get_mask_2d_greedy(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+
+    def test_get_mask_2d_best(self):
+        for _ in range(10):
+            x = np.random.randint(10, size=(5, 5))
+            x = sparsity.get_mask_2d_best(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+
+            x = np.random.randn(5, 4)
+            x = sparsity.get_mask_2d_best(x, 2, 4)
+            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+
+    def test_threadsafe_valid_2d_patterns(self):
+        def get_reference(m=4, n=2):
+            from itertools import permutations
+
+            patterns = np.zeros(m)
+            patterns[:n] = 1
+            patterns = list(set(permutations(patterns.tolist())))
+            patterns = patterns + patterns
+            patterns = np.asarray(list(set(permutations(patterns, m))))
+
+            valid = ((patterns.sum(axis=1) <= n).sum(axis=1) == m
+                     ).nonzero()[0].reshape(-1)
+            valid_patterns = np.empty((valid.shape[0], m, m))
+            valid_patterns[:] = patterns[valid[:]]
+            return valid_patterns
+
+        for _ in range(4):
+            computing_thread = threading.Thread(
+                target=paddle.fluid.contrib.sparsity.utils.
+                compute_valid_2d_patterns,
+                args=(2, 4))
+            computing_thread.start()
+        time.sleep(3)
+        patterns_map = paddle.fluid.contrib.sparsity.utils.valid_2d_patterns
+        reference_patterns = get_reference()
+        reference_key = '4_2'
+
+        self.assertTrue(reference_key in patterns_map)
+        self.assertTrue(len(patterns_map) == 1)
+        self.assertTrue((reference_patterns == patterns_map[reference_key]).all(
+        ))
+
+    def test_check_sparsity(self):
+        for _ in range(10):
+            x = np.random.randint(10, size=(5))
+            x_2d = x.reshape(1, x.shape[0])
+            self.__test_1D_2D_sparsity_checking_methods(x_2d)
+
+            x = np.random.randint(10, size=(5, 5))
+            x_2d = x
+            self.__test_1D_2D_sparsity_checking_methods(x_2d)
+
+            x = np.random.randint(10, size=(5, 5, 5))
+            x_2d = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
+            self.__test_1D_2D_sparsity_checking_methods(x_2d)
+
+            x = np.random.randint(10, size=(5, 5, 5, 5))
+            x_2d = x.reshape(x.shape[0], x.shape[1] * x.shape[2] * x.shape[3])
+            self.__test_1D_2D_sparsity_checking_methods(x_2d)
+
+    def test_create_mask(self):
+        for _ in range(10):
+            x = np.random.randint(10, size=(5))
+            self.__test_1D_2D_sparse_mask_generation_methods(x)
+
+            x = np.random.randint(10, size=(5, 5))
+            self.__test_1D_2D_sparse_mask_generation_methods(x)
+
+            x = np.random.randint(10, size=(5, 5, 5))
+            self.__test_1D_2D_sparse_mask_generation_methods(x)
+
+            x = np.random.randint(10, size=(5, 5, 5, 5))
+            self.__test_1D_2D_sparse_mask_generation_methods(x)
+
+    def __test_1D_2D_sparsity_checking_methods(self, x_2d):
+        mask = sparsity.get_mask_1d(x_2d, 2, 4)
+        self.assertEqual(
+            sparsity.check_sparsity(
+                mask, func_name=sparsity.CheckMethod.CHECK_1D, n=2, m=4),
+            sparsity.check_mask_1d(mask, 2, 4))
+        mask = sparsity.get_mask_2d_best(x_2d, 2, 4)
+        self.assertEqual(
+            sparsity.check_sparsity(
+                mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4),
+            sparsity.check_mask_2d(mask, 2, 4))
+
+    def __test_1D_2D_sparse_mask_generation_methods(self, x):
+        mask = sparsity.create_mask(
+            x, func_name=sparsity.MaskAlgo.MASK_1D, n=2, m=4)
+        self.assertTrue(
+            sparsity.check_sparsity(
+                mask, func_name=sparsity.CheckMethod.CHECK_1D, n=2, m=4))
+        mask = sparsity.create_mask(
+            x, func_name=sparsity.MaskAlgo.MASK_2D_GREEDY, n=2, m=4)
+        self.assertTrue(
+            sparsity.check_sparsity(
+                mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4))
+        mask = sparsity.create_mask(
+            x, func_name=sparsity.MaskAlgo.MASK_2D_BEST, n=2, m=4)
+        self.assertTrue(
+            sparsity.check_sparsity(
+                mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4))
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index e6693b676cf643..f0c042eb7e95b6 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -154,7 +154,10 @@ def _run_cluster(self, model_file, envs):
         #update environment
         env0.update(envs)
         env1.update(envs)
-        tr_cmd = "%s %s"
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            tr_cmd = "%s -m coverage run --branch -p %s"
+        else:
+            tr_cmd = "%s %s"
         tr0_cmd = tr_cmd % (self._python_interp, model_file)
         tr1_cmd = tr_cmd % (self._python_interp, model_file)
         tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
diff --git a/python/setup.py.in b/python/setup.py.in
index 0f2e97192c1df1..79c67182f9c791 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -185,6 +185,7 @@ packages=['paddle',
           'paddle.fluid.contrib.mixed_precision',
           'paddle.fluid.contrib.mixed_precision.bf16',
           'paddle.fluid.contrib.layers',
+          'paddle.fluid.contrib.sparsity',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details',
           'paddle.fluid.incubate',
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 470242da34ddd0..0df3b4914f5df9 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -233,9 +233,9 @@ def is_only_comment(self, f):
 
     def get_all_count(self):
         os.system(
-            "cd %s/build && ctest -N|grep 'Total Tests:' | awk -F ': ' '{print $2}' > testCount"
+            "cd %sbuild && ctest -N|grep 'Total Tests:' | awk -F ': ' '{print $2}' > testCount"
             % PADDLE_ROOT)
-        f = open("%s/build/testCount" % PADDLE_ROOT)
+        f = open("%sbuild/testCount" % PADDLE_ROOT)
         testCount = f.read()
         f.close()
         return int(testCount.strip())
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index cb0581d671006e..55b82084f6bc5a 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -650,7 +650,6 @@
     'test_collective_wait',
     'test_collective_split_row_linear',
     'test_collective_split_embedding',
-    'test_custom_attrs_jit',
     'float16_gpu_test',
     'test_leaky_relu_grad_grad_functor',
     'test_complex_simplenet',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index c5ea8891a21ee2..2c50c4bf9f6207 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -589,6 +589,7 @@
     'test_matmul_op_with_head',
     'test_var_conv_2d',
     'test_batch_norm_mkldnn_op',
+    'test_cast_mkldnn_op',
     'test_concat_int8_mkldnn_op',
     'test_concat_bf16_mkldnn_op',
     'test_concat_mkldnn_op',
diff --git a/tools/test_model_benchmark.sh b/tools/test_model_benchmark.sh
index 8f8026b0adcef7..98066d7beeaa77 100644
--- a/tools/test_model_benchmark.sh
+++ b/tools/test_model_benchmark.sh
@@ -24,11 +24,13 @@ function check_whl {
 
     mkdir -p /tmp/pr && mkdir -p /tmp/develop
     unzip -q build/python/dist/*.whl -d /tmp/pr
+    rm -f build/python/dist/*.whl && rm -f build/python/build/.timestamp
 
     git checkout .
     git checkout -b develop_base_pr upstream/$BRANCH
+    bash -x paddle/scripts/paddle_build.sh build
+    [ $? -ne 0 ] && echo "install paddle failed." && exit 1
     cd build
-    make -j `nproc`
     unzip -q python/dist/*.whl -d /tmp/develop
 
     sed -i '/version.py/d' /tmp/pr/*/RECORD
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index 4a61a99c34fa24..603c9911a44f99 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -132,7 +132,7 @@ goto :eof
 :vs
 echo ">>>>>>>> step [4/7]: Visual Studio 2017 "
 cmd /C "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"  > nul 2> nul || call :install_visual_studio
-goto :cuda10
+goto :cuda
 
 :install_visual_studio
 echo There is not Visual Studio in this PC, will install VS2017.
@@ -153,7 +153,7 @@ goto :eof
 :: ===== end step 4: Visual Studio 2017 =====
 
 :: ===== start step 5: CUDA 11 =====
-:cuda10
+:cuda
 echo ">>>>>>>> step [5/7]: CUDA 11.2"
 cmd /C nvcc --version 2> nul | findstr /C:"11.2" > nul 2> nul || call :install_cuda
 goto java-jre