Shixiaowei02 · Shixiaowei02 · May 9, 2019 · Apr 29, 2019 · Apr 29, 2019 · Apr 29, 2019
diff --git a/cmake/anakin_subgraph.cmake b/cmake/anakin_subgraph.cmake
@@ -25,8 +25,9 @@ endif()
 
 if(ANAKIN_FOUND)
     message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
+    include_directories(${ANAKIN_ROOT})
     include_directories(${ANAKIN_ROOT}/include)
-    include_directories(${ANAKIN_ROOT}/include/saber)
+    include_directories(${ANAKIN_ROOT}/saber)
     link_directories(${ANAKIN_ROOT})
     add_definitions(-DPADDLE_WITH_ANAKIN)
 endif()
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
@@ -77,6 +77,7 @@ else(WIN32)
 ENDIF(WIN32)
 
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
+get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
 INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
 

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
@@ -455,21 +455,29 @@ void MultiSlotDataFeed::Init(
   all_slots_.resize(all_slot_num);
   all_slots_type_.resize(all_slot_num);
   use_slots_index_.resize(all_slot_num);
+  total_dims_without_inductive_.resize(all_slot_num);
+  inductive_shape_index_.resize(all_slot_num);
   use_slots_.clear();
   use_slots_is_dense_.clear();
   for (size_t i = 0; i < all_slot_num; ++i) {
     const auto& slot = multi_slot_desc.slots(i);
     all_slots_[i] = slot.name();
     all_slots_type_[i] = slot.type();
     use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
+    total_dims_without_inductive_[i] = 1;
+    inductive_shape_index_[i] = -1;
     if (slot.is_used()) {
       use_slots_.push_back(all_slots_[i]);
       use_slots_is_dense_.push_back(slot.is_dense());
       std::vector<int> local_shape;
       if (slot.is_dense()) {
-        // for batch size holder if is_dense
-        if (slot.shape(0) > 0) {
-          local_shape.push_back(0);
+        for (size_t i = 0; i < slot.shape_size(); ++i) {
+          if (slot.shape(i) > 0) {
+            total_dims_without_inductive_[i] *= slot.shape(i);
+          }
+          if (slot.shape(i) == -1) {
+            inductive_shape_index_[i] = i;
+          }
         }
       }
       for (size_t i = 0; i < slot.shape_size(); ++i) {
@@ -762,7 +770,10 @@ void MultiSlotDataFeed::PutToFeedVec(
     LoD data_lod{offset};
     feed_vec_[i]->set_lod(data_lod);
     if (use_slots_is_dense_[i]) {
-      use_slots_shape_[i][0] = batch_size_;
+      if (inductive_shape_index_[i] != -1) {
+        use_slots_shape_[i][inductive_shape_index_[i]] =
+            total_instance / total_dims_without_inductive_[i];
+      }
       feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
     }
   }
@@ -785,6 +796,8 @@ void MultiSlotInMemoryDataFeed::Init(
   all_slots_.resize(all_slot_num);
   all_slots_type_.resize(all_slot_num);
   use_slots_index_.resize(all_slot_num);
+  total_dims_without_inductive_.resize(all_slot_num);
+  inductive_shape_index_.resize(all_slot_num);
   use_slots_.clear();
   use_slots_is_dense_.clear();
   for (size_t i = 0; i < all_slot_num; ++i) {
@@ -797,8 +810,13 @@ void MultiSlotInMemoryDataFeed::Init(
       use_slots_is_dense_.push_back(slot.is_dense());
       std::vector<int> local_shape;
       if (slot.is_dense()) {
-        if (slot.shape(0) > 0) {
-          local_shape.push_back(0);
+        for (size_t i = 0; i < slot.shape_size(); ++i) {
+          if (slot.shape(i) > 0) {
+            total_dims_without_inductive_[i] *= slot.shape(i);
+          }
+          if (slot.shape(i) == -1) {
+            inductive_shape_index_[i] = i;
+          }
         }
       }
       for (size_t i = 0; i < slot.shape_size(); ++i) {
@@ -960,7 +978,10 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
     LoD data_lod{offset};
     feed_vec_[i]->set_lod(data_lod);
     if (use_slots_is_dense_[i]) {
-      use_slots_shape_[i][0] = batch_size_;
+      if (inductive_shape_index_[i] != -1) {
+        use_slots_shape_[i][inductive_shape_index_[i]] =
+            total_instance / total_dims_without_inductive_[i];
+      }
       feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
     }
   }

diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
@@ -143,6 +143,8 @@ class DataFeed {
   std::vector<std::string> all_slots_;
   std::vector<std::string> all_slots_type_;
   std::vector<std::vector<int>> use_slots_shape_;
+  std::vector<int> inductive_shape_index_;
+  std::vector<int> total_dims_without_inductive_;
   std::vector<int>
       use_slots_index_;  // -1: not used; >=0: the index of use_slots_
 

diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc
@@ -121,6 +121,16 @@ int64_t product(const DDim& ddim) {
   return ddim.apply_visitor(ProductVisitor());
 }
 
+bool contain_unknown_dim(const DDim& ddim) {
+  for (int i = 0; i < ddim.size(); ++i) {
+    if (ddim[i] < 0) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 DDim slice_ddim(const DDim& dim, int begin, int end) {
   PADDLE_ENFORCE(begin >= 0 && end <= dim.size(),
                  "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",

diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
@@ -182,6 +182,8 @@ std::vector<int> vectorize2int(const DDim& ddim);
 
 int64_t product(const DDim& ddim);
 
+bool contain_unknown_dim(const DDim& ddim);
+
 /**
  * \brief Slice a ddim
  *

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
@@ -1,22 +1,12 @@
 cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
-cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base)
+
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
-
 cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
-cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
-cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
-
-cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)
-cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
-cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
-cc_library(fuse_momentum_op_pass SRCS fuse_momentum_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
-
-cc_library(record_skip_memory_opt_vars_pass SRCS record_skip_memory_opt_vars_pass.cc DEPS graph graph_helper)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
@@ -27,7 +17,7 @@ if(WITH_DISTRIBUTE)
     endif()
 endif()
 
-set(all_reduce_deps all_reduce_op_handle)
+
 if(WITH_GPU)
     nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             dynload_cuda variable_visitor)
@@ -37,7 +27,6 @@ if(WITH_GPU)
     if(WITH_DGC)
         nv_library(sparse_all_reduce_op_handle SRCS sparse_all_reduce_op_handle.cc DEPS op_handle_base scope 
             lod_tensor ddim memory dynload_cuda variable_visitor dgc all_reduce_op_handle)
-        set(all_reduce_deps sparse_all_reduce_op_handle)
     endif()
 
     if(WITH_DISTRIBUTE)
@@ -68,34 +57,12 @@ endif()
 
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 
-if(WITH_GPU)
-cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
-else()
-cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
-endif()
-
-cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
-cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
-cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
-cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
-cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
-cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass)
-cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
-
-cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
-cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
-
-cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle ${all_reduce_deps} reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
-
-cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)
 
 set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
 if (WITH_GPU)
   list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
 endif()
-cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
 
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
-Original file line number
+Diff line change
@@ Expand Up / @@ -182,6 +182,8 @@ std::vector<int> vectorize2int(const DDim& ddim); @@
     int64_t product(const DDim& ddim);
+    bool contain_unknown_dim(const DDim& ddim);
     /**
      * \brief Slice a ddim
      *
@@ Expand Down @@