Intergrate StandaloneExecutor in Static.Executor Interface with FLAGS_USE_STANDALONE_EXECUTOR (#35628)

Aurelius84 · web-flow · commit 4bc085304f93 · 2021-09-14T12:51:17.000+08:00
* Intergrate StandaloneExecutor in Static.Executor Interface with FLAGS_USE_STANDALONE_EXECUTOR

* Enhance unittest and clean code in StandaloneExecutor

* polish unittest
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -117,6 +117,13 @@ void build_variable_scope(const framework::ProgramDesc& pdesc,
       info.var_ref_count_ = 0;
       info.vardesc_ = var;
       var_scope->vec_meta_info_.push_back(info);
+    } else {
+      auto var_id = var_scope->name2id[var->Name()];
+      if (nullptr == var_scope->vec_meta_info_[var_id].vardesc_) {
+        VLOG(3) << "update var:" << var->Name() << " desc from nullptr into "
+                << var;
+        var_scope->vec_meta_info_[var_id].vardesc_ = var;
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -35,14 +35,13 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
       auto v = outer_scope_->Var(name);
       if (global_scope_.name2id.find(name) == global_scope_.name2id.end()) {
         global_scope_.name2id[name] = global_scope_.var_list.size();
-      }
-
-      global_scope_.var_list.push_back(v);
+        global_scope_.var_list.push_back(v);
 
-      VariableMetaInfo info;
-      info.var_ref_count_ = 0;
-      info.vardesc_ = nullptr;
-      global_scope_.vec_meta_info_.push_back(info);
+        VariableMetaInfo info;
+        info.var_ref_count_ = 0;
+        info.vardesc_ = nullptr;
+        global_scope_.vec_meta_info_.push_back(info);
+      }
     }
   }
 
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
@@ -136,9 +136,9 @@ def as_numpy(tensor, copy=False):
         numpy.ndarray
     """
     if isinstance(tensor, core.LoDTensorArray):
-        return [as_numpy(t) for t in tensor]
+        return [as_numpy(t, copy) for t in tensor]
     if isinstance(tensor, list):
-        return [as_numpy(t) for t in tensor]
+        return [as_numpy(t, copy) for t in tensor]
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
     if len(lod) > 0:
@@ -383,6 +383,17 @@ def _to_str(var):
         return _to_str(var)
 
 
+def _is_enable_standalone_executor():
+    """
+    Whether to use experimental executor `StandaloneExecutor`.
+    """
+    flag = False
+    env_val = os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR', None)
+    if env_val in [1, '1', True, 'True', 'true']:
+        flag = True
+    return flag
+
+
 def _get_strong_program_cache_key(program, feed, fetch_list):
     return str(id(program)) + _get_program_cache_key(feed, fetch_list)
 
@@ -472,6 +483,121 @@ def handler(self, res_dict):
 """)
 
 
+class _StandaloneExecutor(object):
+    def __init__(self, place, main_program):
+        self._place = core.Place()
+        self._place.set_place(place)
+        self._main_program = main_program
+        self._new_exe = self._create_new_executor()
+
+    def run(self, feed, fetch_list, return_numpy=True):
+        """
+        Args:
+            feed(list|dict): This parameter represents the input Tensors of the model.
+                If it is single card training, the feed is dict type, and if it is multi-card
+                training, the parameter feed can be dict or list of Tensors. If the
+                parameter type is dict, the data in the feed will be split and sent to
+                multiple devices (CPU/GPU), that is to say, the input data will be evenly
+                sent to different devices, so you should make sure the number of samples of
+                the current mini-batch must be greater than the number of places;
+                if the parameter type is list, those data are copied directly to each device,
+                so the length of this list should be equal to the number of places.
+                The default is None.
+            fetch_list(list): This parameter represents the Tensors that need to be returned
+                after the model runs. The default is None. 
+            return_numpy(bool): This parameter indicates whether convert the fetched Tensors
+                (the Tensor specified in the fetch list) to numpy.ndarray. if it is False,
+                the type of the return value is a list of :code:`LoDTensor`. The default is True.
+        """
+        feed = self._update_feed(feed)
+        fetch_list = self._check_fetch(fetch_list)
+
+        tensors = self._new_exe.run(feed, fetch_list)._move_to_list()
+        if return_numpy:
+            return as_numpy(tensors, copy=True)
+        else:
+            return tensors
+
+    def _create_new_executor(self):
+        # NOTE: It's a trick to set empty start_up program.
+        startup_program = Program()
+        outer_scope = global_scope()
+        new_exe = core.StandaloneExecutor(self._place, startup_program.desc,
+                                          self._main_program.desc, outer_scope)
+
+        return new_exe
+
+    def _update_feed(self, feed):
+        """
+        Update the feed dict, remove the feed item which is pruned in program.  
+
+        Notes: This is a very low level API. Users should not use this API
+        directly. 
+
+        Args:
+            feed(list|dict): feed dict or list.
+
+        Returns:
+            feed:(list|dict)  updated feed.
+        """
+        global_block = self._main_program.global_block()
+        if feed is None:
+            feed = {}
+        elif isinstance(feed, dict):
+            for feed_name in list(feed.keys()):
+                if not global_block.has_var(feed_name):
+                    feed.pop(feed_name)
+                    warnings.warn(
+                        "The variable %s is not found in program. It is not declared or is pruned."
+                        % feed_name)
+        else:
+            raise TypeError("Only support feed with `dict`, but received {}".
+                            format(type(feed).__name__))
+
+        return feed
+
+    def _check_fetch(self, fetch_list):
+        if fetch_list is None:
+            fetch_list = []
+
+        res = []
+        for fetch_var in fetch_list:
+            if isinstance(fetch_var, Variable):
+                fetch_var = fetch_var.name
+            elif not isinstance(fetch_var, str):
+                raise TypeError(
+                    "Required fetch_var shall be str|Variable, but received {}".
+                    format(type(fetch_var).__name__))
+
+            res.append(fetch_var)
+        return res
+
+
+class _ExecutorCache(object):
+    def __init__(self, place):
+        # {Program : _StandaloneExecutor}
+        self._place = place
+        self._cached_executors = {}
+
+    def run(self, program, feed, fetch_list, return_numpy=True):
+        new_exe = self._get_exe_from_cache(program)
+        return new_exe.run(feed, fetch_list, return_numpy)
+
+    def _get_exe_from_cache(self, program):
+        """
+        Return cached _StandaloneExecutor instance. If not found, create associated 
+        _StandaloneExecutor instance with given program and cache it.
+        """
+        assert isinstance(
+            program, Program), "Required type(Program), but received {}".format(
+                type(program).__name__)
+        if program not in self._cached_executors:
+            new_exe = _StandaloneExecutor(self._place, program)
+            self._cached_executors[program] = new_exe
+
+        return self._cached_executors[program]
+
+
 class Executor(object):
     """
     :api_attr: Static Graph
@@ -568,6 +694,10 @@ def __init__(self, place=None):
         self._auto_checkpoint_name = unique_name.generate(
             "__auto_checkpoint_executor__")
 
+        # NOTE: Whether to use experimental executor `StandaloneExecutor`.
+        self._enable_interpreter_core = _is_enable_standalone_executor()
+        self._executor_cache = _ExecutorCache(self.place)
+
     def _get_scope_cache(self, program_cache_key):
         return self.scope_caches.get(program_cache_key, None)
 
@@ -1155,6 +1285,12 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
         if scope is None:
             scope = global_scope()
 
+        # NOTE: This is an experimental feature. If `export FLAGS_USE_STANDALONE_EXECUTOR=1 `,
+        # use StandaloneExecutor to run the program.
+        if self._enable_interpreter_core and not program._is_start_up_program_:
+            return self._executor_cache.run(program, feed, fetch_list,
+                                            return_numpy)
+
         # use_prune can be overrided by putting optimize_ops in fetch_list
         _origin_fetch_list = fetch_list
         _origin_program = program
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
@@ -4381,6 +4381,8 @@ def __init__(self):
 
         # compiled program, i.e. Graph
         self._graph = None
+        # to tag whether is startup_program
+        self._is_start_up_program_ = False
 
     def _find_var_class_kwargs(self, new_desc):
         # NOTE: not all variables support shape/dtype/lod_level methods.
@@ -5994,6 +5996,7 @@ def _copy_to(self, device, blocking):
 # program is a global instance.
 _main_program_ = Program()
 _startup_program_ = Program()
+_startup_program_._is_start_up_program_ = True
 
 
 def default_startup_program():
@@ -6142,6 +6145,8 @@ def program_guard(main_program, startup_program=None):
     if startup_program is not None:
         check_type(startup_program, 'startup_program', Program,
                    'paddle.static.program_guard')
+        # Tag the program __is_start_up as True
+        startup_program._is_start_up_program_ = True
         startup_program = switch_startup_program(startup_program)
     try:
         yield
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py

Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,13 @@ void build_variable_scope(const framework::ProgramDesc& pdesc,`
`117`	`117`	`info.var_ref_count_ = 0;`
`118`	`118`	`info.vardesc_ = var;`
`119`	`119`	`var_scope->vec_meta_info_.push_back(info);`
	`120`	`+ } else {`
	`121`	`+ auto var_id = var_scope->name2id[var->Name()];`
	`122`	`+ if (nullptr == var_scope->vec_meta_info_[var_id].vardesc_) {`
	`123`	`+ VLOG(3) << "update var:" << var->Name() << " desc from nullptr into "`
	`124`	`+ << var;`
	`125`	`+ var_scope->vec_meta_info_[var_id].vardesc_ = var;`
	`126`	`+ }`
`120`	`127`	`}`
`121`	`128`	`}`
`122`	`129`	`}`