From e516bdadde77983b75eb668423dd0ed75090dfc2 Mon Sep 17 00:00:00 2001 From: danleifeng Date: Tue, 29 Sep 2020 07:47:21 +0000 Subject: [PATCH 1/2] fleet support non_distributed training in dygraph mode; test=develop --- .../distributed/fleet/base/fleet_base.py | 2 + .../fluid/tests/unittests/test_fleet_base.py | 40 +++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 3fdd6e92483031..7eb3a5659654ab 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -187,6 +187,8 @@ def init(self, role_maker=None, is_collective=False): self.strategy_compiler = StrategyCompiler() if paddle.fluid.framework.in_dygraph_mode(): + if self.worker_num() == 1: + return if parallel_helper._is_parallel_ctx_initialized(): warnings.warn( "The dygraph parallel environment has been initialized.") diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py index 4945c158025b7e..b54dd09cd713fa 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py @@ -170,6 +170,46 @@ def test_dygraph_method(self): final_strategy = fleet._final_strategy() +class LinearNet(nn.Layer): + def __init__(self): + super(LinearNet, self).__init__() + self._linear1 = nn.Linear(10, 10) + self._linear2 = nn.Linear(10, 1) + + def forward(self, x): + return self._linear2(self._linear1(x)) + + +class TestFleetDygraphSingle(unittest.TestCase): + def setUp(self): + os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213" + os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213" + os.environ["PADDLE_TRAINERS_NUM"] = "1" + os.environ["PADDLE_TRAINER_ID"] = "0" + + def test_dygraph_single(self): + paddle.disable_static() + fleet.init(is_collective=True) + + layer = LinearNet() + loss_fn = nn.MSELoss() + adam = paddle.optimizer.Adam( + learning_rate=0.001, parameters=layer.parameters()) + + adam = fleet.distributed_optimizer(adam) + dp_layer = fleet.distributed_model(layer) + for step in range(2): + inputs = paddle.randn([10, 10], 'float32') + outputs = dp_layer(inputs) + labels = paddle.randn([10, 1], 'float32') + loss = loss_fn(outputs, labels) + loss = dp_layer.scale_loss(loss) + loss.backward() + dp_layer.apply_collective_grads() + adam.step() + adam.clear_grad() + + class TestFleetBaseSingleRunCollective(unittest.TestCase): def setUp(self): os.environ.pop("PADDLE_TRAINER_ENDPOINTS") From 4a9a2d5f853398cb561daed80d4ddc14ad8e35c6 Mon Sep 17 00:00:00 2001 From: danleifeng Date: Tue, 29 Sep 2020 08:28:17 +0000 Subject: [PATCH 2/2] fleet support non_distributed training in dygraph mode; test=develop --- python/paddle/fluid/tests/unittests/test_fleet_base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py index b54dd09cd713fa..3d4b2e218f725a 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py @@ -18,6 +18,7 @@ import paddle.distributed.fleet.base.role_maker as role_maker import os import paddle.fluid as fluid +import paddle.nn as nn import numpy as np @@ -203,9 +204,7 @@ def test_dygraph_single(self): outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) - loss = dp_layer.scale_loss(loss) loss.backward() - dp_layer.apply_collective_grads() adam.step() adam.clear_grad()