Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 26 additions & 6 deletions paddle/parameter/FirstOrderOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
const ParameterConfig& config,
size_t sparseId) const {
CHECK(sparseId == -1LU) << "Sparse update is not supported";

BaseMatrix& value = *vecs[PARAMETER_VALUE];
BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
Expand Down Expand Up @@ -265,6 +266,7 @@ void AdamParameterOptimizer::update(const VectorPtr vecs[],
const ParameterConfig& config,
size_t sparseId) const {
CHECK(sparseId == -1UL) << "Sparse update is not supported";

real beta1_power = std::pow(beta1_, step_);
real beta2_power = std::pow(beta2_, step_);
real learningRate = config.learning_rate() * learningRate_;
Expand Down Expand Up @@ -303,18 +305,36 @@ void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
const ParameterConfig& config,
size_t sparseId) const {
real globalThreshold = optConfig_.gradient_clipping_threshold();
real localThreshold = config.gradient_clipping_threshold();

real threshold;
std::string field;
// Get the minimum of local and global threshold
// as the real threshold for clipping
if (globalThreshold > 0.0f && localThreshold > 0.0f) {
threshold =
globalThreshold < localThreshold ? globalThreshold : localThreshold;
field = globalThreshold < localThreshold ? "global" : "local";
} else if (globalThreshold > 0.0f) {
threshold = globalThreshold;
field = "global";
} else {
threshold = localThreshold;
field = "local";
}

real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
if (maxAbsGrad > config.gradient_clipping_threshold()) {
if (maxAbsGrad > threshold) {
if (FLAGS_log_clipping) {
real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
vecs[PARAMETER_GRADIENT]->getSize();
LOG(INFO) << "parameter=" << config.name() << " need clipping,"
<< " max grad=" << maxAbsGrad << " avg grad=" << avgAbsGrad;
LOG(INFO) << "parameter=" << config.name() << " need clipping by "
<< field << " threshold=" << threshold
<< ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这块global和local可以合并成一个,不用分开两次~

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这样设计主要是考虑global和local可以分开设置,用户可以灵活选择,逻辑上也比较清晰,最重要的是不用更改update函数的接口,避免更多的地方产生修改

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

但是,这个函数里会计算两次,global做一次,local做一次。

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

在local threshold<global threshold的情况下确实会

}
vecs[PARAMETER_GRADIENT]->clip(-config.gradient_clipping_threshold(),
config.gradient_clipping_threshold());
vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
}

optimizer_->update(vecs, config, sparseId);
}

Expand Down
3 changes: 2 additions & 1 deletion paddle/parameter/OptimizerWithRegularizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ ParameterOptimizer* OptimizerWithRegularizer::create(
bool inPserver) {
ParameterOptimizer* optimizer =
ParameterOptimizer::create(optConfig, inPserver);
if (paraConfig.gradient_clipping_threshold() > 0.0f &&
if ((optConfig.gradient_clipping_threshold() > 0.0f ||
paraConfig.gradient_clipping_threshold() > 0.0f) &&
!dynamic_cast<AddOptimizer*>(optimizer)) {
optimizer = new OptimizerWithGradientClipping(optConfig, optimizer);
}
Expand Down
2 changes: 2 additions & 0 deletions paddle/parameter/ParameterOptimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ class ParameterOptimizer {
}
parameterTypes_.push_back(type);
}

real getLearningRate() const { return learningRate_; }

virtual void setNoDecay() { applyDecay_ = false; }
Expand Down Expand Up @@ -201,6 +202,7 @@ class ParameterOptimizer {
* so, if lr change in StartBatch, please assign to learningRate_
*/
real learningRate_;

std::unique_ptr<LearningRateScheduler> learningRateScheduler_;
int64_t pass_; // current training pass (starting from 0)
bool firstTime_;
Expand Down
3 changes: 3 additions & 0 deletions proto/TrainerConfig.proto
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ message OptimizationConfig {
// when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
// current async gradient will be discard silently.
optional double async_lagged_grad_discard_ratio = 37 [default = 1.5];

// global threshold for gradient clipping
optional double gradient_clipping_threshold = 38 [default = 0.0];
};

message TrainerConfig {
Expand Down
1 change: 1 addition & 0 deletions python/paddle/trainer/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3377,6 +3377,7 @@ def Import(config_file, local_args={}):
algorithm='async_sgd',
async_lagged_grad_discard_ratio=1.5,
learning_method='momentum',
gradient_clipping_threshold=None,
num_batches_per_send_parameter=None,
num_batches_per_get_parameter=None,
center_parameter_update_method=None,
Expand Down
3 changes: 2 additions & 1 deletion python/paddle/trainer_config_helpers/optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,8 @@ def settings(batch_size,

args = [
'batch_size', 'learning_rate', 'learning_rate_decay_a',
'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args'
'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args',
'gradient_clipping_threshold'
]
kwargs = dict()
kwargs['algorithm'] = algorithm
Expand Down