diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc index c01df4d6e236c7..6e1ec454b6bab3 100644 --- a/paddle/fluid/ir_adaptor/translator/op_translator.cc +++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc @@ -3113,6 +3113,25 @@ struct CEmbeddingOpTranscriber : public OpTranscriber { } }; +struct QuantizeLinearOpTranscriber : public OpTranscriber { + void HandleNonexistentAttribute(pir::IrContext* ctx, + pir::AttributeMap* attribute_map, + const OpAttributeInfo& info) override { + if (info.name == "round_type") { + (*attribute_map)[info.name] = pir::Int32Attribute::get(ctx, 0); + } + if (info.name == "is_test") { + (*attribute_map)[info.name] = pir::BoolAttribute::get(ctx, true); + } + if (info.name == "only_observer") { + (*attribute_map)[info.name] = pir::BoolAttribute::get(ctx, false); + } + if (info.name == "moving_rate") { + (*attribute_map)[info.name] = pir::FloatAttribute::get(ctx, 0.9); + } + } +}; + OpTranslator::OpTranslator() { pir::IrContext* ctx = pir::IrContext::Instance(); ctx->GetOrRegisterDialect(); @@ -3185,6 +3204,8 @@ OpTranslator::OpTranslator() { special_handlers["elementwise_mod_grad"] = ElementwiseGradTranscriber(); special_handlers["elementwise_floordiv_grad"] = ElementwiseGradTranscriber(); special_handlers["c_embedding"] = CEmbeddingOpTranscriber(); + special_handlers["quantize_linear"] = QuantizeLinearOpTranscriber(); + special_handlers["dequantize_linear"] = QuantizeLinearOpTranscriber(); } } // namespace translator diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py index b141f1ecfa8792..54b56a2e3c8875 100644 --- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py @@ -108,6 +108,12 @@ 'lrn', 'multi_gru', 'matmul_with_flatten', + 'moving_average_abs_max_scale', + 'moving_average_abs_max_scale_', + 'quantize_linear', + 'quantize_linear_', + 'dequantize_linear', + 'dequantize_linear_', ] NO_NEED_GEN_STATIC_ONLY_APIS = [ diff --git a/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.cc b/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.cc index 44183751b8ca1d..54692375248809 100644 --- a/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.cc +++ b/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.cc @@ -28,6 +28,10 @@ KernelKeyTuple UniqueOpParseKernelKey(pir::Operation* op) { return {dtype, backend}; } +KernelKeyTuple SaveCombineOpParseKernelKey(pir::Operation* op) { + return {phi::DataType::FLOAT32, phi::Backend::UNDEFINED}; +} + } // namespace paddle::dialect IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ParseKernelKeyInterface) diff --git a/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h b/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h index 2d101dbd310d5e..7913893fdb7d7a 100644 --- a/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h +++ b/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h @@ -57,6 +57,8 @@ class ParseKernelKeyInterface // Register the ParseKernelKeyInterface for unique op. KernelKeyTuple UniqueOpParseKernelKey(pir::Operation *op); +KernelKeyTuple SaveCombineOpParseKernelKey(pir::Operation *op); + } // namespace dialect } // namespace paddle diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml index f1e20326d59dee..594130926d5695 100644 --- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml +++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml @@ -366,6 +366,19 @@ data_type : x backward : depthwise_conv2d_transpose_grad +- op : dequantize_linear + args : (Tensor x, Tensor scale, Tensor zero_point, Tensor in_accum, Tensor in_state, int quant_axis = 0, int bit_length = 8, int round_type = 0, bool is_test = true, bool only_observer = false, float moving_rate=0.9f) + output : Tensor(y), Tensor(out_scale), Tensor(out_accum), Tensor(out_state) + infer_meta : + func : QuantizeLinearInferMeta + param : [x, scale, in_accum, in_state, quant_axis] + kernel : + func : quantize_linear + param : [x, scale, zero_point, in_accum, in_state, quant_axis, bit_length, round_type, is_test, only_observer, moving_rate] + data_type : x + optional : in_accum, in_state, out_scale, out_accum, out_state + inplace : (scale -> out_scale, in_accum -> out_accum, in_state -> out_state) + - op : disable_check_model_nan_inf args: (Tensor x, int flag = 0) output: Tensor(out) @@ -1083,6 +1096,19 @@ data_type : out_grad_in inplace: (out_grad_in -> out_grad_out) +- op : quantize_linear + args : (Tensor x, Tensor scale, Tensor zero_point, Tensor in_accum, Tensor in_state, int quant_axis = 0, int bit_length = 8, int round_type = 0, bool is_test = true, bool only_observer = false, float moving_rate=0.9f) + output : Tensor(y), Tensor(out_scale), Tensor(out_accum), Tensor(out_state) + infer_meta : + func : QuantizeLinearInferMeta + param : [x, scale, in_accum, in_state, quant_axis] + kernel : + func : quantize_linear + param : [x, scale, zero_point, in_accum, in_state, quant_axis, bit_length, round_type, is_test, only_observer, moving_rate] + data_type : x + optional : in_accum, in_state, out_scale, out_accum, out_state + inplace : (scale -> out_scale, in_accum -> out_accum, in_state -> out_state) + - op : randint args : (int low, int high, IntArray shape, DataType dtype=DataType::INT64, Place place={}) output : Tensor(out) @@ -1215,6 +1241,7 @@ func: save_combine_tensor param: [x, file_path, overwrite, save_as_fp16, save_to_memory] optional : out + interfaces : paddle::dialect::ParseKernelKeyInterface - op : seed args : (int seed, bool deterministic, str rng_name, bool force_cpu) @@ -1635,6 +1662,18 @@ func: match_matrix_tensor backward: match_matrix_tensor_grad +- op: moving_average_abs_max_scale + args: (Tensor x, Tensor in_accum, Tensor in_state, float moving_rate=0.9f, bool is_test=false) + output: Tensor(out), Tensor(out_scale), Tensor(out_state), Tensor(out_accum) + infer_meta: + func: MovingAverageAbsMaxScaleInferMeta + param: [x, in_accum, in_state] + kernel: + func: moving_average_abs_max_scale + param: [x, in_accum, in_state, moving_rate, is_test] + optional : in_accum, in_state, out, out_state, out_accum + inplace : (in_accum -> out_accum), (in_state -> out_state) + - op: nce args: (Tensor input, Tensor label, Tensor weight, Tensor bias, Tensor sample_weight, Tensor custom_dist_probs, Tensor custom_dist_alias, Tensor custom_dist_alias_probs, int num_total_classes, int[] custom_neg_classes={}, int num_neg_samples=10, int sampler=0, int seed=0, bool is_sparse=false, bool remote_prefetch=false, bool is_test=false) output: Tensor(cost), Tensor(sample_logits), Tensor(sample_labels) diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc index b4bad427567b70..296dec7d51940e 100644 --- a/paddle/fluid/pir/dialect/operator/utils/utils.cc +++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc @@ -72,6 +72,12 @@ const std::unordered_set LegacyOpList = { NceGradOp::name(), LrnOp::name(), LrnGradOp::name(), + MovingAverageAbsMaxScaleOp::name(), + MovingAverageAbsMaxScale_Op::name(), + QuantizeLinearOp::name(), + QuantizeLinear_Op::name(), + DequantizeLinearOp::name(), + DequantizeLinear_Op::name(), #ifdef PADDLE_WITH_DNNL paddle::onednn::dialect::LrnOp::name(), paddle::onednn::dialect::LrnGradOp::name(), diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 53e0cea953b879..1a3f86753fa7e9 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -771,6 +771,10 @@ {scale : Scale, shift : Shift} - op : dequantize_linear + inputs : + {x : X, scale : Scale, zero_point : ZeroPoint, in_accum : InAccum, in_state : InState} + outputs : + {y : Y, out_scale : OutScale, out_accum : OutAccum, out_state : OutState} extra : attrs : [float moving_rate = 0.9] @@ -2197,6 +2201,12 @@ outputs : {param_out : ParamOut, velocity_out : VelocityOut, master_param_out : MasterParamOut} +- op : moving_average_abs_max_scale + inputs : + {x : X, in_accum : InAccum, in_state : InState} + outputs : + {out : Out, out_scale : OutScale, out_state : OutState, out_accum : OutAccum} + - op : multi_dot backward : multi_dot_grad inputs : @@ -2546,6 +2556,10 @@ {scale : Scale, shift : Shift, include_self: Include_self} - op : quantize_linear + inputs : + {x : X, scale : Scale, zero_point : ZeroPoint, in_accum : InAccum, in_state : InState} + outputs : + {y : Y, out_scale : OutScale, out_accum : OutAccum, out_state : OutState} extra : attrs : [float moving_rate = 0.9] diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc index 07b05ccd2c760c..4a6888fad7d86a 100644 --- a/paddle/phi/core/kernel_context.cc +++ b/paddle/phi/core/kernel_context.cc @@ -119,7 +119,7 @@ const AttrType& KernelContext::AttrAt(size_t idx) const { return paddle::get(attrs_.at(idx)); } catch (paddle::bad_variant_access const& ex) { PADDLE_THROW(phi::errors::InvalidArgument( - "Attribute cast error in Op Kernel Context.")); + "Attribute %d cast error in Op Kernel Context.", idx)); } } diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index b7a5dd51de9015..69214508ef3f92 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -3497,6 +3497,32 @@ void PsroiPoolInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void QuantizeLinearInferMeta(const MetaTensor& x, + const MetaTensor& scale, + const MetaTensor& in_accum, + const MetaTensor& in_state, + int quant_axis, + MetaTensor* y, + MetaTensor* out_scale, + MetaTensor* out_accum, + MetaTensor* out_state) { + y->set_dims(x.dims()); + y->share_lod(x); + if (out_scale) { + if (quant_axis < 0) { + out_scale->set_dims(scale.dims()); + } else { + out_scale->set_dims({x.dims()[quant_axis]}); + } + } + if (out_accum) { + out_accum->set_dims(in_accum.dims()); + } + if (out_state) { + out_state->set_dims(in_state.dims()); + } +} + void RmsNormInferMeta(const MetaTensor& x, const MetaTensor& bias, const MetaTensor& residual, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 0774189dd8d4fc..3d9b2539267e7e 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -636,6 +636,16 @@ void PsroiPoolInferMeta(const MetaTensor& x, float spatial_scale, MetaTensor* out); +void QuantizeLinearInferMeta(const MetaTensor& x, + const MetaTensor& scale, + const MetaTensor& in_accum, + const MetaTensor& in_state, + int quant_axis, + MetaTensor* y, + MetaTensor* out_scale, + MetaTensor* out_accum, + MetaTensor* out_state); + void RmsNormInferMeta(const MetaTensor& x, const MetaTensor& bias, const MetaTensor& residual, diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index b728c33abf2e2d..edd03e6b07513f 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -896,6 +896,26 @@ void MultiClassNMSInferMeta(const MetaTensor& bboxes, nms_rois_num->set_dtype(DataType::INT32); } +void MovingAverageAbsMaxScaleInferMeta(const MetaTensor& x, + const MetaTensor& in_accum, + const MetaTensor& in_state, + MetaTensor* out, + MetaTensor* out_scale, + MetaTensor* out_state, + MetaTensor* out_accum) { + if (out) { + out->set_dims(x.dims()); + out->share_lod(x); + out_scale->set_dims({1}); + } + if (out_state) { + out_state->set_dims(in_state.dims()); + } + if (out_accum) { + out_accum->set_dims(in_accum.dims()); + } +} + void NllLossRawInferMeta(const MetaTensor& input, const MetaTensor& label, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 7ffdc3d272069f..d12378fe3a92c1 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -140,6 +140,14 @@ void MatchMatrixTensorInferMeta(const MetaTensor& x, MetaTensor* tmp, MetaConfig config = MetaConfig()); +void MovingAverageAbsMaxScaleInferMeta(const MetaTensor& x, + const MetaTensor& in_accum, + const MetaTensor& in_state, + MetaTensor* out, + MetaTensor* out_scale, + MetaTensor* out_state, + MetaTensor* out_accum); + void MultiClassNMSInferMeta(const MetaTensor& bboxes, const MetaTensor& scores, const MetaTensor& rois_num, diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index 87497184392e69..401737bb13ac68 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -257,6 +257,11 @@ def __init__( paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1}) paddle.framework.set_flags({'FLAGS_new_executor_static_build': 1}) + if auto_utils.use_new_executor(): + is_pir_mode = os.environ.get("FLAGS_enable_pir_in_executor", None) + if is_pir_mode is None: + paddle.framework.set_flags({'FLAGS_enable_pir_in_executor': 1}) + self.enable_job_schedule_profiler = False # get dist input spec from shard dataloader diff --git a/test/auto_parallel/engine_api.py b/test/auto_parallel/engine_api.py index cc921d41a74a9d..7edcb9a9823cdf 100644 --- a/test/auto_parallel/engine_api.py +++ b/test/auto_parallel/engine_api.py @@ -234,155 +234,6 @@ def train_low_level(): engine.load(model_filename) temp_dir.cleanup() - # Build dataloader from generator - # train - train_dataset = MyDataset(batch_num * batch_size) - train_dataloader = engine.dataloader_from_generator( - train_dataset, batch_size=batch_size, mode="train" - ) - engine.prepare(mode="train") - for data in train_dataloader: - outs = engine.run(data, feed=feed_dict, mode="train") - - # eval - engine.to_mode("eval") - eval_dataset2 = MyDataset(batch_size) - eval_dataloader = engine.dataloader_from_generator( - eval_dataset2, batch_size=batch_size - ) - engine.prepare() - for data in eval_dataloader: - outs = engine.run(data, feed=feed_dict) - - # predict - test_dataset = MyDataset(batch_size) - predict_dataloader = engine.dataloader_from_generator( - test_dataset, batch_size=batch_size, mode="predict" - ) - engine.prepare(mode="predict") - for data in predict_dataloader: - outs = engine.run(data, feed=feed_dict, mode="predict") - - # save - temp_dir = tempfile.TemporaryDirectory() - model_filename = os.path.join(temp_dir.name, 'mlp') - engine.save(model_filename, training=True) - engine.load(model_filename) - temp_dir.cleanup() - - -def train_builtin_data_vars(): - paddle.distributed.auto_parallel.static.dist_context.set_default_distributed_context( - None - ) - mlp = MLPLayer( - hidden_size=hidden_size, - intermediate_size=4 * hidden_size, - dropout_ratio=0.1, - initializer_range=0.02, - ) - loss = paddle.nn.CrossEntropyLoss() - optimizer = paddle.optimizer.Adam( - learning_rate=0.00001, - beta1=0.9, - beta2=0.999, - epsilon=1e-08, - grad_clip=None, - ) - metric = paddle.metric.Accuracy() - - strategy = auto.Strategy() - strategy.auto_mode = "semi" - - engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy) - - # train - engine.to_mode("train") - - input_spec = static.InputSpec([batch_size, image_size], 'float32', 'input') - label_spec = static.InputSpec([batch_size, 1], 'int64', 'label') - engine.prepare(inputs_spec=[input_spec], labels_spec=[label_spec]) - - with static.program_guard(engine.main_program, engine.startup_program): - feed_list = engine.inputs + engine.labels - print(feed_list) - loader = paddle.base.io.DataLoader.from_generator( - feed_list=feed_list, capacity=4 * batch_size, iterable=False - ) - - places = static.cuda_places() - loader.set_batch_generator(batch_generator_creator(), places=places) - - for _ in range(epoch_num): - loader.start() # call DataLoader.start() before each epoch starts - try: - while True: - engine.run() - except paddle.base.core.EOFException: - loader.reset() # call DataLoader.reset() after catching EOFException - - -def train_non_builtin_data_vars(): - paddle.distributed.auto_parallel.static.dist_context.set_default_distributed_context( - None - ) - main_program = static.Program() - startup_program = static.Program() - with static.program_guard( - main_program, startup_program - ), utils.unique_name.guard(): - input = static.data( - name="input", shape=[batch_size, image_size], dtype='float32' - ) - label = static.data(name="label", shape=[batch_size, 1], dtype='int64') - - loader = paddle.base.io.DataLoader.from_generator( - feed_list=[input, label], capacity=4 * batch_size, iterable=False - ) - places = static.cuda_places() - loader.set_batch_generator(batch_generator_creator(), places=places) - - mlp = MLPLayer( - hidden_size=hidden_size, - intermediate_size=4 * hidden_size, - dropout_ratio=0.1, - initializer_range=0.02, - ) - loss = paddle.nn.CrossEntropyLoss() - optimizer = paddle.optimizer.Adam( - learning_rate=0.00001, - beta1=0.9, - beta2=0.999, - epsilon=1e-08, - grad_clip=None, - ) - metric = paddle.metric.Accuracy() - predict = mlp(input) - loss_var = loss(predict, label) - - strategy = auto.Strategy() - strategy.auto_mode = "semi" - - engine = auto.Engine( - loss=loss_var, optimizer=optimizer, metrics=metric, strategy=strategy - ) - - # train - engine.to_mode("train") - engine.prepare( - inputs=[input], - labels=[label], - main_program=main_program, - startup_program=startup_program, - ) - for _ in range(epoch_num): - loader.start() # call DataLoader.start() before each epoch starts - try: - while True: - engine.run() - except paddle.base.core.EOFException: - loader.reset() # call DataLoader.reset() after catching EOFException - def get_cost(): paddle.distributed.auto_parallel.static.dist_context.set_default_distributed_context( @@ -522,8 +373,6 @@ def get_cost_by_spec(): train_high_level(fetch=True) train_high_level(fetch=False) train_low_level() - train_builtin_data_vars() - train_non_builtin_data_vars() get_cost() get_cost_by_default_program() get_cost_by_spec()