@@ -390,6 +390,7 @@ namespace wrap {
390390 __macro(miopenCreateOpBatchNormBackward) \
391391 __macro(miopenCompileFusionPlan) \
392392 __macro(miopenFusionPlanGetOp) \
393+ __macro(miopenFusionPlanGetWorkSpaceSize) \
393394 __macro(miopenCreateOperatorArgs) \
394395 __macro(miopenSetOpArgsConvForward) \
395396 __macro(miopenSetOpArgsBiasForward) \
@@ -555,6 +556,10 @@ namespace wrap {
555556STREAM_EXECUTOR_MIOPEN_WRAP (miopenSetTensorDescriptorV2)
556557#endif
557558
559+ #if (TF_ROCM_VERSION >= 70000)
560+ STREAM_EXECUTOR_MIOPEN_WRAP (miopenExecuteFusionPlan_v2)
561+ #endif
562+
558563MIOPEN_DNN_ROUTINE_EACH (STREAM_EXECUTOR_MIOPEN_WRAP)
559564
560565#undef MIOPEN_DNN_ROUTINE_EACH
@@ -1294,21 +1299,6 @@ class ScopedFusionPlanBase {
12941299 }
12951300 }
12961301
1297- miopenStatus_t Execute (miopenTensorDescriptor_t input_descriptor,
1298- const void * input_data,
1299- miopenTensorDescriptor_t output_descriptor,
1300- void * output_data) {
1301- auto status = wrap::miopenExecuteFusionPlan (
1302- miopen_handle_, fusion_plan_, input_descriptor, input_data,
1303- output_descriptor, output_data, fusion_args_);
1304- if (status != miopenStatusSuccess) {
1305- LOG (FATAL) << " call to miopenExecuteFusionPlan failed: "
1306- << ToString (status);
1307- }
1308-
1309- return status;
1310- }
1311-
13121302 bool CompilationSucceeded () { return fusion_plan_compiled_; }
13131303
13141304 miopenStatus_t SetConvolutionArgs (const int op_idx, const float * alpha,
@@ -4857,7 +4847,7 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
48574847 return MakeAlgorithmDesc ().ToString ();
48584848 }
48594849
4860- uint64_t GetWorkspaceSize () const override { return 0 ; }
4850+ uint64_t GetWorkspaceSize () const override { return workspace_size_ ; }
48614851
48624852 absl::StatusOr<dnn::AlgorithmDesc> ToAlgorithmDesc () const override {
48634853 return MakeAlgorithmDesc ();
@@ -4888,10 +4878,18 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
48884878 }
48894879
48904880 miopenStatus_t status;
4881+ #if (TF_ROCM_VERSION >= 70000)
4882+ status = wrap::miopenExecuteFusionPlan_v2 (
4883+ miopen.handle (), fusion_plan_.fusion_plan_ , input_nd_.handle (),
4884+ input_data.opaque (), output_nd_.handle (), output_data.opaque (),
4885+ fusion_plan_.fusion_args_ , scratch_memory.opaque (),
4886+ scratch_memory.size ());
4887+ #else
48914888 status = wrap::miopenExecuteFusionPlan (
48924889 miopen.handle (), fusion_plan_.fusion_plan_ , input_nd_.handle (),
48934890 input_data.opaque (), output_nd_.handle (), output_data.opaque (),
48944891 fusion_plan_.fusion_args_ );
4892+ #endif
48954893
48964894 if (status != miopenStatusSuccess) {
48974895 LOG (ERROR) << " Failed to enqueue fused convolution on stream: "
@@ -4911,12 +4909,12 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
49114909
49124910 public:
49134911 static absl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>> Create (
4914- StreamExecutor* parent, Stream* stream, MIOpenAccess* miopen ,
4912+ StreamExecutor* parent, Stream* stream, MIOpenAccess* miopen_ ,
49154913 const dnn::AlgorithmDesc& algo, dnn::DataType input_type,
49164914 dnn::DataType bias_type, double leakyrelu_alpha, BatchDescriptor input_nd,
49174915 BatchDescriptor output_nd, FilterDescriptor filter,
49184916 BatchDescriptor bias_nd, ConvolutionDescriptor conv,
4919- dnn::ActivationMode activation) {
4917+ dnn::ActivationMode activation, std::optional< uint64_t > workspace_size ) {
49204918 TF_ASSIGN_OR_RETURN (
49214919 auto input_nd_,
49224920 scope (input_nd, ToMIOpenDataType (input_type, input_nd.layout ())));
@@ -4933,20 +4931,35 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
49334931 auto activation_desc,
49344932 ScopedActivationDescriptor::Create (activation, leakyrelu_alpha));
49354933
4934+ auto miopen = miopen_->GetHandle (parent, stream);
4935+
49364936 TF_ASSIGN_OR_RETURN (
49374937 auto fusion_plan,
49384938 ScopedFusionPlanConvolutionBiasActivation::Create (
4939- miopen-> GetHandle (parent, stream) .handle (), input_nd_.handle (),
4939+ miopen.handle (), input_nd_.handle (),
49404940 filter_.handle (), conv_.handle (), bias_nd_.handle (),
49414941 activation_desc));
49424942
49434943 if (!fusion_plan.CompilationSucceeded ()) {
49444944 return absl::InternalError (" No algorithms found" );
49454945 }
49464946
4947- auto obj = new RocmFusedConvRunner (parent, stream, miopen, input_nd_,
4948- output_nd_, filter_, bias_nd_, conv_,
4949- activation_desc, fusion_plan);
4947+ uint64_t workspace_size_ = workspace_size.value_or (0 );
4948+
4949+ if (!workspace_size.has_value ()) {
4950+ auto status = wrap::miopenFusionPlanGetWorkSpaceSize (
4951+ miopen.handle (), fusion_plan.fusion_plan_ , &workspace_size_,
4952+ miopenConvolutionFwdAlgoImplicitGEMM);
4953+ if (status != miopenStatusSuccess) {
4954+ return absl::InternalError (
4955+ " call to miopenFusionPlanGetWorkSpaceSize failed: " +
4956+ stream_executor::gpu::ToString (status));
4957+ }
4958+ }
4959+
4960+ auto obj = new RocmFusedConvRunner (
4961+ parent, stream, miopen_, input_nd_, output_nd_, filter_, bias_nd_,
4962+ conv_, activation_desc, fusion_plan, workspace_size_);
49504963
49514964 return std::unique_ptr<const dnn::FusedConvRunner>(obj);
49524965 }
@@ -4959,7 +4972,8 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
49594972 ScopedTensorDescriptor& bias_nd,
49604973 ScopedConvolutionDescriptor& conv,
49614974 ScopedActivationDescriptor& activation_desc,
4962- ScopedFusionPlanConvolutionBiasActivation& fusion_plan)
4975+ ScopedFusionPlanConvolutionBiasActivation& fusion_plan,
4976+ uint64_t workspace_size)
49634977 : parent_(parent),
49644978 miopen_ (miopen),
49654979 input_nd_(std::move(input_nd)),
@@ -4968,15 +4982,16 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
49684982 bias_nd_(std::move(bias_nd)),
49694983 conv_(std::move(conv)),
49704984 activation_desc_(std::move(activation_desc)),
4971- fusion_plan_(std::move(fusion_plan)) {}
4985+ fusion_plan_(std::move(fusion_plan)),
4986+ workspace_size_(workspace_size) {}
49724987
49734988 // Internal form of ToAlgorithmDesc without the StatusOr.
49744989 dnn::AlgorithmDesc MakeAlgorithmDesc () const {
49754990 dnn::AlgorithmProto algorithm;
49764991 algorithm.set_algo_id (0 );
49774992 algorithm.set_math_type (dnn::AlgorithmProto::TENSOR_OP_MATH);
49784993 algorithm.set_is_cudnn_frontend (true );
4979- algorithm.mutable_workspace_size ()->set_value (0 );
4994+ algorithm.mutable_workspace_size ()->set_value (workspace_size_ );
49804995 return dnn::AlgorithmDesc{algorithm};
49814996 }
49824997
@@ -4990,6 +5005,7 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
49905005 ScopedConvolutionDescriptor conv_;
49915006 mutable ScopedActivationDescriptor activation_desc_;
49925007 mutable ScopedFusionPlanConvolutionBiasActivation fusion_plan_;
5008+ uint64_t workspace_size_;
49935009};
49945010
49955011absl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
@@ -5017,7 +5033,8 @@ MIOpenSupport::FusedConvolveRunnerFromDesc(
50175033 return RocmFusedConvRunner::Create (
50185034 parent_, stream, miopen_.get (), algorithm_desc, input_type, bias_type,
50195035 leakyrelu_alpha, input_descriptor, output_descriptor, filter_descriptor,
5020- bias_descriptor, convolution_descriptor, activation_mode);
5036+ bias_descriptor, convolution_descriptor, activation_mode,
5037+ algorithm_desc.workspace_size ());
50215038}
50225039
50235040absl::Status MIOpenSupport::GetFusedConvolveRunners (
@@ -5039,7 +5056,12 @@ absl::Status MIOpenSupport::GetFusedConvolveRunners(
50395056 side_input_scale, leakyrelu_alpha, input_descriptor, filter_descriptor,
50405057 bias_descriptor, output_descriptor, convolution_descriptor,
50415058 activation_mode);
5059+ // No way to invoke fused convs with workspace prior to rocm 7
5060+ #if (TF_ROCM_VERSION >= 70000)
50425061 if (runner_or.ok ()) {
5062+ #else
5063+ if (runner_or.ok () && runner_or.value ()->GetWorkspaceSize () == 0 ) {
5064+ #endif
50435065 out_exec_plans->push_back (std::move (runner_or).value ());
50445066 }
50455067
0 commit comments