Skip to content

Commit 882fcff

Browse files
[ROCm] Fix ConvolutionHloTest.TestFusedConv3D on rocm7 (#376)
1 parent 72cb133 commit 882fcff

File tree

2 files changed

+48
-28
lines changed

2 files changed

+48
-28
lines changed

build_tools/rocm/run_xla.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,6 @@ bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
105105
--action_env=TF_ROCM_AMDGPU_TARGETS=${GPU_NAME} \
106106
--action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
107107
--run_under=//build_tools/ci:parallel_gpu_execute \
108-
--test_env=MIOPEN_FIND_ENFORCE=5 \
109-
--test_env=MIOPEN_FIND_MODE=1 \
110108
--test_filter=-$(IFS=: ; echo "${EXCLUDED_TESTS[*]}") \
111109
"${SANITIZER_ARGS[@]}" \
112110
"$@"

xla/stream_executor/rocm/rocm_dnn.cc

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,7 @@ namespace wrap {
390390
__macro(miopenCreateOpBatchNormBackward) \
391391
__macro(miopenCompileFusionPlan) \
392392
__macro(miopenFusionPlanGetOp) \
393+
__macro(miopenFusionPlanGetWorkSpaceSize) \
393394
__macro(miopenCreateOperatorArgs) \
394395
__macro(miopenSetOpArgsConvForward) \
395396
__macro(miopenSetOpArgsBiasForward) \
@@ -555,6 +556,10 @@ namespace wrap {
555556
STREAM_EXECUTOR_MIOPEN_WRAP(miopenSetTensorDescriptorV2)
556557
#endif
557558

559+
#if (TF_ROCM_VERSION >= 70000)
560+
STREAM_EXECUTOR_MIOPEN_WRAP(miopenExecuteFusionPlan_v2)
561+
#endif
562+
558563
MIOPEN_DNN_ROUTINE_EACH(STREAM_EXECUTOR_MIOPEN_WRAP)
559564

560565
#undef MIOPEN_DNN_ROUTINE_EACH
@@ -1294,21 +1299,6 @@ class ScopedFusionPlanBase {
12941299
}
12951300
}
12961301

1297-
miopenStatus_t Execute(miopenTensorDescriptor_t input_descriptor,
1298-
const void* input_data,
1299-
miopenTensorDescriptor_t output_descriptor,
1300-
void* output_data) {
1301-
auto status = wrap::miopenExecuteFusionPlan(
1302-
miopen_handle_, fusion_plan_, input_descriptor, input_data,
1303-
output_descriptor, output_data, fusion_args_);
1304-
if (status != miopenStatusSuccess) {
1305-
LOG(FATAL) << "call to miopenExecuteFusionPlan failed: "
1306-
<< ToString(status);
1307-
}
1308-
1309-
return status;
1310-
}
1311-
13121302
bool CompilationSucceeded() { return fusion_plan_compiled_; }
13131303

13141304
miopenStatus_t SetConvolutionArgs(const int op_idx, const float* alpha,
@@ -4857,7 +4847,7 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
48574847
return MakeAlgorithmDesc().ToString();
48584848
}
48594849

4860-
uint64_t GetWorkspaceSize() const override { return 0; }
4850+
uint64_t GetWorkspaceSize() const override { return workspace_size_; }
48614851

48624852
absl::StatusOr<dnn::AlgorithmDesc> ToAlgorithmDesc() const override {
48634853
return MakeAlgorithmDesc();
@@ -4888,10 +4878,18 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
48884878
}
48894879

48904880
miopenStatus_t status;
4881+
#if (TF_ROCM_VERSION >= 70000)
4882+
status = wrap::miopenExecuteFusionPlan_v2(
4883+
miopen.handle(), fusion_plan_.fusion_plan_, input_nd_.handle(),
4884+
input_data.opaque(), output_nd_.handle(), output_data.opaque(),
4885+
fusion_plan_.fusion_args_, scratch_memory.opaque(),
4886+
scratch_memory.size());
4887+
#else
48914888
status = wrap::miopenExecuteFusionPlan(
48924889
miopen.handle(), fusion_plan_.fusion_plan_, input_nd_.handle(),
48934890
input_data.opaque(), output_nd_.handle(), output_data.opaque(),
48944891
fusion_plan_.fusion_args_);
4892+
#endif
48954893

48964894
if (status != miopenStatusSuccess) {
48974895
LOG(ERROR) << "Failed to enqueue fused convolution on stream: "
@@ -4911,12 +4909,12 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
49114909

49124910
public:
49134911
static absl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>> Create(
4914-
StreamExecutor* parent, Stream* stream, MIOpenAccess* miopen,
4912+
StreamExecutor* parent, Stream* stream, MIOpenAccess* miopen_,
49154913
const dnn::AlgorithmDesc& algo, dnn::DataType input_type,
49164914
dnn::DataType bias_type, double leakyrelu_alpha, BatchDescriptor input_nd,
49174915
BatchDescriptor output_nd, FilterDescriptor filter,
49184916
BatchDescriptor bias_nd, ConvolutionDescriptor conv,
4919-
dnn::ActivationMode activation) {
4917+
dnn::ActivationMode activation, std::optional<uint64_t> workspace_size) {
49204918
TF_ASSIGN_OR_RETURN(
49214919
auto input_nd_,
49224920
scope(input_nd, ToMIOpenDataType(input_type, input_nd.layout())));
@@ -4933,20 +4931,35 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
49334931
auto activation_desc,
49344932
ScopedActivationDescriptor::Create(activation, leakyrelu_alpha));
49354933

4934+
auto miopen = miopen_->GetHandle(parent, stream);
4935+
49364936
TF_ASSIGN_OR_RETURN(
49374937
auto fusion_plan,
49384938
ScopedFusionPlanConvolutionBiasActivation::Create(
4939-
miopen->GetHandle(parent, stream).handle(), input_nd_.handle(),
4939+
miopen.handle(), input_nd_.handle(),
49404940
filter_.handle(), conv_.handle(), bias_nd_.handle(),
49414941
activation_desc));
49424942

49434943
if (!fusion_plan.CompilationSucceeded()) {
49444944
return absl::InternalError("No algorithms found");
49454945
}
49464946

4947-
auto obj = new RocmFusedConvRunner(parent, stream, miopen, input_nd_,
4948-
output_nd_, filter_, bias_nd_, conv_,
4949-
activation_desc, fusion_plan);
4947+
uint64_t workspace_size_ = workspace_size.value_or(0);
4948+
4949+
if (!workspace_size.has_value()) {
4950+
auto status = wrap::miopenFusionPlanGetWorkSpaceSize(
4951+
miopen.handle(), fusion_plan.fusion_plan_, &workspace_size_,
4952+
miopenConvolutionFwdAlgoImplicitGEMM);
4953+
if (status != miopenStatusSuccess) {
4954+
return absl::InternalError(
4955+
"call to miopenFusionPlanGetWorkSpaceSize failed: " +
4956+
stream_executor::gpu::ToString(status));
4957+
}
4958+
}
4959+
4960+
auto obj = new RocmFusedConvRunner(
4961+
parent, stream, miopen_, input_nd_, output_nd_, filter_, bias_nd_,
4962+
conv_, activation_desc, fusion_plan, workspace_size_);
49504963

49514964
return std::unique_ptr<const dnn::FusedConvRunner>(obj);
49524965
}
@@ -4959,7 +4972,8 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
49594972
ScopedTensorDescriptor& bias_nd,
49604973
ScopedConvolutionDescriptor& conv,
49614974
ScopedActivationDescriptor& activation_desc,
4962-
ScopedFusionPlanConvolutionBiasActivation& fusion_plan)
4975+
ScopedFusionPlanConvolutionBiasActivation& fusion_plan,
4976+
uint64_t workspace_size)
49634977
: parent_(parent),
49644978
miopen_(miopen),
49654979
input_nd_(std::move(input_nd)),
@@ -4968,15 +4982,16 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
49684982
bias_nd_(std::move(bias_nd)),
49694983
conv_(std::move(conv)),
49704984
activation_desc_(std::move(activation_desc)),
4971-
fusion_plan_(std::move(fusion_plan)) {}
4985+
fusion_plan_(std::move(fusion_plan)),
4986+
workspace_size_(workspace_size) {}
49724987

49734988
// Internal form of ToAlgorithmDesc without the StatusOr.
49744989
dnn::AlgorithmDesc MakeAlgorithmDesc() const {
49754990
dnn::AlgorithmProto algorithm;
49764991
algorithm.set_algo_id(0);
49774992
algorithm.set_math_type(dnn::AlgorithmProto::TENSOR_OP_MATH);
49784993
algorithm.set_is_cudnn_frontend(true);
4979-
algorithm.mutable_workspace_size()->set_value(0);
4994+
algorithm.mutable_workspace_size()->set_value(workspace_size_);
49804995
return dnn::AlgorithmDesc{algorithm};
49814996
}
49824997

@@ -4990,6 +5005,7 @@ class RocmFusedConvRunner : public dnn::FusedConvRunner {
49905005
ScopedConvolutionDescriptor conv_;
49915006
mutable ScopedActivationDescriptor activation_desc_;
49925007
mutable ScopedFusionPlanConvolutionBiasActivation fusion_plan_;
5008+
uint64_t workspace_size_;
49935009
};
49945010

49955011
absl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
@@ -5017,7 +5033,8 @@ MIOpenSupport::FusedConvolveRunnerFromDesc(
50175033
return RocmFusedConvRunner::Create(
50185034
parent_, stream, miopen_.get(), algorithm_desc, input_type, bias_type,
50195035
leakyrelu_alpha, input_descriptor, output_descriptor, filter_descriptor,
5020-
bias_descriptor, convolution_descriptor, activation_mode);
5036+
bias_descriptor, convolution_descriptor, activation_mode,
5037+
algorithm_desc.workspace_size());
50215038
}
50225039

50235040
absl::Status MIOpenSupport::GetFusedConvolveRunners(
@@ -5039,7 +5056,12 @@ absl::Status MIOpenSupport::GetFusedConvolveRunners(
50395056
side_input_scale, leakyrelu_alpha, input_descriptor, filter_descriptor,
50405057
bias_descriptor, output_descriptor, convolution_descriptor,
50415058
activation_mode);
5059+
// No way to invoke fused convs with workspace prior to rocm 7
5060+
#if (TF_ROCM_VERSION >= 70000)
50425061
if (runner_or.ok()) {
5062+
#else
5063+
if (runner_or.ok() && runner_or.value()->GetWorkspaceSize() == 0) {
5064+
#endif
50435065
out_exec_plans->push_back(std::move(runner_or).value());
50445066
}
50455067

0 commit comments

Comments
 (0)