Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ KERNEL (permute_f_y_axes)(
__attribute__((opencl_unroll_hint(TILE_SIZE)))
for (int i = 0; i < TILE_SIZE; ++i) {
const int x_idx = x_begin + i;
const int f = f_begin + j;
const int f = (f_begin + j) % INPUT0_FEATURE_NUM;
const int y_idx = y_begin + bf_local;
const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id();
DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local][i]);
Expand Down Expand Up @@ -149,7 +149,7 @@ KERNEL (permute_f_y_axes)(
__attribute__((opencl_unroll_hint(J_TIMES)))
for (int j = 0; j < J_TIMES; ++j) {
const int j_vec = j * VEC_SIZE;
const int f = f_begin + j_vec;
const int f = (f_begin + j_vec) % INPUT0_FEATURE_NUM;;
const int y_idx = y_begin + bf_local;
const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx);
WRITE_VEC(READ_VEC(0, &transpose_buf[bf_local][j_vec]), 0, &output[output_idx]);
Expand All @@ -171,7 +171,7 @@ KERNEL (permute_f_y_axes)(
}
__attribute__((opencl_unroll_hint(TILE_SIZE)))
for (int j = 0; j < TILE_SIZE; ++j) {
const int f = f_begin + j;
const int f = (f_begin + j) % INPUT0_FEATURE_NUM;
const int y_idx = y_begin + bf_local;
const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id();
DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ size_t GetDivisor(const size_t input_size) {
return input_size % i == 0;
};
auto result = std::find_if(begin(v), end(v), is_divided);
if (result != end(v)) {
return *result;
}
return 1;
return *result;
}

bool IsSimpleMemCopyOperation(const permute_params& params) {
Expand Down Expand Up @@ -134,7 +131,7 @@ JitConstants PermuteKernel_f_y_axes::GetJitConstants(const permute_params& param
const size_t tile_width = GetTileWidth(params);
const size_t vector_size = std::min(tile_width, static_cast<size_t>(4));
const size_t tile_size = GetTileSize(params);
const size_t j_times = tile_size / vector_size;
const size_t j_times = IsSimpleMemCopyOperation(params) ? tile_width / vector_size : tile_size / vector_size;
const size_t feature_block_size = GetFeatureBlockSize(params);
jit.AddConstant(MakeJitConstant("BLOCK_SIZE", tile_width));
jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size));
Expand Down Expand Up @@ -200,7 +197,7 @@ bool PermuteKernel_f_y_axes::Validate(const Params& p) const {

const auto is_swapping_f_with_y = [](const std::vector<uint16_t>& order) {
// Target transform: Swap feature with y
// OV order: 0 2 1 3 => bfyx -> byfx
// IE order: 0 2 1 3 => bfyx -> byfx
// cldnn order: 0 3 2 1 => bfxy -> byxf
if (order.size() != 4) {
return false;
Expand Down Expand Up @@ -244,5 +241,4 @@ bool PermuteKernel_f_y_axes::Validate(const Params& p) const {
KernelsPriority PermuteKernel_f_y_axes::GetKernelsPriority(const Params& /*params*/) const {
return FORCE_PRIORITY_3;
}

} // namespace kernel_selector
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ permute_kernel_selector::permute_kernel_selector() {
Attach<PermuteKernel_tile_8x8_4x4>();
Attach<PermuteKernel_tile_8x8_4x4_fsv>();
Attach<PermuteKernel_bfzyx_to_bfyxz>();
// Attach<PermuteKernel_f_y_axes>();
Attach<PermuteKernel_f_y_axes>();
}

KernelsData permute_kernel_selector::GetBestKernels(const Params& params) const {
Expand Down
96 changes: 79 additions & 17 deletions src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2248,18 +2248,38 @@ TEST_P(permute_tile_fsv_5d, i64_cached) {

class permute_f_y_axes_tile: public TiledPermuteTest {};

// Test cases are disabled because permute_f_y_axes_tile kernel itself is disabled for accuracy issue
// INSTANTIATE_TEST_SUITE_P(smoke_permute_f_y_axes_tile,
// permute_f_y_axes_tile,
// ::testing::ValuesIn(std::vector<TiledPermuteParam>{
// {{1, 4, 8, 1}, format::bfyx}, // permute_f_y_axes
// {{1, 64, 32, 1}, format::bfyx}, // permute_f_y_axes
// {{1, 32, 256, 512}, format::b_fs_yx_fsv32}, // THREE_DIM_TRANSPOSE
// {{1, 32, 256, 512}, format::bfyx}, // PERMUTE_SIMPLE_MEM_COPY
// {{1, 256, 256, 1}, format::b_fs_yx_fsv32}, // permute_f_y_axes
// {{1, 32, 16, 4}, format::b_fs_yx_fsv16}, // THREE_DIM_TRANSPOSE
// }),
// TiledPermuteTest::PrintToStringParamName);
INSTANTIATE_TEST_SUITE_P(smoke_permute_f_y_axes_tile,
permute_f_y_axes_tile,
::testing::ValuesIn(std::vector<TiledPermuteParam>{
{{1, 4, 8, 1}, format::bfyx}, // permute_f_y_axes
{{1, 64, 32, 1}, format::bfyx}, // permute_f_y_axes
{{1, 32, 256, 512}, format::b_fs_yx_fsv32}, // THREE_DIM_TRANSPOSE
{{1, 32, 256, 512}, format::bfyx}, // PERMUTE_SIMPLE_MEM_COPY
{{1, 256, 256, 1}, format::b_fs_yx_fsv32}, // permute_f_y_axes
{{1, 32, 16, 4}, format::b_fs_yx_fsv16}, // THREE_DIM_TRANSPOSE
//4 batch version
{{4, 4, 4, 1}, format::bfyx}, // permute_f_y_axes
//32 batch version
{{32, 4, 8, 1}, format::bfyx}, // permute_f_y_axes
{{32, 64, 32, 1}, format::bfyx}, // permute_f_y_axes
{{32, 128, 196, 1}, format::bfyx}, // permute_f_y_axes
{{32, 196, 4, 16}, format::bfyx}, // permute_f_y_axes
{{32, 196, 4, 32}, format::bfyx}, // permute_f_y_axes
{{32, 4, 196, 32}, format::bfyx}, // permute_f_y_axes
{{32, 196, 128, 1}, format::bfyx}, // permute_f_y_axes
{{32, 196, 8, 16}, format::bfyx}, // permute_f_y_axes
{{16, 32, 128, 512}, format::b_fs_yx_fsv32}, // THREE_DIM_TRANSPOSE
{{16, 32, 128, 512}, format::bfyx}, // PERMUTE_SIMPLE_MEM_COPY
{{32, 256, 256, 1}, format::b_fs_yx_fsv32}, // permute_f_y_axes
{{32, 32, 16, 4}, format::b_fs_yx_fsv16}, // THREE_DIM_TRANSPOSE
{{32, 16, 16, 16}, format::bfyx},
{{32, 16, 8, 16}, format::bfyx},
{{32, 16, 16, 64}, format::bfyx},
{{32, 16, 8, 32}, format::bfyx},
{{32, 8, 16, 32}, format::bfyx},
{{32, 196, 8, 64}, format::bfyx}, // permute_f_y_axes
}),
TiledPermuteTest::PrintToStringParamName);

TEST_P(permute_f_y_axes_tile, combined) {
auto p = GetParam();
Expand All @@ -2273,14 +2293,53 @@ TEST_P(permute_f_y_axes_tile, combined) {

struct TiledPerformancePermuteTest : TiledPermuteTest
{
static double get_exectime(const std::map<cldnn::primitive_id, cldnn::network_output>& outputs,
const std::string& primitive_id)
{
using namespace std::chrono;
std::shared_ptr<event> e = outputs.at(primitive_id).get_event();
e->wait(); // should ensure execution completion, if not segfault will occur
double avg_time = 0.0;
auto intervals = e->get_profiling_info();
for (const auto& q : intervals)
{
if (q.stage != instrumentation::profiling_stage::executing) {
continue;
}
avg_time = duration_cast<duration<double, microseconds::period>>(q.value->value()).count();
break;
}
return avg_time;
}

static void print_all_perf(std::map<primitive_id, network_output> outputs)
{
std::cout << "Print last run time" << std::endl;
using namespace std::chrono;
for( const auto &n : outputs ) {
std::shared_ptr<event> e = n.second.get_event();
auto intervals = e->get_profiling_info();
double time = 0.0;
for (const auto& q : intervals)
{
if (q.stage == instrumentation::profiling_stage::executing) {
continue;
}
time = duration_cast<duration<double, microseconds::period>>(q.value->value()).count();
break;
}
std::cout << n.first << ":" << time << std::endl;
}
std::cout << std::endl;
}

template<data_types Data_Type>
void execute_perf_test(const std::vector<cldnn::tensor::value_type>& sizes, cldnn::format format_fsv,
const std::string & kernel_name, std::vector<uint16_t> permute_order)
{
auto& engine = get_test_engine();
// convert ov::float16 to ov::float16
using type_ = typename ov::element_type_traits<Data_Type>::value_type;
using type = typename std::conditional<std::is_same<type_, ov::float16>::value, ov::float16, type_>::type;
// convert half_t to FLOAT16
using type = typename ov::element_type_traits<Data_Type>::value_type;

std::vector<cldnn::tensor::value_type> internal_sizes(sizes);
std::swap(internal_sizes.at(2), internal_sizes.back());
Expand Down Expand Up @@ -2342,11 +2401,11 @@ struct TiledPerformancePermuteTest : TiledPermuteTest
double exectime_opt = 0.f;
for (int i = 0; i < r; ++i) {
output_permute_opt = network_tile.execute();
auto t_opt = get_profiling_exectime(output_permute_opt, "output");
auto t_opt = get_exectime(output_permute_opt, "output");
exectime_opt += t_opt;

output_permute_ref = network_ref.execute();
auto t_ref = get_profiling_exectime(output_permute_ref, "output");
auto t_ref = get_exectime(output_permute_ref, "output");
exectime_ref += t_ref;
}
exectime_ref /= r;
Expand All @@ -2365,14 +2424,17 @@ struct TiledPerformancePermuteTest : TiledPermuteTest
<< frm_str << " " << input_type << " " << exectime_opt << std::endl;

}

};


// No need to run performance tests on CI
TEST_P(TiledPerformancePermuteTest, DISABLED_f32) {
auto p = GetParam();
execute_perf_test<cldnn::data_types::f32>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
}


INSTANTIATE_TEST_SUITE_P(, TiledPerformancePermuteTest,
::testing::ValuesIn(std::vector<TiledPermuteParam> {
// b_fs_zy_fsv16
Expand Down