diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl index 3f3e0a3f8c931a..b65ea6f2adae14 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl @@ -95,7 +95,7 @@ KERNEL (permute_f_y_axes)( __attribute__((opencl_unroll_hint(TILE_SIZE))) for (int i = 0; i < TILE_SIZE; ++i) { const int x_idx = x_begin + i; - const int f = f_begin + j; + const int f = (f_begin + j) % INPUT0_FEATURE_NUM; const int y_idx = y_begin + bf_local; const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id(); DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local][i]); @@ -149,7 +149,7 @@ KERNEL (permute_f_y_axes)( __attribute__((opencl_unroll_hint(J_TIMES))) for (int j = 0; j < J_TIMES; ++j) { const int j_vec = j * VEC_SIZE; - const int f = f_begin + j_vec; + const int f = (f_begin + j_vec) % INPUT0_FEATURE_NUM;; const int y_idx = y_begin + bf_local; const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx); WRITE_VEC(READ_VEC(0, &transpose_buf[bf_local][j_vec]), 0, &output[output_idx]); @@ -171,7 +171,7 @@ KERNEL (permute_f_y_axes)( } __attribute__((opencl_unroll_hint(TILE_SIZE))) for (int j = 0; j < TILE_SIZE; ++j) { - const int f = f_begin + j; + const int f = (f_begin + j) % INPUT0_FEATURE_NUM; const int y_idx = y_begin + bf_local; const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id(); DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local]); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp index e602985ae6d70b..f6d1fde5da1545 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp @@ -24,10 +24,7 @@ size_t GetDivisor(const size_t input_size) { return input_size % i == 0; }; auto result = std::find_if(begin(v), end(v), is_divided); - if (result != end(v)) { - return *result; - } - return 1; + return *result; } bool IsSimpleMemCopyOperation(const permute_params& params) { @@ -134,7 +131,7 @@ JitConstants PermuteKernel_f_y_axes::GetJitConstants(const permute_params& param const size_t tile_width = GetTileWidth(params); const size_t vector_size = std::min(tile_width, static_cast(4)); const size_t tile_size = GetTileSize(params); - const size_t j_times = tile_size / vector_size; + const size_t j_times = IsSimpleMemCopyOperation(params) ? tile_width / vector_size : tile_size / vector_size; const size_t feature_block_size = GetFeatureBlockSize(params); jit.AddConstant(MakeJitConstant("BLOCK_SIZE", tile_width)); jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size)); @@ -200,7 +197,7 @@ bool PermuteKernel_f_y_axes::Validate(const Params& p) const { const auto is_swapping_f_with_y = [](const std::vector& order) { // Target transform: Swap feature with y - // OV order: 0 2 1 3 => bfyx -> byfx + // IE order: 0 2 1 3 => bfyx -> byfx // cldnn order: 0 3 2 1 => bfxy -> byxf if (order.size() != 4) { return false; @@ -244,5 +241,4 @@ bool PermuteKernel_f_y_axes::Validate(const Params& p) const { KernelsPriority PermuteKernel_f_y_axes::GetKernelsPriority(const Params& /*params*/) const { return FORCE_PRIORITY_3; } - } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_selector.cpp index dd9f61823dbb8d..ad5e55d1655e55 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_selector.cpp @@ -16,7 +16,7 @@ permute_kernel_selector::permute_kernel_selector() { Attach(); Attach(); Attach(); - // Attach(); + Attach(); } KernelsData permute_kernel_selector::GetBestKernels(const Params& params) const { diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp index 267a237c855e60..64944aaebb492a 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp @@ -2248,18 +2248,38 @@ TEST_P(permute_tile_fsv_5d, i64_cached) { class permute_f_y_axes_tile: public TiledPermuteTest {}; -// Test cases are disabled because permute_f_y_axes_tile kernel itself is disabled for accuracy issue -// INSTANTIATE_TEST_SUITE_P(smoke_permute_f_y_axes_tile, -// permute_f_y_axes_tile, -// ::testing::ValuesIn(std::vector{ -// {{1, 4, 8, 1}, format::bfyx}, // permute_f_y_axes -// {{1, 64, 32, 1}, format::bfyx}, // permute_f_y_axes -// {{1, 32, 256, 512}, format::b_fs_yx_fsv32}, // THREE_DIM_TRANSPOSE -// {{1, 32, 256, 512}, format::bfyx}, // PERMUTE_SIMPLE_MEM_COPY -// {{1, 256, 256, 1}, format::b_fs_yx_fsv32}, // permute_f_y_axes -// {{1, 32, 16, 4}, format::b_fs_yx_fsv16}, // THREE_DIM_TRANSPOSE -// }), -// TiledPermuteTest::PrintToStringParamName); +INSTANTIATE_TEST_SUITE_P(smoke_permute_f_y_axes_tile, + permute_f_y_axes_tile, + ::testing::ValuesIn(std::vector{ + {{1, 4, 8, 1}, format::bfyx}, // permute_f_y_axes + {{1, 64, 32, 1}, format::bfyx}, // permute_f_y_axes + {{1, 32, 256, 512}, format::b_fs_yx_fsv32}, // THREE_DIM_TRANSPOSE + {{1, 32, 256, 512}, format::bfyx}, // PERMUTE_SIMPLE_MEM_COPY + {{1, 256, 256, 1}, format::b_fs_yx_fsv32}, // permute_f_y_axes + {{1, 32, 16, 4}, format::b_fs_yx_fsv16}, // THREE_DIM_TRANSPOSE + //4 batch version + {{4, 4, 4, 1}, format::bfyx}, // permute_f_y_axes + //32 batch version + {{32, 4, 8, 1}, format::bfyx}, // permute_f_y_axes + {{32, 64, 32, 1}, format::bfyx}, // permute_f_y_axes + {{32, 128, 196, 1}, format::bfyx}, // permute_f_y_axes + {{32, 196, 4, 16}, format::bfyx}, // permute_f_y_axes + {{32, 196, 4, 32}, format::bfyx}, // permute_f_y_axes + {{32, 4, 196, 32}, format::bfyx}, // permute_f_y_axes + {{32, 196, 128, 1}, format::bfyx}, // permute_f_y_axes + {{32, 196, 8, 16}, format::bfyx}, // permute_f_y_axes + {{16, 32, 128, 512}, format::b_fs_yx_fsv32}, // THREE_DIM_TRANSPOSE + {{16, 32, 128, 512}, format::bfyx}, // PERMUTE_SIMPLE_MEM_COPY + {{32, 256, 256, 1}, format::b_fs_yx_fsv32}, // permute_f_y_axes + {{32, 32, 16, 4}, format::b_fs_yx_fsv16}, // THREE_DIM_TRANSPOSE + {{32, 16, 16, 16}, format::bfyx}, + {{32, 16, 8, 16}, format::bfyx}, + {{32, 16, 16, 64}, format::bfyx}, + {{32, 16, 8, 32}, format::bfyx}, + {{32, 8, 16, 32}, format::bfyx}, + {{32, 196, 8, 64}, format::bfyx}, // permute_f_y_axes + }), + TiledPermuteTest::PrintToStringParamName); TEST_P(permute_f_y_axes_tile, combined) { auto p = GetParam(); @@ -2273,14 +2293,53 @@ TEST_P(permute_f_y_axes_tile, combined) { struct TiledPerformancePermuteTest : TiledPermuteTest { + static double get_exectime(const std::map& outputs, + const std::string& primitive_id) + { + using namespace std::chrono; + std::shared_ptr e = outputs.at(primitive_id).get_event(); + e->wait(); // should ensure execution completion, if not segfault will occur + double avg_time = 0.0; + auto intervals = e->get_profiling_info(); + for (const auto& q : intervals) + { + if (q.stage != instrumentation::profiling_stage::executing) { + continue; + } + avg_time = duration_cast>(q.value->value()).count(); + break; + } + return avg_time; + } + + static void print_all_perf(std::map outputs) + { + std::cout << "Print last run time" << std::endl; + using namespace std::chrono; + for( const auto &n : outputs ) { + std::shared_ptr e = n.second.get_event(); + auto intervals = e->get_profiling_info(); + double time = 0.0; + for (const auto& q : intervals) + { + if (q.stage == instrumentation::profiling_stage::executing) { + continue; + } + time = duration_cast>(q.value->value()).count(); + break; + } + std::cout << n.first << ":" << time << std::endl; + } + std::cout << std::endl; + } + template void execute_perf_test(const std::vector& sizes, cldnn::format format_fsv, const std::string & kernel_name, std::vector permute_order) { auto& engine = get_test_engine(); - // convert ov::float16 to ov::float16 - using type_ = typename ov::element_type_traits::value_type; - using type = typename std::conditional::value, ov::float16, type_>::type; + // convert half_t to FLOAT16 + using type = typename ov::element_type_traits::value_type; std::vector internal_sizes(sizes); std::swap(internal_sizes.at(2), internal_sizes.back()); @@ -2342,11 +2401,11 @@ struct TiledPerformancePermuteTest : TiledPermuteTest double exectime_opt = 0.f; for (int i = 0; i < r; ++i) { output_permute_opt = network_tile.execute(); - auto t_opt = get_profiling_exectime(output_permute_opt, "output"); + auto t_opt = get_exectime(output_permute_opt, "output"); exectime_opt += t_opt; output_permute_ref = network_ref.execute(); - auto t_ref = get_profiling_exectime(output_permute_ref, "output"); + auto t_ref = get_exectime(output_permute_ref, "output"); exectime_ref += t_ref; } exectime_ref /= r; @@ -2365,14 +2424,17 @@ struct TiledPerformancePermuteTest : TiledPermuteTest << frm_str << " " << input_type << " " << exectime_opt << std::endl; } + }; + // No need to run performance tests on CI TEST_P(TiledPerformancePermuteTest, DISABLED_f32) { auto p = GetParam(); execute_perf_test(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3}); } + INSTANTIATE_TEST_SUITE_P(, TiledPerformancePermuteTest, ::testing::ValuesIn(std::vector { // b_fs_zy_fsv16