openvinotoolkit · p-durandin · May 6, 2024 · Jul 18, 2023 · Apr 30, 2024 · Apr 30, 2024
@@ -95,7 +95,7 @@ KERNEL (permute_f_y_axes)(
         __attribute__((opencl_unroll_hint(TILE_SIZE)))
         for (int i = 0; i < TILE_SIZE; ++i) {
             const int x_idx = x_begin + i;
-            const int f = f_begin + j;
+            const int f = (f_begin + j) % INPUT0_FEATURE_NUM;
             const int y_idx = y_begin + bf_local;
             const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id();
             DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local][i]);
@@ -149,7 +149,7 @@ KERNEL (permute_f_y_axes)(
     __attribute__((opencl_unroll_hint(J_TIMES)))
     for (int j = 0; j < J_TIMES; ++j) {
         const int j_vec = j * VEC_SIZE;
-        const int f = f_begin + j_vec;
+        const int f = (f_begin + j_vec) % INPUT0_FEATURE_NUM;;
         const int y_idx = y_begin + bf_local;
         const int output_idx = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx);
         WRITE_VEC(READ_VEC(0, &transpose_buf[bf_local][j_vec]), 0, &output[output_idx]);
@@ -171,7 +171,7 @@ KERNEL (permute_f_y_axes)(
     }
     __attribute__((opencl_unroll_hint(TILE_SIZE)))
     for (int j = 0; j < TILE_SIZE; ++j) {
-        const int f = f_begin + j;
+        const int f = (f_begin + j) % INPUT0_FEATURE_NUM;
         const int y_idx = y_begin + bf_local;
         const uint output_offset = OUTPUT_GET_INDEX(b_idx, y_idx, f, x_idx) - get_sub_group_local_id();
         DT_OUTPUT_BLOCK_WRITE(output, output_offset, transpose_buf[j][bf_local]);

@@ -24,10 +24,7 @@ size_t GetDivisor(const size_t input_size) {
         return input_size % i == 0;
     };
     auto result = std::find_if(begin(v), end(v), is_divided);
-    if (result != end(v)) {
-        return *result;
-    }
-    return 1;
+    return *result;
 }
 
 bool IsSimpleMemCopyOperation(const permute_params& params) {
@@ -134,7 +131,7 @@ JitConstants PermuteKernel_f_y_axes::GetJitConstants(const permute_params& param
     const size_t tile_width = GetTileWidth(params);
     const size_t vector_size = std::min(tile_width, static_cast<size_t>(4));
     const size_t tile_size = GetTileSize(params);
-    const size_t j_times = tile_size / vector_size;
+    const size_t j_times = IsSimpleMemCopyOperation(params) ? tile_width / vector_size : tile_size / vector_size;
     const size_t feature_block_size = GetFeatureBlockSize(params);
     jit.AddConstant(MakeJitConstant("BLOCK_SIZE", tile_width));
     jit.AddConstant(MakeJitConstant("VEC_SIZE", vector_size));
@@ -200,7 +197,7 @@ bool PermuteKernel_f_y_axes::Validate(const Params& p) const {
 
     const auto is_swapping_f_with_y = [](const std::vector<uint16_t>& order) {
         // Target transform: Swap feature with y
-        // OV order:    0 2 1 3 => bfyx -> byfx
+        // IE order:    0 2 1 3 => bfyx -> byfx
         // cldnn order: 0 3 2 1 => bfxy -> byxf
         if (order.size() != 4) {
             return false;
@@ -244,5 +241,4 @@ bool PermuteKernel_f_y_axes::Validate(const Params& p) const {
 KernelsPriority PermuteKernel_f_y_axes::GetKernelsPriority(const Params& /*params*/) const {
     return FORCE_PRIORITY_3;
 }
-
 }  // namespace kernel_selector
@@ -16,7 +16,7 @@ permute_kernel_selector::permute_kernel_selector() {
     Attach<PermuteKernel_tile_8x8_4x4>();
     Attach<PermuteKernel_tile_8x8_4x4_fsv>();
     Attach<PermuteKernel_bfzyx_to_bfyxz>();
-    // Attach<PermuteKernel_f_y_axes>();
+    Attach<PermuteKernel_f_y_axes>();
 }
 
 KernelsData permute_kernel_selector::GetBestKernels(const Params& params) const {

@@ -2248,18 +2248,38 @@ TEST_P(permute_tile_fsv_5d, i64_cached) {
 
 class permute_f_y_axes_tile: public TiledPermuteTest {};
 
-// Test cases are disabled because permute_f_y_axes_tile kernel itself is disabled for accuracy issue
-// INSTANTIATE_TEST_SUITE_P(smoke_permute_f_y_axes_tile,
-//                          permute_f_y_axes_tile,
-//                          ::testing::ValuesIn(std::vector<TiledPermuteParam>{
-//                              {{1, 4, 8, 1}, format::bfyx},                // permute_f_y_axes
-//                              {{1, 64, 32, 1}, format::bfyx},              // permute_f_y_axes
-//                              {{1, 32, 256, 512}, format::b_fs_yx_fsv32},  // THREE_DIM_TRANSPOSE
-//                              {{1, 32, 256, 512}, format::bfyx},           // PERMUTE_SIMPLE_MEM_COPY
-//                              {{1, 256, 256, 1}, format::b_fs_yx_fsv32},   // permute_f_y_axes
-//                              {{1, 32, 16, 4}, format::b_fs_yx_fsv16},     // THREE_DIM_TRANSPOSE
-//                          }),
-//                          TiledPermuteTest::PrintToStringParamName);
+INSTANTIATE_TEST_SUITE_P(smoke_permute_f_y_axes_tile,
+                         permute_f_y_axes_tile,
+                         ::testing::ValuesIn(std::vector<TiledPermuteParam>{
+                             {{1, 4, 8, 1}, format::bfyx},                // permute_f_y_axes
+                             {{1, 64, 32, 1}, format::bfyx},              // permute_f_y_axes
+                             {{1, 32, 256, 512}, format::b_fs_yx_fsv32},  // THREE_DIM_TRANSPOSE
+                             {{1, 32, 256, 512}, format::bfyx},           // PERMUTE_SIMPLE_MEM_COPY
+                             {{1, 256, 256, 1}, format::b_fs_yx_fsv32},   // permute_f_y_axes
+                             {{1, 32, 16, 4}, format::b_fs_yx_fsv16},     // THREE_DIM_TRANSPOSE
+                             //4 batch version
+                             {{4, 4, 4, 1}, format::bfyx},                // permute_f_y_axes
+                             //32 batch version
+                             {{32, 4, 8, 1}, format::bfyx},                // permute_f_y_axes
+                             {{32, 64, 32, 1}, format::bfyx},              // permute_f_y_axes
+                             {{32, 128, 196, 1}, format::bfyx},              // permute_f_y_axes
+                             {{32, 196, 4, 16}, format::bfyx},              // permute_f_y_axes
+                             {{32, 196, 4, 32}, format::bfyx},              // permute_f_y_axes
+                             {{32, 4, 196, 32}, format::bfyx},              // permute_f_y_axes
+                             {{32, 196, 128, 1}, format::bfyx},              // permute_f_y_axes
+                             {{32, 196, 8, 16}, format::bfyx},              // permute_f_y_axes
+                             {{16, 32, 128, 512}, format::b_fs_yx_fsv32},  // THREE_DIM_TRANSPOSE
+                             {{16, 32, 128, 512}, format::bfyx},           // PERMUTE_SIMPLE_MEM_COPY
+                             {{32, 256, 256, 1}, format::b_fs_yx_fsv32},   // permute_f_y_axes
+                             {{32, 32, 16, 4}, format::b_fs_yx_fsv16},     // THREE_DIM_TRANSPOSE
+                             {{32, 16, 16, 16}, format::bfyx}, 
+                             {{32, 16, 8, 16}, format::bfyx}, 
+                             {{32, 16, 16, 64}, format::bfyx},
+                             {{32, 16, 8, 32}, format::bfyx}, 
+                             {{32, 8, 16, 32}, format::bfyx},
+                             {{32, 196, 8, 64}, format::bfyx},           // permute_f_y_axes
+                         }),
+                         TiledPermuteTest::PrintToStringParamName);
 
 TEST_P(permute_f_y_axes_tile, combined) {
     auto p = GetParam();
@@ -2273,14 +2293,53 @@ TEST_P(permute_f_y_axes_tile, combined) {
 
 struct TiledPerformancePermuteTest : TiledPermuteTest
 {
+    static double get_exectime(const std::map<cldnn::primitive_id, cldnn::network_output>& outputs,
+                                const std::string& primitive_id)
+    {
+        using namespace std::chrono;
+        std::shared_ptr<event> e = outputs.at(primitive_id).get_event();
+        e->wait(); // should ensure execution completion, if not segfault will occur
+        double avg_time = 0.0;
+        auto intervals = e->get_profiling_info();
+        for (const auto& q : intervals)
+        {
+            if (q.stage != instrumentation::profiling_stage::executing) {
+                continue;
+            }
+            avg_time = duration_cast<duration<double, microseconds::period>>(q.value->value()).count();
+            break;
+        }
+        return avg_time;
+    }
+
+    static void print_all_perf(std::map<primitive_id, network_output> outputs)
+    {
+        std::cout << "Print last run time" << std::endl;
+        using namespace std::chrono;
+        for( const auto &n : outputs ) {
+            std::shared_ptr<event> e = n.second.get_event();
+            auto intervals = e->get_profiling_info();
+            double time = 0.0;
+            for (const auto& q : intervals)
+            {
+                if (q.stage == instrumentation::profiling_stage::executing) {
+                    continue;
+                }
+                time = duration_cast<duration<double, microseconds::period>>(q.value->value()).count();
+                break;
+            }
+            std::cout << n.first << ":" << time << std::endl;
+        }
+        std::cout << std::endl;
+    }
+
     template<data_types Data_Type>
     void execute_perf_test(const std::vector<cldnn::tensor::value_type>& sizes, cldnn::format format_fsv,
                             const std::string & kernel_name, std::vector<uint16_t> permute_order)
     {
         auto& engine = get_test_engine();
-        // convert ov::float16 to ov::float16
-        using type_ = typename ov::element_type_traits<Data_Type>::value_type;
-        using type = typename std::conditional<std::is_same<type_, ov::float16>::value, ov::float16, type_>::type;
+        // convert half_t to FLOAT16
+        using type = typename ov::element_type_traits<Data_Type>::value_type;
 
         std::vector<cldnn::tensor::value_type> internal_sizes(sizes);
         std::swap(internal_sizes.at(2), internal_sizes.back());
@@ -2342,11 +2401,11 @@ struct TiledPerformancePermuteTest : TiledPermuteTest
         double exectime_opt = 0.f;
         for (int i = 0; i < r; ++i) {
             output_permute_opt = network_tile.execute();
-            auto t_opt = get_profiling_exectime(output_permute_opt, "output");
+            auto t_opt = get_exectime(output_permute_opt, "output");
             exectime_opt += t_opt;
 
             output_permute_ref = network_ref.execute();
-            auto t_ref = get_profiling_exectime(output_permute_ref, "output");
+            auto t_ref = get_exectime(output_permute_ref, "output");
             exectime_ref += t_ref;
         }
         exectime_ref /= r;
@@ -2365,14 +2424,17 @@ struct TiledPerformancePermuteTest : TiledPermuteTest
                   << frm_str << " " << input_type << " " << exectime_opt << std::endl;
 
     }
+
 };
 
+
 // No need to run performance tests on CI
 TEST_P(TiledPerformancePermuteTest, DISABLED_f32) {
     auto p = GetParam();
     execute_perf_test<cldnn::data_types::f32>(p.sizes, p.format_fsv, "permute_f_y_axes", {0, 2, 1, 3});
 }
 
+
 INSTANTIATE_TEST_SUITE_P(, TiledPerformancePermuteTest,
     ::testing::ValuesIn(std::vector<TiledPermuteParam> {
         // b_fs_zy_fsv16