diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp index 793fa300a8..8ebc04de51 100644 --- a/exercises/kernel-matrix-transpose-local-array.cpp +++ b/exercises/kernel-matrix-transpose-local-array.cpp @@ -154,11 +154,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int tx = 0; tx < TILE_DIM; ++tx) { for (int ty = 0; ty < TILE_DIM; ++ty) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col_t = by * TILE_DIM + tx; // Matrix column index + int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { + if (row_t < N_c && col_t < N_r) { Atview(col, row) = Tile[ty][tx]; } } diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp index bf61bca57b..bcc2a9d02c 100644 --- a/exercises/kernel-matrix-transpose-local-array_solution.cpp +++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp @@ -154,12 +154,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int tx = 0; tx < TILE_DIM; ++tx) { for (int ty = 0; ty < TILE_DIM; ++ty) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col_t = by * TILE_DIM + tx; // Matrix column index + int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { - Atview(col, row) = Tile[ty][tx]; + if (row_t < N_c && col_t < N_r) { + Atview(row_t, col_t) = Tile[tx][ty]; } } } @@ -399,8 +400,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_direct_unchecked, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_direct_unchecked, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList @@ -431,10 +432,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<1> > - >, - // Synchronize threads to ensure all reads - // from the local array are complete - RAJA::statement::CudaSyncThreads + > > > > @@ -494,8 +492,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // These loops iterate over the number of // tiles needed to carry out the transpose // - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_direct_unchecked, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_direct_unchecked, // This statement will initalize local array memory inside a // kernel. The cpu_tile_mem policy specifies that memory should be // allocated on the stack. The entries in the RAJA::ParamList @@ -526,10 +524,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<1> > - >, - // Synchronize threads to ensure all reads - // from the local array are complete - RAJA::statement::HipSyncThreads + > > > > @@ -556,6 +551,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) ); CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost); + CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp index dda01643e0..2663f4356a 100644 --- a/exercises/launch-matrix-transpose-local-array.cpp +++ b/exercises/launch-matrix-transpose-local-array.cpp @@ -145,18 +145,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (2) Inner loops to write array data into output array tile // - // Note: loop order is swapped from above so that output matrix - // data access is stride-1. // for (int tx = 0; tx < TILE_DIM; ++tx) { for (int ty = 0; ty < TILE_DIM; ++ty) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col_t = by * TILE_DIM + tx; // Matrix column index + int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { - Atview(col, row) = Tile[ty][tx]; + if (row_t < N_c && col_t < N_r) { + Atview(row_t, col_t) = Tile[tx][ty]; } } } @@ -195,10 +194,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// input matrix into the RAJA_TEAM_SHARED memory array /// - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -244,18 +243,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - Tile_Array[ty][tx] = Aview(row, col); + Tile_Array[ty][tx] = Aview(row, col); }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -307,10 +306,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + ctx.teamSync(); - Atview(col, row) = Tile_Array[ty][tx]; + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { + + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -379,10 +380,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + ctx.teamSync(); + + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - d_Atview(col, row) = Tile_Array[ty][tx]; + d_Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp index d57dc5df53..2fcf8d5770 100644 --- a/exercises/launch-matrix-transpose-local-array_solution.cpp +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -145,18 +145,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // (2) Inner loops to write array data into output array tile // - // Note: loop order is swapped from above so that output matrix - // data access is stride-1. // - for (int tx = 0; tx < TILE_DIM; ++tx) { - for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) { - int col = bx * TILE_DIM + tx; // Matrix column index - int row = by * TILE_DIM + ty; // Matrix row index + // Tranpose tile offset + int col_t = by * TILE_DIM + tx; // Matrix column index + int row_t = bx * TILE_DIM + ty; // Matrix row index // Bounds check - if (row < N_r && col < N_c) { - Atview(col, row) = Tile[ty][tx]; + if (row_t < N_c && col_t < N_r) { + Atview(row_t, col_t) = Tile[tx][ty]; } } } @@ -182,9 +181,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; @@ -196,10 +198,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -232,9 +234,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; @@ -246,10 +250,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - Atview(col, row) = Tile_Array[ty][tx]; + Atview(row_t, col_t) = Tile_Array[tx][ty]; }); }); @@ -274,8 +278,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); - using cuda_teams_y = RAJA::LoopPolicy; - using cuda_teams_x = RAJA::LoopPolicy; + using cuda_teams_y = RAJA::LoopPolicy; + using cuda_teams_x = RAJA::LoopPolicy; using cuda_threads_y = RAJA::LoopPolicy; using cuda_threads_x = RAJA::LoopPolicy; @@ -285,12 +289,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::launch( RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), + RAJA::Threads(c_block_sz, r_block_sz)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; @@ -302,16 +308,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + ctx.teamSync(); - Atview(col, row) = Tile_Array[ty][tx]; + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - }); - }); + Atview(row_t, col_t) = Tile_Array[tx][ty]; + + }); + }); - }); - }); + }); + }); }); @@ -346,8 +354,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); - using hip_teams_y = RAJA::LoopPolicy; - using hip_teams_x = RAJA::LoopPolicy; + using hip_teams_y = RAJA::LoopPolicy; + using hip_teams_x = RAJA::LoopPolicy; using hip_threads_y = RAJA::LoopPolicy; using hip_threads_x = RAJA::LoopPolicy; @@ -357,12 +365,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::launch (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), - RAJA::Threads(c_block_sz, r_block_sz)), + RAJA::Threads(c_block_sz, r_block_sz)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), + [&] (RAJA::TypedRangeSegment const &row_tile) { - RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), + [&] (RAJA::TypedRangeSegment const &col_tile) { RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; @@ -374,20 +384,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); }); - RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { - RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + ctx.teamSync(); + + RAJA::loop_icount(ctx, col_tile, [&] (int row_t, int ty) { + RAJA::loop_icount(ctx, row_tile, [&] (int col_t, int tx) { - d_Atview(col, row) = Tile_Array[ty][tx]; + d_Atview(row_t, col_t) = Tile_Array[tx][ty]; - }); - }); + }); + }); - }); - }); + }); + }); }); CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost); + CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif