Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions exercises/kernel-matrix-transpose-local-array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
for (int tx = 0; tx < TILE_DIM; ++tx) {
for (int ty = 0; ty < TILE_DIM; ++ty) {

int col = bx * TILE_DIM + tx; // Matrix column index
int row = by * TILE_DIM + ty; // Matrix row index
// Tranpose tile offset
int col_t = by * TILE_DIM + tx; // Matrix column index
int row_t = bx * TILE_DIM + ty; // Matrix row index

// Bounds check
if (row < N_r && col < N_c) {
if (row_t < N_c && col_t < N_r) {
Atview(col, row) = Tile[ty][tx];
}
}
Expand Down
28 changes: 12 additions & 16 deletions exercises/kernel-matrix-transpose-local-array_solution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
for (int tx = 0; tx < TILE_DIM; ++tx) {
for (int ty = 0; ty < TILE_DIM; ++ty) {

int col = bx * TILE_DIM + tx; // Matrix column index
int row = by * TILE_DIM + ty; // Matrix row index
// Tranpose tile offset
int col_t = by * TILE_DIM + tx; // Matrix column index
int row_t = bx * TILE_DIM + ty; // Matrix row index

// Bounds check
if (row < N_r && col < N_c) {
Atview(col, row) = Tile[ty][tx];
if (row_t < N_c && col_t < N_r) {
Atview(row_t, col_t) = Tile[tx][ty];
}
}
}
Expand Down Expand Up @@ -399,8 +400,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
// These loops iterate over the number of
// tiles needed to carry out the transpose
//
RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_direct_unchecked,
RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_direct_unchecked,
// This statement will initalize local array memory inside a
// kernel. The cpu_tile_mem policy specifies that memory should be
// allocated on the stack. The entries in the RAJA::ParamList
Expand Down Expand Up @@ -431,10 +432,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct,
RAJA::statement::Lambda<1>
>
>,
// Synchronize threads to ensure all reads
// from the local array are complete
RAJA::statement::CudaSyncThreads
>
>
>
>
Expand Down Expand Up @@ -494,8 +492,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
// These loops iterate over the number of
// tiles needed to carry out the transpose
//
RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_direct_unchecked,
RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_direct_unchecked,
// This statement will initalize local array memory inside a
// kernel. The cpu_tile_mem policy specifies that memory should be
// allocated on the stack. The entries in the RAJA::ParamList
Expand Down Expand Up @@ -526,10 +524,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct,
RAJA::statement::Lambda<1>
>
>,
// Synchronize threads to ensure all reads
// from the local array are complete
RAJA::statement::HipSyncThreads
>
>
>
>
Expand All @@ -556,6 +551,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
);

CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost);
CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize);
checkResult<int>(Atview, N_c, N_r);
// printResult<int>(Atview, N_c, N_r);
#endif
Expand Down
45 changes: 24 additions & 21 deletions exercises/launch-matrix-transpose-local-array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,18 +145,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
//
// (2) Inner loops to write array data into output array tile
//
// Note: loop order is swapped from above so that output matrix
// data access is stride-1.
//
for (int tx = 0; tx < TILE_DIM; ++tx) {
for (int ty = 0; ty < TILE_DIM; ++ty) {

int col = bx * TILE_DIM + tx; // Matrix column index
int row = by * TILE_DIM + ty; // Matrix row index
// Tranpose tile offset
int col_t = by * TILE_DIM + tx; // Matrix column index
int row_t = bx * TILE_DIM + ty; // Matrix row index

// Bounds check
if (row < N_r && col < N_c) {
Atview(col, row) = Tile[ty][tx];
if (row_t < N_c && col_t < N_r) {
Atview(row_t, col_t) = Tile[tx][ty];
}
}
}
Expand Down Expand Up @@ -195,10 +194,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
/// input matrix into the RAJA_TEAM_SHARED memory array
///

RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int row_t, int ty) {
RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int col_t, int tx) {

Atview(col, row) = Tile_Array[ty][tx];
Atview(row_t, col_t) = Tile_Array[tx][ty];

});
});
Expand Down Expand Up @@ -244,18 +243,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))

RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];

RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {

Tile_Array[ty][tx] = Aview(row, col);
Tile_Array[ty][tx] = Aview(row, col);

});
});

RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int row_t, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int col_t, int tx) {

Atview(col, row) = Tile_Array[ty][tx];
Atview(row_t, col_t) = Tile_Array[tx][ty];

});
});
Expand Down Expand Up @@ -307,10 +306,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
ctx.teamSync();

Atview(col, row) = Tile_Array[ty][tx];
RAJA::loop_icount<cuda_threads_y>(ctx, col_tile, [&] (int row_t, int ty) {
RAJA::loop_icount<cuda_threads_x>(ctx, row_tile, [&] (int col_t, int tx) {

Atview(row_t, col_t) = Tile_Array[tx][ty];

});
});
Expand Down Expand Up @@ -379,10 +380,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
ctx.teamSync();

RAJA::loop_icount<hip_threads_y>(ctx, col_tile, [&] (int row_t, int ty) {
RAJA::loop_icount<hip_threads_x>(ctx, row_tile, [&] (int col_t, int tx) {

d_Atview(col, row) = Tile_Array[ty][tx];
d_Atview(row_t, col_t) = Tile_Array[tx][ty];

});
});
Expand Down
97 changes: 55 additions & 42 deletions exercises/launch-matrix-transpose-local-array_solution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,18 +145,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
//
// (2) Inner loops to write array data into output array tile
//
// Note: loop order is swapped from above so that output matrix
// data access is stride-1.
//
for (int tx = 0; tx < TILE_DIM; ++tx) {
for (int ty = 0; ty < TILE_DIM; ++ty) {
for (int ty = 0; ty < TILE_DIM; ++ty) {
for (int tx = 0; tx < TILE_DIM; ++tx) {

int col = bx * TILE_DIM + tx; // Matrix column index
int row = by * TILE_DIM + ty; // Matrix row index
// Tranpose tile offset
int col_t = by * TILE_DIM + tx; // Matrix column index
int row_t = bx * TILE_DIM + ty; // Matrix row index

// Bounds check
if (row < N_r && col < N_c) {
Atview(col, row) = Tile[ty][tx];
if (row_t < N_c && col_t < N_r) {
Atview(row_t, col_t) = Tile[tx][ty];
}
}
}
Expand All @@ -182,9 +181,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {

RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
[&] (RAJA::TypedRangeSegment<int> const &row_tile) {

RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
[&] (RAJA::TypedRangeSegment<int> const &col_tile) {

RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];

Expand All @@ -196,10 +198,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int row_t, int ty) {
RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int col_t, int tx) {

Atview(col, row) = Tile_Array[ty][tx];
Atview(row_t, col_t) = Tile_Array[tx][ty];

});
});
Expand Down Expand Up @@ -232,9 +234,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
[&] (RAJA::TypedRangeSegment<int> const &row_tile) {

RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
[&] (RAJA::TypedRangeSegment<int> const &col_tile) {

RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];

Expand All @@ -246,10 +250,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int row_t, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int col_t, int tx) {

Atview(col, row) = Tile_Array[ty][tx];
Atview(row_t, col_t) = Tile_Array[tx][ty];

});
});
Expand All @@ -274,8 +278,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);

using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct_unchecked>;
using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct_unchecked>;

using cuda_threads_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
using cuda_threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
Expand All @@ -285,12 +289,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))

RAJA::launch<cuda_launch_policy>(
RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
RAJA::Threads(c_block_sz, r_block_sz)),
RAJA::Threads(c_block_sz, r_block_sz)),
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
[&] (RAJA::TypedRangeSegment<int> const &row_tile) {

RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
[&] (RAJA::TypedRangeSegment<int> const &col_tile) {

RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];

Expand All @@ -302,16 +308,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
ctx.teamSync();

Atview(col, row) = Tile_Array[ty][tx];
RAJA::loop_icount<cuda_threads_y>(ctx, col_tile, [&] (int row_t, int ty) {
RAJA::loop_icount<cuda_threads_x>(ctx, row_tile, [&] (int col_t, int tx) {

});
});
Atview(row_t, col_t) = Tile_Array[tx][ty];

});
});

});
});
});
});

});

Expand Down Expand Up @@ -346,8 +354,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);

using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct_unchecked>;
using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct_unchecked>;

using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
Expand All @@ -357,12 +365,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))

RAJA::launch<hip_launch_policy>
(RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
RAJA::Threads(c_block_sz, r_block_sz)),
RAJA::Threads(c_block_sz, r_block_sz)),
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
[&] (RAJA::TypedRangeSegment<int> const &row_tile) {

RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
[&] (RAJA::TypedRangeSegment<int> const &col_tile) {

RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];

Expand All @@ -374,20 +384,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
ctx.teamSync();

RAJA::loop_icount<hip_threads_y>(ctx, col_tile, [&] (int row_t, int ty) {
RAJA::loop_icount<hip_threads_x>(ctx, row_tile, [&] (int col_t, int tx) {

d_Atview(col, row) = Tile_Array[ty][tx];
d_Atview(row_t, col_t) = Tile_Array[tx][ty];

});
});
});
});

});
});
});
});

});

CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost);
CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize);
checkResult<int>(Atview, N_c, N_r);
// printResult<int>(Atview, N_c, N_r);
#endif
Expand Down
Loading