diff --git a/examples/03_stencil/CMakeLists.txt b/examples/03_stencil/CMakeLists.txt index 4239059..a1362a4 100644 --- a/examples/03_stencil/CMakeLists.txt +++ b/examples/03_stencil/CMakeLists.txt @@ -17,6 +17,9 @@ hpx_setup_target(stencil_serial) add_executable(stencil_parallel_0 stencil_parallel_0.cpp) hpx_setup_target(stencil_parallel_0) +add_executable(stencil_parallel_vector stencil_parallel_vector.cpp) +hpx_setup_target(stencil_parallel_vector) + add_executable(stencil_parallel_1 stencil_parallel_1.cpp) hpx_setup_target(stencil_parallel_1) @@ -33,6 +36,7 @@ hpx_setup_target(stencil_parallel_4) if (TARGET tutorial) add_hpx_pseudo_dependencies(tutorial.stencil stencil_serial) add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_0) + add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_parallel_vector) add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_1) add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_2) add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_3) diff --git a/examples/03_stencil/README.md b/examples/03_stencil/README.md new file mode 100644 index 0000000..2191d8c --- /dev/null +++ b/examples/03_stencil/README.md @@ -0,0 +1,8 @@ +# Optimized 2D stencil code series + +This example section takes you through an approach to write a serial stencil +and scale it to an optimized distributed 2D stencil. + +To be able to build this directory, you are required to have +[HPX](https://github.com/STEllAR-GROUP/hpx) and +[NSIMD](https://github.com/agenium-scale/nsimd) installed. diff --git a/examples/03_stencil/include/grid.hpp b/examples/03_stencil/include/grid.hpp new file mode 100644 index 0000000..8b9643d --- /dev/null +++ b/examples/03_stencil/include/grid.hpp @@ -0,0 +1,181 @@ +// Copyright (c) 2020 Nikunj Gupta +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef GRID +#define GRID + +#include + +#include +#include +#include + +// Define some commonly used packs +using vdouble = nsimd::pack; +using vfloat = nsimd::pack; + + +// Meta templates to extract type of a pack +template +struct get_type +{ + using type = _Ty; +}; + +template +struct get_type> +{ + using type = _Ty; +}; + + +// Our grid data structure encapsulates the stencil +// and provide with api functions to use in for loops +template +class Grid +{ +public: + using container_t = Container; + using type = typename container_t::value_type; + using alloc_t = typename container_t::allocator_type; + using iterator_t = typename container_t::iterator; + + // Conform rule of 5 + Grid() = default; + Grid(const Grid&) = default; + Grid(Grid&&) = default; + Grid& operator=(const Grid&) = default; + Grid& operator=(Grid&&) = default; + + Grid(std::size_t Nx, std::size_t Ny, type value, alloc_t alloc) + : + nx_(Nx), ny_(Ny), grid_(container_t(Nx * Ny, value, alloc)) + {} + + // Define useful iterators + iterator_t begin() + { + return grid_.begin(); + } + + iterator_t end() + { + return grid_.end(); + } + + // Returns size of a row + std::size_t row_size() + { + return nx_; + } + + // Returns number of columns + std::size_t column_size() + { + return ny_; + } + + // Indexing function + type& in(std::size_t x, std::size_t y) + { + return grid_[y*nx_ + x]; + } + +private: + container_t grid_; + std::size_t nx_; + std::size_t ny_; +}; + + +// Template specializations to allow for correct shuffle and print operations +template < + typename Container, + typename _Tx = typename Container::value_type, + typename _Ty = typename get_type<_Tx>::type + > +struct helper +{ + static void shuffle(Grid& grid, std::size_t ny) + { + std::size_t len = std::size_t(nsimd::len(_Tx())); + std::vector<_Ty> vect(len+2, 0.0); + + // Left shift the first simd register + nsimd::storeu(&vect[1], grid.in(1, ny)); + grid.in(0, ny) = nsimd::loadu<_Tx>(&vect[0]); + + // Right shift the last simd register + nsimd::storeu(&vect[1], grid.in(grid.row_size() - 2, ny)); + grid.in(grid.row_size() - 1, ny) = nsimd::loadu<_Tx>(&vect[2]); + } + + static void print(Grid& grid) + { + const std::size_t nx = grid.row_size(); + const std::size_t ny = grid.column_size(); + + std::size_t len = std::size_t(nsimd::len(_Tx())); + + const std::size_t nx_actual = (nx - 2) * len + 2; + std::vector<_Ty> vect(len, 0.0); + + std::vector<_Ty> line(nx_actual); + + std::cout << "{ "; + for (const auto& elem: line) + std::cout << 1.0 << " , "; + std::cout << "}" << std::endl; + + for (std::size_t y = 1; y < ny-1; ++y) + { + for (std::size_t x = 1; x < nx - 1; ++x) + { + nsimd::storeu(&vect[0], grid.in(x, y)); + for (std::size_t i = 0; i < len; ++i) + { + line[x + (nx-2)*i] = vect[i]; + } + } + line[0] = 0.0; + line[nx_actual - 1] = 0.0; + + std::cout << "{ "; + for (const auto& elem: line) + std::cout << elem << " , "; + std::cout << "}" << std::endl; + } + + std::cout << "{ "; + for (const auto& elem: line) + std::cout << 1.0 << " , "; + std::cout << "}" << std::endl; + } +}; + +template +struct helper +{ + static void shuffle(Grid& grid, std::size_t ny) {} + + static void print(Grid& grid) + { + const std::size_t nx = grid.row_size(); + const std::size_t ny = grid.column_size(); + + for (std::size_t y = 0; y < ny; ++y) + { + std::cout << "{ "; + for (std::size_t x = 0; x < nx; ++x) + { + std::cout << grid.in(x, y) << " , "; + } + std::cout << "}" << std::endl; + } + } +}; + + +#endif diff --git a/examples/03_stencil/include/stencil.hpp b/examples/03_stencil/include/stencil.hpp new file mode 100644 index 0000000..32fd644 --- /dev/null +++ b/examples/03_stencil/include/stencil.hpp @@ -0,0 +1,57 @@ +// Copyright (c) 2016 Thomas Heller +// Copyright (c) 2020 Nikunj Gupta +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef STENCIL +#define STENCIL + +#include +#include +#include + +#include "grid.hpp" + +template +using array_t = std::array, 2>; + +template +void init(array_t& U, std::size_t Nx, std::size_t Ny) +{ + // Initialize the top row with 1 + std::fill(U[0].begin(), U[0].begin() + Nx, 1.0); + std::fill(U[1].begin(), U[1].begin() + Nx, 1.0); + + // Initialize bottom row with 1 + std::fill(U[0].end() - Nx, U[0].end(), 1.0); + std::fill(U[1].end() - Nx, U[1].end(), 1.0); +} + +template +void stencil_update(array_t& U, const std::size_t ny, + const std::size_t len, const std::size_t t) +{ + Grid& curr = U[t % 2]; + Grid& next = U[(t + 1) % 2]; + + std::size_t row_length = curr.row_size(); + + #pragma unroll + for (std::size_t nx = 1; nx < row_length - 1; ++nx) + { + // Stencil operation + next.in(nx, ny) = + (curr.in(nx-1, ny) + curr.in(nx+1, ny) + + curr.in(nx, ny-1) + curr.in(nx, ny+1)) * 0.25f; + } + + // Maintain the halo for next computation in case of simd + if (std::is_same::type>>::value) + { + helper::shuffle(next, ny); + } +} + +#endif diff --git a/examples/03_stencil/stencil_parallel_vector.cpp b/examples/03_stencil/stencil_parallel_vector.cpp new file mode 100644 index 0000000..821bdd1 --- /dev/null +++ b/examples/03_stencil/stencil_parallel_vector.cpp @@ -0,0 +1,122 @@ +// Copyright (c) 2016 Thomas Heller +// Copyright (c) 2020 Nikunj Gupta +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Depends on NSIMD +#include + +#include "include/stencil.hpp" + +template +void stencil_main(const std::size_t& Nx, const std::size_t& Ny, + const std::size_t& steps, const std::size_t& chunk_size) +{ + using allocator_type = hpx::compute::host::block_allocator<_Tx>; + using executor_type = hpx::compute::host::block_executor<>; + using data_type = hpx::compute::vector<_Tx, allocator_type>; + + std::array, 2> U; + + auto numa_domains = hpx::compute::host::numa_domains(); + allocator_type alloc(numa_domains); + + // Get register length + // for float, length = 1 + // for simd register, length = 2/4/8/.. + std::size_t len = 1; + if (std::is_same<_Tx, + nsimd::pack::type>>::value) + len = static_cast(nsimd::len(_Tx())); + + // Create augmented matrix + std::size_t Nx_aug = Nx/len + 2; + std::size_t Ny_aug = Ny + 2; + + U[0] = Grid(Nx_aug, Ny_aug, 0.0, alloc); + U[1] = Grid(Nx_aug, Ny_aug, 0.0, alloc); + + // Initialize the stencil + init(U, Nx_aug, Ny_aug); + + // Range of Ny for stencil operation + auto range = boost::irange(static_cast(1), Ny_aug - 1); + + // Define HPX executor + executor_type executor(numa_domains); + auto policy = hpx::parallel::execution::par.on(executor); + + hpx::util::high_resolution_timer t; + using hpx::parallel::execution::auto_chunk_size; + + for (std::size_t t = 0; t < steps; ++t) + { + hpx::parallel::for_each( + policy, + std::begin(range), std::end(range), + [&U, len, t] (std::size_t i) + { + stencil_update(U, i, len, t); + }); + } + double elapsed = t.elapsed(); + + std::cout << "Working with type: " << + boost::typeindex::type_id<_Tx>().pretty_name() << std::endl; + + std::cout << "Time elapsed: " << elapsed << std::endl; + + double mlups = (Nx * Ny * steps) / 1e6 / elapsed; + std::cout << "MLUPS: " << mlups << "\n\n"; +} + +int hpx_main(hpx::program_options::variables_map& vm) +{ + // Get commandline arguments + std::size_t Nx = vm["Nx"].as(); + std::size_t Ny = vm["Ny"].as(); + std::size_t steps = vm["steps"].as(); + std::size_t chunk_size = vm["chunk-size"].as(); + + stencil_main(Nx, Ny, steps, chunk_size); + stencil_main(Nx, Ny, steps, chunk_size); + stencil_main(Nx, Ny, steps, chunk_size); + stencil_main(Nx, Ny, steps, chunk_size); + + return hpx::finalize(); +} + +int main(int argc, char* argv[]) +{ + using namespace hpx::program_options; + + options_description commandline; + commandline.add_options() + ("Nx", value()->default_value(1024), + "Elements in the x direction") + ("Ny", value()->default_value(1024), + "Elements in the y direction") + ("steps", value()->default_value(100), + "Number of steps to apply the stencil") + ("chunk-size", value()->default_value(2000), + "Number of row upgrades in a task") + ; + + // Initialize the runtime + return hpx::init(commandline, argc, argv); +}