From 37cc6bb15641764b10f3f9b75bcad75c5d940703 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 14 Sep 2023 16:33:12 -0700 Subject: [PATCH 01/20] fft initial stuff --- .clang-format | 4 +- apps/CMakeLists.txt | 3 + apps/fft/CMakeLists.txt | 40 +++++++++ apps/fft/fft-serial-1d.cpp | 31 +++++++ apps/fft/fft.hpp | 176 +++++++++++++++++++++++++++++++++++++ log-gcc.txt | 40 +++++++++ log.txt | 76 ++++++++++++++++ 7 files changed, 368 insertions(+), 2 deletions(-) create mode 100644 apps/fft/CMakeLists.txt create mode 100644 apps/fft/fft-serial-1d.cpp create mode 100644 apps/fft/fft.hpp create mode 100644 log-gcc.txt create mode 100644 log.txt diff --git a/.clang-format b/.clang-format index 335a74a..dd3a3c5 100644 --- a/.clang-format +++ b/.clang-format @@ -42,7 +42,7 @@ BreakBeforeBinaryOperators: None BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeColon BreakInheritanceList: BeforeColon -ColumnLimit: 80 +ColumnLimit: 120 CompactNamespaces: false ContinuationIndentWidth: 4 Cpp11BracedListStyle: true @@ -52,7 +52,7 @@ FixNamespaceComments: true IncludeBlocks: Preserve IndentCaseLabels: true IndentPPDirectives: None -IndentWidth: 2 +IndentWidth: 4 KeepEmptyLinesAtTheStartOfBlocks: true MaxEmptyLinesToKeep: 1 NamespaceIndentation: None diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index 7fc4bfd..daba52b 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -21,3 +21,6 @@ add_subdirectory(mdspan-stdpar) message(STATUS "Adding 1d_stencil_stdpar...") add_subdirectory(1d_stencil) + +message(STATUS "Adding fft...") +add_subdirectory(fft) diff --git a/apps/fft/CMakeLists.txt b/apps/fft/CMakeLists.txt new file mode 100644 index 0000000..e957d46 --- /dev/null +++ b/apps/fft/CMakeLists.txt @@ -0,0 +1,40 @@ +project(fft LANGUAGES CXX) + +file(GLOB CPP_SOURCES "*.cpp") + +foreach(source_file ${CPP_SOURCES}) + if(NOT STDPAR STREQUAL "gpu") + if("${source_file}" MATCHES ".*gpu.*scheduler.*" OR "${source_file}" + MATCHES ".*cuda.*") + message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}") + continue() + endif() + endif() + + # get the file name without an extension + get_filename_component(exec_name ${source_file} NAME_WE) + + # add an executable with the same name as the source file + add_executable(${exec_name} ${_EXCLUDE} ${source_file}) + + # add dependency on argparse + add_dependencies(${exec_name} argparse magic_enum) + + set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX + LINKER_LANGUAGE CXX) + target_include_directories( + ${exec_name} + PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include + ${ARGPARSE_INCLUDE_DIR} ${MAGICENUM_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) + + target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec) + + set_target_properties( + ${exec_name} + PROPERTIES CXX_STANDARD ${CXX_STANDARD} + CXX_EXTENSIONS NO + INSTALL_RPATH_USE_LINK_PATH ON) + + # installation + install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR}) +endforeach() diff --git a/apps/fft/fft-serial-1d.cpp b/apps/fft/fft-serial-1d.cpp new file mode 100644 index 0000000..70456e2 --- /dev/null +++ b/apps/fft/fft-serial-1d.cpp @@ -0,0 +1,31 @@ +/* + * MIT License + * + * Copyright (c) 2023 The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of any + * required approvals from the U.S. Dept. of Energy).All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * commons for the fft codes + */ + +#include "fft.hpp" \ No newline at end of file diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp new file mode 100644 index 0000000..7d5da98 --- /dev/null +++ b/apps/fft/fft.hpp @@ -0,0 +1,176 @@ +/* + * MIT License + * + * Copyright (c) 2023 The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of any + * required approvals from the U.S. Dept. of Energy).All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * commons for the fft codes + */ + +#pragma once + +#include +#include + +#include "argparse/argparse.hpp" +#include "commons.hpp" + +using namespace std::complex_literals; + +// data type +using Real_t = double; +using data_t = std::complex; + +// number of dimensions +constexpr int dims = 1; + +// 1D view +using view_1d = std::extents; + +// 2D view +using view_2d = std::extents; + +// 3D view +using view_3d = std::extents; + +enum class fft_type { fftw, cufft }; +enum class sig_type { square, sinusoid, sawtooth, triangle, sinc, box }; + +using sig_type_t = sig_type; + +// parameters +struct fft_params_t : public argparse::Args { + sig_type_t& sig = kwarg("sig", "input signal type(square, sinusoid, sawtooth, triangle, box)").set_default(signal_type::box); + int& len = kwarg("n,N", "N-point FFT").set_default(1<<20); + bool& print_fft = flag("p,print", "print Fourier transformed signal"); + +#if defined(USE_OMP) + int& nthreads = kwarg("nthreads", "number of threads").set_default(1); +#endif // USE_OMP + + bool& help = flag("h, help", "print help"); + bool& print_time = flag("t,time", "print transform time"); +}; + +void printSignal(data_t* sig, int N) { + std::cout << std::fixed << std::setprecision(1); + + for (int i = 0; i < N; ++i) + std::cout << sig[i] << " "; + + std::cout << std::endl; +} + +class signal +{ +public: + + signal() + { + this->N = 1e3; + t.resize(this->N); + y.resize(this->N); + dt = 1.0 / this->N; + } + + signal(int _N) + { + if (_N <= 0) + { + std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl; + exit(1); + } + this->N = _N; + t.resize(this->N); + y.resize(this->N); + dt = 1.0 / this->N; + } + + signal(int N, sig_type type=sig_type::box) + { + if (N <= 0) + { + std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl; + exit(1); + } + + this->N = N; + t.resize(N); + y.resize(N); + dt = 1.0 / N; + signalGenerator(N, type); + } + + void signalGenerator(int N, sig_type type=sig_type::box) + { + int interval = 1/N; + std::vector t(N); + + switch (type) { + case sig_type::square: + for (int i = 0; i < N; ++i) + y[i] = (i < N / 4 || i > 3 * N/4) ? 1.0 : -1.0; + break; + case sig_type::sinusoid: + for (int i = 0; i < N; ++i) + y[i] = std::sin(2.0 * M_PI * i / N); + break; + case sig_type::sawtooth: + for (int i = 0; i < N; ++i) + y[i] = 2.0 * (i / N) - 1.0; + break; + case sig_type::triangle: + for (int i = 0; i < N; ++i) + y[i] = 2.0 * std::abs(2.0 * (i / N) - 1.0) - 1.0; + break; + case sig_type::sinc: + y[0] = 1.0; + for (int i = 1; i < N; ++i) + y[i] = std::sin(2.0 * M_PI * i / N) / (2.0 * M_PI * i / N); + break; + case sig_type::box: + for (int i = 0; i < N; ++i) + y[i] = (i < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0; + break; + default: + std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl; + exit(1); + } + } + + ~signal() + { + y.clear(); + t.clear(); + } + +private: + int N; + Real_t dt; + // time axis + std::vector t; + // y(t) axis + std::vector y; +}; \ No newline at end of file diff --git a/log-gcc.txt b/log-gcc.txt new file mode 100644 index 0000000..6d41374 --- /dev/null +++ b/log-gcc.txt @@ -0,0 +1,40 @@ ++ cd /global/homes/m/mhaseeb/repos/nvstdpar/build-gcc/apps/heat-equation ++ ./heat-equation-mdspan -s=50 -n=30000 --time ++ tee gcc-md.txt +Time: 155095 ms ++ T=(128 64 32 16 8 4 2 1) ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=128 ++ tee gcc-omp-128.txt +Time: 15310.8 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=64 ++ tee gcc-omp-64.txt +Time: 15362.4 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=32 ++ tee gcc-omp-32.txt +Time: 15631.2 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=16 ++ tee gcc-omp-16.txt +Time: 18824.7 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=8 ++ tee gcc-omp-8.txt +Time: 30255 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=4 ++ tee gcc-omp-4.txt +Time: 56973.2 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=2 ++ tee gcc-omp-2.txt +Time: 117583 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=1 ++ tee gcc-omp-1.txt +Time: 231557 ms ++ ./heat-equation-stdpar -s=50 -n=30000 --time ++ tee gcc-stdpar-1.txt +Time: 15924.2 ms \ No newline at end of file diff --git a/log.txt b/log.txt new file mode 100644 index 0000000..ed41625 --- /dev/null +++ b/log.txt @@ -0,0 +1,76 @@ ++ cd /global/homes/m/mhaseeb/repos/nvstdpar/build/apps/heat-equation ++ ./heat-equation-mdspan -s=50 -n=30000 --time ++ tee md.txt +Time: 72373.3 ms ++ T=(1 2 4 8 16 32 64) ++ for i in "${T[@]}" ++ export OMP_NUM_THREADS=1 ++ OMP_NUM_THREADS=1 ++ ./heat-equation-stdpar -s=50 -n=30000 --time ++ tee stdpar-1.txt +Time: 704823 ms ++ for i in "${T[@]}" ++ export OMP_NUM_THREADS=2 ++ OMP_NUM_THREADS=2 ++ ./heat-equation-stdpar -s=50 -n=30000 --time ++ tee stdpar-2.txt +Time: 352537 ms ++ for i in "${T[@]}" ++ export OMP_NUM_THREADS=4 ++ OMP_NUM_THREADS=4 ++ ./heat-equation-stdpar -s=50 -n=30000 --time ++ tee stdpar-4.txt +Time: 179607 ms ++ for i in "${T[@]}" ++ export OMP_NUM_THREADS=8 ++ OMP_NUM_THREADS=8 ++ ./heat-equation-stdpar -s=50 -n=30000 --time ++ tee stdpar-8.txt +Time: 91341.8 ms ++ for i in "${T[@]}" ++ export OMP_NUM_THREADS=16 ++ OMP_NUM_THREADS=16 ++ ./heat-equation-stdpar -s=50 -n=30000 --time ++ tee stdpar-16.txt +Time: 45602.9 ms ++ for i in "${T[@]}" ++ export OMP_NUM_THREADS=32 ++ OMP_NUM_THREADS=32 ++ ./heat-equation-stdpar -s=50 -n=30000 --time ++ tee stdpar-32.txt +Time: 24956.7 ms ++ for i in "${T[@]}" ++ export OMP_NUM_THREADS=64 ++ OMP_NUM_THREADS=64 ++ ./heat-equation-stdpar -s=50 -n=30000 --time ++ tee stdpar-64.txt +Time: 12437.9 ms ++ unset OMP_NUM_THREADS ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=1 ++ tee omp-1.txt +Time: 258170 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=2 ++ tee omp-2.txt +Time: 129542 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=4 ++ tee omp-4.txt +Time: 65776.1 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=8 ++ tee omp-8.txt +Time: 32570 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=16 ++ tee omp-16.txt +Time: 16814.6 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=32 ++ tee omp-32.txt +Time: 11322.6 ms ++ for i in "${T[@]}" ++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=64 ++ tee omp-64.txt +Time: 15135.6 ms From da68278dcddfefcb14903c78c32bf775dcbe7650 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 27 Sep 2023 13:56:24 -0700 Subject: [PATCH 02/20] adding magic_enum for argparse --- .gitmodules | 3 +++ apps/fft/fft-serial-1d.cpp | 9 ++++++++- apps/fft/fft.hpp | 7 ++++--- externals/CMakeLists.txt | 7 +++++++ externals/magic_enum | 1 + 5 files changed, 23 insertions(+), 4 deletions(-) create mode 160000 externals/magic_enum diff --git a/.gitmodules b/.gitmodules index 2bb4aed..b8f3f6d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "externals/argparse"] path = externals/argparse url = https://github.com/mhaseeb123/argparse +[submodule "externals/magic_enum"] + path = externals/magic_enum + url = https://github.com/mhaseeb123/magic_enum diff --git a/apps/fft/fft-serial-1d.cpp b/apps/fft/fft-serial-1d.cpp index 70456e2..6c73237 100644 --- a/apps/fft/fft-serial-1d.cpp +++ b/apps/fft/fft-serial-1d.cpp @@ -28,4 +28,11 @@ * commons for the fft codes */ -#include "fft.hpp" \ No newline at end of file +#include "fft.hpp" + +// +// simulation +// +int main(int argc, char* argv[]) { + return 0; +} diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp index 7d5da98..bc411d5 100644 --- a/apps/fft/fft.hpp +++ b/apps/fft/fft.hpp @@ -62,8 +62,9 @@ using sig_type_t = sig_type; // parameters struct fft_params_t : public argparse::Args { - sig_type_t& sig = kwarg("sig", "input signal type(square, sinusoid, sawtooth, triangle, box)").set_default(signal_type::box); - int& len = kwarg("n,N", "N-point FFT").set_default(1<<20); + sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box); + int& freq = kwarg("f,freq", "Signal frequency").set_default(1000); + int& len = kwarg("n,N", "N-point FFT").set_default(1<<16); bool& print_fft = flag("p,print", "print Fourier transformed signal"); #if defined(USE_OMP) @@ -152,7 +153,7 @@ class signal break; case sig_type::box: for (int i = 0; i < N; ++i) - y[i] = (i < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0; + y[i] = (i < N / 4 || i > 3 * N / 4) ? 1.0 : 0.0; break; default: std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl; diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 8c6216c..b6e9828 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -156,6 +156,9 @@ add_subdirectory(mdspan) message(STATUS "Adding externals/argparse...") add_subdirectory(argparse) +message(STATUS "Adding externals/magic_enum...") +add_subdirectory(magic_enum) + set(MDSPAN_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/mdspan/include CACHE PATH "mdspan include directory") @@ -163,3 +166,7 @@ set(MDSPAN_INCLUDE_DIR set(ARGPARSE_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/argparse/include CACHE PATH "argparse include directory") + +set(MAGICENUM_INCLUDE_DIR + ${CMAKE_CURRENT_SOURCE_DIR}/magic_enum/include + CACHE PATH "magic_enum include directory") \ No newline at end of file diff --git a/externals/magic_enum b/externals/magic_enum new file mode 160000 index 0000000..b291b0c --- /dev/null +++ b/externals/magic_enum @@ -0,0 +1 @@ +Subproject commit b291b0ce5a76e808e05fc0141154e963407372da From 814fd74afb854e5a99f7cb655fc3e53c0f392d18 Mon Sep 17 00:00:00 2001 From: Chuanqiu He <49005493+hcq9102@users.noreply.github.com> Date: Fri, 22 Sep 2023 00:59:57 -0500 Subject: [PATCH 03/20] choleskey code serial/mdspan/stadpar (#26) * choleskey serial and choleskey_stdpar --- apps/CMakeLists.txt | 9 +++ apps/choleskey/CMakeLists.txt | 13 ++++ apps/choleskey/choleskey_serial.cpp | 101 +++++++++++++++++++++++++ apps/choleskey/choleskey_stdpar.cpp | 113 ++++++++++++++++++++++++++++ apps/choleskey/matrixutil.hpp | 41 ++++++++++ 5 files changed, 277 insertions(+) create mode 100644 apps/choleskey/CMakeLists.txt create mode 100644 apps/choleskey/choleskey_serial.cpp create mode 100644 apps/choleskey/choleskey_stdpar.cpp create mode 100644 apps/choleskey/matrixutil.hpp diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index daba52b..1bf106a 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -22,5 +22,14 @@ add_subdirectory(mdspan-stdpar) message(STATUS "Adding 1d_stencil_stdpar...") add_subdirectory(1d_stencil) +# ----------------------------------------------------------------------------------------# +# Add choleskey demo +# ----------------------------------------------------------------------------------------# +message(STATUS "Adding choleskey example...") +add_subdirectory(choleskey) + +# ----------------------------------------------------------------------------------------# +# Add fft demo +# ----------------------------------------------------------------------------------------# message(STATUS "Adding fft...") add_subdirectory(fft) diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt new file mode 100644 index 0000000..bfa1a7a --- /dev/null +++ b/apps/choleskey/CMakeLists.txt @@ -0,0 +1,13 @@ +project(choleskey_stdpar LANGUAGES CXX) + +add_executable(choleskey_serial choleskey_serial.cpp) +target_include_directories( + choleskey_serial + PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include + ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) + +add_executable(choleskey_stdpar choleskey_stdpar.cpp) +target_include_directories( + choleskey_stdpar + PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include + ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) diff --git a/apps/choleskey/choleskey_serial.cpp b/apps/choleskey/choleskey_serial.cpp new file mode 100644 index 0000000..ea181d0 --- /dev/null +++ b/apps/choleskey/choleskey_serial.cpp @@ -0,0 +1,101 @@ +// Cholesky Decomposition: mdspan +#include +#include +#include +#include "argparse/argparse.hpp" +#include "commons.hpp" +#include "matrixutil.hpp" + +using namespace std; + +struct solver { + + using view_2d = std::extents; + + typedef std::mdspan matrix_ms_t; + + template + matrix_ms_t Cholesky_Decomposition(std::vector& vec, int n) { + std::vector lower(n * n, 0); + + auto matrix_ms = + std::mdspan(vec.data(), n, n); + auto lower_ms = + std::mdspan(lower.data(), n, n); + + // Decomposing a matrix into Lower Triangular + for (int i = 0; i < matrix_ms.extent(0); i++) { + for (int j = 0; j <= i; j++) { + T sum = 0; + + if (j == i) { + // summation for diagonals + for (int k = 0; k < j; k++) + sum += pow(lower_ms(j, k), 2); + lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum); + } else { + // Evaluating L(i, j) using L(j, j) + for (int k = 0; k < j; k++) + sum += (lower_ms(i, k) * lower_ms(j, k)); + lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j); + } + } + } + return lower_ms; + } +}; + +/////////////////////////////////////////////////////////////////////////////// +int benchmark(args_params_t const& args) { + + std::uint64_t nd = args.nd; // Number of matrix dimension. + + std::vector inputMatrix = generate_pascal_matrix(nd); + + // Create the solverobject + solver solve; + // Measure execution time. + Timer timer; + // start decomposation + auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); + + // Print the final results + if (args.results) { + // Displaying Lower Triangular and its Transpose + cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; + for (int i = 0; i < nd; i++) { + // Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix(i, j) << "\t"; + cout << "\t"; + + // Transpose of Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix(j, i) << "\t"; + cout << endl; + } + } + + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } + + return 0; +} + +// Driver Code for testing +int main(int argc, char* argv[]) { + + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + benchmark(args); + + return 0; +} diff --git a/apps/choleskey/choleskey_stdpar.cpp b/apps/choleskey/choleskey_stdpar.cpp new file mode 100644 index 0000000..abbe02d --- /dev/null +++ b/apps/choleskey/choleskey_stdpar.cpp @@ -0,0 +1,113 @@ +// Cholesky Decomposition: stdpar +#include "argparse/argparse.hpp" +#include "commons.hpp" + +#include +#include +#include +#include +#include +#include + +#include "matrixutil.hpp" + +using namespace std; + +struct solver { + + using view_2d = std::extents; + + template + std::vector> Cholesky_Decomposition(std::vector& vec, + int n) { + std::vector> lower(n, std::vector(n, 0)); + + auto matrix_ms = + std::mdspan(vec.data(), n, n); + + auto multiplier_lambda = [=](auto a, auto b) { + return a * b; + }; + + // Decomposing a matrix into Lower Triangular + for (int i = 0; i < matrix_ms.extent(0); i++) { + for (int j = 0; j <= i; j++) { + T sum = 0; + + if (j == i) // summation for diagonals + { + sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), + lower[j].cbegin() + j, 0, std::plus{}, + [=](int val) { return val * val; }); + + lower[j][j] = std::sqrt(matrix_ms(i, j) - sum); + + } else { // Evaluating L(i, j) using L(j, j) + + sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), + lower[j].cbegin() + j, lower[i].cbegin(), + 0, std::plus<>(), multiplier_lambda); + + lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j]; + } + } + } + return lower; + } +}; + +/////////////////////////////////////////////////////////////////////////////// +int benchmark(args_params_t const& args) { + + std::uint64_t nd = args.nd; // Number of matrix dimension. + + std::vector inputMatrix = generate_pascal_matrix(nd); + + // Create the solver object + solver solve; + // Measure execution time. + Timer timer; + + // start decomposation + auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); + + // Print the final results + if (args.results) { + // Displaying Lower Triangular and its Transpose + cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; + for (int i = 0; i < nd; i++) { + // Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[i][j] << "\t"; + cout << "\t"; + + // Transpose of Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[j][i] << "\t"; + cout << endl; + } + } + + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } + + return 0; +} + +// Driver Code for testing +int main(int argc, char* argv[]) { + + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + benchmark(args); + + return 0; +} diff --git a/apps/choleskey/matrixutil.hpp b/apps/choleskey/matrixutil.hpp new file mode 100644 index 0000000..46206c3 --- /dev/null +++ b/apps/choleskey/matrixutil.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include +#include + +// generate positive definition matrix +template +using Matrix = std::vector>; + +template +std::vector generate_pascal_matrix(const int n) { + Matrix matrix(n, std::vector(n, static_cast(0))); + + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + if (i == 0 || j == 0) { + matrix[i][j] = static_cast(1); + } else { + matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j]; + } + } + } + + std::vector flattenedVector; + for (const auto& row : matrix) { + flattenedVector.insert(flattenedVector.end(), row.begin(), row.end()); + } + return std::move(flattenedVector); +} + +// parameters define +struct args_params_t : public argparse::Args { + bool& results = kwarg("results", "print generated results (default: false)") + .set_default(true); + std::uint64_t& nd = + kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)") + .set_default(10); + + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); +}; From 3630384363207bd76bb275e7e2fe6f8372a11cd3 Mon Sep 17 00:00:00 2001 From: Chuanqiu He Date: Tue, 26 Sep 2023 09:05:33 -0700 Subject: [PATCH 04/20] sender_choleskey_sync_wait_issue --- apps/choleskey/CMakeLists.txt | 8 ++ apps/choleskey/choleskey_stdpar_snd.cpp | 142 ++++++++++++++++++++++++ apps/choleskey/matrixutil.hpp | 2 +- 3 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 apps/choleskey/choleskey_stdpar_snd.cpp diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt index bfa1a7a..a5eae85 100644 --- a/apps/choleskey/CMakeLists.txt +++ b/apps/choleskey/CMakeLists.txt @@ -7,7 +7,15 @@ target_include_directories( ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) add_executable(choleskey_stdpar choleskey_stdpar.cpp) +target_link_libraries(choleskey_stdpar stdexec) target_include_directories( choleskey_stdpar PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) + +add_executable(choleskey_stdpar_snd choleskey_stdpar_snd.cpp) +target_link_libraries(choleskey_stdpar_snd stdexec) +target_include_directories( + choleskey_stdpar_snd + PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include + ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp new file mode 100644 index 0000000..36a64fd --- /dev/null +++ b/apps/choleskey/choleskey_stdpar_snd.cpp @@ -0,0 +1,142 @@ +// Cholesky Decomposition: stdpar-->sender +#include "argparse/argparse.hpp" +#include "commons.hpp" + +#include +#include +#include +#include +#include +#include +#include "exec/static_thread_pool.hpp" + +#include "matrixutil.hpp" +using namespace stdexec; +using stdexec::sync_wait; + +using namespace std; + +struct solver { + + using view_2d = std::extents; + + template + std::vector> Cholesky_Decomposition(std::vector& vec, + int n) { + + // test here first, scheduler from a thread pool + exec::static_thread_pool pool(n); + stdexec::scheduler auto sch = pool.get_scheduler(); + stdexec::sender auto begin = stdexec::schedule(sch); + + std::vector> lower(n, std::vector(n, 0)); + + auto matrix_ms = + std::mdspan(vec.data(), n, n); + + auto multiplier_lambda = [=](auto a, auto b) { + return a * b; + }; + + int np = 4; // default number of parallel sec, will be an option + + for (int i = 0; i < matrix_ms.extent(0); i++) { + for (int j = 0; j <= i; j++) { + T sum = 0; + + if (j == i) // summation for diagonals + { + auto send1 = + just(std::move(sum)) | + bulk(np, + [&](int piece) { + int start = piece * (n / 2 + 1) / np; + int size = (n / 2 + 1) / np; // partition size + int remaining = (n / 2 + 1) % np; + size += (piece == np - 1) ? remaining : 0; + + sum = std::transform_reduce( + std::execution::par, counting_iterator(start), + counting_iterator(start) + size, 0, std ::plus{}, + [=](int val) { return val * val; }); + }) | + then([&](auto sum) { return sum; }); + + //auto sum1 = sync_wait(send1).value(); + lower[j][j] = std::sqrt(matrix_ms(i, j) - sum); + + } else { // Evaluating L(i, j) using L(j, j) + + sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), + lower[j].cbegin() + j, lower[i].cbegin(), + 0, std::plus<>(), multiplier_lambda); + + lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j]; + } + } + } + return lower; + } +}; + +/////////////////////////////////////////////////////////////////////////////// +int benchmark(args_params_t const& args) { + + std::uint64_t nd = args.nd; // Number of matrix dimension. + std::uint64_t np = args.np; // Number of partitions. + + std::vector inputMatrix = generate_pascal_matrix(nd); + + // Create the solver object + solver solve; + + exec::static_thread_pool pool(np); + stdexec::scheduler auto sch = pool.get_scheduler(); + stdexec::sender auto begin = stdexec::schedule(sch); + + // Measure execution time. + Timer timer; + + // start decomposation + auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); + + // Print the final results + if (args.results) { + // Displaying Lower Triangular and its Transpose + cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; + for (int i = 0; i < nd; i++) { + // Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[i][j] << "\t"; + cout << "\t"; + + // Transpose of Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[j][i] << "\t"; + cout << endl; + } + } + + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } + + return 0; +} + +// Driver Code for testing +int main(int argc, char* argv[]) { + + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + benchmark(args); + + return 0; +} diff --git a/apps/choleskey/matrixutil.hpp b/apps/choleskey/matrixutil.hpp index 46206c3..44f0468 100644 --- a/apps/choleskey/matrixutil.hpp +++ b/apps/choleskey/matrixutil.hpp @@ -35,7 +35,7 @@ struct args_params_t : public argparse::Args { std::uint64_t& nd = kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)") .set_default(10); - + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4); bool& help = flag("h, help", "print help"); bool& time = kwarg("t, time", "print time").set_default(true); }; From 8f99758d6d8df8f03bae1bc2d14b843fd8a16196 Mon Sep 17 00:00:00 2001 From: hcq9102 Date: Tue, 26 Sep 2023 11:13:48 -0700 Subject: [PATCH 05/20] fix partition&iterator --- apps/choleskey/choleskey_stdpar_snd.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp index 36a64fd..5429445 100644 --- a/apps/choleskey/choleskey_stdpar_snd.cpp +++ b/apps/choleskey/choleskey_stdpar_snd.cpp @@ -46,23 +46,24 @@ struct solver { if (j == i) // summation for diagonals { - auto send1 = - just(std::move(sum)) | + sender auto send1 = + begin | bulk(np, [&](int piece) { - int start = piece * (n / 2 + 1) / np; - int size = (n / 2 + 1) / np; // partition size - int remaining = (n / 2 + 1) % np; + int start = piece * (j + 1) / np; + int size = (j + 1) / np; // partition size + int remaining = (j + 1) % np; size += (piece == np - 1) ? remaining : 0; sum = std::transform_reduce( - std::execution::par, counting_iterator(start), - counting_iterator(start) + size, 0, std ::plus{}, - [=](int val) { return val * val; }); + std::execution::par, + counting_iterator(lower[j][start]), + counting_iterator(lower[j][start]) + size, 0, + std ::plus{}, [=](int val) { return val * val; }); }) | then([&](auto sum) { return sum; }); - //auto sum1 = sync_wait(send1).value(); + auto [sum1] = sync_wait(std::move(send1)).value(); lower[j][j] = std::sqrt(matrix_ms(i, j) - sum); } else { // Evaluating L(i, j) using L(j, j) From 7361b2859f925cb8d798bcefd06ab28cdd4c96dc Mon Sep 17 00:00:00 2001 From: hcq9102 Date: Tue, 26 Sep 2023 14:42:16 -0700 Subject: [PATCH 06/20] last two columns has issue --- apps/choleskey/choleskey_stdpar_snd.cpp | 59 ++++++++++++++----------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp index 5429445..d60d7d8 100644 --- a/apps/choleskey/choleskey_stdpar_snd.cpp +++ b/apps/choleskey/choleskey_stdpar_snd.cpp @@ -1,18 +1,17 @@ // Cholesky Decomposition: stdpar-->sender -#include "argparse/argparse.hpp" -#include "commons.hpp" - #include +#include #include #include #include #include #include +#include "argparse/argparse.hpp" +#include "commons.hpp" #include "exec/static_thread_pool.hpp" #include "matrixutil.hpp" -using namespace stdexec; -using stdexec::sync_wait; +// using namespace stdexec; using namespace std; @@ -38,7 +37,7 @@ struct solver { return a * b; }; - int np = 4; // default number of parallel sec, will be an option + int np = 3; // default number of parallel sec, will be an option for (int i = 0; i < matrix_ms.extent(0); i++) { for (int j = 0; j <= i; j++) { @@ -46,28 +45,35 @@ struct solver { if (j == i) // summation for diagonals { - sender auto send1 = - begin | - bulk(np, - [&](int piece) { - int start = piece * (j + 1) / np; - int size = (j + 1) / np; // partition size - int remaining = (j + 1) % np; - size += (piece == np - 1) ? remaining : 0; - - sum = std::transform_reduce( - std::execution::par, - counting_iterator(lower[j][start]), - counting_iterator(lower[j][start]) + size, 0, - std ::plus{}, [=](int val) { return val * val; }); - }) | - then([&](auto sum) { return sum; }); - - auto [sum1] = sync_wait(std::move(send1)).value(); - lower[j][j] = std::sqrt(matrix_ms(i, j) - sum); + std::vector sum_vec(np + 1); - } else { // Evaluating L(i, j) using L(j, j) + std::cout << "j = " << j << std::endl; + std::size_t const size = ((j + 1) + (np - 1)) / np; // partition size + stdexec::sender auto send1 = + stdexec::bulk( + begin, np, + [&](int piece) { + std::cout << "pcs = " << piece << std::endl; + int start = piece * size; + int end = std::min(j, (int)((piece + 1) * size)); + + sum_vec[piece] = std::transform_reduce( + std::execution::par, counting_iterator(lower[j][start]), + counting_iterator(lower[j][end]), 0, std ::plus{}, + [=](int val) { return val * val; }); + }) | + stdexec::then([&sum_vec]() { + return std::reduce(std::execution::par, sum_vec.begin(), + sum_vec.end()); + }); + + auto [sum1] = stdexec::sync_wait(std::move(send1)).value(); + + lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1); + + } else { // Evaluating L(i, j) using L(j, j) + // TODO sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j, lower[i].cbegin(), 0, std::plus<>(), multiplier_lambda); @@ -76,6 +82,7 @@ struct solver { } } } + return lower; } }; From c8b96ddfdd272101ee3fe22351ba0a7d942238b5 Mon Sep 17 00:00:00 2001 From: hcq9102 Date: Sat, 30 Sep 2023 14:40:12 -0700 Subject: [PATCH 07/20] choleskey_decomposition_sender_correct --- apps/choleskey/CMakeLists.txt | 1 + apps/choleskey/choleskey_stdpar_snd.cpp | 156 ++++++++++++++++-------- 2 files changed, 109 insertions(+), 48 deletions(-) diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt index a5eae85..be4be0a 100644 --- a/apps/choleskey/CMakeLists.txt +++ b/apps/choleskey/CMakeLists.txt @@ -19,3 +19,4 @@ target_include_directories( choleskey_stdpar_snd PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) + \ No newline at end of file diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp index d60d7d8..4fa7d79 100644 --- a/apps/choleskey/choleskey_stdpar_snd.cpp +++ b/apps/choleskey/choleskey_stdpar_snd.cpp @@ -1,4 +1,32 @@ -// Cholesky Decomposition: stdpar-->sender +/* + * MIT License + * + * Copyright (c) 2023 Chuanqiu He + * Copyright (c) 2023 Weile Wei + * Copyright (c) 2023 The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of any + * required approvals from the U.S. Dept. of Energy).All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +// +// This example provides a stdexec(senders/receivers) implementation for choleskey decomposition code. #include #include #include @@ -20,11 +48,11 @@ struct solver { using view_2d = std::extents; template - std::vector> Cholesky_Decomposition(std::vector& vec, - int n) { + std::vector> Cholesky_Decomposition(std::vector& vec, int n, + int np) { // test here first, scheduler from a thread pool - exec::static_thread_pool pool(n); + exec::static_thread_pool pool(np); stdexec::scheduler auto sch = pool.get_scheduler(); stdexec::sender auto begin = stdexec::schedule(sch); @@ -37,52 +65,88 @@ struct solver { return a * b; }; - int np = 3; // default number of parallel sec, will be an option - for (int i = 0; i < matrix_ms.extent(0); i++) { for (int j = 0; j <= i; j++) { - T sum = 0; + // avoid over parallelize + if (j == 0) { + np = 1; + } else if (j > 0 && np > j) { + np = j; + } if (j == i) // summation for diagonals { - std::vector sum_vec(np + 1); - - std::cout << "j = " << j << std::endl; - - std::size_t const size = ((j + 1) + (np - 1)) / np; // partition size - stdexec::sender auto send1 = - stdexec::bulk( - begin, np, - [&](int piece) { - std::cout << "pcs = " << piece << std::endl; - int start = piece * size; - int end = std::min(j, (int)((piece + 1) * size)); - - sum_vec[piece] = std::transform_reduce( - std::execution::par, counting_iterator(lower[j][start]), - counting_iterator(lower[j][end]), 0, std ::plus{}, - [=](int val) { return val * val; }); - }) | - stdexec::then([&sum_vec]() { - return std::reduce(std::execution::par, sum_vec.begin(), - sum_vec.end()); - }); - - auto [sum1] = stdexec::sync_wait(std::move(send1)).value(); - - lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1); - - } else { // Evaluating L(i, j) using L(j, j) - // TODO - sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), - lower[j].cbegin() + j, lower[i].cbegin(), - 0, std::plus<>(), multiplier_lambda); - - lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j]; + + if (i == 0 && j == 0) { + lower[j][j] = std::sqrt(matrix_ms(i, j)); + } else { + + std::vector sum_vec(np); // sub res for each piece + int size = j; // there are j elements need to be calculated(power) + + stdexec::sender auto send1 = + stdexec::bulk(begin, np, + [&](int piece) { + int start = piece * size / np; + int chunk_size = size / np; + int remaining = size % np; + chunk_size += (piece == np - 1) ? remaining : 0; + + sum_vec[piece] = std::transform_reduce( + std::execution::par, + counting_iterator(start), + counting_iterator(start + chunk_size), 0, + std ::plus{}, [=](int val) { + return lower[j][val] * lower[j][val]; + }); + }) | + stdexec::then([&sum_vec]() { + return std::reduce(std::execution::par, sum_vec.begin(), + sum_vec.end()); + }); + + auto [sum1] = stdexec::sync_wait(std::move(send1)).value(); + + lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1); + } + + } else { + // Evaluating L(i, j) using L(j, j) + + if (j == 0) { + lower[i][j] = (matrix_ms(i, j)) / lower[j][j]; + } else { + + std::vector sum_vec(np); // sub res for each piece + int size_nondiag = j; + + stdexec::sender auto send2 = + stdexec::bulk( + begin, np, + [&](int piece) { + int start = piece * size_nondiag / np; + int chunk_size = size_nondiag / np; + int remaining = size_nondiag % np; + chunk_size += (piece == np - 1) ? remaining : 0; + + sum_vec[piece] = std::transform_reduce( + std::execution::par, counting_iterator(start), + counting_iterator(start + chunk_size), 0, + std ::plus{}, + [=](int k) { return lower[j][k] * lower[i][k]; }); + }) | + stdexec::then([&sum_vec]() { + return std::reduce(std::execution::par, sum_vec.begin(), + sum_vec.end()); + }); + + auto [sum2] = stdexec::sync_wait(std::move(send2)).value(); + + lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j]; + } } } } - return lower; } }; @@ -91,22 +155,18 @@ struct solver { int benchmark(args_params_t const& args) { std::uint64_t nd = args.nd; // Number of matrix dimension. - std::uint64_t np = args.np; // Number of partitions. + std::uint64_t np = args.np; // Number of parallel partitions. std::vector inputMatrix = generate_pascal_matrix(nd); // Create the solver object solver solve; - exec::static_thread_pool pool(np); - stdexec::scheduler auto sch = pool.get_scheduler(); - stdexec::sender auto begin = stdexec::schedule(sch); - // Measure execution time. Timer timer; // start decomposation - auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); + auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np); // Print the final results if (args.results) { From f2b93ff6f280e7ce401d7baea5edf13603752454 Mon Sep 17 00:00:00 2001 From: hcq9102 Date: Sat, 30 Sep 2023 15:05:30 -0700 Subject: [PATCH 08/20] format --- apps/choleskey/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt index be4be0a..a5eae85 100644 --- a/apps/choleskey/CMakeLists.txt +++ b/apps/choleskey/CMakeLists.txt @@ -19,4 +19,3 @@ target_include_directories( choleskey_stdpar_snd PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) - \ No newline at end of file From f60a5b570095b94223e07d7481711bf3f333b38e Mon Sep 17 00:00:00 2001 From: hcq9102 Date: Sat, 30 Sep 2023 14:56:15 -0700 Subject: [PATCH 09/20] add copyright --- apps/choleskey/choleskey_serial.cpp | 31 ++++++++++++++++++++++++++++- apps/choleskey/choleskey_stdpar.cpp | 31 ++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/apps/choleskey/choleskey_serial.cpp b/apps/choleskey/choleskey_serial.cpp index ea181d0..2da34e2 100644 --- a/apps/choleskey/choleskey_serial.cpp +++ b/apps/choleskey/choleskey_serial.cpp @@ -1,4 +1,33 @@ -// Cholesky Decomposition: mdspan +/* + * MIT License + * + * Copyright (c) 2023 Chuanqiu He + * Copyright (c) 2023 Weile Wei + * Copyright (c) 2023 The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of any + * required approvals from the U.S. Dept. of Energy).All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +// +// This example provides a sserial(mdspan) implementation for choleskey decomposition code. + #include #include #include diff --git a/apps/choleskey/choleskey_stdpar.cpp b/apps/choleskey/choleskey_stdpar.cpp index abbe02d..2b19b7d 100644 --- a/apps/choleskey/choleskey_stdpar.cpp +++ b/apps/choleskey/choleskey_stdpar.cpp @@ -1,4 +1,33 @@ -// Cholesky Decomposition: stdpar +/* + * MIT License + * + * Copyright (c) 2023 Chuanqiu He + * Copyright (c) 2023 Weile Wei + * Copyright (c) 2023 The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of any + * required approvals from the U.S. Dept. of Energy).All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +// +// This example provides a stdpar implementation for choleskey decomposition code. + #include "argparse/argparse.hpp" #include "commons.hpp" From 4f66c7ad28aa4b72309724b25340cd4501311ba4 Mon Sep 17 00:00:00 2001 From: hcq9102 Date: Sat, 30 Sep 2023 15:00:49 -0700 Subject: [PATCH 10/20] fix typo --- apps/choleskey/choleskey_serial.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/choleskey/choleskey_serial.cpp b/apps/choleskey/choleskey_serial.cpp index 2da34e2..5c82498 100644 --- a/apps/choleskey/choleskey_serial.cpp +++ b/apps/choleskey/choleskey_serial.cpp @@ -26,7 +26,7 @@ * SOFTWARE. */ // -// This example provides a sserial(mdspan) implementation for choleskey decomposition code. +// This example provides a serial(mdspan) implementation for choleskey decomposition code. #include #include From 738a04de24c5f4e9d31fa13f8fe1f9b3ba5a172c Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Sat, 7 Oct 2023 00:02:15 -0700 Subject: [PATCH 11/20] adding fft app --- CMakeLists.txt | 5 +- apps/comm-study/comm-study-no-senders.cpp | 2 +- apps/fft/fft-serial.cpp | 138 +++++++++++++++++ apps/fft/fft.hpp | 172 +++++++++++++--------- 4 files changed, 245 insertions(+), 72 deletions(-) create mode 100644 apps/fft/fft-serial.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 558c8b6..2c6ee0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,7 +74,7 @@ set(GCC_EXPECTED_VERSION 11.2) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS GCC_EXPECTED_VERSION) message( FATAL_ERROR - "GCC: GCB requires GCC v${GCC_EXPECTED_VERSION} or higher to build but found v${CMAKE_CXX_COMPILER_VERSION}" + "GCC: nvstdpar requires GCC v${GCC_EXPECTED_VERSION} or higher to build but found v${CMAKE_CXX_COMPILER_VERSION}" ) endif() @@ -84,10 +84,11 @@ endif() set(CXX_STANDARD_REQUIRED ON) # required minimum CXX standard -set(CMAKE_CXX_STANDARD_REQUIRED 20) +set(CMAKE_CXX_STANDARD_REQUIRED 23) if(NOT CXX_STANDARD OR (CXX_STANDARD LESS ${CMAKE_CXX_STANDARD_REQUIRED})) set(CXX_STANDARD ${CMAKE_CXX_STANDARD_REQUIRED}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${CXX_STANDARD}") message(STATUS "Setting CXX_STANDARD to ${CMAKE_CXX_STANDARD_REQUIRED}") endif() diff --git a/apps/comm-study/comm-study-no-senders.cpp b/apps/comm-study/comm-study-no-senders.cpp index 1377745..1550094 100644 --- a/apps/comm-study/comm-study-no-senders.cpp +++ b/apps/comm-study/comm-study-no-senders.cpp @@ -74,7 +74,7 @@ auto work(P& A, P& B, P& Y, int N) { // get sum(Y) - one last memcpy (not USM) D2H sum += - std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus()); + std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus(), [](T &val){return val * val;}); return sum / N; } diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp new file mode 100644 index 0000000..b174b5a --- /dev/null +++ b/apps/fft/fft-serial.cpp @@ -0,0 +1,138 @@ +/* + * MIT License + * + * Copyright (c) 2023 The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of any + * required approvals from the U.S. Dept. of Energy).All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * commons for the fft codes + */ + +#include "fft.hpp" + +// +// simulation +// +int main(int argc, char* argv[]) +{ + // parse params + fft_params_t args = argparse::parse(argc, argv); + + // see if help wanted + if (args.help) + { + args.print(); // prints all variables + return 0; + } + + // simulation variables + int N = args.N; + sig_type_t sig_type = args.sig; + int freq = args.freq; + bool print_sig = args.print_sig; + bool print_time = args.print_time; + + // x[n] signal + //std::vector test_sig{2,1,-1,5,0,3,0,-4}; + //N = test_sig.size(); + + Timer timer; + + sig_t x_n(N, sig_type); + + if (!isPowOf2(N)) + { + N = ceilPowOf2(N); + std::cout << "log_2(N) != integer. Padding zeros for N = " << N << std::endl; + + x_n.resize(N); + } + + sig_t y_n(x_n); + + if (print_sig) + { + std::cout << std::endl << "x[n] = "; + x_n.printSignal(); + std::cout << std::endl; + } + + // niterations + int niters = ilog2(N); + + std::function fft = [&](data_t *x, int lN, const int N) + { + int stride = N/lN; + + if (lN == 2) + { + auto x_0 = x[0] + x[1]* WNk(N, 0); + x[1] = x[0] - x[1]* WNk(N, 0); + x[0] = x_0; + return; + } + + // vectors for left and right + std::vector e(lN/2); + std::vector o(lN/2); + + // copy data into vectors + for (auto k = 0; k < lN/2; k++) + { + e[k] = x[2*k]; + o[k] = x[2*k+1]; + } + + // compute N/2 pt FFT on even + fft(e.data(), lN/2, N); + + // compute N/2 pt FFT on odd + fft(o.data(), lN/2, N); + + // combine even and odd FFTs + for (int k = 0; k < lN/2; k++) + { + x[k] = e[k] + o[k] * WNk(N, k * stride); + x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride); + } + + return; + }; + + // fft radix-2 algorithm with senders + fft(y_n.data(), N, N); + + if (print_sig) + { + std::cout << "X[k] = "; + y_n.printSignal(); + std::cout << std::endl; + } + + auto elapsed = timer.stop(); + + if (print_time) + std::cout << "Elapsed Time: " << elapsed << " ms" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp index bc411d5..37f9c29 100644 --- a/apps/fft/fft.hpp +++ b/apps/fft/fft.hpp @@ -30,130 +30,138 @@ #pragma once -#include +#include #include +#include +#include +#include +#include +#include #include "argparse/argparse.hpp" #include "commons.hpp" +namespace ex = stdexec; using namespace std::complex_literals; // data type using Real_t = double; using data_t = std::complex; -// number of dimensions -constexpr int dims = 1; - -// 1D view -using view_1d = std::extents; - -// 2D view -using view_2d = std::extents; - -// 3D view -using view_3d = std::extents; - -enum class fft_type { fftw, cufft }; enum class sig_type { square, sinusoid, sawtooth, triangle, sinc, box }; - using sig_type_t = sig_type; +// fft radix +constexpr int radix = 2; + // parameters struct fft_params_t : public argparse::Args { sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box); - int& freq = kwarg("f,freq", "Signal frequency").set_default(1000); - int& len = kwarg("n,N", "N-point FFT").set_default(1<<16); - bool& print_fft = flag("p,print", "print Fourier transformed signal"); + int& freq = kwarg("f,freq", "Signal frequency").set_default(1024); + int& N = kwarg("N", "N-point FFT").set_default(1024); + bool& print_sig = flag("p,print", "print x[n] and X(k)"); #if defined(USE_OMP) int& nthreads = kwarg("nthreads", "number of threads").set_default(1); #endif // USE_OMP bool& help = flag("h, help", "print help"); - bool& print_time = flag("t,time", "print transform time"); + bool& print_time = flag("t,time", "print fft time"); }; -void printSignal(data_t* sig, int N) { - std::cout << std::fixed << std::setprecision(1); +inline bool isPowOf2(long long int x) { + return !(x == 0) && !(x & (x - 1)); +} - for (int i = 0; i < N; ++i) - std::cout << sig[i] << " "; +template +void printVec(T &vec, int len) +{ + std::cout << "[ "; + for (int i = 0; i < len; i++) + std::cout << vec[i] << " "; - std::cout << std::endl; + std::cout << "]" << std::endl; +} + +inline std::complex WNk(int N, int k) +{ + return std::complex(exp(-2*M_PI*1/N*k*1i)); +} + +inline int ceilPowOf2(unsigned int v) +{ + return static_cast(std::bit_ceil(v)); +} + +inline int ilog2(uint32_t x) +{ + return static_cast(log2(x)); } class signal { public: - signal() + signal() = default; + signal(int N) { - this->N = 1e3; - t.resize(this->N); - y.resize(this->N); - dt = 1.0 / this->N; - } - - signal(int _N) - { - if (_N <= 0) + if (N <= 0) { std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl; exit(1); } - this->N = _N; - t.resize(this->N); - y.resize(this->N); - dt = 1.0 / this->N; + y.reserve(ceilPowOf2(N)); } - signal(int N, sig_type type=sig_type::box) + signal(signal &rhs) + { + y = rhs.y; + } + signal(std::vector &in) + { + y = std::move(in); + } + + signal(int N, sig_type type) { if (N <= 0) { std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl; exit(1); } - - this->N = N; - t.resize(N); - y.resize(N); - dt = 1.0 / N; - signalGenerator(N, type); + y.reserve(ceilPowOf2(N)); + signalGenerator(type); } - void signalGenerator(int N, sig_type type=sig_type::box) + void signalGenerator(sig_type type=sig_type::box) { - int interval = 1/N; - std::vector t(N); + int N = y.size(); switch (type) { case sig_type::square: - for (int i = 0; i < N; ++i) - y[i] = (i < N / 4 || i > 3 * N/4) ? 1.0 : -1.0; + for (int n = 0; n < N; ++n) + y[n] = (n < N / 4 || n > 3 * N/4) ? 1.0 : -1.0; break; case sig_type::sinusoid: - for (int i = 0; i < N; ++i) - y[i] = std::sin(2.0 * M_PI * i / N); + for (int n = 0; n < N; ++n) + y[n] = std::sin(2.0 * M_PI * n / N); break; case sig_type::sawtooth: - for (int i = 0; i < N; ++i) - y[i] = 2.0 * (i / N) - 1.0; + for (int n = 0; n < N; ++n) + y[n] = 2.0 * (n / N) - 1.0; break; case sig_type::triangle: - for (int i = 0; i < N; ++i) - y[i] = 2.0 * std::abs(2.0 * (i / N) - 1.0) - 1.0; + for (int n = 0; n < N; ++n) + y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0; break; case sig_type::sinc: y[0] = 1.0; - for (int i = 1; i < N; ++i) - y[i] = std::sin(2.0 * M_PI * i / N) / (2.0 * M_PI * i / N); + for (int n = 1; n < N; ++n) + y[n] = std::sin(2.0 * M_PI * n / N) / (2.0 * M_PI * n / N); break; case sig_type::box: - for (int i = 0; i < N; ++i) - y[i] = (i < N / 4 || i > 3 * N / 4) ? 1.0 : 0.0; + for (int n = 0; n < N; ++n) + y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0; break; default: std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl; @@ -164,14 +172,40 @@ class signal ~signal() { y.clear(); - t.clear(); + } + + data_t *data() { return y.data(); } + int len() { return y.size(); } + + void resize(int N) + { + if (N != y.size()) + y.resize(N, 0); + } + + data_t &operator[](int n) + { + return y[n]; + } + + data_t &operator()(int n) + { + return y[n]; + } + + void printSignal() { + std::cout << std::fixed << std::setprecision(2); + + std::cout << "[ "; + for (auto &el : y) + std::cout << el << " "; + + std::cout << "]" << std::endl; } private: - int N; - Real_t dt; - // time axis - std::vector t; - // y(t) axis - std::vector y; -}; \ No newline at end of file + // y[n] + std::vector y; +}; + +using sig_t = signal; From 1f9d69a8d65c88d0fa6c071edee8db4f6a443a6d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Sat, 7 Oct 2023 00:04:53 -0700 Subject: [PATCH 12/20] Removed argparse --- .gitmodules | 3 --- externals/argparse | 1 - 2 files changed, 4 deletions(-) delete mode 160000 externals/argparse diff --git a/.gitmodules b/.gitmodules index b8f3f6d..6af8544 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,6 @@ [submodule "externals/mdspan"] path = externals/mdspan url = https://github.com/kokkos/mdspan -[submodule "externals/argparse"] - path = externals/argparse - url = https://github.com/mhaseeb123/argparse [submodule "externals/magic_enum"] path = externals/magic_enum url = https://github.com/mhaseeb123/magic_enum diff --git a/externals/argparse b/externals/argparse deleted file mode 160000 index dee5935..0000000 --- a/externals/argparse +++ /dev/null @@ -1 +0,0 @@ -Subproject commit dee59359be9a2a023ceb59384c735b4e711cc18d From 25c83bdf69d00fae5f2a01285309d076684b8dd1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Sat, 7 Oct 2023 00:07:14 -0700 Subject: [PATCH 13/20] readding new argparse --- .gitmodules | 3 +++ argparse | 1 + externals/argparse | 1 + 3 files changed, 5 insertions(+) create mode 160000 argparse create mode 160000 externals/argparse diff --git a/.gitmodules b/.gitmodules index 6af8544..e07fd62 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "externals/magic_enum"] path = externals/magic_enum url = https://github.com/mhaseeb123/magic_enum +[submodule "externals/argparse"] + path = externals/argparse + url = https://github.com/mhaseeb123/argparse diff --git a/argparse b/argparse new file mode 160000 index 0000000..9770626 --- /dev/null +++ b/argparse @@ -0,0 +1 @@ +Subproject commit 9770626123d491bc9d27851a150da20fc47fc994 diff --git a/externals/argparse b/externals/argparse new file mode 160000 index 0000000..9770626 --- /dev/null +++ b/externals/argparse @@ -0,0 +1 @@ +Subproject commit 9770626123d491bc9d27851a150da20fc47fc994 From a34c277239884c0f44ddaefde2032b8c10a86317 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Sat, 7 Oct 2023 00:08:01 -0700 Subject: [PATCH 14/20] updating submodules --- externals/magic_enum | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/externals/magic_enum b/externals/magic_enum index b291b0c..d67973d 160000 --- a/externals/magic_enum +++ b/externals/magic_enum @@ -1 +1 @@ -Subproject commit b291b0ce5a76e808e05fc0141154e963407372da +Subproject commit d67973d1181ff986ba63c756b47cc854f4d51d32 From fd560d2f793fab8c2c9c725d0814fb167bbe130c Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Sat, 7 Oct 2023 00:09:19 -0700 Subject: [PATCH 15/20] removing stale fft file --- apps/fft/fft-serial-1d.cpp | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 apps/fft/fft-serial-1d.cpp diff --git a/apps/fft/fft-serial-1d.cpp b/apps/fft/fft-serial-1d.cpp deleted file mode 100644 index 6c73237..0000000 --- a/apps/fft/fft-serial-1d.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2023 The Regents of the University of California, - * through Lawrence Berkeley National Laboratory (subject to receipt of any - * required approvals from the U.S. Dept. of Energy).All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * commons for the fft codes - */ - -#include "fft.hpp" - -// -// simulation -// -int main(int argc, char* argv[]) { - return 0; -} From 94813eb9836d5c16af04942e2859a8dd21151a0d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Sat, 7 Oct 2023 00:45:18 -0700 Subject: [PATCH 16/20] minor debugging --- apps/fft/fft.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp index 37f9c29..80a7446 100644 --- a/apps/fft/fft.hpp +++ b/apps/fft/fft.hpp @@ -107,10 +107,11 @@ class signal { if (N <= 0) { - std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl; + std::cerr << "FATAL: N must be > 0. exiting.." << std::endl; exit(1); } y.reserve(ceilPowOf2(N)); + y.resize(N); } signal(signal &rhs) @@ -126,10 +127,11 @@ class signal { if (N <= 0) { - std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl; + std::cerr << "FATAL: N must be > 0. exiting.." << std::endl; exit(1); } y.reserve(ceilPowOf2(N)); + y.resize(N); signalGenerator(type); } From 62e87d33d67f416751167cef1113147394a61b46 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Sat, 7 Oct 2023 00:45:21 -0700 Subject: [PATCH 17/20] minor bug fix --- externals/mdspan | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/externals/mdspan b/externals/mdspan index 124b860..f840358 160000 --- a/externals/mdspan +++ b/externals/mdspan @@ -1 +1 @@ -Subproject commit 124b860f458e5c06c9b96d7510dc35b7acdd642b +Subproject commit f84035865a92241a5163d8d0e5100aea037892ca From 2761f772913e40b34f1b23532f4b0c75c57cf16b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Sat, 7 Oct 2023 00:49:34 -0700 Subject: [PATCH 18/20] clang-format --- apps/1d_stencil/stencil_cuda.cpp | 175 +++++------ apps/1d_stencil/stencil_serial.cpp | 190 ++++++----- apps/1d_stencil/stencil_snd_gpu_m.cpp | 201 ++++++------ apps/1d_stencil/stencil_snd_gpu_s.cpp | 202 ++++++------ apps/1d_stencil/stencil_stdpar.cpp | 193 ++++++------ apps/1d_stencil/stencil_stdpar_snd.cpp | 254 +++++++-------- apps/1d_stencil/stencil_stdpar_snd_iter.cpp | 213 ++++++------- apps/choleskey/choleskey_serial.cpp | 140 ++++----- apps/choleskey/choleskey_stdpar.cpp | 124 ++++---- apps/choleskey/choleskey_stdpar_snd.cpp | 269 ++++++++-------- apps/choleskey/matrixutil.hpp | 41 ++- apps/comm-study/comm-study-no-senders.cpp | 108 +++---- apps/comm-study/comm-study.cpp | 163 +++++----- apps/fft/fft-serial.cpp | 47 ++- apps/fft/fft.hpp | 215 ++++++------- apps/heat-equation/heat-equation-cuda.cpp | 294 +++++++++-------- .../heat-equation-gpu-scheduler.cpp | 244 +++++++-------- apps/heat-equation/heat-equation-mdspan.cpp | 209 ++++++------- .../heat-equation-multigpu-scheduler.cpp | 248 +++++++-------- apps/heat-equation/heat-equation-omp.cpp | 182 +++++------ .../heat-equation-stdpar-senders.cpp | 296 +++++++++--------- apps/heat-equation/heat-equation-stdpar.cpp | 212 ++++++------- apps/heat-equation/heat-equation.hpp | 61 ++-- apps/mdspan-stdpar/mdspan-stdpar.cpp | 79 +++-- include/commons.hpp | 57 ++-- include/counting_iterator.hpp | 164 +++++----- 26 files changed, 2176 insertions(+), 2405 deletions(-) diff --git a/apps/1d_stencil/stencil_cuda.cpp b/apps/1d_stencil/stencil_cuda.cpp index 2c87bac..3436893 100644 --- a/apps/1d_stencil/stencil_cuda.cpp +++ b/apps/1d_stencil/stencil_cuda.cpp @@ -7,20 +7,16 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)") - .set_default(false); - std::uint64_t& nx = - kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = - kwarg("no-header", "Do not print csv header row (default: false)") - .set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)").set_default(false); + std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -32,96 +28,95 @@ constexpr double dx = 1.; // grid spacing // Our operator __device__ double heat(double left, double middle, double right) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); } __global__ void heat_equation(double* current, double* next, std::size_t size) { - std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; + std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) { - std::size_t left = (i == 0) ? size - 1 : i - 1; - std::size_t right = (i == size - 1) ? 0 : i + 1; - next[i] = heat(current[left], current[i], current[right]); - } + if (i < size) { + std::size_t left = (i == 0) ? size - 1 : i - 1; + std::size_t right = (i == size - 1) ? 0 : i + 1; + next[i] = heat(current[left], current[i], current[right]); + } } int benchmark(args_params_t const& args) { - // Parameters (for simplicity, some are hardcoded) - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. - std::size_t size = np * nx; - - double* h_current = nullptr; - double* h_next = nullptr; - - // Measure execution time. - Timer timer; - - // Memory allocation - if (args.results) { - h_current = new double[size]; - h_next = new double[size]; - } - - double* d_current; - double* d_next; - cudaMalloc(&d_current, size * sizeof(double)); - cudaMalloc(&d_next, size * sizeof(double)); - thrust::sequence(thrust::device, d_current, d_current + size, 0); - thrust::sequence(thrust::device, d_next, d_next + size, 0); - - // CUDA kernel execution parameters - const int threadsPerBlock = std::min(1024, (int)size); - const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock; - - // Actual time step loop - for (std::size_t t = 0; t < nt; ++t) { - heat_equation<<>>(d_current, d_next, size); - std::swap(d_current, d_next); - } - cudaDeviceSynchronize(); - auto time = timer.stop(); - - if (args.results) { - // Copy result back to host - cudaMemcpy(h_current, d_current, size * sizeof(double), - cudaMemcpyDeviceToHost); - - // Print results - for (std::size_t i = 0; i < np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j < nx; ++j) { - std::cout << h_current[i * nx + j] << " "; - } - std::cout << "}\n"; + // Parameters (for simplicity, some are hardcoded) + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. + std::size_t size = np * nx; + + double* h_current = nullptr; + double* h_next = nullptr; + + // Measure execution time. + Timer timer; + + // Memory allocation + if (args.results) { + h_current = new double[size]; + h_next = new double[size]; } - // Cleanup - delete[] h_current; - delete[] h_next; - } - cudaFree(d_current); - cudaFree(d_next); + double* d_current; + double* d_next; + cudaMalloc(&d_current, size * sizeof(double)); + cudaMalloc(&d_next, size * sizeof(double)); + thrust::sequence(thrust::device, d_current, d_current + size, 0); + thrust::sequence(thrust::device, d_next, d_next + size, 0); + + // CUDA kernel execution parameters + const int threadsPerBlock = std::min(1024, (int)size); + const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock; + + // Actual time step loop + for (std::size_t t = 0; t < nt; ++t) { + heat_equation<<>>(d_current, d_next, size); + std::swap(d_current, d_next); + } + cudaDeviceSynchronize(); + auto time = timer.stop(); + + if (args.results) { + // Copy result back to host + cudaMemcpy(h_current, d_current, size * sizeof(double), cudaMemcpyDeviceToHost); + + // Print results + for (std::size_t i = 0; i < np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j < nx; ++j) { + std::cout << h_current[i * nx + j] << " "; + } + std::cout << "}\n"; + } + // Cleanup + delete[] h_current; + delete[] h_next; + } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + cudaFree(d_current); + cudaFree(d_next); + + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_serial.cpp b/apps/1d_stencil/stencil_serial.cpp index fce1d5d..f4a7180 100644 --- a/apps/1d_stencil/stencil_serial.cpp +++ b/apps/1d_stencil/stencil_serial.cpp @@ -32,20 +32,16 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)") - .set_default(false); - std::uint64_t& nx = - kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = - kwarg("no-header", "Do not print csv header row (default: false)") - .set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)").set_default(false); + std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -58,107 +54,107 @@ double dx = 1.; // grid spacing /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; + // Our partition type + typedef double partition; - // Our data for one time step - using view_1d = std::extents; - typedef std::mdspan space; + // Our data for one time step + using view_1d = std::extents; + typedef std::mdspan space; - void init_value(auto& data, std::size_t np, std::size_t nx) { - for (std::size_t i = 0; i != np * nx; ++i) { - data[i] = double(i); + void init_value(auto& data, std::size_t np, std::size_t nx) { + for (std::size_t i = 0; i != np * nx; ++i) { + data[i] = double(i); + } } - } - // Our operator - double heat(double left, double middle, double right, const double k = ::k, - const double dt = ::dt, const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); - } - - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; + // Our operator + double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, + const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); } - if (id == size - 1 && dir == +1) { - return (std::size_t)0; - } - assert(id < size); - - return id + dir; - } - - // do all the work on 'nx' data points for 'nt' time steps - space do_work(std::size_t np, std::size_t nx, std::size_t nt) { - std::size_t size = np * nx; - partition* current_ptr = new partition[size]; - partition* next_ptr = new partition[size]; - auto current = space(current_ptr, size); - auto next = space(next_ptr, size); - - init_value(current, np, nx); - - // Actual time step loop - for (std::size_t t = 0; t != nt; ++t) { - for (std::size_t i = 0; i < np * nx; ++i) { - auto left = idx(i, -1, size); - auto right = idx(i, +1, size); - next[i] = heat(current[left], current[i], current[right], k, dt, dx); - } - std::swap(current, next); + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; + } + + if (id == size - 1 && dir == +1) { + return (std::size_t)0; + } + assert(id < size); + + return id + dir; } - return current; - } + // do all the work on 'nx' data points for 'nt' time steps + space do_work(std::size_t np, std::size_t nx, std::size_t nt) { + std::size_t size = np * nx; + partition* current_ptr = new partition[size]; + partition* next_ptr = new partition[size]; + auto current = space(current_ptr, size); + auto next = space(next_ptr, size); + + init_value(current, np, nx); + + // Actual time step loop + for (std::size_t t = 0; t != nt; ++t) { + for (std::size_t i = 0; i < np * nx; ++i) { + auto left = idx(i, -1, size); + auto right = idx(i, +1, size); + next[i] = heat(current[left], current[i], current[right], k, dt, dx); + } + std::swap(current, next); + } + + return current; + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. - - // Create the stepper object - stepper step; - - // Measure execution time. - Timer timer; - - // Execute nt time steps on nx grid points. - auto solution = step.do_work(np, nx, nt); - auto time = timer.stop(); - - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. + + // Create the stepper object + stepper step; + + // Measure execution time. + Timer timer; + + // Execute nt time steps on nx grid points. + auto solution = step.do_work(np, nx, nt); + auto time = timer.stop(); + + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; + } } - } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_snd_gpu_m.cpp b/apps/1d_stencil/stencil_snd_gpu_m.cpp index 83d3e16..0c385d9 100644 --- a/apps/1d_stencil/stencil_snd_gpu_m.cpp +++ b/apps/1d_stencil/stencil_snd_gpu_m.cpp @@ -40,20 +40,16 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)") - .set_default(false); - std::uint64_t& nx = - kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = - kwarg("no-header", "Do not print csv header row (default: false)") - .set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)").set_default(false); + std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -64,125 +60,118 @@ double dt = 1.; // time step double dx = 1.; // grid spacing template -using any_sender_of = typename exec::any_receiver_ref< - stdexec::completion_signatures>::template any_sender<>; +using any_sender_of = typename exec::any_receiver_ref>::template any_sender<>; /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; + // Our partition type + typedef double partition; - // Our data for one time step - typedef thrust::device_vector space; + // Our data for one time step + typedef thrust::device_vector space; - // Our operator - double heat(double left, double middle, double right, const double k = ::k, - const double dt = ::dt, const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); - } - - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; + // Our operator + double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, + const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); } - if (id == size - 1 && dir == +1) { - return (std::size_t)0; - } - assert(id < size); - - return id + dir; - } - - // do all the work on 'nx' data points for 'nt' time steps - space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, - std::size_t nt) { - std::size_t size = np * nx; - thrust::device_vector current_vec(size); - thrust::device_vector next_vec(size); - - auto current_ptr = thrust::raw_pointer_cast(current_vec.data()); - auto next_ptr = thrust::raw_pointer_cast(next_vec.data()); - - stdexec::sender auto init = - stdexec::transfer_just(sch, current_ptr, nx) | - stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { - current_ptr[i] = (double)i; - }); - stdexec::sync_wait(std::move(init)); - - for (std::size_t t = 0; t != nt; ++t) { - auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, - dx, np, nx) | - stdexec::bulk(np * nx, [&](int i, auto current_ptr, - auto next_ptr, auto k, auto dt, - auto dx, auto np, auto nx) { - auto left = idx(i, -1, np * nx); - auto right = idx(i, +1, np * nx); - next_ptr[i] = heat(current_ptr[left], current_ptr[i], - current_ptr[right], k, dt, dx); - }); - stdexec::sync_wait(std::move(sender)); - std::swap(current_ptr, next_ptr); + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; + } + + if (id == size - 1 && dir == +1) { + return (std::size_t)0; + } + assert(id < size); + + return id + dir; } - if (nt % 2 == 0) { - return current_vec; + // do all the work on 'nx' data points for 'nt' time steps + space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) { + std::size_t size = np * nx; + thrust::device_vector current_vec(size); + thrust::device_vector next_vec(size); + + auto current_ptr = thrust::raw_pointer_cast(current_vec.data()); + auto next_ptr = thrust::raw_pointer_cast(next_vec.data()); + + stdexec::sender auto init = + stdexec::transfer_just(sch, current_ptr, nx) | + stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { current_ptr[i] = (double)i; }); + stdexec::sync_wait(std::move(init)); + + for (std::size_t t = 0; t != nt; ++t) { + auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx) | + stdexec::bulk(np * nx, [&](int i, auto current_ptr, auto next_ptr, auto k, auto dt, auto dx, + auto np, auto nx) { + auto left = idx(i, -1, np * nx); + auto right = idx(i, +1, np * nx); + next_ptr[i] = heat(current_ptr[left], current_ptr[i], current_ptr[right], k, dt, dx); + }); + stdexec::sync_wait(std::move(sender)); + std::swap(current_ptr, next_ptr); + } + + if (nt % 2 == 0) { + return current_vec; + } + return next_vec; } - return next_vec; - } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. - // Create the stepper object - stepper step; + // Create the stepper object + stepper step; - nvexec::multi_gpu_stream_context stream_context{}; - stdexec::scheduler auto sch = stream_context.get_scheduler(); + nvexec::multi_gpu_stream_context stream_context{}; + stdexec::scheduler auto sch = stream_context.get_scheduler(); - // Measure execution time. - Timer timer; + // Measure execution time. + Timer timer; - // Execute nt time steps on nx grid points. - stepper::space solution = step.do_work(sch, np, nx, nt); + // Execute nt time steps on nx grid points. + stepper::space solution = step.do_work(sch, np, nx, nt); - auto time = timer.stop(); + auto time = timer.stop(); - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; + } } - } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_snd_gpu_s.cpp b/apps/1d_stencil/stencil_snd_gpu_s.cpp index 58fc06c..8144c52 100644 --- a/apps/1d_stencil/stencil_snd_gpu_s.cpp +++ b/apps/1d_stencil/stencil_snd_gpu_s.cpp @@ -40,20 +40,16 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)") - .set_default(false); - std::uint64_t& nx = - kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = - kwarg("no-header", "Do not print csv header row (default: false)") - .set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)").set_default(false); + std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -64,126 +60,118 @@ double dt = 1.; // time step double dx = 1.; // grid spacing template -using any_sender_of = typename exec::any_receiver_ref< - stdexec::completion_signatures>::template any_sender<>; +using any_sender_of = typename exec::any_receiver_ref>::template any_sender<>; /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; + // Our partition type + typedef double partition; - // Our data for one time step - typedef thrust::device_vector space; + // Our data for one time step + typedef thrust::device_vector space; - // Our operator - double heat(double left, double middle, double right, const double k = ::k, - const double dt = ::dt, const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); - } - - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; + // Our operator + double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, + const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); } - if (id == size - 1 && dir == +1) { - return (std::size_t)0; - } - assert(id < size); - - return id + dir; - } - - // do all the work on 'nx' data points for 'nt' time steps - space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, - std::size_t nt) { - std::size_t size = np * nx; - thrust::device_vector current_vec(size); - thrust::device_vector next_vec(size); - - auto current_ptr = thrust::raw_pointer_cast(current_vec.data()); - auto next_ptr = thrust::raw_pointer_cast(next_vec.data()); - - stdexec::sender auto init = - stdexec::transfer_just(sch, current_ptr, nx) | - stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { - current_ptr[i] = (double)i; - }); - stdexec::sync_wait(std::move(init)); - - for (std::size_t t = 0; t != nt; ++t) { - auto sender = - stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx, - size) | - stdexec::bulk(np * nx, - [&](int i, auto& current_ptr, auto& next_ptr, auto k, - auto dt, auto dx, auto np, auto nx, auto size) { - std::size_t left = (i == 0) ? size - 1 : i - 1; - std::size_t right = (i == size - 1) ? 0 : i + 1; - next_ptr[i] = heat(current_ptr[left], current_ptr[i], - current_ptr[right], k, dt, dx); - }); - stdexec::sync_wait(std::move(sender)); - std::swap(current_ptr, next_ptr); + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; + } + + if (id == size - 1 && dir == +1) { + return (std::size_t)0; + } + assert(id < size); + + return id + dir; } - if (nt % 2 == 0) { - return current_vec; + // do all the work on 'nx' data points for 'nt' time steps + space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) { + std::size_t size = np * nx; + thrust::device_vector current_vec(size); + thrust::device_vector next_vec(size); + + auto current_ptr = thrust::raw_pointer_cast(current_vec.data()); + auto next_ptr = thrust::raw_pointer_cast(next_vec.data()); + + stdexec::sender auto init = + stdexec::transfer_just(sch, current_ptr, nx) | + stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { current_ptr[i] = (double)i; }); + stdexec::sync_wait(std::move(init)); + + for (std::size_t t = 0; t != nt; ++t) { + auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx, size) | + stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto& next_ptr, auto k, auto dt, auto dx, + auto np, auto nx, auto size) { + std::size_t left = (i == 0) ? size - 1 : i - 1; + std::size_t right = (i == size - 1) ? 0 : i + 1; + next_ptr[i] = heat(current_ptr[left], current_ptr[i], current_ptr[right], k, dt, dx); + }); + stdexec::sync_wait(std::move(sender)); + std::swap(current_ptr, next_ptr); + } + + if (nt % 2 == 0) { + return current_vec; + } + return next_vec; } - return next_vec; - } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. - // Create the stepper object - stepper step; + // Create the stepper object + stepper step; - nvexec::stream_context stream_ctx{}; - stdexec::scheduler auto sch = stream_ctx.get_scheduler(); + nvexec::stream_context stream_ctx{}; + stdexec::scheduler auto sch = stream_ctx.get_scheduler(); - // Measure execution time. - Timer timer; + // Measure execution time. + Timer timer; - // Execute nt time steps on nx grid points. - stepper::space solution = step.do_work(sch, np, nx, nt); + // Execute nt time steps on nx grid points. + stepper::space solution = step.do_work(sch, np, nx, nt); - auto time = timer.stop(); + auto time = timer.stop(); - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; + } } - } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_stdpar.cpp b/apps/1d_stencil/stencil_stdpar.cpp index c1e780c..e424620 100644 --- a/apps/1d_stencil/stencil_stdpar.cpp +++ b/apps/1d_stencil/stencil_stdpar.cpp @@ -34,20 +34,16 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)") - .set_default(false); - std::uint64_t& nx = - kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = - kwarg("no-header", "Do not print csv header row (default: false)") - .set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)").set_default(false); + std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -60,105 +56,104 @@ double dx = 1.; // grid spacing /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; - - // Our data for one time step - using view_1d = std::extents; - typedef std::mdspan space; - - // Our operator - double heat(double left, double middle, double right, const double k = ::k, - const double dt = ::dt, const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); - } - - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; - } + // Our partition type + typedef double partition; + + // Our data for one time step + using view_1d = std::extents; + typedef std::mdspan space; - if (id == size - 1 && dir == +1) { - return (std::size_t)0; + // Our operator + double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, + const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); } - assert(id < size); - - return id + dir; - } - - // do all the work on 'nx' data points for 'nt' time steps - space do_work(std::size_t np, std::size_t nx, std::size_t nt) { - std::size_t size = np * nx; - partition* current_ptr = new partition[size]; - partition* next_ptr = new partition[size]; - - auto current = space(current_ptr, size); - auto next = space(next_ptr, size); - // parallel init - std::for_each_n(std::execution::par, counting_iterator(0), np * nx, - [=](std::size_t i) { current_ptr[i] = (double)i; }); - - // Actual time step loop - for (std::size_t t = 0; t != nt; ++t) { - std::for_each_n(std::execution::par, counting_iterator(0), np * nx, - [=, k = k, dt = dt, dx = dx](int32_t i) { - auto left = idx(i, -1, size); - auto right = idx(i, +1, size); - next[i] = heat(current[left], current[i], - current[right], k, dt, dx); - }); - std::swap(current, next); + + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; + } + + if (id == size - 1 && dir == +1) { + return (std::size_t)0; + } + assert(id < size); + + return id + dir; } - return current; - } + // do all the work on 'nx' data points for 'nt' time steps + space do_work(std::size_t np, std::size_t nx, std::size_t nt) { + std::size_t size = np * nx; + partition* current_ptr = new partition[size]; + partition* next_ptr = new partition[size]; + + auto current = space(current_ptr, size); + auto next = space(next_ptr, size); + // parallel init + std::for_each_n(std::execution::par, counting_iterator(0), np * nx, + [=](std::size_t i) { current_ptr[i] = (double)i; }); + + // Actual time step loop + for (std::size_t t = 0; t != nt; ++t) { + std::for_each_n(std::execution::par, counting_iterator(0), np * nx, + [=, k = k, dt = dt, dx = dx](int32_t i) { + auto left = idx(i, -1, size); + auto right = idx(i, +1, size); + next[i] = heat(current[left], current[i], current[right], k, dt, dx); + }); + std::swap(current, next); + } + + return current; + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. - - // Create the stepper object - stepper step; - - // Measure execution time. - Timer timer; - - // Execute nt time steps on nx grid points. - auto solution = step.do_work(np, nx, nt); - auto time = timer.stop(); - - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. + + // Create the stepper object + stepper step; + + // Measure execution time. + Timer timer; + + // Execute nt time steps on nx grid points. + auto solution = step.do_work(np, nx, nt); + auto time = timer.stop(); + + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; + } } - } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_stdpar_snd.cpp b/apps/1d_stencil/stencil_stdpar_snd.cpp index 30cfca8..6a08a63 100644 --- a/apps/1d_stencil/stencil_stdpar_snd.cpp +++ b/apps/1d_stencil/stencil_stdpar_snd.cpp @@ -37,20 +37,16 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)") - .set_default(false); - std::uint64_t& nx = - kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = - kwarg("no-header", "Do not print csv header row (default: false)") - .set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)").set_default(false); + std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -61,144 +57,134 @@ double dt = 1.; // time step double dx = 1.; // grid spacing template -using any_sender_of = typename exec::any_receiver_ref< - stdexec::completion_signatures>::template any_sender<>; +using any_sender_of = typename exec::any_receiver_ref>::template any_sender<>; /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; - - // Our data for one time step - using view_1d = std::extents; - typedef std::mdspan space; - - using any_space_sender = - any_sender_of; - - // Our operator - double heat(double left, double middle, double right, const double k = ::k, - const double dt = ::dt, const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); - } - - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; - } + // Our partition type + typedef double partition; + + // Our data for one time step + using view_1d = std::extents; + typedef std::mdspan space; + + using any_space_sender = + any_sender_of; - if (id == size - 1 && dir == +1) { - return (std::size_t)0; + // Our operator + double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, + const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); } - assert(id < size); - - return id + dir; - } - - partition* current_ptr = nullptr; - partition* next_ptr = nullptr; - space current; - space next; - - // do all the work on 'nx' data points for 'nt' time steps - auto do_work(std::size_t np, std::size_t nx, std::size_t nt) - -> any_space_sender { - if (nt == 0) { - std::size_t size = np * nx; - partition* current_ptr = new partition[size]; - partition* next_ptr = new partition[size]; - current = space(current_ptr, size); - next = space(next_ptr, size); - - // parallel init - std::for_each_n(std::execution::par, counting_iterator(0), np * nx, - [=](std::size_t i) { current_ptr[i] = (double)i; }); - - return stdexec::just(current); + + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; + } + + if (id == size - 1 && dir == +1) { + return (std::size_t)0; + } + assert(id < size); + + return id + dir; } - return stdexec::just(nt - 1) | - stdexec::let_value([=](std::size_t nt_updated) { - return do_work(np, nx, nt_updated); - }) | - stdexec::bulk(np, - [&, k = k, dt = dt, dx = dx, nx = nx, np = np]( - std::size_t i, auto const& current) { - std::for_each_n( - std::execution::par, counting_iterator(0), nx, - [=, next = next](std::size_t j) { - std::size_t id = i * nx + j; - auto left = idx(id, -1, np * nx); - auto right = idx(id, +1, np * nx); - next[id] = heat(current[left], current[id], - current[right], k, dt, dx); - }); - }) | - stdexec::then([&](auto current) { - // TODO: return next? - std::swap(current, next); - return current; - }); - } + partition* current_ptr = nullptr; + partition* next_ptr = nullptr; + space current; + space next; + + // do all the work on 'nx' data points for 'nt' time steps + auto do_work(std::size_t np, std::size_t nx, std::size_t nt) -> any_space_sender { + if (nt == 0) { + std::size_t size = np * nx; + partition* current_ptr = new partition[size]; + partition* next_ptr = new partition[size]; + current = space(current_ptr, size); + next = space(next_ptr, size); + + // parallel init + std::for_each_n(std::execution::par, counting_iterator(0), np * nx, + [=](std::size_t i) { current_ptr[i] = (double)i; }); + + return stdexec::just(current); + } + + return stdexec::just(nt - 1) | + stdexec::let_value([=](std::size_t nt_updated) { return do_work(np, nx, nt_updated); }) | + stdexec::bulk(np, + [&, k = k, dt = dt, dx = dx, nx = nx, np = np](std::size_t i, auto const& current) { + std::for_each_n( + std::execution::par, counting_iterator(0), nx, [=, next = next](std::size_t j) { + std::size_t id = i * nx + j; + auto left = idx(id, -1, np * nx); + auto right = idx(id, +1, np * nx); + next[id] = heat(current[left], current[id], current[right], k, dt, dx); + }); + }) | + stdexec::then([&](auto current) { + // TODO: return next? + std::swap(current, next); + return current; + }); + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. - - // Create the stepper object - stepper step; - - exec::static_thread_pool pool(np); - stdexec::scheduler auto sch = pool.get_scheduler(); - stdexec::sender auto begin = stdexec::schedule(sch); - - // Measure execution time. - Timer timer; - - // Execute nt time steps on nx grid points. - stdexec::sender auto sender = - begin | stdexec::then([=]() { return nt; }) | - stdexec::let_value( - [=, &step](std::uint64_t nt) { return step.do_work(np, nx, nt); }); - - auto [solution] = stdexec::sync_wait(std::move(sender)).value(); - - auto time = timer.stop(); - - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. + + // Create the stepper object + stepper step; + + exec::static_thread_pool pool(np); + stdexec::scheduler auto sch = pool.get_scheduler(); + stdexec::sender auto begin = stdexec::schedule(sch); + + // Measure execution time. + Timer timer; + + // Execute nt time steps on nx grid points. + stdexec::sender auto sender = begin | stdexec::then([=]() { return nt; }) | + stdexec::let_value([=, &step](std::uint64_t nt) { return step.do_work(np, nx, nt); }); + + auto [solution] = stdexec::sync_wait(std::move(sender)).value(); + + auto time = timer.stop(); + + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; + } } - } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_stdpar_snd_iter.cpp b/apps/1d_stencil/stencil_stdpar_snd_iter.cpp index 0c1280a..0cbffc9 100644 --- a/apps/1d_stencil/stencil_stdpar_snd_iter.cpp +++ b/apps/1d_stencil/stencil_stdpar_snd_iter.cpp @@ -37,20 +37,16 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)") - .set_default(false); - std::uint64_t& nx = - kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = - kwarg("no-header", "Do not print csv header row (default: false)") - .set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)").set_default(false); + std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -61,127 +57,122 @@ double dt = 1.; // time step double dx = 1.; // grid spacing template -using any_sender_of = typename exec::any_receiver_ref< - stdexec::completion_signatures>::template any_sender<>; +using any_sender_of = typename exec::any_receiver_ref>::template any_sender<>; /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; - - // Our data for one time step - using view_1d = std::extents; - typedef std::mdspan space; - - // Our operator - double heat(double left, double middle, double right, const double k = ::k, - const double dt = ::dt, const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); - } - - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; - } + // Our partition type + typedef double partition; + + // Our data for one time step + using view_1d = std::extents; + typedef std::mdspan space; - if (id == size - 1 && dir == +1) { - return (std::size_t)0; + // Our operator + double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, + const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); } - assert(id < size); - - return id + dir; - } - - partition* current_ptr = nullptr; - partition* next_ptr = nullptr; - space current; - space next; - - // do all the work on 'nx' data points for 'nt' time steps - space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, - std::size_t nt) { - std::size_t size = np * nx; - partition* current_ptr = new partition[size]; - partition* next_ptr = new partition[size]; - - auto current = space(current_ptr, size); - auto next = space(next_ptr, size); - // parallel init - std::for_each_n(std::execution::par, counting_iterator(0), np * nx, - [=](std::size_t i) { current(i) = (double)i; }); - - // Actual time step loop - for (std::size_t t = 0; t != nt; ++t) { - auto sender = - stdexec::transfer_just(sch, current, next, k, dt, dx, np, nx) | - stdexec::bulk(np, [&](int i, auto& current, auto& next, auto k, - auto dt, auto dx, auto np, auto nx) { - std::for_each_n(std::execution::par, counting_iterator(0), nx, - [=](std::size_t j) { - std::size_t id = i * nx + j; - auto left = idx(id, -1, np * nx); - auto right = idx(id, +1, np * nx); - next(id) = heat(current(left), current(id), - current(right), k, dt, dx); - }); - }); - stdexec::sync_wait(std::move(sender)); - std::swap(current, next); + + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; + } + + if (id == size - 1 && dir == +1) { + return (std::size_t)0; + } + assert(id < size); + + return id + dir; } - return current; - } + partition* current_ptr = nullptr; + partition* next_ptr = nullptr; + space current; + space next; + + // do all the work on 'nx' data points for 'nt' time steps + space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) { + std::size_t size = np * nx; + partition* current_ptr = new partition[size]; + partition* next_ptr = new partition[size]; + + auto current = space(current_ptr, size); + auto next = space(next_ptr, size); + // parallel init + std::for_each_n(std::execution::par, counting_iterator(0), np * nx, + [=](std::size_t i) { current(i) = (double)i; }); + + // Actual time step loop + for (std::size_t t = 0; t != nt; ++t) { + auto sender = + stdexec::transfer_just(sch, current, next, k, dt, dx, np, nx) | + stdexec::bulk(np, [&](int i, auto& current, auto& next, auto k, auto dt, auto dx, auto np, auto nx) { + std::for_each_n(std::execution::par, counting_iterator(0), nx, [=](std::size_t j) { + std::size_t id = i * nx + j; + auto left = idx(id, -1, np * nx); + auto right = idx(id, +1, np * nx); + next(id) = heat(current(left), current(id), current(right), k, dt, dx); + }); + }); + stdexec::sync_wait(std::move(sender)); + std::swap(current, next); + } + + return current; + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. - // Create the stepper object - stepper step; + // Create the stepper object + stepper step; - exec::static_thread_pool pool(np); - stdexec::scheduler auto sch = pool.get_scheduler(); + exec::static_thread_pool pool(np); + stdexec::scheduler auto sch = pool.get_scheduler(); - // Measure execution time. - Timer timer; + // Measure execution time. + Timer timer; - stepper::space solution = step.do_work(sch, np, nx, nt); + stepper::space solution = step.do_work(sch, np, nx, nt); - auto time = timer.stop(); + auto time = timer.stop(); - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; + } } - } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/choleskey/choleskey_serial.cpp b/apps/choleskey/choleskey_serial.cpp index 5c82498..88d6824 100644 --- a/apps/choleskey/choleskey_serial.cpp +++ b/apps/choleskey/choleskey_serial.cpp @@ -39,92 +39,90 @@ using namespace std; struct solver { - using view_2d = std::extents; - - typedef std::mdspan matrix_ms_t; - - template - matrix_ms_t Cholesky_Decomposition(std::vector& vec, int n) { - std::vector lower(n * n, 0); - - auto matrix_ms = - std::mdspan(vec.data(), n, n); - auto lower_ms = - std::mdspan(lower.data(), n, n); - - // Decomposing a matrix into Lower Triangular - for (int i = 0; i < matrix_ms.extent(0); i++) { - for (int j = 0; j <= i; j++) { - T sum = 0; - - if (j == i) { - // summation for diagonals - for (int k = 0; k < j; k++) - sum += pow(lower_ms(j, k), 2); - lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum); - } else { - // Evaluating L(i, j) using L(j, j) - for (int k = 0; k < j; k++) - sum += (lower_ms(i, k) * lower_ms(j, k)); - lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j); + using view_2d = std::extents; + + typedef std::mdspan matrix_ms_t; + + template + matrix_ms_t Cholesky_Decomposition(std::vector& vec, int n) { + std::vector lower(n * n, 0); + + auto matrix_ms = std::mdspan(vec.data(), n, n); + auto lower_ms = std::mdspan(lower.data(), n, n); + + // Decomposing a matrix into Lower Triangular + for (int i = 0; i < matrix_ms.extent(0); i++) { + for (int j = 0; j <= i; j++) { + T sum = 0; + + if (j == i) { + // summation for diagonals + for (int k = 0; k < j; k++) + sum += pow(lower_ms(j, k), 2); + lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum); + } else { + // Evaluating L(i, j) using L(j, j) + for (int k = 0; k < j; k++) + sum += (lower_ms(i, k) * lower_ms(j, k)); + lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j); + } + } } - } + return lower_ms; } - return lower_ms; - } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t nd = args.nd; // Number of matrix dimension. - - std::vector inputMatrix = generate_pascal_matrix(nd); - - // Create the solverobject - solver solve; - // Measure execution time. - Timer timer; - // start decomposation - auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); - - // Print the final results - if (args.results) { - // Displaying Lower Triangular and its Transpose - cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; - for (int i = 0; i < nd; i++) { - // Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix(i, j) << "\t"; - cout << "\t"; - - // Transpose of Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix(j, i) << "\t"; - cout << endl; + std::uint64_t nd = args.nd; // Number of matrix dimension. + + std::vector inputMatrix = generate_pascal_matrix(nd); + + // Create the solverobject + solver solve; + // Measure execution time. + Timer timer; + // start decomposation + auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); + + // Print the final results + if (args.results) { + // Displaying Lower Triangular and its Transpose + cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; + for (int i = 0; i < nd; i++) { + // Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix(i, j) << "\t"; + cout << "\t"; + + // Transpose of Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix(j, i) << "\t"; + cout << endl; + } } - } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } // Driver Code for testing int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/choleskey/choleskey_stdpar.cpp b/apps/choleskey/choleskey_stdpar.cpp index 2b19b7d..33c6b87 100644 --- a/apps/choleskey/choleskey_stdpar.cpp +++ b/apps/choleskey/choleskey_stdpar.cpp @@ -44,99 +44,95 @@ using namespace std; struct solver { - using view_2d = std::extents; + using view_2d = std::extents; - template - std::vector> Cholesky_Decomposition(std::vector& vec, - int n) { - std::vector> lower(n, std::vector(n, 0)); + template + std::vector> Cholesky_Decomposition(std::vector& vec, int n) { + std::vector> lower(n, std::vector(n, 0)); - auto matrix_ms = - std::mdspan(vec.data(), n, n); + auto matrix_ms = std::mdspan(vec.data(), n, n); - auto multiplier_lambda = [=](auto a, auto b) { - return a * b; - }; + auto multiplier_lambda = [=](auto a, auto b) { + return a * b; + }; - // Decomposing a matrix into Lower Triangular - for (int i = 0; i < matrix_ms.extent(0); i++) { - for (int j = 0; j <= i; j++) { - T sum = 0; + // Decomposing a matrix into Lower Triangular + for (int i = 0; i < matrix_ms.extent(0); i++) { + for (int j = 0; j <= i; j++) { + T sum = 0; - if (j == i) // summation for diagonals - { - sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), - lower[j].cbegin() + j, 0, std::plus{}, - [=](int val) { return val * val; }); + if (j == i) // summation for diagonals + { + sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j, 0, + std::plus{}, [=](int val) { return val * val; }); - lower[j][j] = std::sqrt(matrix_ms(i, j) - sum); + lower[j][j] = std::sqrt(matrix_ms(i, j) - sum); - } else { // Evaluating L(i, j) using L(j, j) + } else { // Evaluating L(i, j) using L(j, j) - sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), - lower[j].cbegin() + j, lower[i].cbegin(), - 0, std::plus<>(), multiplier_lambda); + sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j, + lower[i].cbegin(), 0, std::plus<>(), multiplier_lambda); - lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j]; + lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j]; + } + } } - } + return lower; } - return lower; - } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t nd = args.nd; // Number of matrix dimension. + std::uint64_t nd = args.nd; // Number of matrix dimension. - std::vector inputMatrix = generate_pascal_matrix(nd); + std::vector inputMatrix = generate_pascal_matrix(nd); - // Create the solver object - solver solve; - // Measure execution time. - Timer timer; + // Create the solver object + solver solve; + // Measure execution time. + Timer timer; - // start decomposation - auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); + // start decomposation + auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); - // Print the final results - if (args.results) { - // Displaying Lower Triangular and its Transpose - cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; - for (int i = 0; i < nd; i++) { - // Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix[i][j] << "\t"; - cout << "\t"; + // Print the final results + if (args.results) { + // Displaying Lower Triangular and its Transpose + cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; + for (int i = 0; i < nd; i++) { + // Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[i][j] << "\t"; + cout << "\t"; - // Transpose of Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix[j][i] << "\t"; - cout << endl; + // Transpose of Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[j][i] << "\t"; + cout << endl; + } } - } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } // Driver Code for testing int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp index 4fa7d79..0a02682 100644 --- a/apps/choleskey/choleskey_stdpar_snd.cpp +++ b/apps/choleskey/choleskey_stdpar_snd.cpp @@ -45,166 +45,157 @@ using namespace std; struct solver { - using view_2d = std::extents; - - template - std::vector> Cholesky_Decomposition(std::vector& vec, int n, - int np) { - - // test here first, scheduler from a thread pool - exec::static_thread_pool pool(np); - stdexec::scheduler auto sch = pool.get_scheduler(); - stdexec::sender auto begin = stdexec::schedule(sch); - - std::vector> lower(n, std::vector(n, 0)); - - auto matrix_ms = - std::mdspan(vec.data(), n, n); - - auto multiplier_lambda = [=](auto a, auto b) { - return a * b; - }; - - for (int i = 0; i < matrix_ms.extent(0); i++) { - for (int j = 0; j <= i; j++) { - // avoid over parallelize - if (j == 0) { - np = 1; - } else if (j > 0 && np > j) { - np = j; - } - - if (j == i) // summation for diagonals - { - - if (i == 0 && j == 0) { - lower[j][j] = std::sqrt(matrix_ms(i, j)); - } else { - - std::vector sum_vec(np); // sub res for each piece - int size = j; // there are j elements need to be calculated(power) - - stdexec::sender auto send1 = - stdexec::bulk(begin, np, - [&](int piece) { - int start = piece * size / np; - int chunk_size = size / np; - int remaining = size % np; - chunk_size += (piece == np - 1) ? remaining : 0; - - sum_vec[piece] = std::transform_reduce( - std::execution::par, - counting_iterator(start), - counting_iterator(start + chunk_size), 0, - std ::plus{}, [=](int val) { - return lower[j][val] * lower[j][val]; - }); - }) | - stdexec::then([&sum_vec]() { - return std::reduce(std::execution::par, sum_vec.begin(), - sum_vec.end()); - }); - - auto [sum1] = stdexec::sync_wait(std::move(send1)).value(); - - lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1); - } - - } else { - // Evaluating L(i, j) using L(j, j) - - if (j == 0) { - lower[i][j] = (matrix_ms(i, j)) / lower[j][j]; - } else { - - std::vector sum_vec(np); // sub res for each piece - int size_nondiag = j; - - stdexec::sender auto send2 = - stdexec::bulk( - begin, np, - [&](int piece) { - int start = piece * size_nondiag / np; - int chunk_size = size_nondiag / np; - int remaining = size_nondiag % np; - chunk_size += (piece == np - 1) ? remaining : 0; - - sum_vec[piece] = std::transform_reduce( - std::execution::par, counting_iterator(start), - counting_iterator(start + chunk_size), 0, - std ::plus{}, - [=](int k) { return lower[j][k] * lower[i][k]; }); - }) | - stdexec::then([&sum_vec]() { - return std::reduce(std::execution::par, sum_vec.begin(), - sum_vec.end()); - }); - - auto [sum2] = stdexec::sync_wait(std::move(send2)).value(); - - lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j]; - } + using view_2d = std::extents; + + template + std::vector> Cholesky_Decomposition(std::vector& vec, int n, int np) { + + // test here first, scheduler from a thread pool + exec::static_thread_pool pool(np); + stdexec::scheduler auto sch = pool.get_scheduler(); + stdexec::sender auto begin = stdexec::schedule(sch); + + std::vector> lower(n, std::vector(n, 0)); + + auto matrix_ms = std::mdspan(vec.data(), n, n); + + auto multiplier_lambda = [=](auto a, auto b) { + return a * b; + }; + + for (int i = 0; i < matrix_ms.extent(0); i++) { + for (int j = 0; j <= i; j++) { + // avoid over parallelize + if (j == 0) { + np = 1; + } else if (j > 0 && np > j) { + np = j; + } + + if (j == i) // summation for diagonals + { + + if (i == 0 && j == 0) { + lower[j][j] = std::sqrt(matrix_ms(i, j)); + } else { + + std::vector sum_vec(np); // sub res for each piece + int size = j; // there are j elements need to be calculated(power) + + stdexec::sender auto send1 = + stdexec::bulk(begin, np, + [&](int piece) { + int start = piece * size / np; + int chunk_size = size / np; + int remaining = size % np; + chunk_size += (piece == np - 1) ? remaining : 0; + + sum_vec[piece] = std::transform_reduce( + std::execution::par, counting_iterator(start), + counting_iterator(start + chunk_size), 0, std ::plus{}, + [=](int val) { return lower[j][val] * lower[j][val]; }); + }) | + stdexec::then([&sum_vec]() { + return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end()); + }); + + auto [sum1] = stdexec::sync_wait(std::move(send1)).value(); + + lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1); + } + + } else { + // Evaluating L(i, j) using L(j, j) + + if (j == 0) { + lower[i][j] = (matrix_ms(i, j)) / lower[j][j]; + } else { + + std::vector sum_vec(np); // sub res for each piece + int size_nondiag = j; + + stdexec::sender auto send2 = + stdexec::bulk(begin, np, + [&](int piece) { + int start = piece * size_nondiag / np; + int chunk_size = size_nondiag / np; + int remaining = size_nondiag % np; + chunk_size += (piece == np - 1) ? remaining : 0; + + sum_vec[piece] = std::transform_reduce( + std::execution::par, counting_iterator(start), + counting_iterator(start + chunk_size), 0, std ::plus{}, + [=](int k) { return lower[j][k] * lower[i][k]; }); + }) | + stdexec::then([&sum_vec]() { + return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end()); + }); + + auto [sum2] = stdexec::sync_wait(std::move(send2)).value(); + + lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j]; + } + } + } } - } + return lower; } - return lower; - } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t nd = args.nd; // Number of matrix dimension. - std::uint64_t np = args.np; // Number of parallel partitions. + std::uint64_t nd = args.nd; // Number of matrix dimension. + std::uint64_t np = args.np; // Number of parallel partitions. - std::vector inputMatrix = generate_pascal_matrix(nd); + std::vector inputMatrix = generate_pascal_matrix(nd); - // Create the solver object - solver solve; + // Create the solver object + solver solve; - // Measure execution time. - Timer timer; + // Measure execution time. + Timer timer; - // start decomposation - auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np); + // start decomposation + auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np); - // Print the final results - if (args.results) { - // Displaying Lower Triangular and its Transpose - cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; - for (int i = 0; i < nd; i++) { - // Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix[i][j] << "\t"; - cout << "\t"; + // Print the final results + if (args.results) { + // Displaying Lower Triangular and its Transpose + cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; + for (int i = 0; i < nd; i++) { + // Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[i][j] << "\t"; + cout << "\t"; - // Transpose of Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix[j][i] << "\t"; - cout << endl; + // Transpose of Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[j][i] << "\t"; + cout << endl; + } } - } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } // Driver Code for testing int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/choleskey/matrixutil.hpp b/apps/choleskey/matrixutil.hpp index 44f0468..8b08fb1 100644 --- a/apps/choleskey/matrixutil.hpp +++ b/apps/choleskey/matrixutil.hpp @@ -9,33 +9,30 @@ using Matrix = std::vector>; template std::vector generate_pascal_matrix(const int n) { - Matrix matrix(n, std::vector(n, static_cast(0))); + Matrix matrix(n, std::vector(n, static_cast(0))); - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - if (i == 0 || j == 0) { - matrix[i][j] = static_cast(1); - } else { - matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j]; - } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + if (i == 0 || j == 0) { + matrix[i][j] = static_cast(1); + } else { + matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j]; + } + } } - } - std::vector flattenedVector; - for (const auto& row : matrix) { - flattenedVector.insert(flattenedVector.end(), row.begin(), row.end()); - } - return std::move(flattenedVector); + std::vector flattenedVector; + for (const auto& row : matrix) { + flattenedVector.insert(flattenedVector.end(), row.begin(), row.end()); + } + return std::move(flattenedVector); } // parameters define struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)") - .set_default(true); - std::uint64_t& nd = - kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)") - .set_default(10); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)").set_default(true); + std::uint64_t& nd = kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)").set_default(10); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; diff --git a/apps/comm-study/comm-study-no-senders.cpp b/apps/comm-study/comm-study-no-senders.cpp index 1550094..87fa74b 100644 --- a/apps/comm-study/comm-study-no-senders.cpp +++ b/apps/comm-study/comm-study-no-senders.cpp @@ -37,87 +37,79 @@ using time_point_t = std::chrono::system_clock::time_point; // must take in the pointers/vectors by reference template auto work(P& A, P& B, P& Y, int N) { - // init A and B separately - will it cause an H2D copy? - std::for_each(std::execution::par_unseq, &A[0], &A[N], - [&](T& ai) { ai = cos(M_PI / 4); }); + // init A and B separately - will it cause an H2D copy? + std::for_each(std::execution::par_unseq, &A[0], &A[N], [&](T& ai) { ai = cos(M_PI / 4); }); - T sum = 0.0; + T sum = 0.0; - for (int i = 0; i < N / 3; i++) { - // read only or read-write operations - sum += A[i] / N; + for (int i = 0; i < N / 3; i++) { + // read only or read-write operations + sum += A[i] / N; - // this line if commented should not result in an H2D after this but it - // does. - // A[i] = sin(M_PI/4); - } + // this line if commented should not result in an H2D after this but it + // does. + // A[i] = sin(M_PI/4); + } - std::cout << std::endl; + std::cout << std::endl; - // will it cause an H2D here? - std::for_each(std::execution::par_unseq, &B[0], &B[N], - [&](T& bi) { bi = sin(M_PI / 6); }); + // will it cause an H2D here? + std::for_each(std::execution::par_unseq, &B[0], &B[N], [&](T& bi) { bi = sin(M_PI / 6); }); - // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B) + // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B) - std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &A[N / 2], - [&](T& ai, T& bi) { return ai + bi; }); - std::transform( - std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &Y[0], - [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); }); + std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &A[N / 2], + [&](T& ai, T& bi) { return ai + bi; }); + std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &Y[0], + [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); }); - // should trigger a D2H copy of N/5 elements - for (int i = 0; i < N / 3; i++) - sum += Y[i] / N; + // should trigger a D2H copy of N/5 elements + for (int i = 0; i < N / 3; i++) + sum += Y[i] / N; - std::cout << std::endl; + std::cout << std::endl; - // get sum(Y) - one last memcpy (not USM) D2H - sum += - std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus(), [](T &val){return val * val;}); + // get sum(Y) - one last memcpy (not USM) D2H + sum += std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus(), + [](T& val) { return val * val; }); - return sum / N; + return sum / N; } int main(int argc, char* argv[]) { - constexpr int N = 1e9; - time_point_t mark = std::chrono::system_clock::now(); - auto es = - std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - T sum = 0; + constexpr int N = 1e9; + time_point_t mark = std::chrono::system_clock::now(); + auto es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); + T sum = 0; #if 1 // 0 if only want to run with pointers - std::vector A(N); - std::vector B(N); - std::vector Y(N); + std::vector A(N); + std::vector B(N); + std::vector Y(N); - mark = std::chrono::system_clock::now(); - sum = work(A, B, Y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl; + mark = std::chrono::system_clock::now(); + sum = work(A, B, Y, N); + es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); + std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl; #endif #if 1 // 0 if only want to run with vectors - // allocate memory - where is this allocated? - T* a = new T[N]; - T* b = new T[N]; - T* y = new T[N]; - - sum = 0; - mark = std::chrono::system_clock::now(); - sum = work(a, b, y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl - << std::endl; + // allocate memory - where is this allocated? + T* a = new T[N]; + T* b = new T[N]; + T* y = new T[N]; + + sum = 0; + mark = std::chrono::system_clock::now(); + sum = work(a, b, y, N); + es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); + std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl << std::endl; #endif - // do not use scientific notation - std::cout << std::fixed << "sum: " << sum << "\n"; + // do not use scientific notation + std::cout << std::fixed << "sum: " << sum << "\n"; - return 0; + return 0; } \ No newline at end of file diff --git a/apps/comm-study/comm-study.cpp b/apps/comm-study/comm-study.cpp index 7629ce0..99abcfc 100644 --- a/apps/comm-study/comm-study.cpp +++ b/apps/comm-study/comm-study.cpp @@ -37,106 +37,91 @@ using time_point_t = std::chrono::system_clock::time_point; // must take in the pointers/vectors by reference template auto work(P& A, P& B, P& Y, int N) { - T sum = 0.0; - - // init A and B separately - will it cause an H2D copy? - sender auto s1 = then(just(), - [&] { - std::for_each(std::execution::par_unseq, &A[0], &A[N], - [&](T& ai) { ai = cos(M_PI / 4); }); - }) - // trigger a D2H here - | then([&] { - for (int i = 0; i < N / 3; i++) { - // read only or read-write operations - sum += A[i] / N; - - // this line if commented should not result in an H2D - // after this but it does. - // A[i] = sin(M_PI/4); - } - std::cout << std::endl; - }); - - // will it cause an H2D here? - sender auto s2 = then(just(), [&] { - std::for_each(std::execution::par_unseq, &B[0], &B[N], - [&](T& bi) { bi = sin(M_PI / 6); }); - }); - - // will s1 and s2 execute in parallel or not? - sync_wait(when_all(std::move(s1), std::move(s2))); - - // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B) - sender auto s3 = - then(just(), - [&] { - std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], - &A[0], [&](T& ai, T& bi) { return ai + bi; }); - std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], - &Y[0], [&](T& ai, T& bi) { - return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); - }); - }) - // should trigger a D2H copy of N/3 elements - | then([&] { - for (int i = 0; i < N / 3; i++) - sum += Y[i] / N; - - std::cout << std::endl; - }) - // get sum(Y) - wonder if there is another H2D as we only read it in the - // last step - | then([&] { - return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, - std::plus()); - }); - - auto [val] = sync_wait(s3).value(); - - return sum += val; + T sum = 0.0; + + // init A and B separately - will it cause an H2D copy? + sender auto s1 = + then(just(), + [&] { std::for_each(std::execution::par_unseq, &A[0], &A[N], [&](T& ai) { ai = cos(M_PI / 4); }); }) + // trigger a D2H here + | then([&] { + for (int i = 0; i < N / 3; i++) { + // read only or read-write operations + sum += A[i] / N; + + // this line if commented should not result in an H2D + // after this but it does. + // A[i] = sin(M_PI/4); + } + std::cout << std::endl; + }); + + // will it cause an H2D here? + sender auto s2 = then( + just(), [&] { std::for_each(std::execution::par_unseq, &B[0], &B[N], [&](T& bi) { bi = sin(M_PI / 6); }); }); + + // will s1 and s2 execute in parallel or not? + sync_wait(when_all(std::move(s1), std::move(s2))); + + // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B) + sender auto s3 = then(just(), + [&] { + std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], &A[0], + [&](T& ai, T& bi) { return ai + bi; }); + std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], &Y[0], + [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); }); + }) + // should trigger a D2H copy of N/3 elements + | then([&] { + for (int i = 0; i < N / 3; i++) + sum += Y[i] / N; + + std::cout << std::endl; + }) + // get sum(Y) - wonder if there is another H2D as we only read it in the + // last step + | then([&] { return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus()); }); + + auto [val] = sync_wait(s3).value(); + + return sum += val; } int main(int argc, char* argv[]) { - constexpr int N = 1e9; - time_point_t mark = std::chrono::system_clock::now(); - auto es = - std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - T sum = 0.0; + constexpr int N = 1e9; + time_point_t mark = std::chrono::system_clock::now(); + auto es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); + T sum = 0.0; #if 1 // 0 if only arrays - std::vector A(N); - std::vector B(N); - std::vector Y(N); + std::vector A(N); + std::vector B(N); + std::vector Y(N); - mark = std::chrono::system_clock::now(); - sum = work(A, B, Y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl; + mark = std::chrono::system_clock::now(); + sum = work(A, B, Y, N); + es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); + std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl; - std::cout << fixed << "sum: " << sum << "\n"; + std::cout << fixed << "sum: " << sum << "\n"; #endif #if 1 // 0 if only vectors - // allocate memory - can we just allocate it on device only? - T* a = new T[N]; - T* b = new T[N]; - T* y = new T[N]; - - sum = 0; - mark = std::chrono::system_clock::now(); - sum = work(a, b, y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl - << std::endl; - - // do not use scientific notation - std::cout << fixed << "sum: " << sum << "\n"; + // allocate memory - can we just allocate it on device only? + T* a = new T[N]; + T* b = new T[N]; + T* y = new T[N]; + + sum = 0; + mark = std::chrono::system_clock::now(); + sum = work(a, b, y, N); + es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); + std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl << std::endl; + + // do not use scientific notation + std::cout << fixed << "sum: " << sum << "\n"; #endif - return 0; + return 0; } \ No newline at end of file diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp index b174b5a..02bbd7b 100644 --- a/apps/fft/fft-serial.cpp +++ b/apps/fft/fft-serial.cpp @@ -33,14 +33,12 @@ // // simulation // -int main(int argc, char* argv[]) -{ +int main(int argc, char* argv[]) { // parse params fft_params_t args = argparse::parse(argc, argv); // see if help wanted - if (args.help) - { + if (args.help) { args.print(); // prints all variables return 0; } @@ -60,8 +58,7 @@ int main(int argc, char* argv[]) sig_t x_n(N, sig_type); - if (!isPowOf2(N)) - { + if (!isPowOf2(N)) { N = ceilPowOf2(N); std::cout << "log_2(N) != integer. Padding zeros for N = " << N << std::endl; @@ -70,8 +67,7 @@ int main(int argc, char* argv[]) sig_t y_n(x_n); - if (print_sig) - { + if (print_sig) { std::cout << std::endl << "x[n] = "; x_n.printSignal(); std::cout << std::endl; @@ -80,40 +76,36 @@ int main(int argc, char* argv[]) // niterations int niters = ilog2(N); - std::function fft = [&](data_t *x, int lN, const int N) - { - int stride = N/lN; + std::function fft = [&](data_t* x, int lN, const int N) { + int stride = N / lN; - if (lN == 2) - { - auto x_0 = x[0] + x[1]* WNk(N, 0); - x[1] = x[0] - x[1]* WNk(N, 0); + if (lN == 2) { + auto x_0 = x[0] + x[1] * WNk(N, 0); + x[1] = x[0] - x[1] * WNk(N, 0); x[0] = x_0; return; } // vectors for left and right - std::vector e(lN/2); - std::vector o(lN/2); + std::vector e(lN / 2); + std::vector o(lN / 2); // copy data into vectors - for (auto k = 0; k < lN/2; k++) - { - e[k] = x[2*k]; - o[k] = x[2*k+1]; + for (auto k = 0; k < lN / 2; k++) { + e[k] = x[2 * k]; + o[k] = x[2 * k + 1]; } // compute N/2 pt FFT on even - fft(e.data(), lN/2, N); + fft(e.data(), lN / 2, N); // compute N/2 pt FFT on odd - fft(o.data(), lN/2, N); + fft(o.data(), lN / 2, N); // combine even and odd FFTs - for (int k = 0; k < lN/2; k++) - { + for (int k = 0; k < lN / 2; k++) { x[k] = e[k] + o[k] * WNk(N, k * stride); - x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride); + x[k + lN / 2] = e[k] - o[k] * WNk(N, k * stride); } return; @@ -122,8 +114,7 @@ int main(int argc, char* argv[]) // fft radix-2 algorithm with senders fft(y_n.data(), N, N); - if (print_sig) - { + if (print_sig) { std::cout << "X[k] = "; y_n.printSignal(); std::cout << std::endl; diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp index 80a7446..56354f8 100644 --- a/apps/fft/fft.hpp +++ b/apps/fft/fft.hpp @@ -32,10 +32,10 @@ #include #include -#include -#include #include #include +#include +#include #include #include "argparse/argparse.hpp" @@ -56,158 +56,135 @@ constexpr int radix = 2; // parameters struct fft_params_t : public argparse::Args { - sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box); - int& freq = kwarg("f,freq", "Signal frequency").set_default(1024); - int& N = kwarg("N", "N-point FFT").set_default(1024); - bool& print_sig = flag("p,print", "print x[n] and X(k)"); + sig_type_t& sig = + kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box); + int& freq = kwarg("f,freq", "Signal frequency").set_default(1024); + int& N = kwarg("N", "N-point FFT").set_default(1024); + bool& print_sig = flag("p,print", "print x[n] and X(k)"); #if defined(USE_OMP) - int& nthreads = kwarg("nthreads", "number of threads").set_default(1); + int& nthreads = kwarg("nthreads", "number of threads").set_default(1); #endif // USE_OMP - bool& help = flag("h, help", "print help"); - bool& print_time = flag("t,time", "print fft time"); + bool& help = flag("h, help", "print help"); + bool& print_time = flag("t,time", "print fft time"); }; inline bool isPowOf2(long long int x) { - return !(x == 0) && !(x & (x - 1)); + return !(x == 0) && !(x & (x - 1)); } template -void printVec(T &vec, int len) -{ +void printVec(T& vec, int len) { std::cout << "[ "; for (int i = 0; i < len; i++) - std::cout << vec[i] << " "; + std::cout << vec[i] << " "; std::cout << "]" << std::endl; } -inline std::complex WNk(int N, int k) -{ - return std::complex(exp(-2*M_PI*1/N*k*1i)); +inline std::complex WNk(int N, int k) { + return std::complex(exp(-2 * M_PI * 1 / N * k * 1i)); } -inline int ceilPowOf2(unsigned int v) -{ - return static_cast(std::bit_ceil(v)); +inline int ceilPowOf2(unsigned int v) { + return static_cast(std::bit_ceil(v)); } -inline int ilog2(uint32_t x) -{ +inline int ilog2(uint32_t x) { return static_cast(log2(x)); } -class signal -{ -public: - - signal() = default; - signal(int N) - { - if (N <= 0) - { - std::cerr << "FATAL: N must be > 0. exiting.." << std::endl; - exit(1); +class signal { + public: + signal() = default; + + signal(int N) { + if (N <= 0) { + std::cerr << "FATAL: N must be > 0. exiting.." << std::endl; + exit(1); + } + y.reserve(ceilPowOf2(N)); + y.resize(N); } - y.reserve(ceilPowOf2(N)); - y.resize(N); - } - - signal(signal &rhs) - { - y = rhs.y; - } - signal(std::vector &in) - { - y = std::move(in); - } - - signal(int N, sig_type type) - { - if (N <= 0) - { - std::cerr << "FATAL: N must be > 0. exiting.." << std::endl; - exit(1); + + signal(signal& rhs) { y = rhs.y; } + + signal(std::vector& in) { y = std::move(in); } + + signal(int N, sig_type type) { + if (N <= 0) { + std::cerr << "FATAL: N must be > 0. exiting.." << std::endl; + exit(1); + } + y.reserve(ceilPowOf2(N)); + y.resize(N); + signalGenerator(type); } - y.reserve(ceilPowOf2(N)); - y.resize(N); - signalGenerator(type); - } - - void signalGenerator(sig_type type=sig_type::box) - { - int N = y.size(); - - switch (type) { - case sig_type::square: - for (int n = 0; n < N; ++n) - y[n] = (n < N / 4 || n > 3 * N/4) ? 1.0 : -1.0; - break; - case sig_type::sinusoid: - for (int n = 0; n < N; ++n) - y[n] = std::sin(2.0 * M_PI * n / N); - break; - case sig_type::sawtooth: - for (int n = 0; n < N; ++n) - y[n] = 2.0 * (n / N) - 1.0; - break; - case sig_type::triangle: - for (int n = 0; n < N; ++n) - y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0; - break; - case sig_type::sinc: - y[0] = 1.0; - for (int n = 1; n < N; ++n) - y[n] = std::sin(2.0 * M_PI * n / N) / (2.0 * M_PI * n / N); - break; - case sig_type::box: - for (int n = 0; n < N; ++n) - y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0; - break; - default: - std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl; - exit(1); + + void signalGenerator(sig_type type = sig_type::box) { + int N = y.size(); + + switch (type) { + case sig_type::square: + for (int n = 0; n < N; ++n) + y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : -1.0; + break; + case sig_type::sinusoid: + for (int n = 0; n < N; ++n) + y[n] = std::sin(2.0 * M_PI * n / N); + break; + case sig_type::sawtooth: + for (int n = 0; n < N; ++n) + y[n] = 2.0 * (n / N) - 1.0; + break; + case sig_type::triangle: + for (int n = 0; n < N; ++n) + y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0; + break; + case sig_type::sinc: + y[0] = 1.0; + for (int n = 1; n < N; ++n) + y[n] = std::sin(2.0 * M_PI * n / N) / (2.0 * M_PI * n / N); + break; + case sig_type::box: + for (int n = 0; n < N; ++n) + y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0; + break; + default: + std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl; + exit(1); + } } - } - ~signal() - { - y.clear(); - } + ~signal() { y.clear(); } - data_t *data() { return y.data(); } - int len() { return y.size(); } + data_t* data() { return y.data(); } - void resize(int N) - { - if (N != y.size()) - y.resize(N, 0); - } + int len() { return y.size(); } - data_t &operator[](int n) - { - return y[n]; - } + void resize(int N) { + if (N != y.size()) + y.resize(N, 0); + } - data_t &operator()(int n) - { - return y[n]; - } + data_t& operator[](int n) { return y[n]; } - void printSignal() { - std::cout << std::fixed << std::setprecision(2); + data_t& operator()(int n) { return y[n]; } - std::cout << "[ "; - for (auto &el : y) - std::cout << el << " "; + void printSignal() { + std::cout << std::fixed << std::setprecision(2); - std::cout << "]" << std::endl; - } + std::cout << "[ "; + for (auto& el : y) + std::cout << el << " "; + + std::cout << "]" << std::endl; + } -private: - // y[n] - std::vector y; + private: + // y[n] + std::vector y; }; using sig_t = signal; diff --git a/apps/heat-equation/heat-equation-cuda.cpp b/apps/heat-equation/heat-equation-cuda.cpp index 3ea2988..b8cca1b 100644 --- a/apps/heat-equation/heat-equation-cuda.cpp +++ b/apps/heat-equation/heat-equation-cuda.cpp @@ -41,15 +41,14 @@ __constant__ Real_t dx[2]; // error checking function template -static inline void check(T result, const char* const file, const int line, - bool is_fatal = true) { - if (result != cudaSuccess) { - std::cerr << "CUDA error at " << file << ":" << line << std::endl; - std::cerr << cudaGetErrorString(result) << std::endl; - - if (is_fatal) - exit(result); - } +static inline void check(T result, const char* const file, const int line, bool is_fatal = true) { + if (result != cudaSuccess) { + std::cerr << "CUDA error at " << file << ":" << line << std::endl; + std::cerr << cudaGetErrorString(result) << std::endl; + + if (is_fatal) + exit(result); + } } // @@ -57,24 +56,24 @@ static inline void check(T result, const char* const file, const int line, // template __global__ void initialize(T* phi, int ncells, int ghost_cells) { - int ind = blockIdx.x * blockDim.x + threadIdx.x; - int d_nghosts = nghosts; - int phi_old_extent = ncells + d_nghosts; - int gsize = ncells * ncells; + int ind = blockIdx.x * blockDim.x + threadIdx.x; + int d_nghosts = nghosts; + int phi_old_extent = ncells + d_nghosts; + int gsize = ncells * ncells; - for (; ind < gsize; ind += blockDim.x * gridDim.x) { - int i = 1 + (ind / ncells); - int j = 1 + (ind % ncells); + for (; ind < gsize; ind += blockDim.x * gridDim.x) { + int i = 1 + (ind / ncells); + int j = 1 + (ind % ncells); - Real_t x = pos(i, ghost_cells, dx[0]); - Real_t y = pos(j, ghost_cells, dx[1]); + Real_t x = pos(i, ghost_cells, dx[0]); + Real_t y = pos(j, ghost_cells, dx[1]); - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); - // phi(x,y) = 1 + exp(-r^2) - phi[(i)*phi_old_extent + j] = 1 + exp(-r2); - } + // phi(x,y) = 1 + exp(-r^2) + phi[(i)*phi_old_extent + j] = 1 + exp(-r2); + } } // @@ -82,57 +81,52 @@ __global__ void initialize(T* phi, int ncells, int ghost_cells) { // template __global__ void fillBoundary(T* phi_old, int ncells, int ghost_cells) { - int pos = blockIdx.x * blockDim.x + threadIdx.x; - int d_nghosts = nghosts; - int phi_old_extent = ncells + d_nghosts; - int len = phi_old_extent; + int pos = blockIdx.x * blockDim.x + threadIdx.x; + int d_nghosts = nghosts; + int phi_old_extent = ncells + d_nghosts; + int len = phi_old_extent; - for (; pos < phi_old_extent - nghosts; pos += blockDim.x * gridDim.x) { - int i = pos + ghost_cells; + for (; pos < phi_old_extent - nghosts; pos += blockDim.x * gridDim.x) { + int i = pos + ghost_cells; - // fill boundary cells in phi_old - phi_old[i] = phi_old[i + (ghost_cells * len)]; + // fill boundary cells in phi_old + phi_old[i] = phi_old[i + (ghost_cells * len)]; - phi_old[i + (len * (len - ghost_cells))] = - phi_old[i + (len * (len - ghost_cells - 1))]; + phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))]; - phi_old[i * len] = phi_old[(ghost_cells * len) + i]; + phi_old[i * len] = phi_old[(ghost_cells * len) + i]; - phi_old[(len - ghost_cells) + (len * i)] = - phi_old[(len - ghost_cells - 1) + (len * i)]; - } + phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)]; + } } // // jacobi 2d stencil kernel // template -__global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha, - Real_t dt) { - int pos = blockIdx.x * blockDim.x + threadIdx.x; - int d_nghosts = nghosts; - int phi_old_extent = ncells + d_nghosts; - int gsize = ncells * ncells; - - for (; pos < gsize; pos += blockDim.x * gridDim.x) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - // Jacobi iteration - phi_new[(i - 1) * ncells + j - 1] = - phi_old[(i)*phi_old_extent + j] + - alpha * dt * - - ((phi_old[(i + 1) * phi_old_extent + j] - - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i - 1) * phi_old_extent + j]) / - (dx[0] * dx[0]) + - - (phi_old[(i)*phi_old_extent + j + 1] - - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i)*phi_old_extent + j - 1]) / - (dx[1] * dx[1])); - } +__global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha, Real_t dt) { + int pos = blockIdx.x * blockDim.x + threadIdx.x; + int d_nghosts = nghosts; + int phi_old_extent = ncells + d_nghosts; + int gsize = ncells * ncells; + + for (; pos < gsize; pos += blockDim.x * gridDim.x) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + // Jacobi iteration + phi_new[(i - 1) * ncells + j - 1] = + phi_old[(i)*phi_old_extent + j] + + alpha * dt * + + ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i - 1) * phi_old_extent + j]) / + (dx[0] * dx[0]) + + + (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i)*phi_old_extent + j - 1]) / + (dx[1] * dx[1])); + } } // @@ -140,127 +134,121 @@ __global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha, // template __global__ void parallelCopy(T* phi_old, T* phi_new, int ncells) { - int pos = blockIdx.x * blockDim.x + threadIdx.x; - int d_nghosts = nghosts; - int phi_old_extent = ncells + d_nghosts; - int gsize = ncells * ncells; - - for (; pos < gsize; pos += blockDim.x * gridDim.x) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; - } + int pos = blockIdx.x * blockDim.x + threadIdx.x; + int d_nghosts = nghosts; + int phi_old_extent = ncells + d_nghosts; + int gsize = ncells * ncells; + + for (; pos < gsize; pos += blockDim.x * gridDim.x) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; + } } // // main simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); + // parse params + heat_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - Real_t dt = args.dt; - Real_t alpha = args.alpha; + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + Real_t dt = args.dt; + Real_t alpha = args.alpha; - // init simulation time - Real_t time = 0.0; + // init simulation time + Real_t time = 0.0; - // initialize dx, dy, dz - Real_t h_dx[dims]; - for (int i = 0; i < dims; ++i) - h_dx[i] = 1.0 / (ncells - 1); + // initialize dx, dy, dz + Real_t h_dx[dims]; + for (int i = 0; i < dims; ++i) + h_dx[i] = 1.0 / (ncells - 1); - cudaErrorCheck(cudaMemcpyToSymbol(dx, h_dx, sizeof(Real_t) * dims)); + cudaErrorCheck(cudaMemcpyToSymbol(dx, h_dx, sizeof(Real_t) * dims)); - // grid size - int gsize = ncells * ncells; + // grid size + int gsize = ncells * ncells; - // host memory for printing - Real_t* h_phi = nullptr; + // host memory for printing + Real_t* h_phi = nullptr; - // simulation setup (2D) - Real_t* phi_old = nullptr; - Real_t* phi_new = nullptr; + // simulation setup (2D) + Real_t* phi_old = nullptr; + Real_t* phi_new = nullptr; - cudaErrorCheck(cudaMalloc( - &phi_old, sizeof(Real_t) * ((ncells + nghosts) * (ncells + nghosts)))); - cudaErrorCheck(cudaMalloc(&phi_new, sizeof(Real_t) * ((ncells) * (ncells)))); + cudaErrorCheck(cudaMalloc(&phi_old, sizeof(Real_t) * ((ncells + nghosts) * (ncells + nghosts)))); + cudaErrorCheck(cudaMalloc(&phi_new, sizeof(Real_t) * ((ncells) * (ncells)))); - // setup grid - int blockSize = std::min(1024, gsize); // let's do at most 1024 threads. - int nBlocks = (gsize + blockSize - 1) / blockSize; + // setup grid + int blockSize = std::min(1024, gsize); // let's do at most 1024 threads. + int nBlocks = (gsize + blockSize - 1) / blockSize; - Timer timer; + Timer timer; - // initialize grid - initialize<<>>(phi_old, ncells, ghost_cells); + // initialize grid + initialize<<>>(phi_old, ncells, ghost_cells); - cudaErrorCheck(cudaDeviceSynchronize()); + cudaErrorCheck(cudaDeviceSynchronize()); - // print initial grid if needed - if (args.print_grid) { - // copy initial grid to host - h_phi = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; - cudaErrorCheck( - cudaMemcpy(h_phi, phi_old, - sizeof(Real_t) * (ncells + nghosts) * (ncells + nghosts), - cudaMemcpyDeviceToHost)); + // print initial grid if needed + if (args.print_grid) { + // copy initial grid to host + h_phi = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; + cudaErrorCheck(cudaMemcpy(h_phi, phi_old, sizeof(Real_t) * (ncells + nghosts) * (ncells + nghosts), + cudaMemcpyDeviceToHost)); - printGrid(h_phi, ncells + nghosts); - } + printGrid(h_phi, ncells + nghosts); + } - // evolve the system - for (auto step = 0; step < nsteps; step++) { - static int fBblock = - std::min(1024, ncells); // let's do at most 1024 threads. - static int fBnBlocks = - (ncells + fBblock - 1) / fBblock; // fillBoundary blocks + // evolve the system + for (auto step = 0; step < nsteps; step++) { + static int fBblock = std::min(1024, ncells); // let's do at most 1024 threads. + static int fBnBlocks = (ncells + fBblock - 1) / fBblock; // fillBoundary blocks - // fillboundary - fillBoundary<<>>(phi_old, ncells, ghost_cells); + // fillboundary + fillBoundary<<>>(phi_old, ncells, ghost_cells); - // jacobi - jacobi<<>>(phi_old, phi_new, ncells, alpha, dt); + // jacobi + jacobi<<>>(phi_old, phi_new, ncells, alpha, dt); - // parallelCopy - parallelCopy<<>>(phi_old, phi_new, ncells); + // parallelCopy + parallelCopy<<>>(phi_old, phi_new, ncells); - cudaErrorCheck(cudaDeviceSynchronize()); + cudaErrorCheck(cudaDeviceSynchronize()); - // update time - time += dt; - } + // update time + time += dt; + } - auto elapsed = timer.stop(); + auto elapsed = timer.stop(); - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } - // print final grid if needed - if (args.print_grid) { - cudaErrorCheck(cudaMemcpy(h_phi, phi_new, sizeof(Real_t) * gsize, - cudaMemcpyDeviceToHost)); - printGrid(h_phi, ncells); + // print final grid if needed + if (args.print_grid) { + cudaErrorCheck(cudaMemcpy(h_phi, phi_new, sizeof(Real_t) * gsize, cudaMemcpyDeviceToHost)); + printGrid(h_phi, ncells); - // free host memory - delete[] h_phi; - h_phi = nullptr; - } + // free host memory + delete[] h_phi; + h_phi = nullptr; + } - // free device memory - cudaErrorCheck(cudaFree(phi_old)); - cudaErrorCheck(cudaFree(phi_new)); + // free device memory + cudaErrorCheck(cudaFree(phi_old)); + cudaErrorCheck(cudaFree(phi_new)); - return 0; + return 0; } diff --git a/apps/heat-equation/heat-equation-gpu-scheduler.cpp b/apps/heat-equation/heat-equation-gpu-scheduler.cpp index b294235..2b9590d 100644 --- a/apps/heat-equation/heat-equation-gpu-scheduler.cpp +++ b/apps/heat-equation/heat-equation-gpu-scheduler.cpp @@ -44,138 +44,132 @@ using namespace nvexec; // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); - - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } - - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; - - // init simulation time - Real_t time = 0.0; - - // initialize dx, dy, dz - thrust::universal_vector dx(dims); - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); - - // simulation setup (2D) - thrust::universal_vector grid_old((ncells + nghosts) * - (ncells + nghosts)); - thrust::universal_vector grid_new(ncells * ncells); - - /* Real_t *grid_old = new Real_t[(ncells+nghosts) * (ncells+nghosts)]; + // parse params + heat_params_t args = argparse::parse(argc, argv); + + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; + + // init simulation time + Real_t time = 0.0; + + // initialize dx, dy, dz + thrust::universal_vector dx(dims); + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); + + // simulation setup (2D) + thrust::universal_vector grid_old((ncells + nghosts) * (ncells + nghosts)); + thrust::universal_vector grid_new(ncells * ncells); + + /* Real_t *grid_old = new Real_t[(ncells+nghosts) * (ncells+nghosts)]; Real_t *grid_new = new Real_t[(ncells) * (ncells)];*/ - // initialize grid - auto phi_old = thrust::raw_pointer_cast(grid_old.data()); - auto phi_new = thrust::raw_pointer_cast(grid_new.data()); - - Timer timer; - - // scheduler from gpu - nvexec::stream_context stream_ctx{}; - auto gpu = stream_ctx.get_scheduler(); - - auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), - thrust::raw_pointer_cast(dx.data()) + dx.size()}; - auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()}; - auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()}; - auto phi_old_extent = ncells + nghosts; - - int gsize = ncells * ncells; - auto heat_eq_init = ex::transfer_just(gpu, dx_span, phi_old_span) | - ex::bulk(gsize, [=](int pos, auto ds, auto phi) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - Real_t x = pos(i, ghost_cells, ds[0]); - Real_t y = pos(j, ghost_cells, ds[1]); - - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); - - // phi(x,y) = 1 + exp(-r^2) - phi[(i)*phi_old_extent + j] = 1 + exp(-r2); - }); - - ex::sync_wait(std::move(heat_eq_init)); - if (args.print_grid) - printGrid(phi_old, ncells + nghosts); - - auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span); - - // evolve the system - for (auto step = 0; step < nsteps; step++) { - static auto evolve = - tx | - ex::bulk(phi_old_extent - nghosts, - [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = pos + ghost_cells; - int len = phi_old_extent; - // fill boundary cells in old_phi - phi_old[i] = phi_old[i + (ghost_cells * len)]; - phi_old[i + (len * (len - ghost_cells))] = - phi_old[i + (len * (len - ghost_cells - 1))]; - phi_old[i * len] = phi_old[(ghost_cells * len) + i]; - phi_old[(len - ghost_cells) + (len * i)] = - phi_old[(len - ghost_cells - 1) + (len * i)]; - }) | - ex::bulk(gsize, - [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - // Jacobi iteration - phi_new[(i - 1) * ncells + j - 1] = - phi_old[(i)*phi_old_extent + j] + - alpha * dt * - ((phi_old[(i + 1) * phi_old_extent + j] - - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i - 1) * phi_old_extent + j]) / - (ds[0] * ds[0]) + - (phi_old[(i)*phi_old_extent + j + 1] - - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i)*phi_old_extent + j - 1]) / - (ds[1] * ds[1])); - }) | - ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; - }); + // initialize grid + auto phi_old = thrust::raw_pointer_cast(grid_old.data()); + auto phi_new = thrust::raw_pointer_cast(grid_new.data()); - ex::sync_wait(std::move(evolve)); + Timer timer; - // update the simulation time - time += dt; - } + // scheduler from gpu + nvexec::stream_context stream_ctx{}; + auto gpu = stream_ctx.get_scheduler(); - auto elapsed = timer.stop(); + auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), thrust::raw_pointer_cast(dx.data()) + dx.size()}; + auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()}; + auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()}; + auto phi_old_extent = ncells + nghosts; - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } + int gsize = ncells * ncells; + auto heat_eq_init = + ex::transfer_just(gpu, dx_span, phi_old_span) | ex::bulk(gsize, [=](int pos, auto ds, auto phi) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); - auto finalize = ex::then(ex::just(), [&]() { - if (args.print_grid) - // print the final grid - printGrid(phi_new, ncells); - }); + Real_t x = pos(i, ghost_cells, ds[0]); + Real_t y = pos(j, ghost_cells, ds[1]); - // end the simulation - ex::sync_wait(std::move(finalize)); + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); - return 0; + // phi(x,y) = 1 + exp(-r^2) + phi[(i)*phi_old_extent + j] = 1 + exp(-r2); + }); + + ex::sync_wait(std::move(heat_eq_init)); + if (args.print_grid) + printGrid(phi_old, ncells + nghosts); + + auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span); + + // evolve the system + for (auto step = 0; step < nsteps; step++) { + static auto evolve = + tx | + ex::bulk(phi_old_extent - nghosts, + [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = pos + ghost_cells; + int len = phi_old_extent; + // fill boundary cells in old_phi + phi_old[i] = phi_old[i + (ghost_cells * len)]; + phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))]; + phi_old[i * len] = phi_old[(ghost_cells * len) + i]; + phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)]; + }) | + ex::bulk(gsize, + [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + // Jacobi iteration + phi_new[(i - 1) * ncells + j - 1] = + phi_old[(i)*phi_old_extent + j] + + alpha * dt * + ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i - 1) * phi_old_extent + j]) / + (ds[0] * ds[0]) + + (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i)*phi_old_extent + j - 1]) / + (ds[1] * ds[1])); + }) | + ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; + }); + + ex::sync_wait(std::move(evolve)); + + // update the simulation time + time += dt; + } + + auto elapsed = timer.stop(); + + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } + + auto finalize = ex::then(ex::just(), [&]() { + if (args.print_grid) + // print the final grid + printGrid(phi_new, ncells); + }); + + // end the simulation + ex::sync_wait(std::move(finalize)); + + return 0; } \ No newline at end of file diff --git a/apps/heat-equation/heat-equation-mdspan.cpp b/apps/heat-equation/heat-equation-mdspan.cpp index 1ae243b..f38b9ed 100644 --- a/apps/heat-equation/heat-equation-mdspan.cpp +++ b/apps/heat-equation/heat-equation-mdspan.cpp @@ -33,128 +33,121 @@ // fill boundary cells template void fill2Dboundaries_mdspan(T* grid, int len, int ghost_cells = 1) { - auto row_view = std::mdspan(grid, len, len); - - for (auto j = ghost_cells; j < row_view.extent(1) - ghost_cells; ++j) { - row_view(0, j) = row_view(ghost_cells, j); - row_view(row_view.extent(0) - ghost_cells, j) = - row_view(row_view.extent(0) - ghost_cells - 1, j); - } - - auto col_view = - std::mdspan(grid, len, len); - - for (auto i = 1; i < col_view.extent(1) - 1; ++i) { - col_view(0, i) = col_view(ghost_cells, i); - col_view(col_view.extent(0) - 1, i) = - col_view(col_view.extent(0) - ghost_cells - 1, i); - } + auto row_view = std::mdspan(grid, len, len); + + for (auto j = ghost_cells; j < row_view.extent(1) - ghost_cells; ++j) { + row_view(0, j) = row_view(ghost_cells, j); + row_view(row_view.extent(0) - ghost_cells, j) = row_view(row_view.extent(0) - ghost_cells - 1, j); + } + + auto col_view = std::mdspan(grid, len, len); + + for (auto i = 1; i < col_view.extent(1) - 1; ++i) { + col_view(0, i) = col_view(ghost_cells, i); + col_view(col_view.extent(0) - 1, i) = col_view(col_view.extent(0) - ghost_cells - 1, i); + } } // // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); + // parse params + heat_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } - - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; - - // initialize dx, dy, dz - auto* dx = new Real_t[dims]; - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); - - // simulation setup (2D) - Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; - Real_t* grid_new = new Real_t[(ncells) * (ncells)]; - - auto phi_old = std::mdspan( - grid_old, ncells + nghosts, ncells + nghosts); - auto phi_new = - std::mdspan(grid_new, ncells, ncells); - - Timer timer; - - // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] - for (int i = 1; i < phi_old.extent(0) - 1; ++i) { - for (int j = 1; j < phi_old.extent(1) - 1; ++j) { - Real_t x = pos(i, ghost_cells, dx[0]); - Real_t y = pos(j, ghost_cells, dx[1]); - - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); - - // phi(x,y) = 1 + exp(-r^2) - phi_old(i, j) = 1 + exp(-r2); - } - } - - if (args.print_grid) - // print the initial grid - printGrid(grid_old, ncells + nghosts); - - // init simulation time - Real_t time = 0.0; - - // evolve the system - for (auto step = 0; step < nsteps; step++) { - // fill boundary cells in old_phi - fill2Dboundaries_mdspan(grid_old, ncells + nghosts, ghost_cells); - - // update phi_new - for (auto i = 1; i < phi_old.extent(0) - 1; i++) { - for (auto j = 1; j < phi_old.extent(1) - 1; j++) { - // Jacobi iteration - phi_new(i - 1, j - 1) = - phi_old(i, j) + - alpha * dt * - ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / - (dx[0] * dx[0]) + - (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / - (dx[1] * dx[1])); - } + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; } - // update the simulation time - time += dt; + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; + + // initialize dx, dy, dz + auto* dx = new Real_t[dims]; + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); + + // simulation setup (2D) + Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; + Real_t* grid_new = new Real_t[(ncells) * (ncells)]; - // parallel copy phi_new to phi_old - for (auto i = 1; i < phi_old.extent(0) - 1; i++) - for (auto j = 1; j < phi_old.extent(1) - 1; j++) - // copy phi_new to phi_old - phi_old(i, j) = phi_new(i - 1, j - 1); - } + auto phi_old = std::mdspan(grid_old, ncells + nghosts, ncells + nghosts); + auto phi_new = std::mdspan(grid_new, ncells, ncells); - auto elapsed = timer.stop(); + Timer timer; - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } + // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] + for (int i = 1; i < phi_old.extent(0) - 1; ++i) { + for (int j = 1; j < phi_old.extent(1) - 1; ++j) { + Real_t x = pos(i, ghost_cells, dx[0]); + Real_t y = pos(j, ghost_cells, dx[1]); - if (args.print_grid) - // print the final grid - printGrid(grid_new, ncells); + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); - // delete all memory - delete[] grid_old; - delete[] grid_new; + // phi(x,y) = 1 + exp(-r^2) + phi_old(i, j) = 1 + exp(-r2); + } + } + + if (args.print_grid) + // print the initial grid + printGrid(grid_old, ncells + nghosts); + + // init simulation time + Real_t time = 0.0; + + // evolve the system + for (auto step = 0; step < nsteps; step++) { + // fill boundary cells in old_phi + fill2Dboundaries_mdspan(grid_old, ncells + nghosts, ghost_cells); + + // update phi_new + for (auto i = 1; i < phi_old.extent(0) - 1; i++) { + for (auto j = 1; j < phi_old.extent(1) - 1; j++) { + // Jacobi iteration + phi_new(i - 1, j - 1) = + phi_old(i, j) + + alpha * dt * + ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) + + (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1])); + } + } + + // update the simulation time + time += dt; + + // parallel copy phi_new to phi_old + for (auto i = 1; i < phi_old.extent(0) - 1; i++) + for (auto j = 1; j < phi_old.extent(1) - 1; j++) + // copy phi_new to phi_old + phi_old(i, j) = phi_new(i - 1, j - 1); + } - grid_old = nullptr; - grid_new = nullptr; + auto elapsed = timer.stop(); + + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } - return 0; + if (args.print_grid) + // print the final grid + printGrid(grid_new, ncells); + + // delete all memory + delete[] grid_old; + delete[] grid_new; + + grid_old = nullptr; + grid_new = nullptr; + + return 0; } diff --git a/apps/heat-equation/heat-equation-multigpu-scheduler.cpp b/apps/heat-equation/heat-equation-multigpu-scheduler.cpp index efcc9e5..d8e79b3 100644 --- a/apps/heat-equation/heat-equation-multigpu-scheduler.cpp +++ b/apps/heat-equation/heat-equation-multigpu-scheduler.cpp @@ -44,135 +44,129 @@ using namespace nvexec; // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); - - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } - - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; - - // init simulation time - Real_t time = 0.0; - - // initialize dx, dy, dz - thrust::universal_vector dx(dims); - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); - - // simulation setup (2D) - thrust::universal_vector grid_old((ncells + nghosts) * - (ncells + nghosts)); - thrust::universal_vector grid_new(ncells * ncells); - - // initialize grid - auto phi_old = thrust::raw_pointer_cast(grid_old.data()); - auto phi_new = thrust::raw_pointer_cast(grid_new.data()); - - Timer timer; - - // scheduler from gpu - nvexec::multi_gpu_stream_context stream_ctx{}; - auto gpu = stream_ctx.get_scheduler(); - - auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), - thrust::raw_pointer_cast(dx.data()) + dx.size()}; - auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()}; - auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()}; - auto phi_old_extent = ncells + nghosts; - - int gsize = ncells * ncells; - auto heat_eq_init = ex::transfer_just(gpu, dx_span, phi_old_span) | - ex::bulk(gsize, [=](int pos, auto ds, auto phi) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - Real_t x = pos(i, ghost_cells, ds[0]); - Real_t y = pos(j, ghost_cells, ds[1]); - - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); - - // phi(x,y) = 1 + exp(-r^2) - phi[(i)*phi_old_extent + j] = 1 + exp(-r2); - }); - - ex::sync_wait(std::move(heat_eq_init)); - if (args.print_grid) - printGrid(phi_old, ncells + nghosts); - - auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span); - - // evolve the system - for (auto step = 0; step < nsteps; step++) { - static auto evolve = - tx | - ex::bulk(phi_old_extent - nghosts, - [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = pos + ghost_cells; - int len = phi_old_extent; - // fill boundary cells in old_phi - phi_old[i] = phi_old[i + (ghost_cells * len)]; - phi_old[i + (len * (len - ghost_cells))] = - phi_old[i + (len * (len - ghost_cells - 1))]; - phi_old[i * len] = phi_old[(ghost_cells * len) + i]; - phi_old[(len - ghost_cells) + (len * i)] = - phi_old[(len - ghost_cells - 1) + (len * i)]; - }) | - ex::bulk(gsize, - [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - // Jacobi iteration - phi_new[(i - 1) * ncells + j - 1] = - phi_old[(i)*phi_old_extent + j] + - alpha * dt * - ((phi_old[(i + 1) * phi_old_extent + j] - - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i - 1) * phi_old_extent + j]) / - (ds[0] * ds[0]) + - (phi_old[(i)*phi_old_extent + j + 1] - - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i)*phi_old_extent + j - 1]) / - (ds[1] * ds[1])); - }) | - ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; + // parse params + heat_params_t args = argparse::parse(argc, argv); + + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; + + // init simulation time + Real_t time = 0.0; + + // initialize dx, dy, dz + thrust::universal_vector dx(dims); + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); + + // simulation setup (2D) + thrust::universal_vector grid_old((ncells + nghosts) * (ncells + nghosts)); + thrust::universal_vector grid_new(ncells * ncells); + + // initialize grid + auto phi_old = thrust::raw_pointer_cast(grid_old.data()); + auto phi_new = thrust::raw_pointer_cast(grid_new.data()); + + Timer timer; + + // scheduler from gpu + nvexec::multi_gpu_stream_context stream_ctx{}; + auto gpu = stream_ctx.get_scheduler(); + + auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), thrust::raw_pointer_cast(dx.data()) + dx.size()}; + auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()}; + auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()}; + auto phi_old_extent = ncells + nghosts; + + int gsize = ncells * ncells; + auto heat_eq_init = + ex::transfer_just(gpu, dx_span, phi_old_span) | ex::bulk(gsize, [=](int pos, auto ds, auto phi) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + Real_t x = pos(i, ghost_cells, ds[0]); + Real_t y = pos(j, ghost_cells, ds[1]); + + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); + + // phi(x,y) = 1 + exp(-r^2) + phi[(i)*phi_old_extent + j] = 1 + exp(-r2); }); - ex::sync_wait(std::move(evolve)); - - // update the simulation time - time += dt; - } - - auto elapsed = timer.stop(); - - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } - - auto finalize = ex::then(ex::just(), [&]() { + ex::sync_wait(std::move(heat_eq_init)); if (args.print_grid) - // print the final grid - printGrid(phi_new, ncells); - }); + printGrid(phi_old, ncells + nghosts); + + auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span); + + // evolve the system + for (auto step = 0; step < nsteps; step++) { + static auto evolve = + tx | + ex::bulk(phi_old_extent - nghosts, + [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = pos + ghost_cells; + int len = phi_old_extent; + // fill boundary cells in old_phi + phi_old[i] = phi_old[i + (ghost_cells * len)]; + phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))]; + phi_old[i * len] = phi_old[(ghost_cells * len) + i]; + phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)]; + }) | + ex::bulk(gsize, + [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + // Jacobi iteration + phi_new[(i - 1) * ncells + j - 1] = + phi_old[(i)*phi_old_extent + j] + + alpha * dt * + ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i - 1) * phi_old_extent + j]) / + (ds[0] * ds[0]) + + (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i)*phi_old_extent + j - 1]) / + (ds[1] * ds[1])); + }) | + ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; + }); + + ex::sync_wait(std::move(evolve)); + + // update the simulation time + time += dt; + } + + auto elapsed = timer.stop(); + + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } + + auto finalize = ex::then(ex::just(), [&]() { + if (args.print_grid) + // print the final grid + printGrid(phi_new, ncells); + }); + + // end the simulation + ex::sync_wait(std::move(finalize)); - // end the simulation - ex::sync_wait(std::move(finalize)); - - return 0; + return 0; } \ No newline at end of file diff --git a/apps/heat-equation/heat-equation-omp.cpp b/apps/heat-equation/heat-equation-omp.cpp index 6af69b0..ebf89e2 100644 --- a/apps/heat-equation/heat-equation-omp.cpp +++ b/apps/heat-equation/heat-equation-omp.cpp @@ -33,134 +33,126 @@ // fill boundary cells OpenMP template -void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1, - int ghost_cells = 1) { +void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1, int ghost_cells = 1) { #pragma omp parallel for num_threads(nthreads) - for (int i = ghost_cells; i < len - ghost_cells; i++) { - grid[i] = grid[i + (ghost_cells * len)]; - grid[i + (len * (len - ghost_cells))] = - grid[i + (len * (len - ghost_cells - 1))]; - - grid[i * len] = grid[(ghost_cells * len) + i]; - grid[(len - ghost_cells) + (len * i)] = - grid[(len - ghost_cells - 1) + (len * i)]; - } + for (int i = ghost_cells; i < len - ghost_cells; i++) { + grid[i] = grid[i + (ghost_cells * len)]; + grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))]; + + grid[i * len] = grid[(ghost_cells * len) + i]; + grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)]; + } } // // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); + // parse params + heat_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - int nthreads = args.nthreads; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + int nthreads = args.nthreads; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; - // initialize dx, dy, dz - auto* dx = new Real_t[dims]; - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); + // initialize dx, dy, dz + auto* dx = new Real_t[dims]; + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); - // simulation setup (2D) - Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; - Real_t* grid_new = new Real_t[(ncells) * (ncells)]; + // simulation setup (2D) + Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; + Real_t* grid_new = new Real_t[(ncells) * (ncells)]; - auto phi_old = std::mdspan( - grid_old, ncells + nghosts, ncells + nghosts); - auto phi_new = - std::mdspan(grid_new, ncells, ncells); + auto phi_old = std::mdspan(grid_old, ncells + nghosts, ncells + nghosts); + auto phi_new = std::mdspan(grid_new, ncells, ncells); - int gsize = ncells * ncells; + int gsize = ncells * ncells; - Timer timer; + Timer timer; - // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] + // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] #pragma omp parallel for num_threads(nthreads) - for (int pos = 0; pos < gsize; pos++) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); + for (int pos = 0; pos < gsize; pos++) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); - Real_t x = pos(i, ghost_cells, dx[0]); - Real_t y = pos(j, ghost_cells, dx[1]); + Real_t x = pos(i, ghost_cells, dx[0]); + Real_t y = pos(j, ghost_cells, dx[1]); - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); - // phi(x,y) = 1 + exp(-r^2) - phi_old(i, j) = 1 + exp(-r2); - } + // phi(x,y) = 1 + exp(-r^2) + phi_old(i, j) = 1 + exp(-r2); + } - if (args.print_grid) - // print the initial grid - printGrid(grid_old, ncells + nghosts); + if (args.print_grid) + // print the initial grid + printGrid(grid_old, ncells + nghosts); - // init simulation time - Real_t time = 0.0; + // init simulation time + Real_t time = 0.0; - // evolve the system - for (auto step = 0; step < nsteps; step++) { - // fill boundary cells in old_phi - fill2Dboundaries_omp(grid_old, ncells + nghosts, ghost_cells, nthreads); + // evolve the system + for (auto step = 0; step < nsteps; step++) { + // fill boundary cells in old_phi + fill2Dboundaries_omp(grid_old, ncells + nghosts, ghost_cells, nthreads); #pragma omp parallel for num_threads(nthreads) - for (int pos = 0; pos < gsize; pos++) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - // Jacobi iteration - phi_new(i - 1, j - 1) = - phi_old(i, j) + - alpha * dt * - ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / - (dx[0] * dx[0]) + - (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / - (dx[1] * dx[1])); - } + for (int pos = 0; pos < gsize; pos++) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + // Jacobi iteration + phi_new(i - 1, j - 1) = + phi_old(i, j) + alpha * dt * + ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) + + (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1])); + } - // update the simulation time - time += dt; + // update the simulation time + time += dt; - // parallel copy phi_new to phi_old + // parallel copy phi_new to phi_old #pragma omp parallel for num_threads(nthreads) - for (int pos = 0; pos < gsize; pos++) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); + for (int pos = 0; pos < gsize; pos++) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); - // copy phi_new to phi_old - phi_old(i, j) = phi_new(i - 1, j - 1); + // copy phi_new to phi_old + phi_old(i, j) = phi_new(i - 1, j - 1); + } } - } - auto elapsed = timer.stop(); + auto elapsed = timer.stop(); - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } - if (args.print_grid) - // print the final grid - printGrid(grid_new, ncells); + if (args.print_grid) + // print the final grid + printGrid(grid_new, ncells); - // delete all memory - delete[] grid_old; - delete[] grid_new; + // delete all memory + delete[] grid_old; + delete[] grid_new; - grid_old = nullptr; - grid_new = nullptr; + grid_old = nullptr; + grid_new = nullptr; - return 0; + return 0; } diff --git a/apps/heat-equation/heat-equation-stdpar-senders.cpp b/apps/heat-equation/heat-equation-stdpar-senders.cpp index f83b113..5209f37 100644 --- a/apps/heat-equation/heat-equation-stdpar-senders.cpp +++ b/apps/heat-equation/heat-equation-stdpar-senders.cpp @@ -45,166 +45,156 @@ using stdexec::sync_wait; // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); - - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } - - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - // number of parallel tiles - int ntiles = args.ntiles; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; - - // init simulation time - Real_t time = 0.0; - - // initialize dx, dy, dz - auto* dx = new Real_t[dims]; - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); - - // simulation setup (2D) - Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; - Real_t* grid_new = new Real_t[(ncells) * (ncells)]; - - auto phi_old = std::mdspan( - grid_old, ncells + nghosts, ncells + nghosts); - auto phi_new = - std::mdspan(grid_new, ncells, ncells); - - Timer timer; - - // scheduler from a thread pool - exec::static_thread_pool ctx{ntiles}; - - scheduler auto sch = ctx.get_scheduler(); - sender auto begin = schedule(sch); - - // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] - sender auto heat_eq_init = - bulk(begin, ntiles, - [&](int tile) { - int start = tile * (ncells * ncells) / ntiles; - int size = (ncells * ncells) / ntiles; - int remaining = (ncells * ncells) % ntiles; - size += (tile == ntiles - 1) ? remaining : 0; - - std::for_each_n(std::execution::par_unseq, - counting_iterator(start), size, [=](int pos) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - Real_t x = pos(i, ghost_cells, dx[0]); - Real_t y = pos(j, ghost_cells, dx[1]); - - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); - - // phi(x,y) = 1 + exp(-r^2) - phi_old(i, j) = 1 + exp(-r2); - }); - }) | - then([&]() { - if (args.print_grid) - // print the initial grid - printGrid(grid_old, ncells + nghosts); - }); - - // start the simulation - sync_wait(std::move(heat_eq_init)); - - // evolve the system - for (auto step = 0; step < nsteps; step++) { - static sender auto evolve = - then(begin, - [&]() { - // fill boundary cells in old_phi - fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells); - }) | - bulk(ntiles, + // parse params + heat_params_t args = argparse::parse(argc, argv); + + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + // number of parallel tiles + int ntiles = args.ntiles; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; + + // init simulation time + Real_t time = 0.0; + + // initialize dx, dy, dz + auto* dx = new Real_t[dims]; + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); + + // simulation setup (2D) + Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; + Real_t* grid_new = new Real_t[(ncells) * (ncells)]; + + auto phi_old = std::mdspan(grid_old, ncells + nghosts, ncells + nghosts); + auto phi_new = std::mdspan(grid_new, ncells, ncells); + + Timer timer; + + // scheduler from a thread pool + exec::static_thread_pool ctx{ntiles}; + + scheduler auto sch = ctx.get_scheduler(); + sender auto begin = schedule(sch); + + // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] + sender auto heat_eq_init = + bulk(begin, ntiles, [&](int tile) { - int start = tile * (ncells * ncells) / ntiles; - int size = (ncells * ncells) / ntiles; - int remaining = (ncells * ncells) % ntiles; - size += (tile == ntiles - 1) ? remaining : 0; - - // update phi_new with stencil - std::for_each_n( - std::execution::par_unseq, counting_iterator(start), size, - [=](int pos) { + int start = tile * (ncells * ncells) / ntiles; + int size = (ncells * ncells) / ntiles; + int remaining = (ncells * ncells) % ntiles; + size += (tile == ntiles - 1) ? remaining : 0; + + std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) { int i = 1 + (pos / ncells); int j = 1 + (pos % ncells); - // Jacobi iteration - phi_new(i - 1, j - 1) = - phi_old(i, j) + - alpha * dt * - ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + - phi_old(i - 1, j)) / - (dx[0] * dx[0]) + - (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + - phi_old(i, j - 1)) / - (dx[1] * dx[1])); - }); - }) | - bulk(ntiles, - [&](int tile) { - int start = tile * (ncells * ncells) / ntiles; - int size = (ncells * ncells) / ntiles; - int remaining = (ncells * ncells) % ntiles; - size += (tile == ntiles - 1) ? remaining : 0; - - // parallel copy phi_new to phi_old - std::for_each_n(std::execution::par_unseq, - counting_iterator(start), size, [=](int pos) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - // copy phi_new to phi_old - phi_old(i, j) = phi_new(i - 1, j - 1); - }); + Real_t x = pos(i, ghost_cells, dx[0]); + Real_t y = pos(j, ghost_cells, dx[1]); + + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); + + // phi(x,y) = 1 + exp(-r^2) + phi_old(i, j) = 1 + exp(-r2); + }); }) | then([&]() { - // update the simulation time - time += dt; + if (args.print_grid) + // print the initial grid + printGrid(grid_old, ncells + nghosts); }); - sync_wait(std::move(evolve)); - } - - auto elapsed = timer.stop(); - - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } - - sender auto finalize = then(just(), - [&]() { - if (args.print_grid) - // print the final grid - printGrid(grid_new, ncells); - }) | - then([&]() { - // delete all memory - delete[] grid_old; - delete[] grid_new; + // start the simulation + sync_wait(std::move(heat_eq_init)); + + // evolve the system + for (auto step = 0; step < nsteps; step++) { + static sender auto evolve = + then(begin, + [&]() { + // fill boundary cells in old_phi + fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells); + }) | + bulk(ntiles, + [&](int tile) { + int start = tile * (ncells * ncells) / ntiles; + int size = (ncells * ncells) / ntiles; + int remaining = (ncells * ncells) % ntiles; + size += (tile == ntiles - 1) ? remaining : 0; + + // update phi_new with stencil + std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + // Jacobi iteration + phi_new(i - 1, j - 1) = + phi_old(i, j) + + alpha * dt * + ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) + + (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1])); + }); + }) | + bulk(ntiles, + [&](int tile) { + int start = tile * (ncells * ncells) / ntiles; + int size = (ncells * ncells) / ntiles; + int remaining = (ncells * ncells) % ntiles; + size += (tile == ntiles - 1) ? remaining : 0; + + // parallel copy phi_new to phi_old + std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + // copy phi_new to phi_old + phi_old(i, j) = phi_new(i - 1, j - 1); + }); + }) | + then([&]() { + // update the simulation time + time += dt; + }); + + sync_wait(std::move(evolve)); + } + + auto elapsed = timer.stop(); + + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } + + sender auto finalize = then(just(), + [&]() { + if (args.print_grid) + // print the final grid + printGrid(grid_new, ncells); + }) | + then([&]() { + // delete all memory + delete[] grid_old; + delete[] grid_new; + + grid_old = nullptr; + grid_new = nullptr; + }); + + // start the simulation + sync_wait(std::move(finalize)); - grid_old = nullptr; - grid_new = nullptr; - }); - - // start the simulation - sync_wait(std::move(finalize)); - - return 0; + return 0; } diff --git a/apps/heat-equation/heat-equation-stdpar.cpp b/apps/heat-equation/heat-equation-stdpar.cpp index b20fb68..164c482 100644 --- a/apps/heat-equation/heat-equation-stdpar.cpp +++ b/apps/heat-equation/heat-equation-stdpar.cpp @@ -34,117 +34,107 @@ // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); + // parse params + heat_params_t args = argparse::parse(argc, argv); + + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; + + // initialize dx, dy, dz + auto* dx = new Real_t[dims]; + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); + + // simulation setup (2D) + Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; + Real_t* grid_new = new Real_t[(ncells) * (ncells)]; + + auto phi_old = std::mdspan(grid_old, ncells + nghosts, ncells + nghosts); + auto phi_new = std::mdspan(grid_new, ncells, ncells); + + // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] + + Timer timer; + + std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) { + int i = 1 + (ind / ncells); + int j = 1 + (ind % ncells); + + Real_t x = pos(i, ghost_cells, dx[0]); + Real_t y = pos(j, ghost_cells, dx[1]); + + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); + + // phi(x,y) = 1 + exp(-r^2) + phi_old(i, j) = 1 + exp(-r2); + }); + + if (args.print_grid) + // print the initial grid + printGrid(grid_old, ncells + nghosts); + + // init simulation time + Real_t time = 0.0; + + // evolve the system + for (auto step = 0; step < nsteps; step++) { + // fill boundary cells in old_phi + fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells); + + // update phi_new with stencil + std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) { + int i = 1 + (ind / ncells); + int j = 1 + (ind % ncells); + + // Jacobi iteration + phi_new(i - 1, j - 1) = + phi_old(i, j) + alpha * dt * + ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) + + (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1])); + }); + + // update the simulation time + time += dt; + + // parallel copy phi_new to phi_old + std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) { + int i = 1 + (ind / ncells); + int j = 1 + (ind % ncells); + + // copy phi_new to phi_old + phi_old(i, j) = phi_new(i - 1, j - 1); + }); + } + + auto elapsed = timer.stop(); + + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } + + if (args.print_grid) + // print the final grid + printGrid(grid_new, ncells); + + // delete all memory + delete[] grid_old; + delete[] grid_new; + + grid_old = nullptr; + grid_new = nullptr; - // see if help wanted - if (args.help) { - args.print(); // prints all variables return 0; - } - - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; - - // initialize dx, dy, dz - auto* dx = new Real_t[dims]; - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); - - // simulation setup (2D) - Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; - Real_t* grid_new = new Real_t[(ncells) * (ncells)]; - - auto phi_old = std::mdspan( - grid_old, ncells + nghosts, ncells + nghosts); - auto phi_new = - std::mdspan(grid_new, ncells, ncells); - - // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] - - Timer timer; - - std::for_each_n(std::execution::par_unseq, counting_iterator(0), - ncells * ncells, [=](int ind) { - int i = 1 + (ind / ncells); - int j = 1 + (ind % ncells); - - Real_t x = pos(i, ghost_cells, dx[0]); - Real_t y = pos(j, ghost_cells, dx[1]); - - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); - - // phi(x,y) = 1 + exp(-r^2) - phi_old(i, j) = 1 + exp(-r2); - }); - - if (args.print_grid) - // print the initial grid - printGrid(grid_old, ncells + nghosts); - - // init simulation time - Real_t time = 0.0; - - // evolve the system - for (auto step = 0; step < nsteps; step++) { - // fill boundary cells in old_phi - fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells); - - // update phi_new with stencil - std::for_each_n(std::execution::par_unseq, counting_iterator(0), - ncells * ncells, [=](int ind) { - int i = 1 + (ind / ncells); - int j = 1 + (ind % ncells); - - // Jacobi iteration - phi_new(i - 1, j - 1) = - phi_old(i, j) + - alpha * dt * - ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + - phi_old(i - 1, j)) / - (dx[0] * dx[0]) + - (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + - phi_old(i, j - 1)) / - (dx[1] * dx[1])); - }); - - // update the simulation time - time += dt; - - // parallel copy phi_new to phi_old - std::for_each_n(std::execution::par_unseq, counting_iterator(0), - ncells * ncells, [=](int ind) { - int i = 1 + (ind / ncells); - int j = 1 + (ind % ncells); - - // copy phi_new to phi_old - phi_old(i, j) = phi_new(i - 1, j - 1); - }); - } - - auto elapsed = timer.stop(); - - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } - - if (args.print_grid) - // print the final grid - printGrid(grid_new, ncells); - - // delete all memory - delete[] grid_old; - delete[] grid_new; - - grid_old = nullptr; - grid_new = nullptr; - - return 0; } diff --git a/apps/heat-equation/heat-equation.hpp b/apps/heat-equation/heat-equation.hpp index a226a45..94bf8b6 100644 --- a/apps/heat-equation/heat-equation.hpp +++ b/apps/heat-equation/heat-equation.hpp @@ -49,62 +49,57 @@ constexpr int nghosts = ghost_cells * dims; using view_2d = std::extents; // 3D view -using view_3d = std::extents; +using view_3d = std::extents; // macros to get x and y positions from indices #define pos(i, ghosts, dx) -0.5 + dx*(i - ghosts) // parameters struct heat_params_t : public argparse::Args { - int& ncells = kwarg("n,ncells", "number of cells on each side of the domain") - .set_default(32); - int& nsteps = kwarg("s,nsteps", "total steps in simulation").set_default(100); + int& ncells = kwarg("n,ncells", "number of cells on each side of the domain").set_default(32); + int& nsteps = kwarg("s,nsteps", "total steps in simulation").set_default(100); #if defined(HEQ_OMP) - int& nthreads = kwarg("nthreads", "number of threads").set_default(1); + int& nthreads = kwarg("nthreads", "number of threads").set_default(1); #endif // HEQ_OMP - Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f); - Real_t& dt = kwarg("t,dt", "time step").set_default(5.0e-5f); - bool& help = flag("h, help", "print help"); - bool& print_grid = flag("p,print", "print grids at step 0 and step n"); - bool& print_time = flag("time", "print simulation time"); + Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f); + Real_t& dt = kwarg("t,dt", "time step").set_default(5.0e-5f); + bool& help = flag("h, help", "print help"); + bool& print_grid = flag("p,print", "print grids at step 0 and step n"); + bool& print_time = flag("time", "print simulation time"); #if defined(TILING) - int& ntiles = kwarg("ntiles", "number of parallel tiles").set_default(4); + int& ntiles = kwarg("ntiles", "number of parallel tiles").set_default(4); #endif // TILING \ // future use if needed \ // int &max_grid_size = kwarg("g, max_grid_size", "size of each box (or - // grid)").set_default(32); bool &verbose = kwarg("v, verbose", "verbose - // mode").set_default(false); int &plot_int = kwarg("p, plot_int", "how often - // to write a plotfile").set_default(-1); + // grid)").set_default(32); bool &verbose = kwarg("v, verbose", "verbose + // mode").set_default(false); int &plot_int = kwarg("p, plot_int", "how often + // to write a plotfile").set_default(-1); }; template void printGrid(T* grid, int len) { - auto view = std::mdspan(grid, len, len); - std::cout << "Grid: " << std::endl; - std::cout << std::fixed << std::showpoint; - std::cout << std::setprecision(2); + auto view = std::mdspan(grid, len, len); + std::cout << "Grid: " << std::endl; + std::cout << std::fixed << std::showpoint; + std::cout << std::setprecision(2); - for (auto j = 0; j < view.extent(1); ++j) { - for (auto i = 0; i < view.extent(0); ++i) { - std::cout << view(i, j) << ", "; + for (auto j = 0; j < view.extent(1); ++j) { + for (auto i = 0; i < view.extent(0); ++i) { + std::cout << view(i, j) << ", "; + } + std::cout << std::endl; } std::cout << std::endl; - } - std::cout << std::endl; } // fill boundary cells template void fill2Dboundaries(T* grid, int len, int ghost_cells = 1) { - std::for_each_n(std::execution::par_unseq, counting_iterator(ghost_cells), - len - nghosts, [=](auto i) { - grid[i] = grid[i + (ghost_cells * len)]; - grid[i + (len * (len - ghost_cells))] = - grid[i + (len * (len - ghost_cells - 1))]; + std::for_each_n(std::execution::par_unseq, counting_iterator(ghost_cells), len - nghosts, [=](auto i) { + grid[i] = grid[i + (ghost_cells * len)]; + grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))]; - grid[i * len] = grid[(ghost_cells * len) + i]; - grid[(len - ghost_cells) + (len * i)] = - grid[(len - ghost_cells - 1) + (len * i)]; - }); + grid[i * len] = grid[(ghost_cells * len) + i]; + grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)]; + }); } \ No newline at end of file diff --git a/apps/mdspan-stdpar/mdspan-stdpar.cpp b/apps/mdspan-stdpar/mdspan-stdpar.cpp index 9c92a87..92dbdb9 100644 --- a/apps/mdspan-stdpar/mdspan-stdpar.cpp +++ b/apps/mdspan-stdpar/mdspan-stdpar.cpp @@ -30,58 +30,51 @@ using data_type = int; // 2D view -using extents_type = - std::extents; +using extents_type = std::extents; // 3D view (fix the first dimension to 2) -using extents_type2 = - std::extents; +using extents_type2 = std::extents; int main() { - constexpr int N = 1e9; - std::vector v(N); + constexpr int N = 1e9; + std::vector v(N); - // View data as contiguous memory representing 2 rows of 6 ints each - auto ms2 = std::mdspan(v.data(), - N / 2, 2); - // View the same data as a 3D array 2 (fixed above) x 3 x 2 - auto ms3 = std::mdspan(v.data(), - N / 4, 2); + // View data as contiguous memory representing 2 rows of 6 ints each + auto ms2 = std::mdspan(v.data(), N / 2, 2); + // View the same data as a 3D array 2 (fixed above) x 3 x 2 + auto ms3 = std::mdspan(v.data(), N / 4, 2); - // auto dim2 = [=](int i){int i1 = i/ms2.extent(1); int i2 = i%ms2.extent(1); - // return std::make_tuple(i1, i2);}; auto dim3 = [=](int i){int i1 = - // i/(ms3.extent(1)*ms3.extent(2)); int i2 = (i/ms3.extent(2))%ms3.extent(1); - // int i3 = i%ms3.extent(2); return std::make_tuple(i1, i2, i3);}; + // auto dim2 = [=](int i){int i1 = i/ms2.extent(1); int i2 = i%ms2.extent(1); + // return std::make_tuple(i1, i2);}; auto dim3 = [=](int i){int i1 = + // i/(ms3.extent(1)*ms3.extent(2)); int i2 = (i/ms3.extent(2))%ms3.extent(1); + // int i3 = i%ms3.extent(2); return std::make_tuple(i1, i2, i3);}; - std::for_each(std::execution::par_unseq, ms2.data_handle(), - ms2.data_handle() + ms2.size(), [=](int& i) { - auto global_idx = std::distance(ms2.data_handle(), &i); - dim2(global_idx, ms2); - // auto [i1, i2] = dim2(global_idx); - ms2(ii, ij) = global_idx; - }); + std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) { + auto global_idx = std::distance(ms2.data_handle(), &i); + dim2(global_idx, ms2); + // auto [i1, i2] = dim2(global_idx); + ms2(ii, ij) = global_idx; + }); - std::cout << std::endl << std::endl; + std::cout << std::endl << std::endl; - std::for_each(std::execution::par_unseq, ms2.data_handle(), - ms2.data_handle() + ms2.size(), [=](int& i) { - auto global_idx = std::distance(ms2.data_handle(), &i); - dim3(global_idx, ms3); - // auto [i1, i2, i3] = dim3(global_idx); - ms3(ii, ij, ik) = 1000 + global_idx; - }); + std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) { + auto global_idx = std::distance(ms2.data_handle(), &i); + dim3(global_idx, ms3); + // auto [i1, i2, i3] = dim3(global_idx); + ms3(ii, ij, ik) = 1000 + global_idx; + }); - // read subset of data using 3D view - for (size_t i = 0; i < ms3.extent(0); i++) { - for (size_t j = 0; j < 10; j++) { - for (size_t k = 0; k < ms3.extent(2); k++) { - assert(ms3(i, j, k) == 1000 + i * ms3.extent(1) * ms3.extent(2) + - j * ms3.extent(2) + k); - std::cout << ms3(i, j, k) << " "; - } - std::cout << std::endl; + // read subset of data using 3D view + for (size_t i = 0; i < ms3.extent(0); i++) { + for (size_t j = 0; j < 10; j++) { + for (size_t k = 0; k < ms3.extent(2); k++) { + assert(ms3(i, j, k) == 1000 + i * ms3.extent(1) * ms3.extent(2) + j * ms3.extent(2) + k); + std::cout << ms3(i, j, k) << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; } - std::cout << std::endl; - } - std::cout << ms3(0, 0, 1) << "\n"; + std::cout << ms3(0, 0, 1) << "\n"; } \ No newline at end of file diff --git a/include/commons.hpp b/include/commons.hpp index c043a20..cfacfa1 100644 --- a/include/commons.hpp +++ b/include/commons.hpp @@ -48,43 +48,38 @@ #include "counting_iterator.hpp" // get mdpsan 2d indices from 1d index -#define dim2(x, ms) \ - int ii = x / ms.extent(1); \ - int ij = x % ms.extent(1); +#define dim2(x, ms) \ + int ii = x / ms.extent(1); \ + int ij = x % ms.extent(1); // get mdspan 3d indices from 1d index -#define dim3(x, ms) \ - int ii = x / (ms3.extent(1) * ms.extent(2)); \ - int ij = (x / ms.extent(2)) % ms.extent(1); \ - int ik = x % ms.extent(2) +#define dim3(x, ms) \ + int ii = x / (ms3.extent(1) * ms.extent(2)); \ + int ij = (x / ms.extent(2)) % ms.extent(1); \ + int ik = x % ms.extent(2) class Timer { - public: - Timer() { start(); } + public: + Timer() { start(); } - ~Timer() { stop(); } + ~Timer() { stop(); } - void start() { start_time_point = std::chrono::high_resolution_clock::now(); } + void start() { start_time_point = std::chrono::high_resolution_clock::now(); } - double stop() { - end_time_point = std::chrono::high_resolution_clock::now(); - return duration(); - } + double stop() { + end_time_point = std::chrono::high_resolution_clock::now(); + return duration(); + } - double duration() { - auto start = std::chrono::time_point_cast( - start_time_point) - .time_since_epoch() - .count(); - auto end = - std::chrono::time_point_cast(end_time_point) - .time_since_epoch() - .count(); - auto duration = end - start; - double ms = duration * 0.001; - return ms; - } + double duration() { + auto start = + std::chrono::time_point_cast(start_time_point).time_since_epoch().count(); + auto end = std::chrono::time_point_cast(end_time_point).time_since_epoch().count(); + auto duration = end - start; + double ms = duration * 0.001; + return ms; + } - private: - std::chrono::time_point start_time_point; - std::chrono::time_point end_time_point; + private: + std::chrono::time_point start_time_point; + std::chrono::time_point end_time_point; }; diff --git a/include/counting_iterator.hpp b/include/counting_iterator.hpp index aae6a85..09d0fa2 100644 --- a/include/counting_iterator.hpp +++ b/include/counting_iterator.hpp @@ -36,96 +36,76 @@ using Index_t = int32_t; struct counting_iterator { - private: - using self = counting_iterator; - - public: - using value_type = Index_t; - using difference_type = typename std::make_signed::type; - using pointer = Index_t*; - using reference = Index_t&; - using iterator_category = std::random_access_iterator_tag; - - counting_iterator() : value(0) {} - - explicit counting_iterator(value_type v) : value(v) {} - - value_type operator*() const { return value; } - - value_type operator[](difference_type n) const { return value + n; } - - self& operator++() { - ++value; - return *this; - } - - self operator++(int) { - self result{value}; - ++value; - return result; - } - - self& operator--() { - --value; - return *this; - } - - self operator--(int) { - self result{value}; - --value; - return result; - } - - self& operator+=(difference_type n) { - value += n; - return *this; - } - - self& operator-=(difference_type n) { - value -= n; - return *this; - } - - friend self operator+(self const& i, difference_type n) { - return self(i.value + n); - } - - friend self operator+(difference_type n, self const& i) { - return self(i.value + n); - } - - friend difference_type operator-(self const& x, self const& y) { - return x.value - y.value; - } - - friend self operator-(self const& i, difference_type n) { - return self(i.value - n); - } - - friend bool operator==(self const& x, self const& y) { - return x.value == y.value; - } - - friend bool operator!=(self const& x, self const& y) { - return x.value != y.value; - } - - friend bool operator<(self const& x, self const& y) { - return x.value < y.value; - } - - friend bool operator<=(self const& x, self const& y) { - return x.value <= y.value; - } - - friend bool operator>(self const& x, self const& y) { - return x.value > y.value; - } - - friend bool operator>=(self const& x, self const& y) { - return x.value >= y.value; - } - - private: - value_type value; + private: + using self = counting_iterator; + + public: + using value_type = Index_t; + using difference_type = typename std::make_signed::type; + using pointer = Index_t*; + using reference = Index_t&; + using iterator_category = std::random_access_iterator_tag; + + counting_iterator() : value(0) {} + + explicit counting_iterator(value_type v) : value(v) {} + + value_type operator*() const { return value; } + + value_type operator[](difference_type n) const { return value + n; } + + self& operator++() { + ++value; + return *this; + } + + self operator++(int) { + self result{value}; + ++value; + return result; + } + + self& operator--() { + --value; + return *this; + } + + self operator--(int) { + self result{value}; + --value; + return result; + } + + self& operator+=(difference_type n) { + value += n; + return *this; + } + + self& operator-=(difference_type n) { + value -= n; + return *this; + } + + friend self operator+(self const& i, difference_type n) { return self(i.value + n); } + + friend self operator+(difference_type n, self const& i) { return self(i.value + n); } + + friend difference_type operator-(self const& x, self const& y) { return x.value - y.value; } + + friend self operator-(self const& i, difference_type n) { return self(i.value - n); } + + friend bool operator==(self const& x, self const& y) { return x.value == y.value; } + + friend bool operator!=(self const& x, self const& y) { return x.value != y.value; } + + friend bool operator<(self const& x, self const& y) { return x.value < y.value; } + + friend bool operator<=(self const& x, self const& y) { return x.value <= y.value; } + + friend bool operator>(self const& x, self const& y) { return x.value > y.value; } + + friend bool operator>=(self const& x, self const& y) { return x.value >= y.value; } + + private: + value_type value; }; \ No newline at end of file From e01a2b19d71ccfcde22d286afcb64ec83556a2d1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Sat, 7 Oct 2023 01:07:26 -0700 Subject: [PATCH 19/20] Revert "clang-format" This reverts commit 2761f772913e40b34f1b23532f4b0c75c57cf16b. --- apps/1d_stencil/stencil_cuda.cpp | 175 ++++++----- apps/1d_stencil/stencil_serial.cpp | 190 +++++------ apps/1d_stencil/stencil_snd_gpu_m.cpp | 201 ++++++------ apps/1d_stencil/stencil_snd_gpu_s.cpp | 202 ++++++------ apps/1d_stencil/stencil_stdpar.cpp | 193 ++++++------ apps/1d_stencil/stencil_stdpar_snd.cpp | 254 ++++++++------- apps/1d_stencil/stencil_stdpar_snd_iter.cpp | 213 +++++++------ apps/choleskey/choleskey_serial.cpp | 140 +++++---- apps/choleskey/choleskey_stdpar.cpp | 124 ++++---- apps/choleskey/choleskey_stdpar_snd.cpp | 269 ++++++++-------- apps/choleskey/matrixutil.hpp | 41 +-- apps/comm-study/comm-study-no-senders.cpp | 108 ++++--- apps/comm-study/comm-study.cpp | 163 +++++----- apps/fft/fft-serial.cpp | 47 +-- apps/fft/fft.hpp | 215 +++++++------ apps/heat-equation/heat-equation-cuda.cpp | 294 ++++++++--------- .../heat-equation-gpu-scheduler.cpp | 244 ++++++++------- apps/heat-equation/heat-equation-mdspan.cpp | 209 +++++++------ .../heat-equation-multigpu-scheduler.cpp | 248 ++++++++------- apps/heat-equation/heat-equation-omp.cpp | 182 ++++++----- .../heat-equation-stdpar-senders.cpp | 296 +++++++++--------- apps/heat-equation/heat-equation-stdpar.cpp | 212 +++++++------ apps/heat-equation/heat-equation.hpp | 61 ++-- apps/mdspan-stdpar/mdspan-stdpar.cpp | 79 ++--- include/commons.hpp | 57 ++-- include/counting_iterator.hpp | 164 +++++----- 26 files changed, 2405 insertions(+), 2176 deletions(-) diff --git a/apps/1d_stencil/stencil_cuda.cpp b/apps/1d_stencil/stencil_cuda.cpp index 3436893..2c87bac 100644 --- a/apps/1d_stencil/stencil_cuda.cpp +++ b/apps/1d_stencil/stencil_cuda.cpp @@ -7,16 +7,20 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)").set_default(false); - std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)") + .set_default(false); + std::uint64_t& nx = + kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = + kwarg("no-header", "Do not print csv header row (default: false)") + .set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -28,95 +32,96 @@ constexpr double dx = 1.; // grid spacing // Our operator __device__ double heat(double left, double middle, double right) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); } __global__ void heat_equation(double* current, double* next, std::size_t size) { - std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; + std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) { - std::size_t left = (i == 0) ? size - 1 : i - 1; - std::size_t right = (i == size - 1) ? 0 : i + 1; - next[i] = heat(current[left], current[i], current[right]); - } + if (i < size) { + std::size_t left = (i == 0) ? size - 1 : i - 1; + std::size_t right = (i == size - 1) ? 0 : i + 1; + next[i] = heat(current[left], current[i], current[right]); + } } int benchmark(args_params_t const& args) { - // Parameters (for simplicity, some are hardcoded) - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. - std::size_t size = np * nx; - - double* h_current = nullptr; - double* h_next = nullptr; - - // Measure execution time. - Timer timer; - - // Memory allocation - if (args.results) { - h_current = new double[size]; - h_next = new double[size]; + // Parameters (for simplicity, some are hardcoded) + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. + std::size_t size = np * nx; + + double* h_current = nullptr; + double* h_next = nullptr; + + // Measure execution time. + Timer timer; + + // Memory allocation + if (args.results) { + h_current = new double[size]; + h_next = new double[size]; + } + + double* d_current; + double* d_next; + cudaMalloc(&d_current, size * sizeof(double)); + cudaMalloc(&d_next, size * sizeof(double)); + thrust::sequence(thrust::device, d_current, d_current + size, 0); + thrust::sequence(thrust::device, d_next, d_next + size, 0); + + // CUDA kernel execution parameters + const int threadsPerBlock = std::min(1024, (int)size); + const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock; + + // Actual time step loop + for (std::size_t t = 0; t < nt; ++t) { + heat_equation<<>>(d_current, d_next, size); + std::swap(d_current, d_next); + } + cudaDeviceSynchronize(); + auto time = timer.stop(); + + if (args.results) { + // Copy result back to host + cudaMemcpy(h_current, d_current, size * sizeof(double), + cudaMemcpyDeviceToHost); + + // Print results + for (std::size_t i = 0; i < np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j < nx; ++j) { + std::cout << h_current[i * nx + j] << " "; + } + std::cout << "}\n"; } + // Cleanup + delete[] h_current; + delete[] h_next; + } - double* d_current; - double* d_next; - cudaMalloc(&d_current, size * sizeof(double)); - cudaMalloc(&d_next, size * sizeof(double)); - thrust::sequence(thrust::device, d_current, d_current + size, 0); - thrust::sequence(thrust::device, d_next, d_next + size, 0); - - // CUDA kernel execution parameters - const int threadsPerBlock = std::min(1024, (int)size); - const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock; - - // Actual time step loop - for (std::size_t t = 0; t < nt; ++t) { - heat_equation<<>>(d_current, d_next, size); - std::swap(d_current, d_next); - } - cudaDeviceSynchronize(); - auto time = timer.stop(); - - if (args.results) { - // Copy result back to host - cudaMemcpy(h_current, d_current, size * sizeof(double), cudaMemcpyDeviceToHost); - - // Print results - for (std::size_t i = 0; i < np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j < nx; ++j) { - std::cout << h_current[i * nx + j] << " "; - } - std::cout << "}\n"; - } - // Cleanup - delete[] h_current; - delete[] h_next; - } + cudaFree(d_current); + cudaFree(d_next); - cudaFree(d_current); - cudaFree(d_next); - - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_serial.cpp b/apps/1d_stencil/stencil_serial.cpp index f4a7180..fce1d5d 100644 --- a/apps/1d_stencil/stencil_serial.cpp +++ b/apps/1d_stencil/stencil_serial.cpp @@ -32,16 +32,20 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)").set_default(false); - std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)") + .set_default(false); + std::uint64_t& nx = + kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = + kwarg("no-header", "Do not print csv header row (default: false)") + .set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -54,107 +58,107 @@ double dx = 1.; // grid spacing /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; + // Our partition type + typedef double partition; - // Our data for one time step - using view_1d = std::extents; - typedef std::mdspan space; + // Our data for one time step + using view_1d = std::extents; + typedef std::mdspan space; - void init_value(auto& data, std::size_t np, std::size_t nx) { - for (std::size_t i = 0; i != np * nx; ++i) { - data[i] = double(i); - } + void init_value(auto& data, std::size_t np, std::size_t nx) { + for (std::size_t i = 0; i != np * nx; ++i) { + data[i] = double(i); } + } - // Our operator - double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, - const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); - } - - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; - } - - if (id == size - 1 && dir == +1) { - return (std::size_t)0; - } - assert(id < size); + // Our operator + double heat(double left, double middle, double right, const double k = ::k, + const double dt = ::dt, const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); + } - return id + dir; + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; } - // do all the work on 'nx' data points for 'nt' time steps - space do_work(std::size_t np, std::size_t nx, std::size_t nt) { - std::size_t size = np * nx; - partition* current_ptr = new partition[size]; - partition* next_ptr = new partition[size]; - auto current = space(current_ptr, size); - auto next = space(next_ptr, size); - - init_value(current, np, nx); - - // Actual time step loop - for (std::size_t t = 0; t != nt; ++t) { - for (std::size_t i = 0; i < np * nx; ++i) { - auto left = idx(i, -1, size); - auto right = idx(i, +1, size); - next[i] = heat(current[left], current[i], current[right], k, dt, dx); - } - std::swap(current, next); - } - - return current; + if (id == size - 1 && dir == +1) { + return (std::size_t)0; } + assert(id < size); + + return id + dir; + } + + // do all the work on 'nx' data points for 'nt' time steps + space do_work(std::size_t np, std::size_t nx, std::size_t nt) { + std::size_t size = np * nx; + partition* current_ptr = new partition[size]; + partition* next_ptr = new partition[size]; + auto current = space(current_ptr, size); + auto next = space(next_ptr, size); + + init_value(current, np, nx); + + // Actual time step loop + for (std::size_t t = 0; t != nt; ++t) { + for (std::size_t i = 0; i < np * nx; ++i) { + auto left = idx(i, -1, size); + auto right = idx(i, +1, size); + next[i] = heat(current[left], current[i], current[right], k, dt, dx); + } + std::swap(current, next); + } + + return current; + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. - - // Create the stepper object - stepper step; - - // Measure execution time. - Timer timer; - - // Execute nt time steps on nx grid points. - auto solution = step.do_work(np, nx, nt); - auto time = timer.stop(); - - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; - } + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. + + // Create the stepper object + stepper step; + + // Measure execution time. + Timer timer; + + // Execute nt time steps on nx grid points. + auto solution = step.do_work(np, nx, nt); + auto time = timer.stop(); + + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; } + } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_snd_gpu_m.cpp b/apps/1d_stencil/stencil_snd_gpu_m.cpp index 0c385d9..83d3e16 100644 --- a/apps/1d_stencil/stencil_snd_gpu_m.cpp +++ b/apps/1d_stencil/stencil_snd_gpu_m.cpp @@ -40,16 +40,20 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)").set_default(false); - std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)") + .set_default(false); + std::uint64_t& nx = + kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = + kwarg("no-header", "Do not print csv header row (default: false)") + .set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -60,118 +64,125 @@ double dt = 1.; // time step double dx = 1.; // grid spacing template -using any_sender_of = typename exec::any_receiver_ref>::template any_sender<>; +using any_sender_of = typename exec::any_receiver_ref< + stdexec::completion_signatures>::template any_sender<>; /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; + // Our partition type + typedef double partition; - // Our data for one time step - typedef thrust::device_vector space; + // Our data for one time step + typedef thrust::device_vector space; - // Our operator - double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, - const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); - } - - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; - } + // Our operator + double heat(double left, double middle, double right, const double k = ::k, + const double dt = ::dt, const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); + } - if (id == size - 1 && dir == +1) { - return (std::size_t)0; - } - assert(id < size); + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; + } - return id + dir; + if (id == size - 1 && dir == +1) { + return (std::size_t)0; + } + assert(id < size); + + return id + dir; + } + + // do all the work on 'nx' data points for 'nt' time steps + space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, + std::size_t nt) { + std::size_t size = np * nx; + thrust::device_vector current_vec(size); + thrust::device_vector next_vec(size); + + auto current_ptr = thrust::raw_pointer_cast(current_vec.data()); + auto next_ptr = thrust::raw_pointer_cast(next_vec.data()); + + stdexec::sender auto init = + stdexec::transfer_just(sch, current_ptr, nx) | + stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { + current_ptr[i] = (double)i; + }); + stdexec::sync_wait(std::move(init)); + + for (std::size_t t = 0; t != nt; ++t) { + auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, + dx, np, nx) | + stdexec::bulk(np * nx, [&](int i, auto current_ptr, + auto next_ptr, auto k, auto dt, + auto dx, auto np, auto nx) { + auto left = idx(i, -1, np * nx); + auto right = idx(i, +1, np * nx); + next_ptr[i] = heat(current_ptr[left], current_ptr[i], + current_ptr[right], k, dt, dx); + }); + stdexec::sync_wait(std::move(sender)); + std::swap(current_ptr, next_ptr); } - // do all the work on 'nx' data points for 'nt' time steps - space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) { - std::size_t size = np * nx; - thrust::device_vector current_vec(size); - thrust::device_vector next_vec(size); - - auto current_ptr = thrust::raw_pointer_cast(current_vec.data()); - auto next_ptr = thrust::raw_pointer_cast(next_vec.data()); - - stdexec::sender auto init = - stdexec::transfer_just(sch, current_ptr, nx) | - stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { current_ptr[i] = (double)i; }); - stdexec::sync_wait(std::move(init)); - - for (std::size_t t = 0; t != nt; ++t) { - auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx) | - stdexec::bulk(np * nx, [&](int i, auto current_ptr, auto next_ptr, auto k, auto dt, auto dx, - auto np, auto nx) { - auto left = idx(i, -1, np * nx); - auto right = idx(i, +1, np * nx); - next_ptr[i] = heat(current_ptr[left], current_ptr[i], current_ptr[right], k, dt, dx); - }); - stdexec::sync_wait(std::move(sender)); - std::swap(current_ptr, next_ptr); - } - - if (nt % 2 == 0) { - return current_vec; - } - return next_vec; + if (nt % 2 == 0) { + return current_vec; } + return next_vec; + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. - // Create the stepper object - stepper step; + // Create the stepper object + stepper step; - nvexec::multi_gpu_stream_context stream_context{}; - stdexec::scheduler auto sch = stream_context.get_scheduler(); + nvexec::multi_gpu_stream_context stream_context{}; + stdexec::scheduler auto sch = stream_context.get_scheduler(); - // Measure execution time. - Timer timer; + // Measure execution time. + Timer timer; - // Execute nt time steps on nx grid points. - stepper::space solution = step.do_work(sch, np, nx, nt); + // Execute nt time steps on nx grid points. + stepper::space solution = step.do_work(sch, np, nx, nt); - auto time = timer.stop(); + auto time = timer.stop(); - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; - } + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; } + } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_snd_gpu_s.cpp b/apps/1d_stencil/stencil_snd_gpu_s.cpp index 8144c52..58fc06c 100644 --- a/apps/1d_stencil/stencil_snd_gpu_s.cpp +++ b/apps/1d_stencil/stencil_snd_gpu_s.cpp @@ -40,16 +40,20 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)").set_default(false); - std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)") + .set_default(false); + std::uint64_t& nx = + kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = + kwarg("no-header", "Do not print csv header row (default: false)") + .set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -60,118 +64,126 @@ double dt = 1.; // time step double dx = 1.; // grid spacing template -using any_sender_of = typename exec::any_receiver_ref>::template any_sender<>; +using any_sender_of = typename exec::any_receiver_ref< + stdexec::completion_signatures>::template any_sender<>; /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; + // Our partition type + typedef double partition; - // Our data for one time step - typedef thrust::device_vector space; + // Our data for one time step + typedef thrust::device_vector space; - // Our operator - double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, - const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); - } - - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; - } + // Our operator + double heat(double left, double middle, double right, const double k = ::k, + const double dt = ::dt, const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); + } - if (id == size - 1 && dir == +1) { - return (std::size_t)0; - } - assert(id < size); + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; + } - return id + dir; + if (id == size - 1 && dir == +1) { + return (std::size_t)0; + } + assert(id < size); + + return id + dir; + } + + // do all the work on 'nx' data points for 'nt' time steps + space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, + std::size_t nt) { + std::size_t size = np * nx; + thrust::device_vector current_vec(size); + thrust::device_vector next_vec(size); + + auto current_ptr = thrust::raw_pointer_cast(current_vec.data()); + auto next_ptr = thrust::raw_pointer_cast(next_vec.data()); + + stdexec::sender auto init = + stdexec::transfer_just(sch, current_ptr, nx) | + stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { + current_ptr[i] = (double)i; + }); + stdexec::sync_wait(std::move(init)); + + for (std::size_t t = 0; t != nt; ++t) { + auto sender = + stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx, + size) | + stdexec::bulk(np * nx, + [&](int i, auto& current_ptr, auto& next_ptr, auto k, + auto dt, auto dx, auto np, auto nx, auto size) { + std::size_t left = (i == 0) ? size - 1 : i - 1; + std::size_t right = (i == size - 1) ? 0 : i + 1; + next_ptr[i] = heat(current_ptr[left], current_ptr[i], + current_ptr[right], k, dt, dx); + }); + stdexec::sync_wait(std::move(sender)); + std::swap(current_ptr, next_ptr); } - // do all the work on 'nx' data points for 'nt' time steps - space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) { - std::size_t size = np * nx; - thrust::device_vector current_vec(size); - thrust::device_vector next_vec(size); - - auto current_ptr = thrust::raw_pointer_cast(current_vec.data()); - auto next_ptr = thrust::raw_pointer_cast(next_vec.data()); - - stdexec::sender auto init = - stdexec::transfer_just(sch, current_ptr, nx) | - stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { current_ptr[i] = (double)i; }); - stdexec::sync_wait(std::move(init)); - - for (std::size_t t = 0; t != nt; ++t) { - auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx, size) | - stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto& next_ptr, auto k, auto dt, auto dx, - auto np, auto nx, auto size) { - std::size_t left = (i == 0) ? size - 1 : i - 1; - std::size_t right = (i == size - 1) ? 0 : i + 1; - next_ptr[i] = heat(current_ptr[left], current_ptr[i], current_ptr[right], k, dt, dx); - }); - stdexec::sync_wait(std::move(sender)); - std::swap(current_ptr, next_ptr); - } - - if (nt % 2 == 0) { - return current_vec; - } - return next_vec; + if (nt % 2 == 0) { + return current_vec; } + return next_vec; + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. - // Create the stepper object - stepper step; + // Create the stepper object + stepper step; - nvexec::stream_context stream_ctx{}; - stdexec::scheduler auto sch = stream_ctx.get_scheduler(); + nvexec::stream_context stream_ctx{}; + stdexec::scheduler auto sch = stream_ctx.get_scheduler(); - // Measure execution time. - Timer timer; + // Measure execution time. + Timer timer; - // Execute nt time steps on nx grid points. - stepper::space solution = step.do_work(sch, np, nx, nt); + // Execute nt time steps on nx grid points. + stepper::space solution = step.do_work(sch, np, nx, nt); - auto time = timer.stop(); + auto time = timer.stop(); - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; - } + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; } + } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_stdpar.cpp b/apps/1d_stencil/stencil_stdpar.cpp index e424620..c1e780c 100644 --- a/apps/1d_stencil/stencil_stdpar.cpp +++ b/apps/1d_stencil/stencil_stdpar.cpp @@ -34,16 +34,20 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)").set_default(false); - std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)") + .set_default(false); + std::uint64_t& nx = + kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = + kwarg("no-header", "Do not print csv header row (default: false)") + .set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -56,104 +60,105 @@ double dx = 1.; // grid spacing /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; - - // Our data for one time step - using view_1d = std::extents; - typedef std::mdspan space; - - // Our operator - double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, - const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); + // Our partition type + typedef double partition; + + // Our data for one time step + using view_1d = std::extents; + typedef std::mdspan space; + + // Our operator + double heat(double left, double middle, double right, const double k = ::k, + const double dt = ::dt, const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); + } + + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; } - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; - } - - if (id == size - 1 && dir == +1) { - return (std::size_t)0; - } - assert(id < size); - - return id + dir; + if (id == size - 1 && dir == +1) { + return (std::size_t)0; } - - // do all the work on 'nx' data points for 'nt' time steps - space do_work(std::size_t np, std::size_t nx, std::size_t nt) { - std::size_t size = np * nx; - partition* current_ptr = new partition[size]; - partition* next_ptr = new partition[size]; - - auto current = space(current_ptr, size); - auto next = space(next_ptr, size); - // parallel init - std::for_each_n(std::execution::par, counting_iterator(0), np * nx, - [=](std::size_t i) { current_ptr[i] = (double)i; }); - - // Actual time step loop - for (std::size_t t = 0; t != nt; ++t) { - std::for_each_n(std::execution::par, counting_iterator(0), np * nx, - [=, k = k, dt = dt, dx = dx](int32_t i) { - auto left = idx(i, -1, size); - auto right = idx(i, +1, size); - next[i] = heat(current[left], current[i], current[right], k, dt, dx); - }); - std::swap(current, next); - } - - return current; + assert(id < size); + + return id + dir; + } + + // do all the work on 'nx' data points for 'nt' time steps + space do_work(std::size_t np, std::size_t nx, std::size_t nt) { + std::size_t size = np * nx; + partition* current_ptr = new partition[size]; + partition* next_ptr = new partition[size]; + + auto current = space(current_ptr, size); + auto next = space(next_ptr, size); + // parallel init + std::for_each_n(std::execution::par, counting_iterator(0), np * nx, + [=](std::size_t i) { current_ptr[i] = (double)i; }); + + // Actual time step loop + for (std::size_t t = 0; t != nt; ++t) { + std::for_each_n(std::execution::par, counting_iterator(0), np * nx, + [=, k = k, dt = dt, dx = dx](int32_t i) { + auto left = idx(i, -1, size); + auto right = idx(i, +1, size); + next[i] = heat(current[left], current[i], + current[right], k, dt, dx); + }); + std::swap(current, next); } + + return current; + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. - - // Create the stepper object - stepper step; - - // Measure execution time. - Timer timer; - - // Execute nt time steps on nx grid points. - auto solution = step.do_work(np, nx, nt); - auto time = timer.stop(); - - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; - } + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. + + // Create the stepper object + stepper step; + + // Measure execution time. + Timer timer; + + // Execute nt time steps on nx grid points. + auto solution = step.do_work(np, nx, nt); + auto time = timer.stop(); + + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; } + } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_stdpar_snd.cpp b/apps/1d_stencil/stencil_stdpar_snd.cpp index 6a08a63..30cfca8 100644 --- a/apps/1d_stencil/stencil_stdpar_snd.cpp +++ b/apps/1d_stencil/stencil_stdpar_snd.cpp @@ -37,16 +37,20 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)").set_default(false); - std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)") + .set_default(false); + std::uint64_t& nx = + kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = + kwarg("no-header", "Do not print csv header row (default: false)") + .set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -57,134 +61,144 @@ double dt = 1.; // time step double dx = 1.; // grid spacing template -using any_sender_of = typename exec::any_receiver_ref>::template any_sender<>; +using any_sender_of = typename exec::any_receiver_ref< + stdexec::completion_signatures>::template any_sender<>; /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; - - // Our data for one time step - using view_1d = std::extents; - typedef std::mdspan space; - - using any_space_sender = - any_sender_of; - - // Our operator - double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, - const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); + // Our partition type + typedef double partition; + + // Our data for one time step + using view_1d = std::extents; + typedef std::mdspan space; + + using any_space_sender = + any_sender_of; + + // Our operator + double heat(double left, double middle, double right, const double k = ::k, + const double dt = ::dt, const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); + } + + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; } - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; - } - - if (id == size - 1 && dir == +1) { - return (std::size_t)0; - } - assert(id < size); - - return id + dir; + if (id == size - 1 && dir == +1) { + return (std::size_t)0; } - - partition* current_ptr = nullptr; - partition* next_ptr = nullptr; - space current; - space next; - - // do all the work on 'nx' data points for 'nt' time steps - auto do_work(std::size_t np, std::size_t nx, std::size_t nt) -> any_space_sender { - if (nt == 0) { - std::size_t size = np * nx; - partition* current_ptr = new partition[size]; - partition* next_ptr = new partition[size]; - current = space(current_ptr, size); - next = space(next_ptr, size); - - // parallel init - std::for_each_n(std::execution::par, counting_iterator(0), np * nx, - [=](std::size_t i) { current_ptr[i] = (double)i; }); - - return stdexec::just(current); - } - - return stdexec::just(nt - 1) | - stdexec::let_value([=](std::size_t nt_updated) { return do_work(np, nx, nt_updated); }) | - stdexec::bulk(np, - [&, k = k, dt = dt, dx = dx, nx = nx, np = np](std::size_t i, auto const& current) { - std::for_each_n( - std::execution::par, counting_iterator(0), nx, [=, next = next](std::size_t j) { - std::size_t id = i * nx + j; - auto left = idx(id, -1, np * nx); - auto right = idx(id, +1, np * nx); - next[id] = heat(current[left], current[id], current[right], k, dt, dx); - }); - }) | - stdexec::then([&](auto current) { - // TODO: return next? - std::swap(current, next); - return current; - }); + assert(id < size); + + return id + dir; + } + + partition* current_ptr = nullptr; + partition* next_ptr = nullptr; + space current; + space next; + + // do all the work on 'nx' data points for 'nt' time steps + auto do_work(std::size_t np, std::size_t nx, std::size_t nt) + -> any_space_sender { + if (nt == 0) { + std::size_t size = np * nx; + partition* current_ptr = new partition[size]; + partition* next_ptr = new partition[size]; + current = space(current_ptr, size); + next = space(next_ptr, size); + + // parallel init + std::for_each_n(std::execution::par, counting_iterator(0), np * nx, + [=](std::size_t i) { current_ptr[i] = (double)i; }); + + return stdexec::just(current); } + + return stdexec::just(nt - 1) | + stdexec::let_value([=](std::size_t nt_updated) { + return do_work(np, nx, nt_updated); + }) | + stdexec::bulk(np, + [&, k = k, dt = dt, dx = dx, nx = nx, np = np]( + std::size_t i, auto const& current) { + std::for_each_n( + std::execution::par, counting_iterator(0), nx, + [=, next = next](std::size_t j) { + std::size_t id = i * nx + j; + auto left = idx(id, -1, np * nx); + auto right = idx(id, +1, np * nx); + next[id] = heat(current[left], current[id], + current[right], k, dt, dx); + }); + }) | + stdexec::then([&](auto current) { + // TODO: return next? + std::swap(current, next); + return current; + }); + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. - - // Create the stepper object - stepper step; - - exec::static_thread_pool pool(np); - stdexec::scheduler auto sch = pool.get_scheduler(); - stdexec::sender auto begin = stdexec::schedule(sch); - - // Measure execution time. - Timer timer; - - // Execute nt time steps on nx grid points. - stdexec::sender auto sender = begin | stdexec::then([=]() { return nt; }) | - stdexec::let_value([=, &step](std::uint64_t nt) { return step.do_work(np, nx, nt); }); - - auto [solution] = stdexec::sync_wait(std::move(sender)).value(); - - auto time = timer.stop(); - - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; - } + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. + + // Create the stepper object + stepper step; + + exec::static_thread_pool pool(np); + stdexec::scheduler auto sch = pool.get_scheduler(); + stdexec::sender auto begin = stdexec::schedule(sch); + + // Measure execution time. + Timer timer; + + // Execute nt time steps on nx grid points. + stdexec::sender auto sender = + begin | stdexec::then([=]() { return nt; }) | + stdexec::let_value( + [=, &step](std::uint64_t nt) { return step.do_work(np, nx, nt); }); + + auto [solution] = stdexec::sync_wait(std::move(sender)).value(); + + auto time = timer.stop(); + + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; } + } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/1d_stencil/stencil_stdpar_snd_iter.cpp b/apps/1d_stencil/stencil_stdpar_snd_iter.cpp index 0cbffc9..0c1280a 100644 --- a/apps/1d_stencil/stencil_stdpar_snd_iter.cpp +++ b/apps/1d_stencil/stencil_stdpar_snd_iter.cpp @@ -37,16 +37,20 @@ // parameters struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)").set_default(false); - std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10); - std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); - bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); - double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); - double& dx = kwarg("dx", "Local x dimension").set_default(1.0); - bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)") + .set_default(false); + std::uint64_t& nx = + kwarg("nx", "Local x dimension (of each partition)").set_default(10); + std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10); + bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); + double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); + double& dx = kwarg("dx", "Local x dimension").set_default(1.0); + bool& no_header = + kwarg("no-header", "Do not print csv header row (default: false)") + .set_default(false); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; /////////////////////////////////////////////////////////////////////////////// @@ -57,122 +61,127 @@ double dt = 1.; // time step double dx = 1.; // grid spacing template -using any_sender_of = typename exec::any_receiver_ref>::template any_sender<>; +using any_sender_of = typename exec::any_receiver_ref< + stdexec::completion_signatures>::template any_sender<>; /////////////////////////////////////////////////////////////////////////////// //[stepper_1 struct stepper { - // Our partition type - typedef double partition; - - // Our data for one time step - using view_1d = std::extents; - typedef std::mdspan space; - - // Our operator - double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt, - const double dx = ::dx) { - return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); + // Our partition type + typedef double partition; + + // Our data for one time step + using view_1d = std::extents; + typedef std::mdspan space; + + // Our operator + double heat(double left, double middle, double right, const double k = ::k, + const double dt = ::dt, const double dx = ::dx) { + return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); + } + + inline std::size_t idx(std::size_t id, int dir, std::size_t size) { + if (id == 0 && dir == -1) { + return size - 1; } - inline std::size_t idx(std::size_t id, int dir, std::size_t size) { - if (id == 0 && dir == -1) { - return size - 1; - } - - if (id == size - 1 && dir == +1) { - return (std::size_t)0; - } - assert(id < size); - - return id + dir; + if (id == size - 1 && dir == +1) { + return (std::size_t)0; } - - partition* current_ptr = nullptr; - partition* next_ptr = nullptr; - space current; - space next; - - // do all the work on 'nx' data points for 'nt' time steps - space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) { - std::size_t size = np * nx; - partition* current_ptr = new partition[size]; - partition* next_ptr = new partition[size]; - - auto current = space(current_ptr, size); - auto next = space(next_ptr, size); - // parallel init - std::for_each_n(std::execution::par, counting_iterator(0), np * nx, - [=](std::size_t i) { current(i) = (double)i; }); - - // Actual time step loop - for (std::size_t t = 0; t != nt; ++t) { - auto sender = - stdexec::transfer_just(sch, current, next, k, dt, dx, np, nx) | - stdexec::bulk(np, [&](int i, auto& current, auto& next, auto k, auto dt, auto dx, auto np, auto nx) { - std::for_each_n(std::execution::par, counting_iterator(0), nx, [=](std::size_t j) { - std::size_t id = i * nx + j; - auto left = idx(id, -1, np * nx); - auto right = idx(id, +1, np * nx); - next(id) = heat(current(left), current(id), current(right), k, dt, dx); - }); - }); - stdexec::sync_wait(std::move(sender)); - std::swap(current, next); - } - - return current; + assert(id < size); + + return id + dir; + } + + partition* current_ptr = nullptr; + partition* next_ptr = nullptr; + space current; + space next; + + // do all the work on 'nx' data points for 'nt' time steps + space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, + std::size_t nt) { + std::size_t size = np * nx; + partition* current_ptr = new partition[size]; + partition* next_ptr = new partition[size]; + + auto current = space(current_ptr, size); + auto next = space(next_ptr, size); + // parallel init + std::for_each_n(std::execution::par, counting_iterator(0), np * nx, + [=](std::size_t i) { current(i) = (double)i; }); + + // Actual time step loop + for (std::size_t t = 0; t != nt; ++t) { + auto sender = + stdexec::transfer_just(sch, current, next, k, dt, dx, np, nx) | + stdexec::bulk(np, [&](int i, auto& current, auto& next, auto k, + auto dt, auto dx, auto np, auto nx) { + std::for_each_n(std::execution::par, counting_iterator(0), nx, + [=](std::size_t j) { + std::size_t id = i * nx + j; + auto left = idx(id, -1, np * nx); + auto right = idx(id, +1, np * nx); + next(id) = heat(current(left), current(id), + current(right), k, dt, dx); + }); + }); + stdexec::sync_wait(std::move(sender)); + std::swap(current, next); } + + return current; + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t np = args.np; // Number of partitions. - std::uint64_t nx = args.nx; // Number of grid points. - std::uint64_t nt = args.nt; // Number of steps. + std::uint64_t np = args.np; // Number of partitions. + std::uint64_t nx = args.nx; // Number of grid points. + std::uint64_t nt = args.nt; // Number of steps. - // Create the stepper object - stepper step; + // Create the stepper object + stepper step; - exec::static_thread_pool pool(np); - stdexec::scheduler auto sch = pool.get_scheduler(); + exec::static_thread_pool pool(np); + stdexec::scheduler auto sch = pool.get_scheduler(); - // Measure execution time. - Timer timer; + // Measure execution time. + Timer timer; - stepper::space solution = step.do_work(sch, np, nx, nt); + stepper::space solution = step.do_work(sch, np, nx, nt); - auto time = timer.stop(); + auto time = timer.stop(); - // Print the final solution - if (args.results) { - for (std::size_t i = 0; i != np; ++i) { - std::cout << "U[" << i << "] = {"; - for (std::size_t j = 0; j != nx; ++j) { - std::cout << solution[i * nx + j] << " "; - } - std::cout << "}\n"; - } + // Print the final solution + if (args.results) { + for (std::size_t i = 0; i != np; ++i) { + std::cout << "U[" << i << "] = {"; + for (std::size_t j = 0; j != nx; ++j) { + std::cout << solution[i * nx + j] << " "; + } + std::cout << "}\n"; } + } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/choleskey/choleskey_serial.cpp b/apps/choleskey/choleskey_serial.cpp index 88d6824..5c82498 100644 --- a/apps/choleskey/choleskey_serial.cpp +++ b/apps/choleskey/choleskey_serial.cpp @@ -39,90 +39,92 @@ using namespace std; struct solver { - using view_2d = std::extents; - - typedef std::mdspan matrix_ms_t; - - template - matrix_ms_t Cholesky_Decomposition(std::vector& vec, int n) { - std::vector lower(n * n, 0); - - auto matrix_ms = std::mdspan(vec.data(), n, n); - auto lower_ms = std::mdspan(lower.data(), n, n); - - // Decomposing a matrix into Lower Triangular - for (int i = 0; i < matrix_ms.extent(0); i++) { - for (int j = 0; j <= i; j++) { - T sum = 0; - - if (j == i) { - // summation for diagonals - for (int k = 0; k < j; k++) - sum += pow(lower_ms(j, k), 2); - lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum); - } else { - // Evaluating L(i, j) using L(j, j) - for (int k = 0; k < j; k++) - sum += (lower_ms(i, k) * lower_ms(j, k)); - lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j); - } - } + using view_2d = std::extents; + + typedef std::mdspan matrix_ms_t; + + template + matrix_ms_t Cholesky_Decomposition(std::vector& vec, int n) { + std::vector lower(n * n, 0); + + auto matrix_ms = + std::mdspan(vec.data(), n, n); + auto lower_ms = + std::mdspan(lower.data(), n, n); + + // Decomposing a matrix into Lower Triangular + for (int i = 0; i < matrix_ms.extent(0); i++) { + for (int j = 0; j <= i; j++) { + T sum = 0; + + if (j == i) { + // summation for diagonals + for (int k = 0; k < j; k++) + sum += pow(lower_ms(j, k), 2); + lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum); + } else { + // Evaluating L(i, j) using L(j, j) + for (int k = 0; k < j; k++) + sum += (lower_ms(i, k) * lower_ms(j, k)); + lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j); } - return lower_ms; + } } + return lower_ms; + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t nd = args.nd; // Number of matrix dimension. - - std::vector inputMatrix = generate_pascal_matrix(nd); - - // Create the solverobject - solver solve; - // Measure execution time. - Timer timer; - // start decomposation - auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); - - // Print the final results - if (args.results) { - // Displaying Lower Triangular and its Transpose - cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; - for (int i = 0; i < nd; i++) { - // Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix(i, j) << "\t"; - cout << "\t"; - - // Transpose of Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix(j, i) << "\t"; - cout << endl; - } + std::uint64_t nd = args.nd; // Number of matrix dimension. + + std::vector inputMatrix = generate_pascal_matrix(nd); + + // Create the solverobject + solver solve; + // Measure execution time. + Timer timer; + // start decomposation + auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); + + // Print the final results + if (args.results) { + // Displaying Lower Triangular and its Transpose + cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; + for (int i = 0; i < nd; i++) { + // Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix(i, j) << "\t"; + cout << "\t"; + + // Transpose of Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix(j, i) << "\t"; + cout << endl; } + } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } // Driver Code for testing int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/choleskey/choleskey_stdpar.cpp b/apps/choleskey/choleskey_stdpar.cpp index 33c6b87..2b19b7d 100644 --- a/apps/choleskey/choleskey_stdpar.cpp +++ b/apps/choleskey/choleskey_stdpar.cpp @@ -44,95 +44,99 @@ using namespace std; struct solver { - using view_2d = std::extents; + using view_2d = std::extents; - template - std::vector> Cholesky_Decomposition(std::vector& vec, int n) { - std::vector> lower(n, std::vector(n, 0)); + template + std::vector> Cholesky_Decomposition(std::vector& vec, + int n) { + std::vector> lower(n, std::vector(n, 0)); - auto matrix_ms = std::mdspan(vec.data(), n, n); + auto matrix_ms = + std::mdspan(vec.data(), n, n); - auto multiplier_lambda = [=](auto a, auto b) { - return a * b; - }; + auto multiplier_lambda = [=](auto a, auto b) { + return a * b; + }; - // Decomposing a matrix into Lower Triangular - for (int i = 0; i < matrix_ms.extent(0); i++) { - for (int j = 0; j <= i; j++) { - T sum = 0; + // Decomposing a matrix into Lower Triangular + for (int i = 0; i < matrix_ms.extent(0); i++) { + for (int j = 0; j <= i; j++) { + T sum = 0; - if (j == i) // summation for diagonals - { - sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j, 0, - std::plus{}, [=](int val) { return val * val; }); + if (j == i) // summation for diagonals + { + sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), + lower[j].cbegin() + j, 0, std::plus{}, + [=](int val) { return val * val; }); - lower[j][j] = std::sqrt(matrix_ms(i, j) - sum); + lower[j][j] = std::sqrt(matrix_ms(i, j) - sum); - } else { // Evaluating L(i, j) using L(j, j) + } else { // Evaluating L(i, j) using L(j, j) - sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j, - lower[i].cbegin(), 0, std::plus<>(), multiplier_lambda); + sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), + lower[j].cbegin() + j, lower[i].cbegin(), + 0, std::plus<>(), multiplier_lambda); - lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j]; - } - } + lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j]; } - return lower; + } } + return lower; + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t nd = args.nd; // Number of matrix dimension. + std::uint64_t nd = args.nd; // Number of matrix dimension. - std::vector inputMatrix = generate_pascal_matrix(nd); + std::vector inputMatrix = generate_pascal_matrix(nd); - // Create the solver object - solver solve; - // Measure execution time. - Timer timer; + // Create the solver object + solver solve; + // Measure execution time. + Timer timer; - // start decomposation - auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); + // start decomposation + auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); - // Print the final results - if (args.results) { - // Displaying Lower Triangular and its Transpose - cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; - for (int i = 0; i < nd; i++) { - // Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix[i][j] << "\t"; - cout << "\t"; + // Print the final results + if (args.results) { + // Displaying Lower Triangular and its Transpose + cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; + for (int i = 0; i < nd; i++) { + // Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[i][j] << "\t"; + cout << "\t"; - // Transpose of Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix[j][i] << "\t"; - cout << endl; - } + // Transpose of Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[j][i] << "\t"; + cout << endl; } + } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } // Driver Code for testing int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp index 0a02682..4fa7d79 100644 --- a/apps/choleskey/choleskey_stdpar_snd.cpp +++ b/apps/choleskey/choleskey_stdpar_snd.cpp @@ -45,157 +45,166 @@ using namespace std; struct solver { - using view_2d = std::extents; - - template - std::vector> Cholesky_Decomposition(std::vector& vec, int n, int np) { - - // test here first, scheduler from a thread pool - exec::static_thread_pool pool(np); - stdexec::scheduler auto sch = pool.get_scheduler(); - stdexec::sender auto begin = stdexec::schedule(sch); - - std::vector> lower(n, std::vector(n, 0)); - - auto matrix_ms = std::mdspan(vec.data(), n, n); - - auto multiplier_lambda = [=](auto a, auto b) { - return a * b; - }; - - for (int i = 0; i < matrix_ms.extent(0); i++) { - for (int j = 0; j <= i; j++) { - // avoid over parallelize - if (j == 0) { - np = 1; - } else if (j > 0 && np > j) { - np = j; - } - - if (j == i) // summation for diagonals - { - - if (i == 0 && j == 0) { - lower[j][j] = std::sqrt(matrix_ms(i, j)); - } else { - - std::vector sum_vec(np); // sub res for each piece - int size = j; // there are j elements need to be calculated(power) - - stdexec::sender auto send1 = - stdexec::bulk(begin, np, - [&](int piece) { - int start = piece * size / np; - int chunk_size = size / np; - int remaining = size % np; - chunk_size += (piece == np - 1) ? remaining : 0; - - sum_vec[piece] = std::transform_reduce( - std::execution::par, counting_iterator(start), - counting_iterator(start + chunk_size), 0, std ::plus{}, - [=](int val) { return lower[j][val] * lower[j][val]; }); - }) | - stdexec::then([&sum_vec]() { - return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end()); - }); - - auto [sum1] = stdexec::sync_wait(std::move(send1)).value(); - - lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1); - } - - } else { - // Evaluating L(i, j) using L(j, j) - - if (j == 0) { - lower[i][j] = (matrix_ms(i, j)) / lower[j][j]; - } else { - - std::vector sum_vec(np); // sub res for each piece - int size_nondiag = j; - - stdexec::sender auto send2 = - stdexec::bulk(begin, np, - [&](int piece) { - int start = piece * size_nondiag / np; - int chunk_size = size_nondiag / np; - int remaining = size_nondiag % np; - chunk_size += (piece == np - 1) ? remaining : 0; - - sum_vec[piece] = std::transform_reduce( - std::execution::par, counting_iterator(start), - counting_iterator(start + chunk_size), 0, std ::plus{}, - [=](int k) { return lower[j][k] * lower[i][k]; }); - }) | - stdexec::then([&sum_vec]() { - return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end()); - }); - - auto [sum2] = stdexec::sync_wait(std::move(send2)).value(); - - lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j]; - } - } - } + using view_2d = std::extents; + + template + std::vector> Cholesky_Decomposition(std::vector& vec, int n, + int np) { + + // test here first, scheduler from a thread pool + exec::static_thread_pool pool(np); + stdexec::scheduler auto sch = pool.get_scheduler(); + stdexec::sender auto begin = stdexec::schedule(sch); + + std::vector> lower(n, std::vector(n, 0)); + + auto matrix_ms = + std::mdspan(vec.data(), n, n); + + auto multiplier_lambda = [=](auto a, auto b) { + return a * b; + }; + + for (int i = 0; i < matrix_ms.extent(0); i++) { + for (int j = 0; j <= i; j++) { + // avoid over parallelize + if (j == 0) { + np = 1; + } else if (j > 0 && np > j) { + np = j; + } + + if (j == i) // summation for diagonals + { + + if (i == 0 && j == 0) { + lower[j][j] = std::sqrt(matrix_ms(i, j)); + } else { + + std::vector sum_vec(np); // sub res for each piece + int size = j; // there are j elements need to be calculated(power) + + stdexec::sender auto send1 = + stdexec::bulk(begin, np, + [&](int piece) { + int start = piece * size / np; + int chunk_size = size / np; + int remaining = size % np; + chunk_size += (piece == np - 1) ? remaining : 0; + + sum_vec[piece] = std::transform_reduce( + std::execution::par, + counting_iterator(start), + counting_iterator(start + chunk_size), 0, + std ::plus{}, [=](int val) { + return lower[j][val] * lower[j][val]; + }); + }) | + stdexec::then([&sum_vec]() { + return std::reduce(std::execution::par, sum_vec.begin(), + sum_vec.end()); + }); + + auto [sum1] = stdexec::sync_wait(std::move(send1)).value(); + + lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1); + } + + } else { + // Evaluating L(i, j) using L(j, j) + + if (j == 0) { + lower[i][j] = (matrix_ms(i, j)) / lower[j][j]; + } else { + + std::vector sum_vec(np); // sub res for each piece + int size_nondiag = j; + + stdexec::sender auto send2 = + stdexec::bulk( + begin, np, + [&](int piece) { + int start = piece * size_nondiag / np; + int chunk_size = size_nondiag / np; + int remaining = size_nondiag % np; + chunk_size += (piece == np - 1) ? remaining : 0; + + sum_vec[piece] = std::transform_reduce( + std::execution::par, counting_iterator(start), + counting_iterator(start + chunk_size), 0, + std ::plus{}, + [=](int k) { return lower[j][k] * lower[i][k]; }); + }) | + stdexec::then([&sum_vec]() { + return std::reduce(std::execution::par, sum_vec.begin(), + sum_vec.end()); + }); + + auto [sum2] = stdexec::sync_wait(std::move(send2)).value(); + + lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j]; + } } - return lower; + } } + return lower; + } }; /////////////////////////////////////////////////////////////////////////////// int benchmark(args_params_t const& args) { - std::uint64_t nd = args.nd; // Number of matrix dimension. - std::uint64_t np = args.np; // Number of parallel partitions. + std::uint64_t nd = args.nd; // Number of matrix dimension. + std::uint64_t np = args.np; // Number of parallel partitions. - std::vector inputMatrix = generate_pascal_matrix(nd); + std::vector inputMatrix = generate_pascal_matrix(nd); - // Create the solver object - solver solve; + // Create the solver object + solver solve; - // Measure execution time. - Timer timer; + // Measure execution time. + Timer timer; - // start decomposation - auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np); + // start decomposation + auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np); - // Print the final results - if (args.results) { - // Displaying Lower Triangular and its Transpose - cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; - for (int i = 0; i < nd; i++) { - // Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix[i][j] << "\t"; - cout << "\t"; + // Print the final results + if (args.results) { + // Displaying Lower Triangular and its Transpose + cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; + for (int i = 0; i < nd; i++) { + // Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[i][j] << "\t"; + cout << "\t"; - // Transpose of Lower Triangular - for (int j = 0; j < nd; j++) - cout << setw(6) << res_matrix[j][i] << "\t"; - cout << endl; - } + // Transpose of Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[j][i] << "\t"; + cout << endl; } + } - if (args.time) { - std::cout << "Duration: " << time << " ms." - << "\n"; - } + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } - return 0; + return 0; } // Driver Code for testing int main(int argc, char* argv[]) { - // parse params - args_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - benchmark(args); + benchmark(args); - return 0; + return 0; } diff --git a/apps/choleskey/matrixutil.hpp b/apps/choleskey/matrixutil.hpp index 8b08fb1..44f0468 100644 --- a/apps/choleskey/matrixutil.hpp +++ b/apps/choleskey/matrixutil.hpp @@ -9,30 +9,33 @@ using Matrix = std::vector>; template std::vector generate_pascal_matrix(const int n) { - Matrix matrix(n, std::vector(n, static_cast(0))); + Matrix matrix(n, std::vector(n, static_cast(0))); - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - if (i == 0 || j == 0) { - matrix[i][j] = static_cast(1); - } else { - matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j]; - } - } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + if (i == 0 || j == 0) { + matrix[i][j] = static_cast(1); + } else { + matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j]; + } } + } - std::vector flattenedVector; - for (const auto& row : matrix) { - flattenedVector.insert(flattenedVector.end(), row.begin(), row.end()); - } - return std::move(flattenedVector); + std::vector flattenedVector; + for (const auto& row : matrix) { + flattenedVector.insert(flattenedVector.end(), row.begin(), row.end()); + } + return std::move(flattenedVector); } // parameters define struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)").set_default(true); - std::uint64_t& nd = kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)").set_default(10); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + bool& results = kwarg("results", "print generated results (default: false)") + .set_default(true); + std::uint64_t& nd = + kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)") + .set_default(10); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); }; diff --git a/apps/comm-study/comm-study-no-senders.cpp b/apps/comm-study/comm-study-no-senders.cpp index 87fa74b..1550094 100644 --- a/apps/comm-study/comm-study-no-senders.cpp +++ b/apps/comm-study/comm-study-no-senders.cpp @@ -37,79 +37,87 @@ using time_point_t = std::chrono::system_clock::time_point; // must take in the pointers/vectors by reference template auto work(P& A, P& B, P& Y, int N) { - // init A and B separately - will it cause an H2D copy? - std::for_each(std::execution::par_unseq, &A[0], &A[N], [&](T& ai) { ai = cos(M_PI / 4); }); + // init A and B separately - will it cause an H2D copy? + std::for_each(std::execution::par_unseq, &A[0], &A[N], + [&](T& ai) { ai = cos(M_PI / 4); }); - T sum = 0.0; + T sum = 0.0; - for (int i = 0; i < N / 3; i++) { - // read only or read-write operations - sum += A[i] / N; + for (int i = 0; i < N / 3; i++) { + // read only or read-write operations + sum += A[i] / N; - // this line if commented should not result in an H2D after this but it - // does. - // A[i] = sin(M_PI/4); - } + // this line if commented should not result in an H2D after this but it + // does. + // A[i] = sin(M_PI/4); + } - std::cout << std::endl; + std::cout << std::endl; - // will it cause an H2D here? - std::for_each(std::execution::par_unseq, &B[0], &B[N], [&](T& bi) { bi = sin(M_PI / 6); }); + // will it cause an H2D here? + std::for_each(std::execution::par_unseq, &B[0], &B[N], + [&](T& bi) { bi = sin(M_PI / 6); }); - // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B) + // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B) - std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &A[N / 2], - [&](T& ai, T& bi) { return ai + bi; }); - std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &Y[0], - [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); }); + std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &A[N / 2], + [&](T& ai, T& bi) { return ai + bi; }); + std::transform( + std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &Y[0], + [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); }); - // should trigger a D2H copy of N/5 elements - for (int i = 0; i < N / 3; i++) - sum += Y[i] / N; + // should trigger a D2H copy of N/5 elements + for (int i = 0; i < N / 3; i++) + sum += Y[i] / N; - std::cout << std::endl; + std::cout << std::endl; - // get sum(Y) - one last memcpy (not USM) D2H - sum += std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus(), - [](T& val) { return val * val; }); + // get sum(Y) - one last memcpy (not USM) D2H + sum += + std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus(), [](T &val){return val * val;}); - return sum / N; + return sum / N; } int main(int argc, char* argv[]) { - constexpr int N = 1e9; - time_point_t mark = std::chrono::system_clock::now(); - auto es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); - T sum = 0; + constexpr int N = 1e9; + time_point_t mark = std::chrono::system_clock::now(); + auto es = + std::chrono::duration(std::chrono::system_clock::now() - mark) + .count(); + T sum = 0; #if 1 // 0 if only want to run with pointers - std::vector A(N); - std::vector B(N); - std::vector Y(N); + std::vector A(N); + std::vector B(N); + std::vector Y(N); - mark = std::chrono::system_clock::now(); - sum = work(A, B, Y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); - std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl; + mark = std::chrono::system_clock::now(); + sum = work(A, B, Y, N); + es = std::chrono::duration(std::chrono::system_clock::now() - mark) + .count(); + std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl; #endif #if 1 // 0 if only want to run with vectors - // allocate memory - where is this allocated? - T* a = new T[N]; - T* b = new T[N]; - T* y = new T[N]; - - sum = 0; - mark = std::chrono::system_clock::now(); - sum = work(a, b, y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); - std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl << std::endl; + // allocate memory - where is this allocated? + T* a = new T[N]; + T* b = new T[N]; + T* y = new T[N]; + + sum = 0; + mark = std::chrono::system_clock::now(); + sum = work(a, b, y, N); + es = std::chrono::duration(std::chrono::system_clock::now() - mark) + .count(); + std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl + << std::endl; #endif - // do not use scientific notation - std::cout << std::fixed << "sum: " << sum << "\n"; + // do not use scientific notation + std::cout << std::fixed << "sum: " << sum << "\n"; - return 0; + return 0; } \ No newline at end of file diff --git a/apps/comm-study/comm-study.cpp b/apps/comm-study/comm-study.cpp index 99abcfc..7629ce0 100644 --- a/apps/comm-study/comm-study.cpp +++ b/apps/comm-study/comm-study.cpp @@ -37,91 +37,106 @@ using time_point_t = std::chrono::system_clock::time_point; // must take in the pointers/vectors by reference template auto work(P& A, P& B, P& Y, int N) { - T sum = 0.0; - - // init A and B separately - will it cause an H2D copy? - sender auto s1 = - then(just(), - [&] { std::for_each(std::execution::par_unseq, &A[0], &A[N], [&](T& ai) { ai = cos(M_PI / 4); }); }) - // trigger a D2H here - | then([&] { - for (int i = 0; i < N / 3; i++) { - // read only or read-write operations - sum += A[i] / N; - - // this line if commented should not result in an H2D - // after this but it does. - // A[i] = sin(M_PI/4); - } - std::cout << std::endl; - }); - - // will it cause an H2D here? - sender auto s2 = then( - just(), [&] { std::for_each(std::execution::par_unseq, &B[0], &B[N], [&](T& bi) { bi = sin(M_PI / 6); }); }); - - // will s1 and s2 execute in parallel or not? - sync_wait(when_all(std::move(s1), std::move(s2))); - - // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B) - sender auto s3 = then(just(), - [&] { - std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], &A[0], - [&](T& ai, T& bi) { return ai + bi; }); - std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], &Y[0], - [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); }); - }) - // should trigger a D2H copy of N/3 elements - | then([&] { - for (int i = 0; i < N / 3; i++) - sum += Y[i] / N; - - std::cout << std::endl; - }) - // get sum(Y) - wonder if there is another H2D as we only read it in the - // last step - | then([&] { return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus()); }); - - auto [val] = sync_wait(s3).value(); - - return sum += val; + T sum = 0.0; + + // init A and B separately - will it cause an H2D copy? + sender auto s1 = then(just(), + [&] { + std::for_each(std::execution::par_unseq, &A[0], &A[N], + [&](T& ai) { ai = cos(M_PI / 4); }); + }) + // trigger a D2H here + | then([&] { + for (int i = 0; i < N / 3; i++) { + // read only or read-write operations + sum += A[i] / N; + + // this line if commented should not result in an H2D + // after this but it does. + // A[i] = sin(M_PI/4); + } + std::cout << std::endl; + }); + + // will it cause an H2D here? + sender auto s2 = then(just(), [&] { + std::for_each(std::execution::par_unseq, &B[0], &B[N], + [&](T& bi) { bi = sin(M_PI / 6); }); + }); + + // will s1 and s2 execute in parallel or not? + sync_wait(when_all(std::move(s1), std::move(s2))); + + // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B) + sender auto s3 = + then(just(), + [&] { + std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], + &A[0], [&](T& ai, T& bi) { return ai + bi; }); + std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], + &Y[0], [&](T& ai, T& bi) { + return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); + }); + }) + // should trigger a D2H copy of N/3 elements + | then([&] { + for (int i = 0; i < N / 3; i++) + sum += Y[i] / N; + + std::cout << std::endl; + }) + // get sum(Y) - wonder if there is another H2D as we only read it in the + // last step + | then([&] { + return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, + std::plus()); + }); + + auto [val] = sync_wait(s3).value(); + + return sum += val; } int main(int argc, char* argv[]) { - constexpr int N = 1e9; - time_point_t mark = std::chrono::system_clock::now(); - auto es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); - T sum = 0.0; + constexpr int N = 1e9; + time_point_t mark = std::chrono::system_clock::now(); + auto es = + std::chrono::duration(std::chrono::system_clock::now() - mark) + .count(); + T sum = 0.0; #if 1 // 0 if only arrays - std::vector A(N); - std::vector B(N); - std::vector Y(N); + std::vector A(N); + std::vector B(N); + std::vector Y(N); - mark = std::chrono::system_clock::now(); - sum = work(A, B, Y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); - std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl; + mark = std::chrono::system_clock::now(); + sum = work(A, B, Y, N); + es = std::chrono::duration(std::chrono::system_clock::now() - mark) + .count(); + std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl; - std::cout << fixed << "sum: " << sum << "\n"; + std::cout << fixed << "sum: " << sum << "\n"; #endif #if 1 // 0 if only vectors - // allocate memory - can we just allocate it on device only? - T* a = new T[N]; - T* b = new T[N]; - T* y = new T[N]; - - sum = 0; - mark = std::chrono::system_clock::now(); - sum = work(a, b, y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); - std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl << std::endl; - - // do not use scientific notation - std::cout << fixed << "sum: " << sum << "\n"; + // allocate memory - can we just allocate it on device only? + T* a = new T[N]; + T* b = new T[N]; + T* y = new T[N]; + + sum = 0; + mark = std::chrono::system_clock::now(); + sum = work(a, b, y, N); + es = std::chrono::duration(std::chrono::system_clock::now() - mark) + .count(); + std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl + << std::endl; + + // do not use scientific notation + std::cout << fixed << "sum: " << sum << "\n"; #endif - return 0; + return 0; } \ No newline at end of file diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp index 02bbd7b..b174b5a 100644 --- a/apps/fft/fft-serial.cpp +++ b/apps/fft/fft-serial.cpp @@ -33,12 +33,14 @@ // // simulation // -int main(int argc, char* argv[]) { +int main(int argc, char* argv[]) +{ // parse params fft_params_t args = argparse::parse(argc, argv); // see if help wanted - if (args.help) { + if (args.help) + { args.print(); // prints all variables return 0; } @@ -58,7 +60,8 @@ int main(int argc, char* argv[]) { sig_t x_n(N, sig_type); - if (!isPowOf2(N)) { + if (!isPowOf2(N)) + { N = ceilPowOf2(N); std::cout << "log_2(N) != integer. Padding zeros for N = " << N << std::endl; @@ -67,7 +70,8 @@ int main(int argc, char* argv[]) { sig_t y_n(x_n); - if (print_sig) { + if (print_sig) + { std::cout << std::endl << "x[n] = "; x_n.printSignal(); std::cout << std::endl; @@ -76,36 +80,40 @@ int main(int argc, char* argv[]) { // niterations int niters = ilog2(N); - std::function fft = [&](data_t* x, int lN, const int N) { - int stride = N / lN; + std::function fft = [&](data_t *x, int lN, const int N) + { + int stride = N/lN; - if (lN == 2) { - auto x_0 = x[0] + x[1] * WNk(N, 0); - x[1] = x[0] - x[1] * WNk(N, 0); + if (lN == 2) + { + auto x_0 = x[0] + x[1]* WNk(N, 0); + x[1] = x[0] - x[1]* WNk(N, 0); x[0] = x_0; return; } // vectors for left and right - std::vector e(lN / 2); - std::vector o(lN / 2); + std::vector e(lN/2); + std::vector o(lN/2); // copy data into vectors - for (auto k = 0; k < lN / 2; k++) { - e[k] = x[2 * k]; - o[k] = x[2 * k + 1]; + for (auto k = 0; k < lN/2; k++) + { + e[k] = x[2*k]; + o[k] = x[2*k+1]; } // compute N/2 pt FFT on even - fft(e.data(), lN / 2, N); + fft(e.data(), lN/2, N); // compute N/2 pt FFT on odd - fft(o.data(), lN / 2, N); + fft(o.data(), lN/2, N); // combine even and odd FFTs - for (int k = 0; k < lN / 2; k++) { + for (int k = 0; k < lN/2; k++) + { x[k] = e[k] + o[k] * WNk(N, k * stride); - x[k + lN / 2] = e[k] - o[k] * WNk(N, k * stride); + x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride); } return; @@ -114,7 +122,8 @@ int main(int argc, char* argv[]) { // fft radix-2 algorithm with senders fft(y_n.data(), N, N); - if (print_sig) { + if (print_sig) + { std::cout << "X[k] = "; y_n.printSignal(); std::cout << std::endl; diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp index 56354f8..80a7446 100644 --- a/apps/fft/fft.hpp +++ b/apps/fft/fft.hpp @@ -32,10 +32,10 @@ #include #include +#include +#include #include #include -#include -#include #include #include "argparse/argparse.hpp" @@ -56,135 +56,158 @@ constexpr int radix = 2; // parameters struct fft_params_t : public argparse::Args { - sig_type_t& sig = - kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box); - int& freq = kwarg("f,freq", "Signal frequency").set_default(1024); - int& N = kwarg("N", "N-point FFT").set_default(1024); - bool& print_sig = flag("p,print", "print x[n] and X(k)"); + sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box); + int& freq = kwarg("f,freq", "Signal frequency").set_default(1024); + int& N = kwarg("N", "N-point FFT").set_default(1024); + bool& print_sig = flag("p,print", "print x[n] and X(k)"); #if defined(USE_OMP) - int& nthreads = kwarg("nthreads", "number of threads").set_default(1); + int& nthreads = kwarg("nthreads", "number of threads").set_default(1); #endif // USE_OMP - bool& help = flag("h, help", "print help"); - bool& print_time = flag("t,time", "print fft time"); + bool& help = flag("h, help", "print help"); + bool& print_time = flag("t,time", "print fft time"); }; inline bool isPowOf2(long long int x) { - return !(x == 0) && !(x & (x - 1)); + return !(x == 0) && !(x & (x - 1)); } template -void printVec(T& vec, int len) { +void printVec(T &vec, int len) +{ std::cout << "[ "; for (int i = 0; i < len; i++) - std::cout << vec[i] << " "; + std::cout << vec[i] << " "; std::cout << "]" << std::endl; } -inline std::complex WNk(int N, int k) { - return std::complex(exp(-2 * M_PI * 1 / N * k * 1i)); +inline std::complex WNk(int N, int k) +{ + return std::complex(exp(-2*M_PI*1/N*k*1i)); } -inline int ceilPowOf2(unsigned int v) { - return static_cast(std::bit_ceil(v)); +inline int ceilPowOf2(unsigned int v) +{ + return static_cast(std::bit_ceil(v)); } -inline int ilog2(uint32_t x) { +inline int ilog2(uint32_t x) +{ return static_cast(log2(x)); } -class signal { - public: - signal() = default; - - signal(int N) { - if (N <= 0) { - std::cerr << "FATAL: N must be > 0. exiting.." << std::endl; - exit(1); - } - y.reserve(ceilPowOf2(N)); - y.resize(N); +class signal +{ +public: + + signal() = default; + signal(int N) + { + if (N <= 0) + { + std::cerr << "FATAL: N must be > 0. exiting.." << std::endl; + exit(1); } - - signal(signal& rhs) { y = rhs.y; } - - signal(std::vector& in) { y = std::move(in); } - - signal(int N, sig_type type) { - if (N <= 0) { - std::cerr << "FATAL: N must be > 0. exiting.." << std::endl; - exit(1); - } - y.reserve(ceilPowOf2(N)); - y.resize(N); - signalGenerator(type); + y.reserve(ceilPowOf2(N)); + y.resize(N); + } + + signal(signal &rhs) + { + y = rhs.y; + } + signal(std::vector &in) + { + y = std::move(in); + } + + signal(int N, sig_type type) + { + if (N <= 0) + { + std::cerr << "FATAL: N must be > 0. exiting.." << std::endl; + exit(1); } - - void signalGenerator(sig_type type = sig_type::box) { - int N = y.size(); - - switch (type) { - case sig_type::square: - for (int n = 0; n < N; ++n) - y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : -1.0; - break; - case sig_type::sinusoid: - for (int n = 0; n < N; ++n) - y[n] = std::sin(2.0 * M_PI * n / N); - break; - case sig_type::sawtooth: - for (int n = 0; n < N; ++n) - y[n] = 2.0 * (n / N) - 1.0; - break; - case sig_type::triangle: - for (int n = 0; n < N; ++n) - y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0; - break; - case sig_type::sinc: - y[0] = 1.0; - for (int n = 1; n < N; ++n) - y[n] = std::sin(2.0 * M_PI * n / N) / (2.0 * M_PI * n / N); - break; - case sig_type::box: - for (int n = 0; n < N; ++n) - y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0; - break; - default: - std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl; - exit(1); - } + y.reserve(ceilPowOf2(N)); + y.resize(N); + signalGenerator(type); + } + + void signalGenerator(sig_type type=sig_type::box) + { + int N = y.size(); + + switch (type) { + case sig_type::square: + for (int n = 0; n < N; ++n) + y[n] = (n < N / 4 || n > 3 * N/4) ? 1.0 : -1.0; + break; + case sig_type::sinusoid: + for (int n = 0; n < N; ++n) + y[n] = std::sin(2.0 * M_PI * n / N); + break; + case sig_type::sawtooth: + for (int n = 0; n < N; ++n) + y[n] = 2.0 * (n / N) - 1.0; + break; + case sig_type::triangle: + for (int n = 0; n < N; ++n) + y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0; + break; + case sig_type::sinc: + y[0] = 1.0; + for (int n = 1; n < N; ++n) + y[n] = std::sin(2.0 * M_PI * n / N) / (2.0 * M_PI * n / N); + break; + case sig_type::box: + for (int n = 0; n < N; ++n) + y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0; + break; + default: + std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl; + exit(1); } + } - ~signal() { y.clear(); } - - data_t* data() { return y.data(); } + ~signal() + { + y.clear(); + } - int len() { return y.size(); } + data_t *data() { return y.data(); } + int len() { return y.size(); } - void resize(int N) { - if (N != y.size()) - y.resize(N, 0); - } + void resize(int N) + { + if (N != y.size()) + y.resize(N, 0); + } - data_t& operator[](int n) { return y[n]; } + data_t &operator[](int n) + { + return y[n]; + } - data_t& operator()(int n) { return y[n]; } + data_t &operator()(int n) + { + return y[n]; + } - void printSignal() { - std::cout << std::fixed << std::setprecision(2); + void printSignal() { + std::cout << std::fixed << std::setprecision(2); - std::cout << "[ "; - for (auto& el : y) - std::cout << el << " "; + std::cout << "[ "; + for (auto &el : y) + std::cout << el << " "; - std::cout << "]" << std::endl; - } + std::cout << "]" << std::endl; + } - private: - // y[n] - std::vector y; +private: + // y[n] + std::vector y; }; using sig_t = signal; diff --git a/apps/heat-equation/heat-equation-cuda.cpp b/apps/heat-equation/heat-equation-cuda.cpp index b8cca1b..3ea2988 100644 --- a/apps/heat-equation/heat-equation-cuda.cpp +++ b/apps/heat-equation/heat-equation-cuda.cpp @@ -41,14 +41,15 @@ __constant__ Real_t dx[2]; // error checking function template -static inline void check(T result, const char* const file, const int line, bool is_fatal = true) { - if (result != cudaSuccess) { - std::cerr << "CUDA error at " << file << ":" << line << std::endl; - std::cerr << cudaGetErrorString(result) << std::endl; - - if (is_fatal) - exit(result); - } +static inline void check(T result, const char* const file, const int line, + bool is_fatal = true) { + if (result != cudaSuccess) { + std::cerr << "CUDA error at " << file << ":" << line << std::endl; + std::cerr << cudaGetErrorString(result) << std::endl; + + if (is_fatal) + exit(result); + } } // @@ -56,24 +57,24 @@ static inline void check(T result, const char* const file, const int line, bool // template __global__ void initialize(T* phi, int ncells, int ghost_cells) { - int ind = blockIdx.x * blockDim.x + threadIdx.x; - int d_nghosts = nghosts; - int phi_old_extent = ncells + d_nghosts; - int gsize = ncells * ncells; + int ind = blockIdx.x * blockDim.x + threadIdx.x; + int d_nghosts = nghosts; + int phi_old_extent = ncells + d_nghosts; + int gsize = ncells * ncells; - for (; ind < gsize; ind += blockDim.x * gridDim.x) { - int i = 1 + (ind / ncells); - int j = 1 + (ind % ncells); + for (; ind < gsize; ind += blockDim.x * gridDim.x) { + int i = 1 + (ind / ncells); + int j = 1 + (ind % ncells); - Real_t x = pos(i, ghost_cells, dx[0]); - Real_t y = pos(j, ghost_cells, dx[1]); + Real_t x = pos(i, ghost_cells, dx[0]); + Real_t y = pos(j, ghost_cells, dx[1]); - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); - // phi(x,y) = 1 + exp(-r^2) - phi[(i)*phi_old_extent + j] = 1 + exp(-r2); - } + // phi(x,y) = 1 + exp(-r^2) + phi[(i)*phi_old_extent + j] = 1 + exp(-r2); + } } // @@ -81,52 +82,57 @@ __global__ void initialize(T* phi, int ncells, int ghost_cells) { // template __global__ void fillBoundary(T* phi_old, int ncells, int ghost_cells) { - int pos = blockIdx.x * blockDim.x + threadIdx.x; - int d_nghosts = nghosts; - int phi_old_extent = ncells + d_nghosts; - int len = phi_old_extent; + int pos = blockIdx.x * blockDim.x + threadIdx.x; + int d_nghosts = nghosts; + int phi_old_extent = ncells + d_nghosts; + int len = phi_old_extent; - for (; pos < phi_old_extent - nghosts; pos += blockDim.x * gridDim.x) { - int i = pos + ghost_cells; + for (; pos < phi_old_extent - nghosts; pos += blockDim.x * gridDim.x) { + int i = pos + ghost_cells; - // fill boundary cells in phi_old - phi_old[i] = phi_old[i + (ghost_cells * len)]; + // fill boundary cells in phi_old + phi_old[i] = phi_old[i + (ghost_cells * len)]; - phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))]; + phi_old[i + (len * (len - ghost_cells))] = + phi_old[i + (len * (len - ghost_cells - 1))]; - phi_old[i * len] = phi_old[(ghost_cells * len) + i]; + phi_old[i * len] = phi_old[(ghost_cells * len) + i]; - phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)]; - } + phi_old[(len - ghost_cells) + (len * i)] = + phi_old[(len - ghost_cells - 1) + (len * i)]; + } } // // jacobi 2d stencil kernel // template -__global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha, Real_t dt) { - int pos = blockIdx.x * blockDim.x + threadIdx.x; - int d_nghosts = nghosts; - int phi_old_extent = ncells + d_nghosts; - int gsize = ncells * ncells; - - for (; pos < gsize; pos += blockDim.x * gridDim.x) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - // Jacobi iteration - phi_new[(i - 1) * ncells + j - 1] = - phi_old[(i)*phi_old_extent + j] + - alpha * dt * - - ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i - 1) * phi_old_extent + j]) / - (dx[0] * dx[0]) + - - (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i)*phi_old_extent + j - 1]) / - (dx[1] * dx[1])); - } +__global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha, + Real_t dt) { + int pos = blockIdx.x * blockDim.x + threadIdx.x; + int d_nghosts = nghosts; + int phi_old_extent = ncells + d_nghosts; + int gsize = ncells * ncells; + + for (; pos < gsize; pos += blockDim.x * gridDim.x) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + // Jacobi iteration + phi_new[(i - 1) * ncells + j - 1] = + phi_old[(i)*phi_old_extent + j] + + alpha * dt * + + ((phi_old[(i + 1) * phi_old_extent + j] - + 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i - 1) * phi_old_extent + j]) / + (dx[0] * dx[0]) + + + (phi_old[(i)*phi_old_extent + j + 1] - + 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i)*phi_old_extent + j - 1]) / + (dx[1] * dx[1])); + } } // @@ -134,121 +140,127 @@ __global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha, Real_t // template __global__ void parallelCopy(T* phi_old, T* phi_new, int ncells) { - int pos = blockIdx.x * blockDim.x + threadIdx.x; - int d_nghosts = nghosts; - int phi_old_extent = ncells + d_nghosts; - int gsize = ncells * ncells; - - for (; pos < gsize; pos += blockDim.x * gridDim.x) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; - } + int pos = blockIdx.x * blockDim.x + threadIdx.x; + int d_nghosts = nghosts; + int phi_old_extent = ncells + d_nghosts; + int gsize = ncells * ncells; + + for (; pos < gsize; pos += blockDim.x * gridDim.x) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; + } } // // main simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); + // parse params + heat_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - Real_t dt = args.dt; - Real_t alpha = args.alpha; + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + Real_t dt = args.dt; + Real_t alpha = args.alpha; - // init simulation time - Real_t time = 0.0; + // init simulation time + Real_t time = 0.0; - // initialize dx, dy, dz - Real_t h_dx[dims]; - for (int i = 0; i < dims; ++i) - h_dx[i] = 1.0 / (ncells - 1); + // initialize dx, dy, dz + Real_t h_dx[dims]; + for (int i = 0; i < dims; ++i) + h_dx[i] = 1.0 / (ncells - 1); - cudaErrorCheck(cudaMemcpyToSymbol(dx, h_dx, sizeof(Real_t) * dims)); + cudaErrorCheck(cudaMemcpyToSymbol(dx, h_dx, sizeof(Real_t) * dims)); - // grid size - int gsize = ncells * ncells; + // grid size + int gsize = ncells * ncells; - // host memory for printing - Real_t* h_phi = nullptr; + // host memory for printing + Real_t* h_phi = nullptr; - // simulation setup (2D) - Real_t* phi_old = nullptr; - Real_t* phi_new = nullptr; + // simulation setup (2D) + Real_t* phi_old = nullptr; + Real_t* phi_new = nullptr; - cudaErrorCheck(cudaMalloc(&phi_old, sizeof(Real_t) * ((ncells + nghosts) * (ncells + nghosts)))); - cudaErrorCheck(cudaMalloc(&phi_new, sizeof(Real_t) * ((ncells) * (ncells)))); + cudaErrorCheck(cudaMalloc( + &phi_old, sizeof(Real_t) * ((ncells + nghosts) * (ncells + nghosts)))); + cudaErrorCheck(cudaMalloc(&phi_new, sizeof(Real_t) * ((ncells) * (ncells)))); - // setup grid - int blockSize = std::min(1024, gsize); // let's do at most 1024 threads. - int nBlocks = (gsize + blockSize - 1) / blockSize; + // setup grid + int blockSize = std::min(1024, gsize); // let's do at most 1024 threads. + int nBlocks = (gsize + blockSize - 1) / blockSize; - Timer timer; + Timer timer; - // initialize grid - initialize<<>>(phi_old, ncells, ghost_cells); + // initialize grid + initialize<<>>(phi_old, ncells, ghost_cells); - cudaErrorCheck(cudaDeviceSynchronize()); + cudaErrorCheck(cudaDeviceSynchronize()); - // print initial grid if needed - if (args.print_grid) { - // copy initial grid to host - h_phi = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; - cudaErrorCheck(cudaMemcpy(h_phi, phi_old, sizeof(Real_t) * (ncells + nghosts) * (ncells + nghosts), - cudaMemcpyDeviceToHost)); + // print initial grid if needed + if (args.print_grid) { + // copy initial grid to host + h_phi = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; + cudaErrorCheck( + cudaMemcpy(h_phi, phi_old, + sizeof(Real_t) * (ncells + nghosts) * (ncells + nghosts), + cudaMemcpyDeviceToHost)); - printGrid(h_phi, ncells + nghosts); - } + printGrid(h_phi, ncells + nghosts); + } - // evolve the system - for (auto step = 0; step < nsteps; step++) { - static int fBblock = std::min(1024, ncells); // let's do at most 1024 threads. - static int fBnBlocks = (ncells + fBblock - 1) / fBblock; // fillBoundary blocks + // evolve the system + for (auto step = 0; step < nsteps; step++) { + static int fBblock = + std::min(1024, ncells); // let's do at most 1024 threads. + static int fBnBlocks = + (ncells + fBblock - 1) / fBblock; // fillBoundary blocks - // fillboundary - fillBoundary<<>>(phi_old, ncells, ghost_cells); + // fillboundary + fillBoundary<<>>(phi_old, ncells, ghost_cells); - // jacobi - jacobi<<>>(phi_old, phi_new, ncells, alpha, dt); + // jacobi + jacobi<<>>(phi_old, phi_new, ncells, alpha, dt); - // parallelCopy - parallelCopy<<>>(phi_old, phi_new, ncells); + // parallelCopy + parallelCopy<<>>(phi_old, phi_new, ncells); - cudaErrorCheck(cudaDeviceSynchronize()); + cudaErrorCheck(cudaDeviceSynchronize()); - // update time - time += dt; - } + // update time + time += dt; + } - auto elapsed = timer.stop(); + auto elapsed = timer.stop(); - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } - // print final grid if needed - if (args.print_grid) { - cudaErrorCheck(cudaMemcpy(h_phi, phi_new, sizeof(Real_t) * gsize, cudaMemcpyDeviceToHost)); - printGrid(h_phi, ncells); + // print final grid if needed + if (args.print_grid) { + cudaErrorCheck(cudaMemcpy(h_phi, phi_new, sizeof(Real_t) * gsize, + cudaMemcpyDeviceToHost)); + printGrid(h_phi, ncells); - // free host memory - delete[] h_phi; - h_phi = nullptr; - } + // free host memory + delete[] h_phi; + h_phi = nullptr; + } - // free device memory - cudaErrorCheck(cudaFree(phi_old)); - cudaErrorCheck(cudaFree(phi_new)); + // free device memory + cudaErrorCheck(cudaFree(phi_old)); + cudaErrorCheck(cudaFree(phi_new)); - return 0; + return 0; } diff --git a/apps/heat-equation/heat-equation-gpu-scheduler.cpp b/apps/heat-equation/heat-equation-gpu-scheduler.cpp index 2b9590d..b294235 100644 --- a/apps/heat-equation/heat-equation-gpu-scheduler.cpp +++ b/apps/heat-equation/heat-equation-gpu-scheduler.cpp @@ -44,132 +44,138 @@ using namespace nvexec; // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); - - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } - - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; - - // init simulation time - Real_t time = 0.0; - - // initialize dx, dy, dz - thrust::universal_vector dx(dims); - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); - - // simulation setup (2D) - thrust::universal_vector grid_old((ncells + nghosts) * (ncells + nghosts)); - thrust::universal_vector grid_new(ncells * ncells); - - /* Real_t *grid_old = new Real_t[(ncells+nghosts) * (ncells+nghosts)]; - Real_t *grid_new = new Real_t[(ncells) * (ncells)];*/ - - // initialize grid - auto phi_old = thrust::raw_pointer_cast(grid_old.data()); - auto phi_new = thrust::raw_pointer_cast(grid_new.data()); - - Timer timer; + // parse params + heat_params_t args = argparse::parse(argc, argv); - // scheduler from gpu - nvexec::stream_context stream_ctx{}; - auto gpu = stream_ctx.get_scheduler(); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; + + // init simulation time + Real_t time = 0.0; + + // initialize dx, dy, dz + thrust::universal_vector dx(dims); + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); + + // simulation setup (2D) + thrust::universal_vector grid_old((ncells + nghosts) * + (ncells + nghosts)); + thrust::universal_vector grid_new(ncells * ncells); + + /* Real_t *grid_old = new Real_t[(ncells+nghosts) * (ncells+nghosts)]; + Real_t *grid_new = new Real_t[(ncells) * (ncells)];*/ - auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), thrust::raw_pointer_cast(dx.data()) + dx.size()}; - auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()}; - auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()}; - auto phi_old_extent = ncells + nghosts; + // initialize grid + auto phi_old = thrust::raw_pointer_cast(grid_old.data()); + auto phi_new = thrust::raw_pointer_cast(grid_new.data()); + + Timer timer; + + // scheduler from gpu + nvexec::stream_context stream_ctx{}; + auto gpu = stream_ctx.get_scheduler(); + + auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), + thrust::raw_pointer_cast(dx.data()) + dx.size()}; + auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()}; + auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()}; + auto phi_old_extent = ncells + nghosts; + + int gsize = ncells * ncells; + auto heat_eq_init = ex::transfer_just(gpu, dx_span, phi_old_span) | + ex::bulk(gsize, [=](int pos, auto ds, auto phi) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + Real_t x = pos(i, ghost_cells, ds[0]); + Real_t y = pos(j, ghost_cells, ds[1]); + + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); + + // phi(x,y) = 1 + exp(-r^2) + phi[(i)*phi_old_extent + j] = 1 + exp(-r2); + }); + + ex::sync_wait(std::move(heat_eq_init)); + if (args.print_grid) + printGrid(phi_old, ncells + nghosts); + + auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span); + + // evolve the system + for (auto step = 0; step < nsteps; step++) { + static auto evolve = + tx | + ex::bulk(phi_old_extent - nghosts, + [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = pos + ghost_cells; + int len = phi_old_extent; + // fill boundary cells in old_phi + phi_old[i] = phi_old[i + (ghost_cells * len)]; + phi_old[i + (len * (len - ghost_cells))] = + phi_old[i + (len * (len - ghost_cells - 1))]; + phi_old[i * len] = phi_old[(ghost_cells * len) + i]; + phi_old[(len - ghost_cells) + (len * i)] = + phi_old[(len - ghost_cells - 1) + (len * i)]; + }) | + ex::bulk(gsize, + [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + // Jacobi iteration + phi_new[(i - 1) * ncells + j - 1] = + phi_old[(i)*phi_old_extent + j] + + alpha * dt * + ((phi_old[(i + 1) * phi_old_extent + j] - + 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i - 1) * phi_old_extent + j]) / + (ds[0] * ds[0]) + + (phi_old[(i)*phi_old_extent + j + 1] - + 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i)*phi_old_extent + j - 1]) / + (ds[1] * ds[1])); + }) | + ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; + }); - int gsize = ncells * ncells; - auto heat_eq_init = - ex::transfer_just(gpu, dx_span, phi_old_span) | ex::bulk(gsize, [=](int pos, auto ds, auto phi) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); + ex::sync_wait(std::move(evolve)); - Real_t x = pos(i, ghost_cells, ds[0]); - Real_t y = pos(j, ghost_cells, ds[1]); + // update the simulation time + time += dt; + } - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); + auto elapsed = timer.stop(); - // phi(x,y) = 1 + exp(-r^2) - phi[(i)*phi_old_extent + j] = 1 + exp(-r2); - }); + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } - ex::sync_wait(std::move(heat_eq_init)); + auto finalize = ex::then(ex::just(), [&]() { if (args.print_grid) - printGrid(phi_old, ncells + nghosts); - - auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span); - - // evolve the system - for (auto step = 0; step < nsteps; step++) { - static auto evolve = - tx | - ex::bulk(phi_old_extent - nghosts, - [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = pos + ghost_cells; - int len = phi_old_extent; - // fill boundary cells in old_phi - phi_old[i] = phi_old[i + (ghost_cells * len)]; - phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))]; - phi_old[i * len] = phi_old[(ghost_cells * len) + i]; - phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)]; - }) | - ex::bulk(gsize, - [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - // Jacobi iteration - phi_new[(i - 1) * ncells + j - 1] = - phi_old[(i)*phi_old_extent + j] + - alpha * dt * - ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i - 1) * phi_old_extent + j]) / - (ds[0] * ds[0]) + - (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i)*phi_old_extent + j - 1]) / - (ds[1] * ds[1])); - }) | - ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; - }); - - ex::sync_wait(std::move(evolve)); - - // update the simulation time - time += dt; - } - - auto elapsed = timer.stop(); - - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } - - auto finalize = ex::then(ex::just(), [&]() { - if (args.print_grid) - // print the final grid - printGrid(phi_new, ncells); - }); - - // end the simulation - ex::sync_wait(std::move(finalize)); + // print the final grid + printGrid(phi_new, ncells); + }); - return 0; + // end the simulation + ex::sync_wait(std::move(finalize)); + + return 0; } \ No newline at end of file diff --git a/apps/heat-equation/heat-equation-mdspan.cpp b/apps/heat-equation/heat-equation-mdspan.cpp index f38b9ed..1ae243b 100644 --- a/apps/heat-equation/heat-equation-mdspan.cpp +++ b/apps/heat-equation/heat-equation-mdspan.cpp @@ -33,121 +33,128 @@ // fill boundary cells template void fill2Dboundaries_mdspan(T* grid, int len, int ghost_cells = 1) { - auto row_view = std::mdspan(grid, len, len); - - for (auto j = ghost_cells; j < row_view.extent(1) - ghost_cells; ++j) { - row_view(0, j) = row_view(ghost_cells, j); - row_view(row_view.extent(0) - ghost_cells, j) = row_view(row_view.extent(0) - ghost_cells - 1, j); - } - - auto col_view = std::mdspan(grid, len, len); - - for (auto i = 1; i < col_view.extent(1) - 1; ++i) { - col_view(0, i) = col_view(ghost_cells, i); - col_view(col_view.extent(0) - 1, i) = col_view(col_view.extent(0) - ghost_cells - 1, i); - } + auto row_view = std::mdspan(grid, len, len); + + for (auto j = ghost_cells; j < row_view.extent(1) - ghost_cells; ++j) { + row_view(0, j) = row_view(ghost_cells, j); + row_view(row_view.extent(0) - ghost_cells, j) = + row_view(row_view.extent(0) - ghost_cells - 1, j); + } + + auto col_view = + std::mdspan(grid, len, len); + + for (auto i = 1; i < col_view.extent(1) - 1; ++i) { + col_view(0, i) = col_view(ghost_cells, i); + col_view(col_view.extent(0) - 1, i) = + col_view(col_view.extent(0) - ghost_cells - 1, i); + } } // // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); + // parse params + heat_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; + + // initialize dx, dy, dz + auto* dx = new Real_t[dims]; + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); + + // simulation setup (2D) + Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; + Real_t* grid_new = new Real_t[(ncells) * (ncells)]; + + auto phi_old = std::mdspan( + grid_old, ncells + nghosts, ncells + nghosts); + auto phi_new = + std::mdspan(grid_new, ncells, ncells); + + Timer timer; + + // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] + for (int i = 1; i < phi_old.extent(0) - 1; ++i) { + for (int j = 1; j < phi_old.extent(1) - 1; ++j) { + Real_t x = pos(i, ghost_cells, dx[0]); + Real_t y = pos(j, ghost_cells, dx[1]); + + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); + + // phi(x,y) = 1 + exp(-r^2) + phi_old(i, j) = 1 + exp(-r2); } - - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; - - // initialize dx, dy, dz - auto* dx = new Real_t[dims]; - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); - - // simulation setup (2D) - Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; - Real_t* grid_new = new Real_t[(ncells) * (ncells)]; - - auto phi_old = std::mdspan(grid_old, ncells + nghosts, ncells + nghosts); - auto phi_new = std::mdspan(grid_new, ncells, ncells); - - Timer timer; - - // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] - for (int i = 1; i < phi_old.extent(0) - 1; ++i) { - for (int j = 1; j < phi_old.extent(1) - 1; ++j) { - Real_t x = pos(i, ghost_cells, dx[0]); - Real_t y = pos(j, ghost_cells, dx[1]); - - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); - - // phi(x,y) = 1 + exp(-r^2) - phi_old(i, j) = 1 + exp(-r2); - } + } + + if (args.print_grid) + // print the initial grid + printGrid(grid_old, ncells + nghosts); + + // init simulation time + Real_t time = 0.0; + + // evolve the system + for (auto step = 0; step < nsteps; step++) { + // fill boundary cells in old_phi + fill2Dboundaries_mdspan(grid_old, ncells + nghosts, ghost_cells); + + // update phi_new + for (auto i = 1; i < phi_old.extent(0) - 1; i++) { + for (auto j = 1; j < phi_old.extent(1) - 1; j++) { + // Jacobi iteration + phi_new(i - 1, j - 1) = + phi_old(i, j) + + alpha * dt * + ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / + (dx[0] * dx[0]) + + (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / + (dx[1] * dx[1])); + } } - if (args.print_grid) - // print the initial grid - printGrid(grid_old, ncells + nghosts); - - // init simulation time - Real_t time = 0.0; - - // evolve the system - for (auto step = 0; step < nsteps; step++) { - // fill boundary cells in old_phi - fill2Dboundaries_mdspan(grid_old, ncells + nghosts, ghost_cells); - - // update phi_new - for (auto i = 1; i < phi_old.extent(0) - 1; i++) { - for (auto j = 1; j < phi_old.extent(1) - 1; j++) { - // Jacobi iteration - phi_new(i - 1, j - 1) = - phi_old(i, j) + - alpha * dt * - ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) + - (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1])); - } - } - - // update the simulation time - time += dt; - - // parallel copy phi_new to phi_old - for (auto i = 1; i < phi_old.extent(0) - 1; i++) - for (auto j = 1; j < phi_old.extent(1) - 1; j++) - // copy phi_new to phi_old - phi_old(i, j) = phi_new(i - 1, j - 1); - } + // update the simulation time + time += dt; - auto elapsed = timer.stop(); + // parallel copy phi_new to phi_old + for (auto i = 1; i < phi_old.extent(0) - 1; i++) + for (auto j = 1; j < phi_old.extent(1) - 1; j++) + // copy phi_new to phi_old + phi_old(i, j) = phi_new(i - 1, j - 1); + } - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } + auto elapsed = timer.stop(); - if (args.print_grid) - // print the final grid - printGrid(grid_new, ncells); + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } - // delete all memory - delete[] grid_old; - delete[] grid_new; + if (args.print_grid) + // print the final grid + printGrid(grid_new, ncells); - grid_old = nullptr; - grid_new = nullptr; + // delete all memory + delete[] grid_old; + delete[] grid_new; - return 0; + grid_old = nullptr; + grid_new = nullptr; + + return 0; } diff --git a/apps/heat-equation/heat-equation-multigpu-scheduler.cpp b/apps/heat-equation/heat-equation-multigpu-scheduler.cpp index d8e79b3..efcc9e5 100644 --- a/apps/heat-equation/heat-equation-multigpu-scheduler.cpp +++ b/apps/heat-equation/heat-equation-multigpu-scheduler.cpp @@ -44,129 +44,135 @@ using namespace nvexec; // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); - - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } - - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; - - // init simulation time - Real_t time = 0.0; - - // initialize dx, dy, dz - thrust::universal_vector dx(dims); - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); - - // simulation setup (2D) - thrust::universal_vector grid_old((ncells + nghosts) * (ncells + nghosts)); - thrust::universal_vector grid_new(ncells * ncells); - - // initialize grid - auto phi_old = thrust::raw_pointer_cast(grid_old.data()); - auto phi_new = thrust::raw_pointer_cast(grid_new.data()); - - Timer timer; - - // scheduler from gpu - nvexec::multi_gpu_stream_context stream_ctx{}; - auto gpu = stream_ctx.get_scheduler(); - - auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), thrust::raw_pointer_cast(dx.data()) + dx.size()}; - auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()}; - auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()}; - auto phi_old_extent = ncells + nghosts; - - int gsize = ncells * ncells; - auto heat_eq_init = - ex::transfer_just(gpu, dx_span, phi_old_span) | ex::bulk(gsize, [=](int pos, auto ds, auto phi) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - Real_t x = pos(i, ghost_cells, ds[0]); - Real_t y = pos(j, ghost_cells, ds[1]); - - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); - - // phi(x,y) = 1 + exp(-r^2) - phi[(i)*phi_old_extent + j] = 1 + exp(-r2); + // parse params + heat_params_t args = argparse::parse(argc, argv); + + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; + + // init simulation time + Real_t time = 0.0; + + // initialize dx, dy, dz + thrust::universal_vector dx(dims); + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); + + // simulation setup (2D) + thrust::universal_vector grid_old((ncells + nghosts) * + (ncells + nghosts)); + thrust::universal_vector grid_new(ncells * ncells); + + // initialize grid + auto phi_old = thrust::raw_pointer_cast(grid_old.data()); + auto phi_new = thrust::raw_pointer_cast(grid_new.data()); + + Timer timer; + + // scheduler from gpu + nvexec::multi_gpu_stream_context stream_ctx{}; + auto gpu = stream_ctx.get_scheduler(); + + auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), + thrust::raw_pointer_cast(dx.data()) + dx.size()}; + auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()}; + auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()}; + auto phi_old_extent = ncells + nghosts; + + int gsize = ncells * ncells; + auto heat_eq_init = ex::transfer_just(gpu, dx_span, phi_old_span) | + ex::bulk(gsize, [=](int pos, auto ds, auto phi) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + Real_t x = pos(i, ghost_cells, ds[0]); + Real_t y = pos(j, ghost_cells, ds[1]); + + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); + + // phi(x,y) = 1 + exp(-r^2) + phi[(i)*phi_old_extent + j] = 1 + exp(-r2); + }); + + ex::sync_wait(std::move(heat_eq_init)); + if (args.print_grid) + printGrid(phi_old, ncells + nghosts); + + auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span); + + // evolve the system + for (auto step = 0; step < nsteps; step++) { + static auto evolve = + tx | + ex::bulk(phi_old_extent - nghosts, + [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = pos + ghost_cells; + int len = phi_old_extent; + // fill boundary cells in old_phi + phi_old[i] = phi_old[i + (ghost_cells * len)]; + phi_old[i + (len * (len - ghost_cells))] = + phi_old[i + (len * (len - ghost_cells - 1))]; + phi_old[i * len] = phi_old[(ghost_cells * len) + i]; + phi_old[(len - ghost_cells) + (len * i)] = + phi_old[(len - ghost_cells - 1) + (len * i)]; + }) | + ex::bulk(gsize, + [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + // Jacobi iteration + phi_new[(i - 1) * ncells + j - 1] = + phi_old[(i)*phi_old_extent + j] + + alpha * dt * + ((phi_old[(i + 1) * phi_old_extent + j] - + 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i - 1) * phi_old_extent + j]) / + (ds[0] * ds[0]) + + (phi_old[(i)*phi_old_extent + j + 1] - + 2.0 * phi_old[(i)*phi_old_extent + j] + + phi_old[(i)*phi_old_extent + j - 1]) / + (ds[1] * ds[1])); + }) | + ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; }); - ex::sync_wait(std::move(heat_eq_init)); + ex::sync_wait(std::move(evolve)); + + // update the simulation time + time += dt; + } + + auto elapsed = timer.stop(); + + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } + + auto finalize = ex::then(ex::just(), [&]() { if (args.print_grid) - printGrid(phi_old, ncells + nghosts); - - auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span); - - // evolve the system - for (auto step = 0; step < nsteps; step++) { - static auto evolve = - tx | - ex::bulk(phi_old_extent - nghosts, - [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = pos + ghost_cells; - int len = phi_old_extent; - // fill boundary cells in old_phi - phi_old[i] = phi_old[i + (ghost_cells * len)]; - phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))]; - phi_old[i * len] = phi_old[(ghost_cells * len) + i]; - phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)]; - }) | - ex::bulk(gsize, - [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - // Jacobi iteration - phi_new[(i - 1) * ncells + j - 1] = - phi_old[(i)*phi_old_extent + j] + - alpha * dt * - ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i - 1) * phi_old_extent + j]) / - (ds[0] * ds[0]) + - (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] + - phi_old[(i)*phi_old_extent + j - 1]) / - (ds[1] * ds[1])); - }) | - ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; - }); - - ex::sync_wait(std::move(evolve)); - - // update the simulation time - time += dt; - } - - auto elapsed = timer.stop(); - - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } - - auto finalize = ex::then(ex::just(), [&]() { - if (args.print_grid) - // print the final grid - printGrid(phi_new, ncells); - }); - - // end the simulation - ex::sync_wait(std::move(finalize)); + // print the final grid + printGrid(phi_new, ncells); + }); - return 0; + // end the simulation + ex::sync_wait(std::move(finalize)); + + return 0; } \ No newline at end of file diff --git a/apps/heat-equation/heat-equation-omp.cpp b/apps/heat-equation/heat-equation-omp.cpp index ebf89e2..6af69b0 100644 --- a/apps/heat-equation/heat-equation-omp.cpp +++ b/apps/heat-equation/heat-equation-omp.cpp @@ -33,126 +33,134 @@ // fill boundary cells OpenMP template -void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1, int ghost_cells = 1) { +void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1, + int ghost_cells = 1) { #pragma omp parallel for num_threads(nthreads) - for (int i = ghost_cells; i < len - ghost_cells; i++) { - grid[i] = grid[i + (ghost_cells * len)]; - grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))]; - - grid[i * len] = grid[(ghost_cells * len) + i]; - grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)]; - } + for (int i = ghost_cells; i < len - ghost_cells; i++) { + grid[i] = grid[i + (ghost_cells * len)]; + grid[i + (len * (len - ghost_cells))] = + grid[i + (len * (len - ghost_cells - 1))]; + + grid[i * len] = grid[(ghost_cells * len) + i]; + grid[(len - ghost_cells) + (len * i)] = + grid[(len - ghost_cells - 1) + (len * i)]; + } } // // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); + // parse params + heat_params_t args = argparse::parse(argc, argv); - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - int nthreads = args.nthreads; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + int nthreads = args.nthreads; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; - // initialize dx, dy, dz - auto* dx = new Real_t[dims]; - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); + // initialize dx, dy, dz + auto* dx = new Real_t[dims]; + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); - // simulation setup (2D) - Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; - Real_t* grid_new = new Real_t[(ncells) * (ncells)]; + // simulation setup (2D) + Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; + Real_t* grid_new = new Real_t[(ncells) * (ncells)]; - auto phi_old = std::mdspan(grid_old, ncells + nghosts, ncells + nghosts); - auto phi_new = std::mdspan(grid_new, ncells, ncells); + auto phi_old = std::mdspan( + grid_old, ncells + nghosts, ncells + nghosts); + auto phi_new = + std::mdspan(grid_new, ncells, ncells); - int gsize = ncells * ncells; + int gsize = ncells * ncells; - Timer timer; + Timer timer; - // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] + // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] #pragma omp parallel for num_threads(nthreads) - for (int pos = 0; pos < gsize; pos++) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); + for (int pos = 0; pos < gsize; pos++) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); - Real_t x = pos(i, ghost_cells, dx[0]); - Real_t y = pos(j, ghost_cells, dx[1]); + Real_t x = pos(i, ghost_cells, dx[0]); + Real_t y = pos(j, ghost_cells, dx[1]); - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); - // phi(x,y) = 1 + exp(-r^2) - phi_old(i, j) = 1 + exp(-r2); - } + // phi(x,y) = 1 + exp(-r^2) + phi_old(i, j) = 1 + exp(-r2); + } - if (args.print_grid) - // print the initial grid - printGrid(grid_old, ncells + nghosts); + if (args.print_grid) + // print the initial grid + printGrid(grid_old, ncells + nghosts); - // init simulation time - Real_t time = 0.0; + // init simulation time + Real_t time = 0.0; - // evolve the system - for (auto step = 0; step < nsteps; step++) { - // fill boundary cells in old_phi - fill2Dboundaries_omp(grid_old, ncells + nghosts, ghost_cells, nthreads); + // evolve the system + for (auto step = 0; step < nsteps; step++) { + // fill boundary cells in old_phi + fill2Dboundaries_omp(grid_old, ncells + nghosts, ghost_cells, nthreads); #pragma omp parallel for num_threads(nthreads) - for (int pos = 0; pos < gsize; pos++) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - // Jacobi iteration - phi_new(i - 1, j - 1) = - phi_old(i, j) + alpha * dt * - ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) + - (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1])); - } + for (int pos = 0; pos < gsize; pos++) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + // Jacobi iteration + phi_new(i - 1, j - 1) = + phi_old(i, j) + + alpha * dt * + ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / + (dx[0] * dx[0]) + + (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / + (dx[1] * dx[1])); + } - // update the simulation time - time += dt; + // update the simulation time + time += dt; - // parallel copy phi_new to phi_old + // parallel copy phi_new to phi_old #pragma omp parallel for num_threads(nthreads) - for (int pos = 0; pos < gsize; pos++) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); + for (int pos = 0; pos < gsize; pos++) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); - // copy phi_new to phi_old - phi_old(i, j) = phi_new(i - 1, j - 1); - } + // copy phi_new to phi_old + phi_old(i, j) = phi_new(i - 1, j - 1); } + } - auto elapsed = timer.stop(); + auto elapsed = timer.stop(); - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } - if (args.print_grid) - // print the final grid - printGrid(grid_new, ncells); + if (args.print_grid) + // print the final grid + printGrid(grid_new, ncells); - // delete all memory - delete[] grid_old; - delete[] grid_new; + // delete all memory + delete[] grid_old; + delete[] grid_new; - grid_old = nullptr; - grid_new = nullptr; + grid_old = nullptr; + grid_new = nullptr; - return 0; + return 0; } diff --git a/apps/heat-equation/heat-equation-stdpar-senders.cpp b/apps/heat-equation/heat-equation-stdpar-senders.cpp index 5209f37..f83b113 100644 --- a/apps/heat-equation/heat-equation-stdpar-senders.cpp +++ b/apps/heat-equation/heat-equation-stdpar-senders.cpp @@ -45,156 +45,166 @@ using stdexec::sync_wait; // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); - - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } - - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - // number of parallel tiles - int ntiles = args.ntiles; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; - - // init simulation time - Real_t time = 0.0; - - // initialize dx, dy, dz - auto* dx = new Real_t[dims]; - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); - - // simulation setup (2D) - Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; - Real_t* grid_new = new Real_t[(ncells) * (ncells)]; - - auto phi_old = std::mdspan(grid_old, ncells + nghosts, ncells + nghosts); - auto phi_new = std::mdspan(grid_new, ncells, ncells); - - Timer timer; - - // scheduler from a thread pool - exec::static_thread_pool ctx{ntiles}; - - scheduler auto sch = ctx.get_scheduler(); - sender auto begin = schedule(sch); - - // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] - sender auto heat_eq_init = - bulk(begin, ntiles, - [&](int tile) { - int start = tile * (ncells * ncells) / ntiles; - int size = (ncells * ncells) / ntiles; - int remaining = (ncells * ncells) % ntiles; - size += (tile == ntiles - 1) ? remaining : 0; + // parse params + heat_params_t args = argparse::parse(argc, argv); - std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) { + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + // number of parallel tiles + int ntiles = args.ntiles; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; + + // init simulation time + Real_t time = 0.0; + + // initialize dx, dy, dz + auto* dx = new Real_t[dims]; + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); + + // simulation setup (2D) + Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; + Real_t* grid_new = new Real_t[(ncells) * (ncells)]; + + auto phi_old = std::mdspan( + grid_old, ncells + nghosts, ncells + nghosts); + auto phi_new = + std::mdspan(grid_new, ncells, ncells); + + Timer timer; + + // scheduler from a thread pool + exec::static_thread_pool ctx{ntiles}; + + scheduler auto sch = ctx.get_scheduler(); + sender auto begin = schedule(sch); + + // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] + sender auto heat_eq_init = + bulk(begin, ntiles, + [&](int tile) { + int start = tile * (ncells * ncells) / ntiles; + int size = (ncells * ncells) / ntiles; + int remaining = (ncells * ncells) % ntiles; + size += (tile == ntiles - 1) ? remaining : 0; + + std::for_each_n(std::execution::par_unseq, + counting_iterator(start), size, [=](int pos) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + Real_t x = pos(i, ghost_cells, dx[0]); + Real_t y = pos(j, ghost_cells, dx[1]); + + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); + + // phi(x,y) = 1 + exp(-r^2) + phi_old(i, j) = 1 + exp(-r2); + }); + }) | + then([&]() { + if (args.print_grid) + // print the initial grid + printGrid(grid_old, ncells + nghosts); + }); + + // start the simulation + sync_wait(std::move(heat_eq_init)); + + // evolve the system + for (auto step = 0; step < nsteps; step++) { + static sender auto evolve = + then(begin, + [&]() { + // fill boundary cells in old_phi + fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells); + }) | + bulk(ntiles, + [&](int tile) { + int start = tile * (ncells * ncells) / ntiles; + int size = (ncells * ncells) / ntiles; + int remaining = (ncells * ncells) % ntiles; + size += (tile == ntiles - 1) ? remaining : 0; + + // update phi_new with stencil + std::for_each_n( + std::execution::par_unseq, counting_iterator(start), size, + [=](int pos) { int i = 1 + (pos / ncells); int j = 1 + (pos % ncells); - Real_t x = pos(i, ghost_cells, dx[0]); - Real_t y = pos(j, ghost_cells, dx[1]); - - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); - - // phi(x,y) = 1 + exp(-r^2) - phi_old(i, j) = 1 + exp(-r2); - }); + // Jacobi iteration + phi_new(i - 1, j - 1) = + phi_old(i, j) + + alpha * dt * + ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + + phi_old(i - 1, j)) / + (dx[0] * dx[0]) + + (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + + phi_old(i, j - 1)) / + (dx[1] * dx[1])); + }); + }) | + bulk(ntiles, + [&](int tile) { + int start = tile * (ncells * ncells) / ntiles; + int size = (ncells * ncells) / ntiles; + int remaining = (ncells * ncells) % ntiles; + size += (tile == ntiles - 1) ? remaining : 0; + + // parallel copy phi_new to phi_old + std::for_each_n(std::execution::par_unseq, + counting_iterator(start), size, [=](int pos) { + int i = 1 + (pos / ncells); + int j = 1 + (pos % ncells); + + // copy phi_new to phi_old + phi_old(i, j) = phi_new(i - 1, j - 1); + }); }) | then([&]() { - if (args.print_grid) - // print the initial grid - printGrid(grid_old, ncells + nghosts); + // update the simulation time + time += dt; }); - // start the simulation - sync_wait(std::move(heat_eq_init)); - - // evolve the system - for (auto step = 0; step < nsteps; step++) { - static sender auto evolve = - then(begin, - [&]() { - // fill boundary cells in old_phi - fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells); - }) | - bulk(ntiles, - [&](int tile) { - int start = tile * (ncells * ncells) / ntiles; - int size = (ncells * ncells) / ntiles; - int remaining = (ncells * ncells) % ntiles; - size += (tile == ntiles - 1) ? remaining : 0; - - // update phi_new with stencil - std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - // Jacobi iteration - phi_new(i - 1, j - 1) = - phi_old(i, j) + - alpha * dt * - ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) + - (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1])); - }); - }) | - bulk(ntiles, - [&](int tile) { - int start = tile * (ncells * ncells) / ntiles; - int size = (ncells * ncells) / ntiles; - int remaining = (ncells * ncells) % ntiles; - size += (tile == ntiles - 1) ? remaining : 0; - - // parallel copy phi_new to phi_old - std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) { - int i = 1 + (pos / ncells); - int j = 1 + (pos % ncells); - - // copy phi_new to phi_old - phi_old(i, j) = phi_new(i - 1, j - 1); - }); - }) | - then([&]() { - // update the simulation time - time += dt; - }); - - sync_wait(std::move(evolve)); - } - - auto elapsed = timer.stop(); - - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } - - sender auto finalize = then(just(), - [&]() { - if (args.print_grid) - // print the final grid - printGrid(grid_new, ncells); - }) | - then([&]() { - // delete all memory - delete[] grid_old; - delete[] grid_new; - - grid_old = nullptr; - grid_new = nullptr; - }); - - // start the simulation - sync_wait(std::move(finalize)); + sync_wait(std::move(evolve)); + } - return 0; + auto elapsed = timer.stop(); + + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } + + sender auto finalize = then(just(), + [&]() { + if (args.print_grid) + // print the final grid + printGrid(grid_new, ncells); + }) | + then([&]() { + // delete all memory + delete[] grid_old; + delete[] grid_new; + + grid_old = nullptr; + grid_new = nullptr; + }); + + // start the simulation + sync_wait(std::move(finalize)); + + return 0; } diff --git a/apps/heat-equation/heat-equation-stdpar.cpp b/apps/heat-equation/heat-equation-stdpar.cpp index 164c482..b20fb68 100644 --- a/apps/heat-equation/heat-equation-stdpar.cpp +++ b/apps/heat-equation/heat-equation-stdpar.cpp @@ -34,107 +34,117 @@ // simulation // int main(int argc, char* argv[]) { - // parse params - heat_params_t args = argparse::parse(argc, argv); - - // see if help wanted - if (args.help) { - args.print(); // prints all variables - return 0; - } - - // simulation variables - int ncells = args.ncells; - int nsteps = args.nsteps; - Real_t dt = args.dt; - Real_t alpha = args.alpha; - // future if needed to split in multiple grids - // int max_grid_size = args.max_grid_size; - - // initialize dx, dy, dz - auto* dx = new Real_t[dims]; - for (int i = 0; i < dims; ++i) - dx[i] = 1.0 / (ncells - 1); - - // simulation setup (2D) - Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; - Real_t* grid_new = new Real_t[(ncells) * (ncells)]; - - auto phi_old = std::mdspan(grid_old, ncells + nghosts, ncells + nghosts); - auto phi_new = std::mdspan(grid_new, ncells, ncells); - - // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] - - Timer timer; - - std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) { - int i = 1 + (ind / ncells); - int j = 1 + (ind % ncells); - - Real_t x = pos(i, ghost_cells, dx[0]); - Real_t y = pos(j, ghost_cells, dx[1]); - - // L2 distance (r2 from origin) - Real_t r2 = (x * x + y * y) / (0.01); - - // phi(x,y) = 1 + exp(-r^2) - phi_old(i, j) = 1 + exp(-r2); - }); - - if (args.print_grid) - // print the initial grid - printGrid(grid_old, ncells + nghosts); - - // init simulation time - Real_t time = 0.0; - - // evolve the system - for (auto step = 0; step < nsteps; step++) { - // fill boundary cells in old_phi - fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells); - - // update phi_new with stencil - std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) { - int i = 1 + (ind / ncells); - int j = 1 + (ind % ncells); - - // Jacobi iteration - phi_new(i - 1, j - 1) = - phi_old(i, j) + alpha * dt * - ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) + - (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1])); - }); - - // update the simulation time - time += dt; - - // parallel copy phi_new to phi_old - std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) { - int i = 1 + (ind / ncells); - int j = 1 + (ind % ncells); - - // copy phi_new to phi_old - phi_old(i, j) = phi_new(i - 1, j - 1); - }); - } - - auto elapsed = timer.stop(); - - // print timing - if (args.print_time) { - std::cout << "Time: " << elapsed << " ms" << std::endl; - } - - if (args.print_grid) - // print the final grid - printGrid(grid_new, ncells); - - // delete all memory - delete[] grid_old; - delete[] grid_new; - - grid_old = nullptr; - grid_new = nullptr; + // parse params + heat_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables return 0; + } + + // simulation variables + int ncells = args.ncells; + int nsteps = args.nsteps; + Real_t dt = args.dt; + Real_t alpha = args.alpha; + // future if needed to split in multiple grids + // int max_grid_size = args.max_grid_size; + + // initialize dx, dy, dz + auto* dx = new Real_t[dims]; + for (int i = 0; i < dims; ++i) + dx[i] = 1.0 / (ncells - 1); + + // simulation setup (2D) + Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; + Real_t* grid_new = new Real_t[(ncells) * (ncells)]; + + auto phi_old = std::mdspan( + grid_old, ncells + nghosts, ncells + nghosts); + auto phi_new = + std::mdspan(grid_new, ncells, ncells); + + // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] + + Timer timer; + + std::for_each_n(std::execution::par_unseq, counting_iterator(0), + ncells * ncells, [=](int ind) { + int i = 1 + (ind / ncells); + int j = 1 + (ind % ncells); + + Real_t x = pos(i, ghost_cells, dx[0]); + Real_t y = pos(j, ghost_cells, dx[1]); + + // L2 distance (r2 from origin) + Real_t r2 = (x * x + y * y) / (0.01); + + // phi(x,y) = 1 + exp(-r^2) + phi_old(i, j) = 1 + exp(-r2); + }); + + if (args.print_grid) + // print the initial grid + printGrid(grid_old, ncells + nghosts); + + // init simulation time + Real_t time = 0.0; + + // evolve the system + for (auto step = 0; step < nsteps; step++) { + // fill boundary cells in old_phi + fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells); + + // update phi_new with stencil + std::for_each_n(std::execution::par_unseq, counting_iterator(0), + ncells * ncells, [=](int ind) { + int i = 1 + (ind / ncells); + int j = 1 + (ind % ncells); + + // Jacobi iteration + phi_new(i - 1, j - 1) = + phi_old(i, j) + + alpha * dt * + ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + + phi_old(i - 1, j)) / + (dx[0] * dx[0]) + + (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + + phi_old(i, j - 1)) / + (dx[1] * dx[1])); + }); + + // update the simulation time + time += dt; + + // parallel copy phi_new to phi_old + std::for_each_n(std::execution::par_unseq, counting_iterator(0), + ncells * ncells, [=](int ind) { + int i = 1 + (ind / ncells); + int j = 1 + (ind % ncells); + + // copy phi_new to phi_old + phi_old(i, j) = phi_new(i - 1, j - 1); + }); + } + + auto elapsed = timer.stop(); + + // print timing + if (args.print_time) { + std::cout << "Time: " << elapsed << " ms" << std::endl; + } + + if (args.print_grid) + // print the final grid + printGrid(grid_new, ncells); + + // delete all memory + delete[] grid_old; + delete[] grid_new; + + grid_old = nullptr; + grid_new = nullptr; + + return 0; } diff --git a/apps/heat-equation/heat-equation.hpp b/apps/heat-equation/heat-equation.hpp index 94bf8b6..a226a45 100644 --- a/apps/heat-equation/heat-equation.hpp +++ b/apps/heat-equation/heat-equation.hpp @@ -49,57 +49,62 @@ constexpr int nghosts = ghost_cells * dims; using view_2d = std::extents; // 3D view -using view_3d = std::extents; +using view_3d = std::extents; // macros to get x and y positions from indices #define pos(i, ghosts, dx) -0.5 + dx*(i - ghosts) // parameters struct heat_params_t : public argparse::Args { - int& ncells = kwarg("n,ncells", "number of cells on each side of the domain").set_default(32); - int& nsteps = kwarg("s,nsteps", "total steps in simulation").set_default(100); + int& ncells = kwarg("n,ncells", "number of cells on each side of the domain") + .set_default(32); + int& nsteps = kwarg("s,nsteps", "total steps in simulation").set_default(100); #if defined(HEQ_OMP) - int& nthreads = kwarg("nthreads", "number of threads").set_default(1); + int& nthreads = kwarg("nthreads", "number of threads").set_default(1); #endif // HEQ_OMP - Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f); - Real_t& dt = kwarg("t,dt", "time step").set_default(5.0e-5f); - bool& help = flag("h, help", "print help"); - bool& print_grid = flag("p,print", "print grids at step 0 and step n"); - bool& print_time = flag("time", "print simulation time"); + Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f); + Real_t& dt = kwarg("t,dt", "time step").set_default(5.0e-5f); + bool& help = flag("h, help", "print help"); + bool& print_grid = flag("p,print", "print grids at step 0 and step n"); + bool& print_time = flag("time", "print simulation time"); #if defined(TILING) - int& ntiles = kwarg("ntiles", "number of parallel tiles").set_default(4); + int& ntiles = kwarg("ntiles", "number of parallel tiles").set_default(4); #endif // TILING \ // future use if needed \ // int &max_grid_size = kwarg("g, max_grid_size", "size of each box (or - // grid)").set_default(32); bool &verbose = kwarg("v, verbose", "verbose - // mode").set_default(false); int &plot_int = kwarg("p, plot_int", "how often - // to write a plotfile").set_default(-1); + // grid)").set_default(32); bool &verbose = kwarg("v, verbose", "verbose + // mode").set_default(false); int &plot_int = kwarg("p, plot_int", "how often + // to write a plotfile").set_default(-1); }; template void printGrid(T* grid, int len) { - auto view = std::mdspan(grid, len, len); - std::cout << "Grid: " << std::endl; - std::cout << std::fixed << std::showpoint; - std::cout << std::setprecision(2); + auto view = std::mdspan(grid, len, len); + std::cout << "Grid: " << std::endl; + std::cout << std::fixed << std::showpoint; + std::cout << std::setprecision(2); - for (auto j = 0; j < view.extent(1); ++j) { - for (auto i = 0; i < view.extent(0); ++i) { - std::cout << view(i, j) << ", "; - } - std::cout << std::endl; + for (auto j = 0; j < view.extent(1); ++j) { + for (auto i = 0; i < view.extent(0); ++i) { + std::cout << view(i, j) << ", "; } std::cout << std::endl; + } + std::cout << std::endl; } // fill boundary cells template void fill2Dboundaries(T* grid, int len, int ghost_cells = 1) { - std::for_each_n(std::execution::par_unseq, counting_iterator(ghost_cells), len - nghosts, [=](auto i) { - grid[i] = grid[i + (ghost_cells * len)]; - grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))]; + std::for_each_n(std::execution::par_unseq, counting_iterator(ghost_cells), + len - nghosts, [=](auto i) { + grid[i] = grid[i + (ghost_cells * len)]; + grid[i + (len * (len - ghost_cells))] = + grid[i + (len * (len - ghost_cells - 1))]; - grid[i * len] = grid[(ghost_cells * len) + i]; - grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)]; - }); + grid[i * len] = grid[(ghost_cells * len) + i]; + grid[(len - ghost_cells) + (len * i)] = + grid[(len - ghost_cells - 1) + (len * i)]; + }); } \ No newline at end of file diff --git a/apps/mdspan-stdpar/mdspan-stdpar.cpp b/apps/mdspan-stdpar/mdspan-stdpar.cpp index 92dbdb9..9c92a87 100644 --- a/apps/mdspan-stdpar/mdspan-stdpar.cpp +++ b/apps/mdspan-stdpar/mdspan-stdpar.cpp @@ -30,51 +30,58 @@ using data_type = int; // 2D view -using extents_type = std::extents; +using extents_type = + std::extents; // 3D view (fix the first dimension to 2) -using extents_type2 = std::extents; +using extents_type2 = + std::extents; int main() { - constexpr int N = 1e9; - std::vector v(N); + constexpr int N = 1e9; + std::vector v(N); - // View data as contiguous memory representing 2 rows of 6 ints each - auto ms2 = std::mdspan(v.data(), N / 2, 2); - // View the same data as a 3D array 2 (fixed above) x 3 x 2 - auto ms3 = std::mdspan(v.data(), N / 4, 2); + // View data as contiguous memory representing 2 rows of 6 ints each + auto ms2 = std::mdspan(v.data(), + N / 2, 2); + // View the same data as a 3D array 2 (fixed above) x 3 x 2 + auto ms3 = std::mdspan(v.data(), + N / 4, 2); - // auto dim2 = [=](int i){int i1 = i/ms2.extent(1); int i2 = i%ms2.extent(1); - // return std::make_tuple(i1, i2);}; auto dim3 = [=](int i){int i1 = - // i/(ms3.extent(1)*ms3.extent(2)); int i2 = (i/ms3.extent(2))%ms3.extent(1); - // int i3 = i%ms3.extent(2); return std::make_tuple(i1, i2, i3);}; + // auto dim2 = [=](int i){int i1 = i/ms2.extent(1); int i2 = i%ms2.extent(1); + // return std::make_tuple(i1, i2);}; auto dim3 = [=](int i){int i1 = + // i/(ms3.extent(1)*ms3.extent(2)); int i2 = (i/ms3.extent(2))%ms3.extent(1); + // int i3 = i%ms3.extent(2); return std::make_tuple(i1, i2, i3);}; - std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) { - auto global_idx = std::distance(ms2.data_handle(), &i); - dim2(global_idx, ms2); - // auto [i1, i2] = dim2(global_idx); - ms2(ii, ij) = global_idx; - }); + std::for_each(std::execution::par_unseq, ms2.data_handle(), + ms2.data_handle() + ms2.size(), [=](int& i) { + auto global_idx = std::distance(ms2.data_handle(), &i); + dim2(global_idx, ms2); + // auto [i1, i2] = dim2(global_idx); + ms2(ii, ij) = global_idx; + }); - std::cout << std::endl << std::endl; + std::cout << std::endl << std::endl; - std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) { - auto global_idx = std::distance(ms2.data_handle(), &i); - dim3(global_idx, ms3); - // auto [i1, i2, i3] = dim3(global_idx); - ms3(ii, ij, ik) = 1000 + global_idx; - }); + std::for_each(std::execution::par_unseq, ms2.data_handle(), + ms2.data_handle() + ms2.size(), [=](int& i) { + auto global_idx = std::distance(ms2.data_handle(), &i); + dim3(global_idx, ms3); + // auto [i1, i2, i3] = dim3(global_idx); + ms3(ii, ij, ik) = 1000 + global_idx; + }); - // read subset of data using 3D view - for (size_t i = 0; i < ms3.extent(0); i++) { - for (size_t j = 0; j < 10; j++) { - for (size_t k = 0; k < ms3.extent(2); k++) { - assert(ms3(i, j, k) == 1000 + i * ms3.extent(1) * ms3.extent(2) + j * ms3.extent(2) + k); - std::cout << ms3(i, j, k) << " "; - } - std::cout << std::endl; - } - std::cout << std::endl; + // read subset of data using 3D view + for (size_t i = 0; i < ms3.extent(0); i++) { + for (size_t j = 0; j < 10; j++) { + for (size_t k = 0; k < ms3.extent(2); k++) { + assert(ms3(i, j, k) == 1000 + i * ms3.extent(1) * ms3.extent(2) + + j * ms3.extent(2) + k); + std::cout << ms3(i, j, k) << " "; + } + std::cout << std::endl; } + std::cout << std::endl; + } - std::cout << ms3(0, 0, 1) << "\n"; + std::cout << ms3(0, 0, 1) << "\n"; } \ No newline at end of file diff --git a/include/commons.hpp b/include/commons.hpp index cfacfa1..c043a20 100644 --- a/include/commons.hpp +++ b/include/commons.hpp @@ -48,38 +48,43 @@ #include "counting_iterator.hpp" // get mdpsan 2d indices from 1d index -#define dim2(x, ms) \ - int ii = x / ms.extent(1); \ - int ij = x % ms.extent(1); +#define dim2(x, ms) \ + int ii = x / ms.extent(1); \ + int ij = x % ms.extent(1); // get mdspan 3d indices from 1d index -#define dim3(x, ms) \ - int ii = x / (ms3.extent(1) * ms.extent(2)); \ - int ij = (x / ms.extent(2)) % ms.extent(1); \ - int ik = x % ms.extent(2) +#define dim3(x, ms) \ + int ii = x / (ms3.extent(1) * ms.extent(2)); \ + int ij = (x / ms.extent(2)) % ms.extent(1); \ + int ik = x % ms.extent(2) class Timer { - public: - Timer() { start(); } + public: + Timer() { start(); } - ~Timer() { stop(); } + ~Timer() { stop(); } - void start() { start_time_point = std::chrono::high_resolution_clock::now(); } + void start() { start_time_point = std::chrono::high_resolution_clock::now(); } - double stop() { - end_time_point = std::chrono::high_resolution_clock::now(); - return duration(); - } + double stop() { + end_time_point = std::chrono::high_resolution_clock::now(); + return duration(); + } - double duration() { - auto start = - std::chrono::time_point_cast(start_time_point).time_since_epoch().count(); - auto end = std::chrono::time_point_cast(end_time_point).time_since_epoch().count(); - auto duration = end - start; - double ms = duration * 0.001; - return ms; - } + double duration() { + auto start = std::chrono::time_point_cast( + start_time_point) + .time_since_epoch() + .count(); + auto end = + std::chrono::time_point_cast(end_time_point) + .time_since_epoch() + .count(); + auto duration = end - start; + double ms = duration * 0.001; + return ms; + } - private: - std::chrono::time_point start_time_point; - std::chrono::time_point end_time_point; + private: + std::chrono::time_point start_time_point; + std::chrono::time_point end_time_point; }; diff --git a/include/counting_iterator.hpp b/include/counting_iterator.hpp index 09d0fa2..aae6a85 100644 --- a/include/counting_iterator.hpp +++ b/include/counting_iterator.hpp @@ -36,76 +36,96 @@ using Index_t = int32_t; struct counting_iterator { - private: - using self = counting_iterator; - - public: - using value_type = Index_t; - using difference_type = typename std::make_signed::type; - using pointer = Index_t*; - using reference = Index_t&; - using iterator_category = std::random_access_iterator_tag; - - counting_iterator() : value(0) {} - - explicit counting_iterator(value_type v) : value(v) {} - - value_type operator*() const { return value; } - - value_type operator[](difference_type n) const { return value + n; } - - self& operator++() { - ++value; - return *this; - } - - self operator++(int) { - self result{value}; - ++value; - return result; - } - - self& operator--() { - --value; - return *this; - } - - self operator--(int) { - self result{value}; - --value; - return result; - } - - self& operator+=(difference_type n) { - value += n; - return *this; - } - - self& operator-=(difference_type n) { - value -= n; - return *this; - } - - friend self operator+(self const& i, difference_type n) { return self(i.value + n); } - - friend self operator+(difference_type n, self const& i) { return self(i.value + n); } - - friend difference_type operator-(self const& x, self const& y) { return x.value - y.value; } - - friend self operator-(self const& i, difference_type n) { return self(i.value - n); } - - friend bool operator==(self const& x, self const& y) { return x.value == y.value; } - - friend bool operator!=(self const& x, self const& y) { return x.value != y.value; } - - friend bool operator<(self const& x, self const& y) { return x.value < y.value; } - - friend bool operator<=(self const& x, self const& y) { return x.value <= y.value; } - - friend bool operator>(self const& x, self const& y) { return x.value > y.value; } - - friend bool operator>=(self const& x, self const& y) { return x.value >= y.value; } - - private: - value_type value; + private: + using self = counting_iterator; + + public: + using value_type = Index_t; + using difference_type = typename std::make_signed::type; + using pointer = Index_t*; + using reference = Index_t&; + using iterator_category = std::random_access_iterator_tag; + + counting_iterator() : value(0) {} + + explicit counting_iterator(value_type v) : value(v) {} + + value_type operator*() const { return value; } + + value_type operator[](difference_type n) const { return value + n; } + + self& operator++() { + ++value; + return *this; + } + + self operator++(int) { + self result{value}; + ++value; + return result; + } + + self& operator--() { + --value; + return *this; + } + + self operator--(int) { + self result{value}; + --value; + return result; + } + + self& operator+=(difference_type n) { + value += n; + return *this; + } + + self& operator-=(difference_type n) { + value -= n; + return *this; + } + + friend self operator+(self const& i, difference_type n) { + return self(i.value + n); + } + + friend self operator+(difference_type n, self const& i) { + return self(i.value + n); + } + + friend difference_type operator-(self const& x, self const& y) { + return x.value - y.value; + } + + friend self operator-(self const& i, difference_type n) { + return self(i.value - n); + } + + friend bool operator==(self const& x, self const& y) { + return x.value == y.value; + } + + friend bool operator!=(self const& x, self const& y) { + return x.value != y.value; + } + + friend bool operator<(self const& x, self const& y) { + return x.value < y.value; + } + + friend bool operator<=(self const& x, self const& y) { + return x.value <= y.value; + } + + friend bool operator>(self const& x, self const& y) { + return x.value > y.value; + } + + friend bool operator>=(self const& x, self const& y) { + return x.value >= y.value; + } + + private: + value_type value; }; \ No newline at end of file From 003698871603bc9c99bd7d28655e6c7e6ee26cf2 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Sat, 7 Oct 2023 01:33:17 -0700 Subject: [PATCH 20/20] removing stale log files --- apps/fft/fft-serial.cpp | 2 +- log-gcc.txt | 40 ---------------------- log.txt | 76 ----------------------------------------- 3 files changed, 1 insertion(+), 117 deletions(-) delete mode 100644 log-gcc.txt delete mode 100644 log.txt diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp index b174b5a..21d66f6 100644 --- a/apps/fft/fft-serial.cpp +++ b/apps/fft/fft-serial.cpp @@ -135,4 +135,4 @@ int main(int argc, char* argv[]) std::cout << "Elapsed Time: " << elapsed << " ms" << std::endl; return 0; -} \ No newline at end of file +} diff --git a/log-gcc.txt b/log-gcc.txt deleted file mode 100644 index 6d41374..0000000 --- a/log-gcc.txt +++ /dev/null @@ -1,40 +0,0 @@ -+ cd /global/homes/m/mhaseeb/repos/nvstdpar/build-gcc/apps/heat-equation -+ ./heat-equation-mdspan -s=50 -n=30000 --time -+ tee gcc-md.txt -Time: 155095 ms -+ T=(128 64 32 16 8 4 2 1) -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=128 -+ tee gcc-omp-128.txt -Time: 15310.8 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=64 -+ tee gcc-omp-64.txt -Time: 15362.4 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=32 -+ tee gcc-omp-32.txt -Time: 15631.2 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=16 -+ tee gcc-omp-16.txt -Time: 18824.7 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=8 -+ tee gcc-omp-8.txt -Time: 30255 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=4 -+ tee gcc-omp-4.txt -Time: 56973.2 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=2 -+ tee gcc-omp-2.txt -Time: 117583 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=1 -+ tee gcc-omp-1.txt -Time: 231557 ms -+ ./heat-equation-stdpar -s=50 -n=30000 --time -+ tee gcc-stdpar-1.txt -Time: 15924.2 ms \ No newline at end of file diff --git a/log.txt b/log.txt deleted file mode 100644 index ed41625..0000000 --- a/log.txt +++ /dev/null @@ -1,76 +0,0 @@ -+ cd /global/homes/m/mhaseeb/repos/nvstdpar/build/apps/heat-equation -+ ./heat-equation-mdspan -s=50 -n=30000 --time -+ tee md.txt -Time: 72373.3 ms -+ T=(1 2 4 8 16 32 64) -+ for i in "${T[@]}" -+ export OMP_NUM_THREADS=1 -+ OMP_NUM_THREADS=1 -+ ./heat-equation-stdpar -s=50 -n=30000 --time -+ tee stdpar-1.txt -Time: 704823 ms -+ for i in "${T[@]}" -+ export OMP_NUM_THREADS=2 -+ OMP_NUM_THREADS=2 -+ ./heat-equation-stdpar -s=50 -n=30000 --time -+ tee stdpar-2.txt -Time: 352537 ms -+ for i in "${T[@]}" -+ export OMP_NUM_THREADS=4 -+ OMP_NUM_THREADS=4 -+ ./heat-equation-stdpar -s=50 -n=30000 --time -+ tee stdpar-4.txt -Time: 179607 ms -+ for i in "${T[@]}" -+ export OMP_NUM_THREADS=8 -+ OMP_NUM_THREADS=8 -+ ./heat-equation-stdpar -s=50 -n=30000 --time -+ tee stdpar-8.txt -Time: 91341.8 ms -+ for i in "${T[@]}" -+ export OMP_NUM_THREADS=16 -+ OMP_NUM_THREADS=16 -+ ./heat-equation-stdpar -s=50 -n=30000 --time -+ tee stdpar-16.txt -Time: 45602.9 ms -+ for i in "${T[@]}" -+ export OMP_NUM_THREADS=32 -+ OMP_NUM_THREADS=32 -+ ./heat-equation-stdpar -s=50 -n=30000 --time -+ tee stdpar-32.txt -Time: 24956.7 ms -+ for i in "${T[@]}" -+ export OMP_NUM_THREADS=64 -+ OMP_NUM_THREADS=64 -+ ./heat-equation-stdpar -s=50 -n=30000 --time -+ tee stdpar-64.txt -Time: 12437.9 ms -+ unset OMP_NUM_THREADS -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=1 -+ tee omp-1.txt -Time: 258170 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=2 -+ tee omp-2.txt -Time: 129542 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=4 -+ tee omp-4.txt -Time: 65776.1 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=8 -+ tee omp-8.txt -Time: 32570 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=16 -+ tee omp-16.txt -Time: 16814.6 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=32 -+ tee omp-32.txt -Time: 11322.6 ms -+ for i in "${T[@]}" -+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=64 -+ tee omp-64.txt -Time: 15135.6 ms