From 37cc6bb15641764b10f3f9b75bcad75c5d940703 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Thu, 14 Sep 2023 16:33:12 -0700
Subject: [PATCH 01/20] fft initial stuff

---
 .clang-format              |   4 +-
 apps/CMakeLists.txt        |   3 +
 apps/fft/CMakeLists.txt    |  40 +++++++++
 apps/fft/fft-serial-1d.cpp |  31 +++++++
 apps/fft/fft.hpp           | 176 +++++++++++++++++++++++++++++++++++++
 log-gcc.txt                |  40 +++++++++
 log.txt                    |  76 ++++++++++++++++
 7 files changed, 368 insertions(+), 2 deletions(-)
 create mode 100644 apps/fft/CMakeLists.txt
 create mode 100644 apps/fft/fft-serial-1d.cpp
 create mode 100644 apps/fft/fft.hpp
 create mode 100644 log-gcc.txt
 create mode 100644 log.txt

diff --git a/.clang-format b/.clang-format
index 335a74a..dd3a3c5 100644
--- a/.clang-format
+++ b/.clang-format
@@ -42,7 +42,7 @@ BreakBeforeBinaryOperators: None
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializers: BeforeColon
 BreakInheritanceList: BeforeColon
-ColumnLimit: 80
+ColumnLimit: 120
 CompactNamespaces: false
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
@@ -52,7 +52,7 @@ FixNamespaceComments: true
 IncludeBlocks: Preserve
 IndentCaseLabels: true
 IndentPPDirectives: None
-IndentWidth: 2
+IndentWidth: 4
 KeepEmptyLinesAtTheStartOfBlocks: true
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 7fc4bfd..daba52b 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -21,3 +21,6 @@ add_subdirectory(mdspan-stdpar)
 
 message(STATUS "Adding 1d_stencil_stdpar...")
 add_subdirectory(1d_stencil)
+
+message(STATUS "Adding fft...")
+add_subdirectory(fft)
diff --git a/apps/fft/CMakeLists.txt b/apps/fft/CMakeLists.txt
new file mode 100644
index 0000000..e957d46
--- /dev/null
+++ b/apps/fft/CMakeLists.txt
@@ -0,0 +1,40 @@
+project(fft LANGUAGES CXX)
+
+file(GLOB CPP_SOURCES "*.cpp")
+
+foreach(source_file ${CPP_SOURCES})
+  if(NOT STDPAR STREQUAL "gpu")
+    if("${source_file}" MATCHES ".*gpu.*scheduler.*" OR "${source_file}"
+                                                        MATCHES ".*cuda.*")
+      message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}")
+      continue()
+    endif()
+  endif()
+
+  # get the file name without an extension
+  get_filename_component(exec_name ${source_file} NAME_WE)
+
+  # add an executable with the same name as the source file
+  add_executable(${exec_name} ${_EXCLUDE} ${source_file})
+
+  # add dependency on argparse
+  add_dependencies(${exec_name} argparse magic_enum)
+
+  set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX
+                                                        LINKER_LANGUAGE CXX)
+  target_include_directories(
+    ${exec_name}
+    PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
+            ${ARGPARSE_INCLUDE_DIR} ${MAGICENUM_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+
+  target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec)
+
+  set_target_properties(
+    ${exec_name}
+    PROPERTIES CXX_STANDARD ${CXX_STANDARD}
+               CXX_EXTENSIONS NO
+               INSTALL_RPATH_USE_LINK_PATH ON)
+
+  # installation
+  install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR})
+endforeach()
diff --git a/apps/fft/fft-serial-1d.cpp b/apps/fft/fft-serial-1d.cpp
new file mode 100644
index 0000000..70456e2
--- /dev/null
+++ b/apps/fft/fft-serial-1d.cpp
@@ -0,0 +1,31 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2023 The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of any
+ * required approvals from the U.S. Dept. of Energy).All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * commons for the fft codes
+ */
+
+#include "fft.hpp"
\ No newline at end of file
diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp
new file mode 100644
index 0000000..7d5da98
--- /dev/null
+++ b/apps/fft/fft.hpp
@@ -0,0 +1,176 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2023 The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of any
+ * required approvals from the U.S. Dept. of Energy).All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * commons for the fft codes
+ */
+
+#pragma once
+
+#include <experimental/mdspan>
+#include <complex>
+
+#include "argparse/argparse.hpp"
+#include "commons.hpp"
+
+using namespace std::complex_literals;
+
+// data type
+using Real_t = double;
+using data_t = std::complex<Real_t>;
+
+// number of dimensions
+constexpr int dims = 1;
+
+// 1D view
+using view_1d = std::extents<int, std::dynamic_extent>;
+
+// 2D view
+using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+
+// 3D view
+using view_3d = std::extents<int, std::dynamic_extent, std::dynamic_extent,
+                             std::dynamic_extent>;
+
+enum class fft_type { fftw, cufft };
+enum class sig_type { square, sinusoid, sawtooth, triangle, sinc, box };
+
+using sig_type_t = sig_type;
+
+// parameters
+struct fft_params_t : public argparse::Args {
+  sig_type_t& sig = kwarg("sig", "input signal type(square, sinusoid, sawtooth, triangle, box)").set_default(signal_type::box);
+  int& len = kwarg("n,N", "N-point FFT").set_default(1<<20);
+  bool& print_fft = flag("p,print", "print Fourier transformed signal");
+
+#if defined(USE_OMP)
+  int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
+#endif  // USE_OMP
+
+  bool& help = flag("h, help", "print help");
+  bool& print_time = flag("t,time", "print transform time");
+};
+
+void printSignal(data_t* sig, int N) {
+  std::cout << std::fixed << std::setprecision(1);
+
+  for (int i = 0; i < N; ++i)
+    std::cout << sig[i] << " ";
+
+  std::cout << std::endl;
+}
+
+class signal
+{
+public:
+
+  signal()
+  {
+    this->N = 1e3;
+    t.resize(this->N);
+    y.resize(this->N);
+    dt = 1.0 / this->N;
+  }
+
+  signal(int _N)
+  {
+    if (_N <= 0)
+    {
+      std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl;
+      exit(1);
+    }
+    this->N = _N;
+    t.resize(this->N);
+    y.resize(this->N);
+    dt = 1.0 / this->N;
+  }
+
+  signal(int N, sig_type type=sig_type::box)
+  {
+    if (N <= 0)
+    {
+      std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl;
+      exit(1);
+    }
+
+    this->N = N;
+    t.resize(N);
+    y.resize(N);
+    dt = 1.0 / N;
+    signalGenerator(N, type);
+  }
+
+  void signalGenerator(int N, sig_type type=sig_type::box)
+  {
+    int interval = 1/N;
+    std::vector<Real_t> t(N);
+
+    switch (type) {
+      case sig_type::square:
+        for (int i = 0; i < N; ++i)
+          y[i] = (i < N / 4 || i > 3 * N/4) ? 1.0 : -1.0;
+        break;
+      case sig_type::sinusoid:
+        for (int i = 0; i < N; ++i)
+          y[i] = std::sin(2.0 * M_PI * i / N);
+        break;
+      case sig_type::sawtooth:
+        for (int i = 0; i < N; ++i)
+          y[i] = 2.0 * (i / N) - 1.0;
+        break;
+      case sig_type::triangle:
+        for (int i = 0; i < N; ++i)
+          y[i] = 2.0 * std::abs(2.0 * (i / N) - 1.0) - 1.0;
+        break;
+      case sig_type::sinc:
+          y[0] = 1.0;
+        for (int i = 1; i < N; ++i)
+          y[i] = std::sin(2.0 * M_PI * i / N) / (2.0 * M_PI * i / N);
+        break;
+      case sig_type::box:
+        for (int i = 0; i < N; ++i)
+          y[i] = (i < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0;
+        break;
+      default:
+        std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl;
+        exit(1);
+    }
+  }
+
+  ~signal()
+  {
+    y.clear();
+    t.clear();
+  }
+
+private:
+  int N;
+  Real_t dt;
+  // time axis
+  std::vector<Real_t> t;
+  // y(t) axis
+  std::vector<Real_t> y;
+};
\ No newline at end of file
diff --git a/log-gcc.txt b/log-gcc.txt
new file mode 100644
index 0000000..6d41374
--- /dev/null
+++ b/log-gcc.txt
@@ -0,0 +1,40 @@
++ cd /global/homes/m/mhaseeb/repos/nvstdpar/build-gcc/apps/heat-equation
++ ./heat-equation-mdspan -s=50 -n=30000 --time
++ tee gcc-md.txt
+Time: 155095 ms
++ T=(128 64 32 16 8 4 2 1)
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=128
++ tee gcc-omp-128.txt
+Time: 15310.8 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=64
++ tee gcc-omp-64.txt
+Time: 15362.4 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=32
++ tee gcc-omp-32.txt
+Time: 15631.2 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=16
++ tee gcc-omp-16.txt
+Time: 18824.7 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=8
++ tee gcc-omp-8.txt
+Time: 30255 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=4
++ tee gcc-omp-4.txt
+Time: 56973.2 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=2
++ tee gcc-omp-2.txt
+Time: 117583 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=1
++ tee gcc-omp-1.txt
+Time: 231557 ms
++ ./heat-equation-stdpar -s=50 -n=30000 --time
++ tee gcc-stdpar-1.txt
+Time: 15924.2 ms
\ No newline at end of file
diff --git a/log.txt b/log.txt
new file mode 100644
index 0000000..ed41625
--- /dev/null
+++ b/log.txt
@@ -0,0 +1,76 @@
++ cd /global/homes/m/mhaseeb/repos/nvstdpar/build/apps/heat-equation
++ ./heat-equation-mdspan -s=50 -n=30000 --time
++ tee md.txt
+Time: 72373.3 ms
++ T=(1 2 4 8 16 32 64)
++ for i in "${T[@]}"
++ export OMP_NUM_THREADS=1
++ OMP_NUM_THREADS=1
++ ./heat-equation-stdpar -s=50 -n=30000 --time
++ tee stdpar-1.txt
+Time: 704823 ms
++ for i in "${T[@]}"
++ export OMP_NUM_THREADS=2
++ OMP_NUM_THREADS=2
++ ./heat-equation-stdpar -s=50 -n=30000 --time
++ tee stdpar-2.txt
+Time: 352537 ms
++ for i in "${T[@]}"
++ export OMP_NUM_THREADS=4
++ OMP_NUM_THREADS=4
++ ./heat-equation-stdpar -s=50 -n=30000 --time
++ tee stdpar-4.txt
+Time: 179607 ms
++ for i in "${T[@]}"
++ export OMP_NUM_THREADS=8
++ OMP_NUM_THREADS=8
++ ./heat-equation-stdpar -s=50 -n=30000 --time
++ tee stdpar-8.txt
+Time: 91341.8 ms
++ for i in "${T[@]}"
++ export OMP_NUM_THREADS=16
++ OMP_NUM_THREADS=16
++ ./heat-equation-stdpar -s=50 -n=30000 --time
++ tee stdpar-16.txt
+Time: 45602.9 ms
++ for i in "${T[@]}"
++ export OMP_NUM_THREADS=32
++ OMP_NUM_THREADS=32
++ ./heat-equation-stdpar -s=50 -n=30000 --time
++ tee stdpar-32.txt
+Time: 24956.7 ms
++ for i in "${T[@]}"
++ export OMP_NUM_THREADS=64
++ OMP_NUM_THREADS=64
++ ./heat-equation-stdpar -s=50 -n=30000 --time
++ tee stdpar-64.txt
+Time: 12437.9 ms
++ unset OMP_NUM_THREADS
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=1
++ tee omp-1.txt
+Time: 258170 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=2
++ tee omp-2.txt
+Time: 129542 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=4
++ tee omp-4.txt
+Time: 65776.1 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=8
++ tee omp-8.txt
+Time: 32570 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=16
++ tee omp-16.txt
+Time: 16814.6 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=32
++ tee omp-32.txt
+Time: 11322.6 ms
++ for i in "${T[@]}"
++ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=64
++ tee omp-64.txt
+Time: 15135.6 ms

From da68278dcddfefcb14903c78c32bf775dcbe7650 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Wed, 27 Sep 2023 13:56:24 -0700
Subject: [PATCH 02/20] adding magic_enum for argparse

---
 .gitmodules                | 3 +++
 apps/fft/fft-serial-1d.cpp | 9 ++++++++-
 apps/fft/fft.hpp           | 7 ++++---
 externals/CMakeLists.txt   | 7 +++++++
 externals/magic_enum       | 1 +
 5 files changed, 23 insertions(+), 4 deletions(-)
 create mode 160000 externals/magic_enum

diff --git a/.gitmodules b/.gitmodules
index 2bb4aed..b8f3f6d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "externals/argparse"]
 	path = externals/argparse
 	url = https://github.com/mhaseeb123/argparse
+[submodule "externals/magic_enum"]
+	path = externals/magic_enum
+	url = https://github.com/mhaseeb123/magic_enum
diff --git a/apps/fft/fft-serial-1d.cpp b/apps/fft/fft-serial-1d.cpp
index 70456e2..6c73237 100644
--- a/apps/fft/fft-serial-1d.cpp
+++ b/apps/fft/fft-serial-1d.cpp
@@ -28,4 +28,11 @@
  * commons for the fft codes
  */
 
-#include "fft.hpp"
\ No newline at end of file
+#include "fft.hpp"
+
+//
+// simulation
+//
+int main(int argc, char* argv[]) {
+    return 0;
+}
diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp
index 7d5da98..bc411d5 100644
--- a/apps/fft/fft.hpp
+++ b/apps/fft/fft.hpp
@@ -62,8 +62,9 @@ using sig_type_t = sig_type;
 
 // parameters
 struct fft_params_t : public argparse::Args {
-  sig_type_t& sig = kwarg("sig", "input signal type(square, sinusoid, sawtooth, triangle, box)").set_default(signal_type::box);
-  int& len = kwarg("n,N", "N-point FFT").set_default(1<<20);
+  sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box);
+  int& freq = kwarg("f,freq", "Signal frequency").set_default(1000);
+  int& len = kwarg("n,N", "N-point FFT").set_default(1<<16);
   bool& print_fft = flag("p,print", "print Fourier transformed signal");
 
 #if defined(USE_OMP)
@@ -152,7 +153,7 @@ class signal
         break;
       case sig_type::box:
         for (int i = 0; i < N; ++i)
-          y[i] = (i < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0;
+          y[i] = (i < N / 4 || i > 3 * N / 4) ? 1.0 : 0.0;
         break;
       default:
         std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl;
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 8c6216c..b6e9828 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -156,6 +156,9 @@ add_subdirectory(mdspan)
 message(STATUS "Adding externals/argparse...")
 add_subdirectory(argparse)
 
+message(STATUS "Adding externals/magic_enum...")
+add_subdirectory(magic_enum)
+
 set(MDSPAN_INCLUDE_DIR
     ${CMAKE_CURRENT_SOURCE_DIR}/mdspan/include
     CACHE PATH "mdspan include directory")
@@ -163,3 +166,7 @@ set(MDSPAN_INCLUDE_DIR
 set(ARGPARSE_INCLUDE_DIR
     ${CMAKE_CURRENT_SOURCE_DIR}/argparse/include
     CACHE PATH "argparse include directory")
+
+set(MAGICENUM_INCLUDE_DIR
+    ${CMAKE_CURRENT_SOURCE_DIR}/magic_enum/include
+    CACHE PATH "magic_enum include directory")
\ No newline at end of file
diff --git a/externals/magic_enum b/externals/magic_enum
new file mode 160000
index 0000000..b291b0c
--- /dev/null
+++ b/externals/magic_enum
@@ -0,0 +1 @@
+Subproject commit b291b0ce5a76e808e05fc0141154e963407372da

From 814fd74afb854e5a99f7cb655fc3e53c0f392d18 Mon Sep 17 00:00:00 2001
From: Chuanqiu He <49005493+hcq9102@users.noreply.github.com>
Date: Fri, 22 Sep 2023 00:59:57 -0500
Subject: [PATCH 03/20] choleskey code serial/mdspan/stadpar (#26)

* choleskey serial  and choleskey_stdpar
---
 apps/CMakeLists.txt                 |   9 +++
 apps/choleskey/CMakeLists.txt       |  13 ++++
 apps/choleskey/choleskey_serial.cpp | 101 +++++++++++++++++++++++++
 apps/choleskey/choleskey_stdpar.cpp | 113 ++++++++++++++++++++++++++++
 apps/choleskey/matrixutil.hpp       |  41 ++++++++++
 5 files changed, 277 insertions(+)
 create mode 100644 apps/choleskey/CMakeLists.txt
 create mode 100644 apps/choleskey/choleskey_serial.cpp
 create mode 100644 apps/choleskey/choleskey_stdpar.cpp
 create mode 100644 apps/choleskey/matrixutil.hpp

diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index daba52b..1bf106a 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -22,5 +22,14 @@ add_subdirectory(mdspan-stdpar)
 message(STATUS "Adding 1d_stencil_stdpar...")
 add_subdirectory(1d_stencil)
 
+# ----------------------------------------------------------------------------------------#
+# Add choleskey demo
+# ----------------------------------------------------------------------------------------#
+message(STATUS "Adding choleskey example...")
+add_subdirectory(choleskey)
+
+# ----------------------------------------------------------------------------------------#
+# Add fft demo
+# ----------------------------------------------------------------------------------------#
 message(STATUS "Adding fft...")
 add_subdirectory(fft)
diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt
new file mode 100644
index 0000000..bfa1a7a
--- /dev/null
+++ b/apps/choleskey/CMakeLists.txt
@@ -0,0 +1,13 @@
+project(choleskey_stdpar LANGUAGES CXX)
+
+add_executable(choleskey_serial choleskey_serial.cpp)
+target_include_directories(
+  choleskey_serial
+  PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
+          ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+
+add_executable(choleskey_stdpar choleskey_stdpar.cpp)
+target_include_directories(
+  choleskey_stdpar
+  PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
+          ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
diff --git a/apps/choleskey/choleskey_serial.cpp b/apps/choleskey/choleskey_serial.cpp
new file mode 100644
index 0000000..ea181d0
--- /dev/null
+++ b/apps/choleskey/choleskey_serial.cpp
@@ -0,0 +1,101 @@
+// Cholesky Decomposition:  mdspan
+#include <bits/stdc++.h>
+#include <experimental/mdspan>
+#include <vector>
+#include "argparse/argparse.hpp"
+#include "commons.hpp"
+#include "matrixutil.hpp"
+
+using namespace std;
+
+struct solver {
+
+  using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+
+  typedef std::mdspan<int, view_2d, std::layout_right> matrix_ms_t;
+
+  template <typename T>
+  matrix_ms_t Cholesky_Decomposition(std::vector<T>& vec, int n) {
+    std::vector<T> lower(n * n, 0);
+
+    auto matrix_ms =
+        std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
+    auto lower_ms =
+        std::mdspan<T, view_2d, std::layout_right>(lower.data(), n, n);
+
+    // Decomposing a matrix into Lower Triangular
+    for (int i = 0; i < matrix_ms.extent(0); i++) {
+      for (int j = 0; j <= i; j++) {
+        T sum = 0;
+
+        if (j == i) {
+          // summation for diagonals
+          for (int k = 0; k < j; k++)
+            sum += pow(lower_ms(j, k), 2);
+          lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum);
+        } else {
+          // Evaluating L(i, j) using L(j, j)
+          for (int k = 0; k < j; k++)
+            sum += (lower_ms(i, k) * lower_ms(j, k));
+          lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j);
+        }
+      }
+    }
+    return lower_ms;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+int benchmark(args_params_t const& args) {
+
+  std::uint64_t nd = args.nd;  // Number of matrix dimension.
+
+  std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
+
+  // Create the solverobject
+  solver solve;
+  // Measure execution time.
+  Timer timer;
+  // start decomposation
+  auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
+
+  // Print the final results
+  if (args.results) {
+    // Displaying Lower Triangular and its Transpose
+    cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
+    for (int i = 0; i < nd; i++) {
+      // Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix(i, j) << "\t";
+      cout << "\t";
+
+      // Transpose of Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix(j, i) << "\t";
+      cout << endl;
+    }
+  }
+
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
+
+  return 0;
+}
+
+// Driver Code for testing
+int main(int argc, char* argv[]) {
+
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
+
+  benchmark(args);
+
+  return 0;
+}
diff --git a/apps/choleskey/choleskey_stdpar.cpp b/apps/choleskey/choleskey_stdpar.cpp
new file mode 100644
index 0000000..abbe02d
--- /dev/null
+++ b/apps/choleskey/choleskey_stdpar.cpp
@@ -0,0 +1,113 @@
+// Cholesky Decomposition: stdpar
+#include "argparse/argparse.hpp"
+#include "commons.hpp"
+
+#include <algorithm>
+#include <execution>
+#include <experimental/mdspan>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#include "matrixutil.hpp"
+
+using namespace std;
+
+struct solver {
+
+  using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+
+  template <typename T>
+  std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec,
+                                                     int n) {
+    std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
+
+    auto matrix_ms =
+        std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
+
+    auto multiplier_lambda = [=](auto a, auto b) {
+      return a * b;
+    };
+
+    // Decomposing a matrix into Lower Triangular
+    for (int i = 0; i < matrix_ms.extent(0); i++) {
+      for (int j = 0; j <= i; j++) {
+        T sum = 0;
+
+        if (j == i)  // summation for diagonals
+        {
+          sum = std::transform_reduce(std::execution::par, lower[j].cbegin(),
+                                      lower[j].cbegin() + j, 0, std::plus{},
+                                      [=](int val) { return val * val; });
+
+          lower[j][j] = std::sqrt(matrix_ms(i, j) - sum);
+
+        } else {  // Evaluating L(i, j) using L(j, j)
+
+          sum = std::transform_reduce(std::execution::par, lower[j].cbegin(),
+                                      lower[j].cbegin() + j, lower[i].cbegin(),
+                                      0, std::plus<>(), multiplier_lambda);
+
+          lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j];
+        }
+      }
+    }
+    return lower;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+int benchmark(args_params_t const& args) {
+
+  std::uint64_t nd = args.nd;  // Number of matrix dimension.
+
+  std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
+
+  // Create the solver object
+  solver solve;
+  // Measure execution time.
+  Timer timer;
+
+  // start decomposation
+  auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
+
+  // Print the final results
+  if (args.results) {
+    // Displaying Lower Triangular and its Transpose
+    cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
+    for (int i = 0; i < nd; i++) {
+      // Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix[i][j] << "\t";
+      cout << "\t";
+
+      // Transpose of Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix[j][i] << "\t";
+      cout << endl;
+    }
+  }
+
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
+
+  return 0;
+}
+
+// Driver Code for testing
+int main(int argc, char* argv[]) {
+
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
+
+  benchmark(args);
+
+  return 0;
+}
diff --git a/apps/choleskey/matrixutil.hpp b/apps/choleskey/matrixutil.hpp
new file mode 100644
index 0000000..46206c3
--- /dev/null
+++ b/apps/choleskey/matrixutil.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+// generate positive definition matrix
+template <typename T>
+using Matrix = std::vector<std::vector<T>>;
+
+template <typename T>
+std::vector<T> generate_pascal_matrix(const int n) {
+  Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));
+
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < n; ++j) {
+      if (i == 0 || j == 0) {
+        matrix[i][j] = static_cast<T>(1);
+      } else {
+        matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
+      }
+    }
+  }
+
+  std::vector<T> flattenedVector;
+  for (const auto& row : matrix) {
+    flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
+  }
+  return std::move(flattenedVector);
+}
+
+// parameters define
+struct args_params_t : public argparse::Args {
+  bool& results = kwarg("results", "print generated results (default: false)")
+                      .set_default(true);
+  std::uint64_t& nd =
+      kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)")
+          .set_default(10);
+
+  bool& help = flag("h, help", "print help");
+  bool& time = kwarg("t, time", "print time").set_default(true);
+};

From 3630384363207bd76bb275e7e2fe6f8372a11cd3 Mon Sep 17 00:00:00 2001
From: Chuanqiu He <chuanqiuhe@gmail.com>
Date: Tue, 26 Sep 2023 09:05:33 -0700
Subject: [PATCH 04/20] sender_choleskey_sync_wait_issue

---
 apps/choleskey/CMakeLists.txt           |   8 ++
 apps/choleskey/choleskey_stdpar_snd.cpp | 142 ++++++++++++++++++++++++
 apps/choleskey/matrixutil.hpp           |   2 +-
 3 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 apps/choleskey/choleskey_stdpar_snd.cpp

diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt
index bfa1a7a..a5eae85 100644
--- a/apps/choleskey/CMakeLists.txt
+++ b/apps/choleskey/CMakeLists.txt
@@ -7,7 +7,15 @@ target_include_directories(
           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
 
 add_executable(choleskey_stdpar choleskey_stdpar.cpp)
+target_link_libraries(choleskey_stdpar stdexec)
 target_include_directories(
   choleskey_stdpar
   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+
+add_executable(choleskey_stdpar_snd choleskey_stdpar_snd.cpp)
+target_link_libraries(choleskey_stdpar_snd stdexec)
+target_include_directories(
+  choleskey_stdpar_snd
+  PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
+          ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp
new file mode 100644
index 0000000..36a64fd
--- /dev/null
+++ b/apps/choleskey/choleskey_stdpar_snd.cpp
@@ -0,0 +1,142 @@
+// Cholesky Decomposition: stdpar-->sender
+#include "argparse/argparse.hpp"
+#include "commons.hpp"
+
+#include <algorithm>
+#include <experimental/mdspan>
+#include <iostream>
+#include <numeric>
+#include <stdexec/execution.hpp>
+#include <vector>
+#include "exec/static_thread_pool.hpp"
+
+#include "matrixutil.hpp"
+using namespace stdexec;
+using stdexec::sync_wait;
+
+using namespace std;
+
+struct solver {
+
+  using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+
+  template <typename T>
+  std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec,
+                                                     int n) {
+
+    // test here first, scheduler from a thread pool
+    exec::static_thread_pool pool(n);
+    stdexec::scheduler auto sch = pool.get_scheduler();
+    stdexec::sender auto begin = stdexec::schedule(sch);
+
+    std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
+
+    auto matrix_ms =
+        std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
+
+    auto multiplier_lambda = [=](auto a, auto b) {
+      return a * b;
+    };
+
+    int np = 4;  // default number of parallel sec, will be an option
+
+    for (int i = 0; i < matrix_ms.extent(0); i++) {
+      for (int j = 0; j <= i; j++) {
+        T sum = 0;
+
+        if (j == i)  // summation for diagonals
+        {
+          auto send1 =
+              just(std::move(sum)) |
+              bulk(np,
+                   [&](int piece) {
+                     int start = piece * (n / 2 + 1) / np;
+                     int size = (n / 2 + 1) / np;  // partition size
+                     int remaining = (n / 2 + 1) % np;
+                     size += (piece == np - 1) ? remaining : 0;
+
+                     sum = std::transform_reduce(
+                         std::execution::par, counting_iterator(start),
+                         counting_iterator(start) + size, 0, std ::plus{},
+                         [=](int val) { return val * val; });
+                   }) |
+              then([&](auto sum) { return sum; });
+
+          //auto sum1 = sync_wait(send1).value();
+          lower[j][j] = std::sqrt(matrix_ms(i, j) - sum);
+
+        } else {  // Evaluating L(i, j) using L(j, j)
+
+          sum = std::transform_reduce(std::execution::par, lower[j].cbegin(),
+                                      lower[j].cbegin() + j, lower[i].cbegin(),
+                                      0, std::plus<>(), multiplier_lambda);
+
+          lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j];
+        }
+      }
+    }
+    return lower;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+int benchmark(args_params_t const& args) {
+
+  std::uint64_t nd = args.nd;  // Number of matrix dimension.
+  std::uint64_t np = args.np;  // Number of partitions.
+
+  std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
+
+  // Create the solver object
+  solver solve;
+
+  exec::static_thread_pool pool(np);
+  stdexec::scheduler auto sch = pool.get_scheduler();
+  stdexec::sender auto begin = stdexec::schedule(sch);
+
+  // Measure execution time.
+  Timer timer;
+
+  // start decomposation
+  auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
+
+  // Print the final results
+  if (args.results) {
+    // Displaying Lower Triangular and its Transpose
+    cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
+    for (int i = 0; i < nd; i++) {
+      // Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix[i][j] << "\t";
+      cout << "\t";
+
+      // Transpose of Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix[j][i] << "\t";
+      cout << endl;
+    }
+  }
+
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
+
+  return 0;
+}
+
+// Driver Code for testing
+int main(int argc, char* argv[]) {
+
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
+
+  benchmark(args);
+
+  return 0;
+}
diff --git a/apps/choleskey/matrixutil.hpp b/apps/choleskey/matrixutil.hpp
index 46206c3..44f0468 100644
--- a/apps/choleskey/matrixutil.hpp
+++ b/apps/choleskey/matrixutil.hpp
@@ -35,7 +35,7 @@ struct args_params_t : public argparse::Args {
   std::uint64_t& nd =
       kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)")
           .set_default(10);
-
+  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
   bool& help = flag("h, help", "print help");
   bool& time = kwarg("t, time", "print time").set_default(true);
 };

From 8f99758d6d8df8f03bae1bc2d14b843fd8a16196 Mon Sep 17 00:00:00 2001
From: hcq9102 <chuanqiuhe@gmail.com>
Date: Tue, 26 Sep 2023 11:13:48 -0700
Subject: [PATCH 05/20] fix partition&iterator

---
 apps/choleskey/choleskey_stdpar_snd.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp
index 36a64fd..5429445 100644
--- a/apps/choleskey/choleskey_stdpar_snd.cpp
+++ b/apps/choleskey/choleskey_stdpar_snd.cpp
@@ -46,23 +46,24 @@ struct solver {
 
         if (j == i)  // summation for diagonals
         {
-          auto send1 =
-              just(std::move(sum)) |
+          sender auto send1 =
+              begin |
               bulk(np,
                    [&](int piece) {
-                     int start = piece * (n / 2 + 1) / np;
-                     int size = (n / 2 + 1) / np;  // partition size
-                     int remaining = (n / 2 + 1) % np;
+                     int start = piece * (j + 1) / np;
+                     int size = (j + 1) / np;  // partition size
+                     int remaining = (j + 1) % np;
                      size += (piece == np - 1) ? remaining : 0;
 
                      sum = std::transform_reduce(
-                         std::execution::par, counting_iterator(start),
-                         counting_iterator(start) + size, 0, std ::plus{},
-                         [=](int val) { return val * val; });
+                         std::execution::par,
+                         counting_iterator(lower[j][start]),
+                         counting_iterator(lower[j][start]) + size, 0,
+                         std ::plus{}, [=](int val) { return val * val; });
                    }) |
               then([&](auto sum) { return sum; });
 
-          //auto sum1 = sync_wait(send1).value();
+          auto [sum1] = sync_wait(std::move(send1)).value();
           lower[j][j] = std::sqrt(matrix_ms(i, j) - sum);
 
         } else {  // Evaluating L(i, j) using L(j, j)

From 7361b2859f925cb8d798bcefd06ab28cdd4c96dc Mon Sep 17 00:00:00 2001
From: hcq9102 <chuanqiuhe@gmail.com>
Date: Tue, 26 Sep 2023 14:42:16 -0700
Subject: [PATCH 06/20] last two columns has issue

---
 apps/choleskey/choleskey_stdpar_snd.cpp | 59 ++++++++++++++-----------
 1 file changed, 33 insertions(+), 26 deletions(-)

diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp
index 5429445..d60d7d8 100644
--- a/apps/choleskey/choleskey_stdpar_snd.cpp
+++ b/apps/choleskey/choleskey_stdpar_snd.cpp
@@ -1,18 +1,17 @@
 // Cholesky Decomposition: stdpar-->sender
-#include "argparse/argparse.hpp"
-#include "commons.hpp"
-
 #include <algorithm>
+#include <exec/any_sender_of.hpp>
 #include <experimental/mdspan>
 #include <iostream>
 #include <numeric>
 #include <stdexec/execution.hpp>
 #include <vector>
+#include "argparse/argparse.hpp"
+#include "commons.hpp"
 #include "exec/static_thread_pool.hpp"
 
 #include "matrixutil.hpp"
-using namespace stdexec;
-using stdexec::sync_wait;
+// using namespace stdexec;
 
 using namespace std;
 
@@ -38,7 +37,7 @@ struct solver {
       return a * b;
     };
 
-    int np = 4;  // default number of parallel sec, will be an option
+    int np = 3;  // default number of parallel sec, will be an option
 
     for (int i = 0; i < matrix_ms.extent(0); i++) {
       for (int j = 0; j <= i; j++) {
@@ -46,28 +45,35 @@ struct solver {
 
         if (j == i)  // summation for diagonals
         {
-          sender auto send1 =
-              begin |
-              bulk(np,
-                   [&](int piece) {
-                     int start = piece * (j + 1) / np;
-                     int size = (j + 1) / np;  // partition size
-                     int remaining = (j + 1) % np;
-                     size += (piece == np - 1) ? remaining : 0;
-
-                     sum = std::transform_reduce(
-                         std::execution::par,
-                         counting_iterator(lower[j][start]),
-                         counting_iterator(lower[j][start]) + size, 0,
-                         std ::plus{}, [=](int val) { return val * val; });
-                   }) |
-              then([&](auto sum) { return sum; });
-
-          auto [sum1] = sync_wait(std::move(send1)).value();
-          lower[j][j] = std::sqrt(matrix_ms(i, j) - sum);
+          std::vector<T> sum_vec(np + 1);
 
-        } else {  // Evaluating L(i, j) using L(j, j)
+          std::cout << "j = " << j << std::endl;
 
+          std::size_t const size = ((j + 1) + (np - 1)) / np;  // partition size
+          stdexec::sender auto send1 =
+              stdexec::bulk(
+                  begin, np,
+                  [&](int piece) {
+                    std::cout << "pcs = " << piece << std::endl;
+                    int start = piece * size;
+                    int end = std::min(j, (int)((piece + 1) * size));
+
+                    sum_vec[piece] = std::transform_reduce(
+                        std::execution::par, counting_iterator(lower[j][start]),
+                        counting_iterator(lower[j][end]), 0, std ::plus{},
+                        [=](int val) { return val * val; });
+                  }) |
+              stdexec::then([&sum_vec]() {
+                return std::reduce(std::execution::par, sum_vec.begin(),
+                                   sum_vec.end());
+              });
+
+          auto [sum1] = stdexec::sync_wait(std::move(send1)).value();
+
+          lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1);
+
+        } else {  // Evaluating L(i, j) using L(j, j)
+          // TODO
           sum = std::transform_reduce(std::execution::par, lower[j].cbegin(),
                                       lower[j].cbegin() + j, lower[i].cbegin(),
                                       0, std::plus<>(), multiplier_lambda);
@@ -76,6 +82,7 @@ struct solver {
         }
       }
     }
+
     return lower;
   }
 };

From c8b96ddfdd272101ee3fe22351ba0a7d942238b5 Mon Sep 17 00:00:00 2001
From: hcq9102 <chuanqiuhe@gmail.com>
Date: Sat, 30 Sep 2023 14:40:12 -0700
Subject: [PATCH 07/20] choleskey_decomposition_sender_correct

---
 apps/choleskey/CMakeLists.txt           |   1 +
 apps/choleskey/choleskey_stdpar_snd.cpp | 156 ++++++++++++++++--------
 2 files changed, 109 insertions(+), 48 deletions(-)

diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt
index a5eae85..be4be0a 100644
--- a/apps/choleskey/CMakeLists.txt
+++ b/apps/choleskey/CMakeLists.txt
@@ -19,3 +19,4 @@ target_include_directories(
   choleskey_stdpar_snd
   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+    
\ No newline at end of file
diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp
index d60d7d8..4fa7d79 100644
--- a/apps/choleskey/choleskey_stdpar_snd.cpp
+++ b/apps/choleskey/choleskey_stdpar_snd.cpp
@@ -1,4 +1,32 @@
-// Cholesky Decomposition: stdpar-->sender
+/*
+ * MIT License
+ *
+ * Copyright (c) 2023 Chuanqiu He 
+ * Copyright (c) 2023 Weile Wei 
+ * Copyright (c) 2023 The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of any
+ * required approvals from the U.S. Dept. of Energy).All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+//
+// This example provides a stdexec(senders/receivers) implementation for choleskey decomposition code.
 #include <algorithm>
 #include <exec/any_sender_of.hpp>
 #include <experimental/mdspan>
@@ -20,11 +48,11 @@ struct solver {
   using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 
   template <typename T>
-  std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec,
-                                                     int n) {
+  std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec, int n,
+                                                     int np) {
 
     // test here first, scheduler from a thread pool
-    exec::static_thread_pool pool(n);
+    exec::static_thread_pool pool(np);
     stdexec::scheduler auto sch = pool.get_scheduler();
     stdexec::sender auto begin = stdexec::schedule(sch);
 
@@ -37,52 +65,88 @@ struct solver {
       return a * b;
     };
 
-    int np = 3;  // default number of parallel sec, will be an option
-
     for (int i = 0; i < matrix_ms.extent(0); i++) {
       for (int j = 0; j <= i; j++) {
-        T sum = 0;
+        // avoid over parallelize
+        if (j == 0) {
+          np = 1;
+        } else if (j > 0 && np > j) {
+          np = j;
+        }
 
         if (j == i)  // summation for diagonals
         {
-          std::vector<T> sum_vec(np + 1);
-
-          std::cout << "j = " << j << std::endl;
-
-          std::size_t const size = ((j + 1) + (np - 1)) / np;  // partition size
-          stdexec::sender auto send1 =
-              stdexec::bulk(
-                  begin, np,
-                  [&](int piece) {
-                    std::cout << "pcs = " << piece << std::endl;
-                    int start = piece * size;
-                    int end = std::min(j, (int)((piece + 1) * size));
-
-                    sum_vec[piece] = std::transform_reduce(
-                        std::execution::par, counting_iterator(lower[j][start]),
-                        counting_iterator(lower[j][end]), 0, std ::plus{},
-                        [=](int val) { return val * val; });
-                  }) |
-              stdexec::then([&sum_vec]() {
-                return std::reduce(std::execution::par, sum_vec.begin(),
-                                   sum_vec.end());
-              });
-
-          auto [sum1] = stdexec::sync_wait(std::move(send1)).value();
-
-          lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1);
-
-        } else {  // Evaluating L(i, j) using L(j, j)
-          // TODO
-          sum = std::transform_reduce(std::execution::par, lower[j].cbegin(),
-                                      lower[j].cbegin() + j, lower[i].cbegin(),
-                                      0, std::plus<>(), multiplier_lambda);
-
-          lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j];
+
+          if (i == 0 && j == 0) {
+            lower[j][j] = std::sqrt(matrix_ms(i, j));
+          } else {
+
+            std::vector<T> sum_vec(np);  // sub res for each piece
+            int size = j;  // there are j elements need to be calculated(power)
+
+            stdexec::sender auto send1 =
+                stdexec::bulk(begin, np,
+                              [&](int piece) {
+                                int start = piece * size / np;
+                                int chunk_size = size / np;
+                                int remaining = size % np;
+                                chunk_size += (piece == np - 1) ? remaining : 0;
+
+                                sum_vec[piece] = std::transform_reduce(
+                                    std::execution::par,
+                                    counting_iterator(start),
+                                    counting_iterator(start + chunk_size), 0,
+                                    std ::plus{}, [=](int val) {
+                                      return lower[j][val] * lower[j][val];
+                                    });
+                              }) |
+                stdexec::then([&sum_vec]() {
+                  return std::reduce(std::execution::par, sum_vec.begin(),
+                                     sum_vec.end());
+                });
+
+            auto [sum1] = stdexec::sync_wait(std::move(send1)).value();
+
+            lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1);
+          }
+
+        } else {
+          // Evaluating L(i, j) using L(j, j)
+
+          if (j == 0) {
+            lower[i][j] = (matrix_ms(i, j)) / lower[j][j];
+          } else {
+
+            std::vector<T> sum_vec(np);  // sub res for each piece
+            int size_nondiag = j;
+
+            stdexec::sender auto send2 =
+                stdexec::bulk(
+                    begin, np,
+                    [&](int piece) {
+                      int start = piece * size_nondiag / np;
+                      int chunk_size = size_nondiag / np;
+                      int remaining = size_nondiag % np;
+                      chunk_size += (piece == np - 1) ? remaining : 0;
+
+                      sum_vec[piece] = std::transform_reduce(
+                          std::execution::par, counting_iterator(start),
+                          counting_iterator(start + chunk_size), 0,
+                          std ::plus{},
+                          [=](int k) { return lower[j][k] * lower[i][k]; });
+                    }) |
+                stdexec::then([&sum_vec]() {
+                  return std::reduce(std::execution::par, sum_vec.begin(),
+                                     sum_vec.end());
+                });
+
+            auto [sum2] = stdexec::sync_wait(std::move(send2)).value();
+
+            lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j];
+          }
         }
       }
     }
-
     return lower;
   }
 };
@@ -91,22 +155,18 @@ struct solver {
 int benchmark(args_params_t const& args) {
 
   std::uint64_t nd = args.nd;  // Number of matrix dimension.
-  std::uint64_t np = args.np;  // Number of partitions.
+  std::uint64_t np = args.np;  // Number of parallel partitions.
 
   std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
 
   // Create the solver object
   solver solve;
 
-  exec::static_thread_pool pool(np);
-  stdexec::scheduler auto sch = pool.get_scheduler();
-  stdexec::sender auto begin = stdexec::schedule(sch);
-
   // Measure execution time.
   Timer timer;
 
   // start decomposation
-  auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
+  auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np);
 
   // Print the final results
   if (args.results) {

From f2b93ff6f280e7ce401d7baea5edf13603752454 Mon Sep 17 00:00:00 2001
From: hcq9102 <chuanqiuhe@gmail.com>
Date: Sat, 30 Sep 2023 15:05:30 -0700
Subject: [PATCH 08/20] format

---
 apps/choleskey/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt
index be4be0a..a5eae85 100644
--- a/apps/choleskey/CMakeLists.txt
+++ b/apps/choleskey/CMakeLists.txt
@@ -19,4 +19,3 @@ target_include_directories(
   choleskey_stdpar_snd
   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
-    
\ No newline at end of file

From f60a5b570095b94223e07d7481711bf3f333b38e Mon Sep 17 00:00:00 2001
From: hcq9102 <chuanqiuhe@gmail.com>
Date: Sat, 30 Sep 2023 14:56:15 -0700
Subject: [PATCH 09/20] add copyright

---
 apps/choleskey/choleskey_serial.cpp | 31 ++++++++++++++++++++++++++++-
 apps/choleskey/choleskey_stdpar.cpp | 31 ++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/apps/choleskey/choleskey_serial.cpp b/apps/choleskey/choleskey_serial.cpp
index ea181d0..2da34e2 100644
--- a/apps/choleskey/choleskey_serial.cpp
+++ b/apps/choleskey/choleskey_serial.cpp
@@ -1,4 +1,33 @@
-// Cholesky Decomposition:  mdspan
+/*
+ * MIT License
+ *
+ * Copyright (c) 2023 Chuanqiu He 
+ * Copyright (c) 2023 Weile Wei 
+ * Copyright (c) 2023 The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of any
+ * required approvals from the U.S. Dept. of Energy).All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+//
+// This example provides a sserial(mdspan) implementation for choleskey decomposition code.
+
 #include <bits/stdc++.h>
 #include <experimental/mdspan>
 #include <vector>
diff --git a/apps/choleskey/choleskey_stdpar.cpp b/apps/choleskey/choleskey_stdpar.cpp
index abbe02d..2b19b7d 100644
--- a/apps/choleskey/choleskey_stdpar.cpp
+++ b/apps/choleskey/choleskey_stdpar.cpp
@@ -1,4 +1,33 @@
-// Cholesky Decomposition: stdpar
+/*
+ * MIT License
+ *
+ * Copyright (c) 2023 Chuanqiu He 
+ * Copyright (c) 2023 Weile Wei 
+ * Copyright (c) 2023 The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of any
+ * required approvals from the U.S. Dept. of Energy).All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+//
+// This example provides a stdpar implementation for choleskey decomposition code.
+
 #include "argparse/argparse.hpp"
 #include "commons.hpp"
 

From 4f66c7ad28aa4b72309724b25340cd4501311ba4 Mon Sep 17 00:00:00 2001
From: hcq9102 <chuanqiuhe@gmail.com>
Date: Sat, 30 Sep 2023 15:00:49 -0700
Subject: [PATCH 10/20] fix typo

---
 apps/choleskey/choleskey_serial.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/choleskey/choleskey_serial.cpp b/apps/choleskey/choleskey_serial.cpp
index 2da34e2..5c82498 100644
--- a/apps/choleskey/choleskey_serial.cpp
+++ b/apps/choleskey/choleskey_serial.cpp
@@ -26,7 +26,7 @@
  * SOFTWARE.
  */
 //
-// This example provides a sserial(mdspan) implementation for choleskey decomposition code.
+// This example provides a serial(mdspan) implementation for choleskey decomposition code.
 
 #include <bits/stdc++.h>
 #include <experimental/mdspan>

From 738a04de24c5f4e9d31fa13f8fe1f9b3ba5a172c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Sat, 7 Oct 2023 00:02:15 -0700
Subject: [PATCH 11/20] adding fft app

---
 CMakeLists.txt                            |   5 +-
 apps/comm-study/comm-study-no-senders.cpp |   2 +-
 apps/fft/fft-serial.cpp                   | 138 +++++++++++++++++
 apps/fft/fft.hpp                          | 172 +++++++++++++---------
 4 files changed, 245 insertions(+), 72 deletions(-)
 create mode 100644 apps/fft/fft-serial.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 558c8b6..2c6ee0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,7 +74,7 @@ set(GCC_EXPECTED_VERSION 11.2)
 if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS GCC_EXPECTED_VERSION)
   message(
     FATAL_ERROR
-      "GCC: GCB requires GCC v${GCC_EXPECTED_VERSION} or higher to build but found v${CMAKE_CXX_COMPILER_VERSION}"
+      "GCC: nvstdpar requires GCC v${GCC_EXPECTED_VERSION} or higher to build but found v${CMAKE_CXX_COMPILER_VERSION}"
   )
 endif()
 
@@ -84,10 +84,11 @@ endif()
 set(CXX_STANDARD_REQUIRED ON)
 
 # required minimum CXX standard
-set(CMAKE_CXX_STANDARD_REQUIRED 20)
+set(CMAKE_CXX_STANDARD_REQUIRED 23)
 
 if(NOT CXX_STANDARD OR (CXX_STANDARD LESS ${CMAKE_CXX_STANDARD_REQUIRED}))
   set(CXX_STANDARD ${CMAKE_CXX_STANDARD_REQUIRED})
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${CXX_STANDARD}")
   message(STATUS "Setting CXX_STANDARD to ${CMAKE_CXX_STANDARD_REQUIRED}")
 endif()
 
diff --git a/apps/comm-study/comm-study-no-senders.cpp b/apps/comm-study/comm-study-no-senders.cpp
index 1377745..1550094 100644
--- a/apps/comm-study/comm-study-no-senders.cpp
+++ b/apps/comm-study/comm-study-no-senders.cpp
@@ -74,7 +74,7 @@ auto work(P& A, P& B, P& Y, int N) {
 
   // get sum(Y) - one last memcpy (not USM) D2H
   sum +=
-      std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>());
+      std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>(), [](T &val){return val * val;});
 
   return sum / N;
 }
diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp
new file mode 100644
index 0000000..b174b5a
--- /dev/null
+++ b/apps/fft/fft-serial.cpp
@@ -0,0 +1,138 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2023 The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of any
+ * required approvals from the U.S. Dept. of Energy).All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * commons for the fft codes
+ */
+
+#include "fft.hpp"
+
+//
+// simulation
+//
+int main(int argc, char* argv[])
+{
+    // parse params
+    fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
+
+    // see if help wanted
+    if (args.help)
+    {
+        args.print();  // prints all variables
+        return 0;
+    }
+
+    // simulation variables
+    int N = args.N;
+    sig_type_t sig_type = args.sig;
+    int freq = args.freq;
+    bool print_sig = args.print_sig;
+    bool print_time = args.print_time;
+
+    // x[n] signal
+    //std::vector<data_t> test_sig{2,1,-1,5,0,3,0,-4};
+    //N = test_sig.size();
+
+    Timer timer;
+
+    sig_t x_n(N, sig_type);
+
+    if (!isPowOf2(N))
+    {
+        N = ceilPowOf2(N);
+        std::cout << "log_2(N) != integer. Padding zeros for N = " << N << std::endl;
+
+        x_n.resize(N);
+    }
+
+    sig_t y_n(x_n);
+
+    if (print_sig)
+    {
+        std::cout << std::endl << "x[n] = ";
+        x_n.printSignal();
+        std::cout << std::endl;
+    }
+
+    // niterations
+    int niters = ilog2(N);
+
+    std::function<void(data_t *, int, const int)> fft = [&](data_t *x, int lN, const int N)
+    {
+        int stride = N/lN;
+
+        if (lN == 2)
+        {
+            auto x_0 = x[0] + x[1]* WNk(N, 0);
+            x[1] = x[0] - x[1]* WNk(N, 0);
+            x[0] = x_0;
+            return;
+        }
+
+        // vectors for left and right
+        std::vector<data_t> e(lN/2);
+        std::vector<data_t> o(lN/2);
+
+        // copy data into vectors
+        for (auto k = 0; k < lN/2; k++)
+        {
+            e[k] = x[2*k];
+            o[k] = x[2*k+1];
+        }
+
+        // compute N/2 pt FFT on even
+        fft(e.data(), lN/2, N);
+
+        // compute N/2 pt FFT on odd
+        fft(o.data(), lN/2, N);
+
+        // combine even and odd FFTs
+        for (int k = 0; k < lN/2; k++)
+        {
+            x[k] = e[k] + o[k] * WNk(N, k * stride);
+            x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride);
+        }
+
+        return;
+    };
+
+    // fft radix-2 algorithm with senders
+    fft(y_n.data(), N, N);
+
+    if (print_sig)
+    {
+        std::cout << "X[k] = ";
+        y_n.printSignal();
+        std::cout << std::endl;
+    }
+
+    auto elapsed = timer.stop();
+
+    if (print_time)
+        std::cout << "Elapsed Time: " << elapsed << " ms" << std::endl;
+
+    return 0;
+}
\ No newline at end of file
diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp
index bc411d5..37f9c29 100644
--- a/apps/fft/fft.hpp
+++ b/apps/fft/fft.hpp
@@ -30,130 +30,138 @@
 
 #pragma once
 
-#include <experimental/mdspan>
+#include <bit>
 #include <complex>
+#include <functional>
+#include <experimental/mdspan>
+#include <exec/any_sender_of.hpp>
+#include <exec/static_thread_pool.hpp>
+#include <stdexec/execution.hpp>
 
 #include "argparse/argparse.hpp"
 #include "commons.hpp"
 
+namespace ex = stdexec;
 using namespace std::complex_literals;
 
 // data type
 using Real_t = double;
 using data_t = std::complex<Real_t>;
 
-// number of dimensions
-constexpr int dims = 1;
-
-// 1D view
-using view_1d = std::extents<int, std::dynamic_extent>;
-
-// 2D view
-using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
-
-// 3D view
-using view_3d = std::extents<int, std::dynamic_extent, std::dynamic_extent,
-                             std::dynamic_extent>;
-
-enum class fft_type { fftw, cufft };
 enum class sig_type { square, sinusoid, sawtooth, triangle, sinc, box };
-
 using sig_type_t = sig_type;
 
+// fft radix
+constexpr int radix = 2;
+
 // parameters
 struct fft_params_t : public argparse::Args {
   sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box);
-  int& freq = kwarg("f,freq", "Signal frequency").set_default(1000);
-  int& len = kwarg("n,N", "N-point FFT").set_default(1<<16);
-  bool& print_fft = flag("p,print", "print Fourier transformed signal");
+  int& freq = kwarg("f,freq", "Signal frequency").set_default(1024);
+  int& N = kwarg("N", "N-point FFT").set_default(1024);
+  bool& print_sig = flag("p,print", "print x[n] and X(k)");
 
 #if defined(USE_OMP)
   int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
 #endif  // USE_OMP
 
   bool& help = flag("h, help", "print help");
-  bool& print_time = flag("t,time", "print transform time");
+  bool& print_time = flag("t,time", "print fft time");
 };
 
-void printSignal(data_t* sig, int N) {
-  std::cout << std::fixed << std::setprecision(1);
+inline bool isPowOf2(long long int x) {
+  return !(x == 0) && !(x & (x - 1));
+}
 
-  for (int i = 0; i < N; ++i)
-    std::cout << sig[i] << " ";
+template <typename T>
+void printVec(T &vec, int len)
+{
+    std::cout << "[ ";
+    for (int i = 0; i < len; i++)
+      std::cout << vec[i] << " ";
 
-  std::cout << std::endl;
+    std::cout << "]" << std::endl;
+}
+
+inline std::complex<Real_t> WNk(int N, int k)
+{
+    return std::complex<Real_t>(exp(-2*M_PI*1/N*k*1i));
+}
+
+inline int ceilPowOf2(unsigned int v)
+{
+  return static_cast<int>(std::bit_ceil(v));
+}
+
+inline int ilog2(uint32_t x)
+{
+    return static_cast<int>(log2(x));
 }
 
 class signal
 {
 public:
 
-  signal()
+  signal() = default;
+  signal(int N)
   {
-    this->N = 1e3;
-    t.resize(this->N);
-    y.resize(this->N);
-    dt = 1.0 / this->N;
-  }
-
-  signal(int _N)
-  {
-    if (_N <= 0)
+    if (N <= 0)
     {
       std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl;
       exit(1);
     }
-    this->N = _N;
-    t.resize(this->N);
-    y.resize(this->N);
-    dt = 1.0 / this->N;
+    y.reserve(ceilPowOf2(N));
   }
 
-  signal(int N, sig_type type=sig_type::box)
+  signal(signal &rhs)
+  {
+    y = rhs.y;
+  }
+  signal(std::vector<data_t> &in)
+  {
+    y = std::move(in);
+  }
+
+  signal(int N, sig_type type)
   {
     if (N <= 0)
     {
       std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl;
       exit(1);
     }
-
-    this->N = N;
-    t.resize(N);
-    y.resize(N);
-    dt = 1.0 / N;
-    signalGenerator(N, type);
+    y.reserve(ceilPowOf2(N));
+    signalGenerator(type);
   }
 
-  void signalGenerator(int N, sig_type type=sig_type::box)
+  void signalGenerator(sig_type type=sig_type::box)
   {
-    int interval = 1/N;
-    std::vector<Real_t> t(N);
+    int N = y.size();
 
     switch (type) {
       case sig_type::square:
-        for (int i = 0; i < N; ++i)
-          y[i] = (i < N / 4 || i > 3 * N/4) ? 1.0 : -1.0;
+        for (int n = 0; n < N; ++n)
+          y[n] = (n < N / 4 || n > 3 * N/4) ? 1.0 : -1.0;
         break;
       case sig_type::sinusoid:
-        for (int i = 0; i < N; ++i)
-          y[i] = std::sin(2.0 * M_PI * i / N);
+        for (int n = 0; n < N; ++n)
+          y[n] = std::sin(2.0 * M_PI * n / N);
         break;
       case sig_type::sawtooth:
-        for (int i = 0; i < N; ++i)
-          y[i] = 2.0 * (i / N) - 1.0;
+        for (int n = 0; n < N; ++n)
+          y[n] = 2.0 * (n / N) - 1.0;
         break;
       case sig_type::triangle:
-        for (int i = 0; i < N; ++i)
-          y[i] = 2.0 * std::abs(2.0 * (i / N) - 1.0) - 1.0;
+        for (int n = 0; n < N; ++n)
+          y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0;
         break;
       case sig_type::sinc:
           y[0] = 1.0;
-        for (int i = 1; i < N; ++i)
-          y[i] = std::sin(2.0 * M_PI * i / N) / (2.0 * M_PI * i / N);
+        for (int n = 1; n < N; ++n)
+          y[n] = std::sin(2.0 * M_PI * n / N) / (2.0 * M_PI * n / N);
         break;
       case sig_type::box:
-        for (int i = 0; i < N; ++i)
-          y[i] = (i < N / 4 || i > 3 * N / 4) ? 1.0 : 0.0;
+        for (int n = 0; n < N; ++n)
+          y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0;
         break;
       default:
         std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl;
@@ -164,14 +172,40 @@ class signal
   ~signal()
   {
     y.clear();
-    t.clear();
+  }
+
+  data_t *data() { return y.data(); }
+  int len() { return y.size(); }
+
+  void resize(int N)
+  {
+    if (N != y.size())
+      y.resize(N, 0);
+  }
+
+  data_t &operator[](int n)
+  {
+    return y[n];
+  }
+
+  data_t &operator()(int n)
+  {
+    return y[n];
+  }
+
+  void printSignal() {
+    std::cout << std::fixed << std::setprecision(2);
+
+    std::cout << "[ ";
+    for (auto &el : y)
+      std::cout << el << " ";
+
+    std::cout << "]" << std::endl;
   }
 
 private:
-  int N;
-  Real_t dt;
-  // time axis
-  std::vector<Real_t> t;
-  // y(t) axis
-  std::vector<Real_t> y;
-};
\ No newline at end of file
+  // y[n]
+  std::vector<data_t> y;
+};
+
+using sig_t = signal;

From 1f9d69a8d65c88d0fa6c071edee8db4f6a443a6d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Sat, 7 Oct 2023 00:04:53 -0700
Subject: [PATCH 12/20] Removed argparse

---
 .gitmodules        | 3 ---
 externals/argparse | 1 -
 2 files changed, 4 deletions(-)
 delete mode 160000 externals/argparse

diff --git a/.gitmodules b/.gitmodules
index b8f3f6d..6af8544 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,6 @@
 [submodule "externals/mdspan"]
 	path = externals/mdspan
 	url = https://github.com/kokkos/mdspan
-[submodule "externals/argparse"]
-	path = externals/argparse
-	url = https://github.com/mhaseeb123/argparse
 [submodule "externals/magic_enum"]
 	path = externals/magic_enum
 	url = https://github.com/mhaseeb123/magic_enum
diff --git a/externals/argparse b/externals/argparse
deleted file mode 160000
index dee5935..0000000
--- a/externals/argparse
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit dee59359be9a2a023ceb59384c735b4e711cc18d

From 25c83bdf69d00fae5f2a01285309d076684b8dd1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Sat, 7 Oct 2023 00:07:14 -0700
Subject: [PATCH 13/20] readding new argparse

---
 .gitmodules        | 3 +++
 argparse           | 1 +
 externals/argparse | 1 +
 3 files changed, 5 insertions(+)
 create mode 160000 argparse
 create mode 160000 externals/argparse

diff --git a/.gitmodules b/.gitmodules
index 6af8544..e07fd62 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "externals/magic_enum"]
 	path = externals/magic_enum
 	url = https://github.com/mhaseeb123/magic_enum
+[submodule "externals/argparse"]
+	path = externals/argparse
+	url = https://github.com/mhaseeb123/argparse
diff --git a/argparse b/argparse
new file mode 160000
index 0000000..9770626
--- /dev/null
+++ b/argparse
@@ -0,0 +1 @@
+Subproject commit 9770626123d491bc9d27851a150da20fc47fc994
diff --git a/externals/argparse b/externals/argparse
new file mode 160000
index 0000000..9770626
--- /dev/null
+++ b/externals/argparse
@@ -0,0 +1 @@
+Subproject commit 9770626123d491bc9d27851a150da20fc47fc994

From a34c277239884c0f44ddaefde2032b8c10a86317 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Sat, 7 Oct 2023 00:08:01 -0700
Subject: [PATCH 14/20] updating submodules

---
 externals/magic_enum | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/externals/magic_enum b/externals/magic_enum
index b291b0c..d67973d 160000
--- a/externals/magic_enum
+++ b/externals/magic_enum
@@ -1 +1 @@
-Subproject commit b291b0ce5a76e808e05fc0141154e963407372da
+Subproject commit d67973d1181ff986ba63c756b47cc854f4d51d32

From fd560d2f793fab8c2c9c725d0814fb167bbe130c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Sat, 7 Oct 2023 00:09:19 -0700
Subject: [PATCH 15/20] removing stale fft file

---
 apps/fft/fft-serial-1d.cpp | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 apps/fft/fft-serial-1d.cpp

diff --git a/apps/fft/fft-serial-1d.cpp b/apps/fft/fft-serial-1d.cpp
deleted file mode 100644
index 6c73237..0000000
--- a/apps/fft/fft-serial-1d.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2023 The Regents of the University of California,
- * through Lawrence Berkeley National Laboratory (subject to receipt of any
- * required approvals from the U.S. Dept. of Energy).All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * commons for the fft codes
- */
-
-#include "fft.hpp"
-
-//
-// simulation
-//
-int main(int argc, char* argv[]) {
-    return 0;
-}

From 94813eb9836d5c16af04942e2859a8dd21151a0d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Sat, 7 Oct 2023 00:45:18 -0700
Subject: [PATCH 16/20] minor debugging

---
 apps/fft/fft.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp
index 37f9c29..80a7446 100644
--- a/apps/fft/fft.hpp
+++ b/apps/fft/fft.hpp
@@ -107,10 +107,11 @@ class signal
   {
     if (N <= 0)
     {
-      std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl;
+      std::cerr << "FATAL: N must be > 0. exiting.." << std::endl;
       exit(1);
     }
     y.reserve(ceilPowOf2(N));
+    y.resize(N);
   }
 
   signal(signal &rhs)
@@ -126,10 +127,11 @@ class signal
   {
     if (N <= 0)
     {
-      std::cerr << "FATAL: N must be greater than 0. exiting.." << std::endl;
+      std::cerr << "FATAL: N must be > 0. exiting.." << std::endl;
       exit(1);
     }
     y.reserve(ceilPowOf2(N));
+    y.resize(N);
     signalGenerator(type);
   }
 

From 62e87d33d67f416751167cef1113147394a61b46 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Sat, 7 Oct 2023 00:45:21 -0700
Subject: [PATCH 17/20] minor bug fix

---
 externals/mdspan | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/externals/mdspan b/externals/mdspan
index 124b860..f840358 160000
--- a/externals/mdspan
+++ b/externals/mdspan
@@ -1 +1 @@
-Subproject commit 124b860f458e5c06c9b96d7510dc35b7acdd642b
+Subproject commit f84035865a92241a5163d8d0e5100aea037892ca

From 2761f772913e40b34f1b23532f4b0c75c57cf16b Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Sat, 7 Oct 2023 00:49:34 -0700
Subject: [PATCH 18/20] clang-format

---
 apps/1d_stencil/stencil_cuda.cpp              | 175 +++++------
 apps/1d_stencil/stencil_serial.cpp            | 190 ++++++-----
 apps/1d_stencil/stencil_snd_gpu_m.cpp         | 201 ++++++------
 apps/1d_stencil/stencil_snd_gpu_s.cpp         | 202 ++++++------
 apps/1d_stencil/stencil_stdpar.cpp            | 193 ++++++------
 apps/1d_stencil/stencil_stdpar_snd.cpp        | 254 +++++++--------
 apps/1d_stencil/stencil_stdpar_snd_iter.cpp   | 213 ++++++-------
 apps/choleskey/choleskey_serial.cpp           | 140 ++++-----
 apps/choleskey/choleskey_stdpar.cpp           | 124 ++++----
 apps/choleskey/choleskey_stdpar_snd.cpp       | 269 ++++++++--------
 apps/choleskey/matrixutil.hpp                 |  41 ++-
 apps/comm-study/comm-study-no-senders.cpp     | 108 +++----
 apps/comm-study/comm-study.cpp                | 163 +++++-----
 apps/fft/fft-serial.cpp                       |  47 ++-
 apps/fft/fft.hpp                              | 215 ++++++-------
 apps/heat-equation/heat-equation-cuda.cpp     | 294 +++++++++--------
 .../heat-equation-gpu-scheduler.cpp           | 244 +++++++--------
 apps/heat-equation/heat-equation-mdspan.cpp   | 209 ++++++-------
 .../heat-equation-multigpu-scheduler.cpp      | 248 +++++++--------
 apps/heat-equation/heat-equation-omp.cpp      | 182 +++++------
 .../heat-equation-stdpar-senders.cpp          | 296 +++++++++---------
 apps/heat-equation/heat-equation-stdpar.cpp   | 212 ++++++-------
 apps/heat-equation/heat-equation.hpp          |  61 ++--
 apps/mdspan-stdpar/mdspan-stdpar.cpp          |  79 +++--
 include/commons.hpp                           |  57 ++--
 include/counting_iterator.hpp                 | 164 +++++-----
 26 files changed, 2176 insertions(+), 2405 deletions(-)

diff --git a/apps/1d_stencil/stencil_cuda.cpp b/apps/1d_stencil/stencil_cuda.cpp
index 2c87bac..3436893 100644
--- a/apps/1d_stencil/stencil_cuda.cpp
+++ b/apps/1d_stencil/stencil_cuda.cpp
@@ -7,20 +7,16 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-  bool& results = kwarg("results", "print generated results (default: false)")
-                      .set_default(false);
-  std::uint64_t& nx =
-      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-  bool& no_header =
-      kwarg("no-header", "Do not print csv header row (default: false)")
-          .set_default(false);
-  bool& help = flag("h, help", "print help");
-  bool& time = kwarg("t, time", "print time").set_default(true);
+    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
+    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
+    bool& help = flag("h, help", "print help");
+    bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -32,96 +28,95 @@ constexpr double dx = 1.;  // grid spacing
 
 // Our operator
 __device__ double heat(double left, double middle, double right) {
-  return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
 }
 
 __global__ void heat_equation(double* current, double* next, std::size_t size) {
-  std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if (i < size) {
-    std::size_t left = (i == 0) ? size - 1 : i - 1;
-    std::size_t right = (i == size - 1) ? 0 : i + 1;
-    next[i] = heat(current[left], current[i], current[right]);
-  }
+    if (i < size) {
+        std::size_t left = (i == 0) ? size - 1 : i - 1;
+        std::size_t right = (i == size - 1) ? 0 : i + 1;
+        next[i] = heat(current[left], current[i], current[right]);
+    }
 }
 
 int benchmark(args_params_t const& args) {
-  // Parameters (for simplicity, some are hardcoded)
-  std::uint64_t np = args.np;  // Number of partitions.
-  std::uint64_t nx = args.nx;  // Number of grid points.
-  std::uint64_t nt = args.nt;  // Number of steps.
-  std::size_t size = np * nx;
-
-  double* h_current = nullptr;
-  double* h_next = nullptr;
-
-  // Measure execution time.
-  Timer timer;
-
-  // Memory allocation
-  if (args.results) {
-    h_current = new double[size];
-    h_next = new double[size];
-  }
-
-  double* d_current;
-  double* d_next;
-  cudaMalloc(&d_current, size * sizeof(double));
-  cudaMalloc(&d_next, size * sizeof(double));
-  thrust::sequence(thrust::device, d_current, d_current + size, 0);
-  thrust::sequence(thrust::device, d_next, d_next + size, 0);
-
-  // CUDA kernel execution parameters
-  const int threadsPerBlock = std::min(1024, (int)size);
-  const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock;
-
-  // Actual time step loop
-  for (std::size_t t = 0; t < nt; ++t) {
-    heat_equation<<<blocks, threadsPerBlock>>>(d_current, d_next, size);
-    std::swap(d_current, d_next);
-  }
-  cudaDeviceSynchronize();
-  auto time = timer.stop();
-
-  if (args.results) {
-    // Copy result back to host
-    cudaMemcpy(h_current, d_current, size * sizeof(double),
-               cudaMemcpyDeviceToHost);
-
-    // Print results
-    for (std::size_t i = 0; i < np; ++i) {
-      std::cout << "U[" << i << "] = {";
-      for (std::size_t j = 0; j < nx; ++j) {
-        std::cout << h_current[i * nx + j] << " ";
-      }
-      std::cout << "}\n";
+    // Parameters (for simplicity, some are hardcoded)
+    std::uint64_t np = args.np;  // Number of partitions.
+    std::uint64_t nx = args.nx;  // Number of grid points.
+    std::uint64_t nt = args.nt;  // Number of steps.
+    std::size_t size = np * nx;
+
+    double* h_current = nullptr;
+    double* h_next = nullptr;
+
+    // Measure execution time.
+    Timer timer;
+
+    // Memory allocation
+    if (args.results) {
+        h_current = new double[size];
+        h_next = new double[size];
     }
-    // Cleanup
-    delete[] h_current;
-    delete[] h_next;
-  }
 
-  cudaFree(d_current);
-  cudaFree(d_next);
+    double* d_current;
+    double* d_next;
+    cudaMalloc(&d_current, size * sizeof(double));
+    cudaMalloc(&d_next, size * sizeof(double));
+    thrust::sequence(thrust::device, d_current, d_current + size, 0);
+    thrust::sequence(thrust::device, d_next, d_next + size, 0);
+
+    // CUDA kernel execution parameters
+    const int threadsPerBlock = std::min(1024, (int)size);
+    const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+    // Actual time step loop
+    for (std::size_t t = 0; t < nt; ++t) {
+        heat_equation<<<blocks, threadsPerBlock>>>(d_current, d_next, size);
+        std::swap(d_current, d_next);
+    }
+    cudaDeviceSynchronize();
+    auto time = timer.stop();
+
+    if (args.results) {
+        // Copy result back to host
+        cudaMemcpy(h_current, d_current, size * sizeof(double), cudaMemcpyDeviceToHost);
+
+        // Print results
+        for (std::size_t i = 0; i < np; ++i) {
+            std::cout << "U[" << i << "] = {";
+            for (std::size_t j = 0; j < nx; ++j) {
+                std::cout << h_current[i * nx + j] << " ";
+            }
+            std::cout << "}\n";
+        }
+        // Cleanup
+        delete[] h_current;
+        delete[] h_next;
+    }
 
-  if (args.time) {
-    std::cout << "Duration: " << time << " ms."
-              << "\n";
-  }
+    cudaFree(d_current);
+    cudaFree(d_next);
+
+    if (args.time) {
+        std::cout << "Duration: " << time << " ms."
+                  << "\n";
+    }
 
-  return 0;
+    return 0;
 }
 
 int main(int argc, char* argv[]) {
-  // parse params
-  args_params_t args = argparse::parse<args_params_t>(argc, argv);
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // parse params
+    args_params_t args = argparse::parse<args_params_t>(argc, argv);
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  benchmark(args);
+    benchmark(args);
 
-  return 0;
+    return 0;
 }
diff --git a/apps/1d_stencil/stencil_serial.cpp b/apps/1d_stencil/stencil_serial.cpp
index fce1d5d..f4a7180 100644
--- a/apps/1d_stencil/stencil_serial.cpp
+++ b/apps/1d_stencil/stencil_serial.cpp
@@ -32,20 +32,16 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-  bool& results = kwarg("results", "print generated results (default: false)")
-                      .set_default(false);
-  std::uint64_t& nx =
-      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-  bool& no_header =
-      kwarg("no-header", "Do not print csv header row (default: false)")
-          .set_default(false);
-  bool& help = flag("h, help", "print help");
-  bool& time = kwarg("t, time", "print time").set_default(true);
+    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
+    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
+    bool& help = flag("h, help", "print help");
+    bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -58,107 +54,107 @@ double dx = 1.;      // grid spacing
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-  // Our partition type
-  typedef double partition;
+    // Our partition type
+    typedef double partition;
 
-  // Our data for one time step
-  using view_1d = std::extents<int, std::dynamic_extent>;
-  typedef std::mdspan<partition, view_1d, std::layout_right> space;
+    // Our data for one time step
+    using view_1d = std::extents<int, std::dynamic_extent>;
+    typedef std::mdspan<partition, view_1d, std::layout_right> space;
 
-  void init_value(auto& data, std::size_t np, std::size_t nx) {
-    for (std::size_t i = 0; i != np * nx; ++i) {
-      data[i] = double(i);
+    void init_value(auto& data, std::size_t np, std::size_t nx) {
+        for (std::size_t i = 0; i != np * nx; ++i) {
+            data[i] = double(i);
+        }
     }
-  }
 
-  // Our operator
-  double heat(double left, double middle, double right, const double k = ::k,
-              const double dt = ::dt, const double dx = ::dx) {
-    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
-  }
-
-  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-    if (id == 0 && dir == -1) {
-      return size - 1;
+    // Our operator
+    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
+                const double dx = ::dx) {
+        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
     }
 
-    if (id == size - 1 && dir == +1) {
-      return (std::size_t)0;
-    }
-    assert(id < size);
-
-    return id + dir;
-  }
-
-  // do all the work on 'nx' data points for 'nt' time steps
-  space do_work(std::size_t np, std::size_t nx, std::size_t nt) {
-    std::size_t size = np * nx;
-    partition* current_ptr = new partition[size];
-    partition* next_ptr = new partition[size];
-    auto current = space(current_ptr, size);
-    auto next = space(next_ptr, size);
-
-    init_value(current, np, nx);
-
-    // Actual time step loop
-    for (std::size_t t = 0; t != nt; ++t) {
-      for (std::size_t i = 0; i < np * nx; ++i) {
-        auto left = idx(i, -1, size);
-        auto right = idx(i, +1, size);
-        next[i] = heat(current[left], current[i], current[right], k, dt, dx);
-      }
-      std::swap(current, next);
+    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+        if (id == 0 && dir == -1) {
+            return size - 1;
+        }
+
+        if (id == size - 1 && dir == +1) {
+            return (std::size_t)0;
+        }
+        assert(id < size);
+
+        return id + dir;
     }
 
-    return current;
-  }
+    // do all the work on 'nx' data points for 'nt' time steps
+    space do_work(std::size_t np, std::size_t nx, std::size_t nt) {
+        std::size_t size = np * nx;
+        partition* current_ptr = new partition[size];
+        partition* next_ptr = new partition[size];
+        auto current = space(current_ptr, size);
+        auto next = space(next_ptr, size);
+
+        init_value(current, np, nx);
+
+        // Actual time step loop
+        for (std::size_t t = 0; t != nt; ++t) {
+            for (std::size_t i = 0; i < np * nx; ++i) {
+                auto left = idx(i, -1, size);
+                auto right = idx(i, +1, size);
+                next[i] = heat(current[left], current[i], current[right], k, dt, dx);
+            }
+            std::swap(current, next);
+        }
+
+        return current;
+    }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-  std::uint64_t np = args.np;  // Number of partitions.
-  std::uint64_t nx = args.nx;  // Number of grid points.
-  std::uint64_t nt = args.nt;  // Number of steps.
-
-  // Create the stepper object
-  stepper step;
-
-  // Measure execution time.
-  Timer timer;
-
-  // Execute nt time steps on nx grid points.
-  auto solution = step.do_work(np, nx, nt);
-  auto time = timer.stop();
-
-  // Print the final solution
-  if (args.results) {
-    for (std::size_t i = 0; i != np; ++i) {
-      std::cout << "U[" << i << "] = {";
-      for (std::size_t j = 0; j != nx; ++j) {
-        std::cout << solution[i * nx + j] << " ";
-      }
-      std::cout << "}\n";
+    std::uint64_t np = args.np;  // Number of partitions.
+    std::uint64_t nx = args.nx;  // Number of grid points.
+    std::uint64_t nt = args.nt;  // Number of steps.
+
+    // Create the stepper object
+    stepper step;
+
+    // Measure execution time.
+    Timer timer;
+
+    // Execute nt time steps on nx grid points.
+    auto solution = step.do_work(np, nx, nt);
+    auto time = timer.stop();
+
+    // Print the final solution
+    if (args.results) {
+        for (std::size_t i = 0; i != np; ++i) {
+            std::cout << "U[" << i << "] = {";
+            for (std::size_t j = 0; j != nx; ++j) {
+                std::cout << solution[i * nx + j] << " ";
+            }
+            std::cout << "}\n";
+        }
     }
-  }
 
-  if (args.time) {
-    std::cout << "Duration: " << time << " ms."
-              << "\n";
-  }
+    if (args.time) {
+        std::cout << "Duration: " << time << " ms."
+                  << "\n";
+    }
 
-  return 0;
+    return 0;
 }
 
 int main(int argc, char* argv[]) {
-  // parse params
-  args_params_t args = argparse::parse<args_params_t>(argc, argv);
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // parse params
+    args_params_t args = argparse::parse<args_params_t>(argc, argv);
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  benchmark(args);
+    benchmark(args);
 
-  return 0;
+    return 0;
 }
diff --git a/apps/1d_stencil/stencil_snd_gpu_m.cpp b/apps/1d_stencil/stencil_snd_gpu_m.cpp
index 83d3e16..0c385d9 100644
--- a/apps/1d_stencil/stencil_snd_gpu_m.cpp
+++ b/apps/1d_stencil/stencil_snd_gpu_m.cpp
@@ -40,20 +40,16 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-  bool& results = kwarg("results", "print generated results (default: false)")
-                      .set_default(false);
-  std::uint64_t& nx =
-      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-  bool& no_header =
-      kwarg("no-header", "Do not print csv header row (default: false)")
-          .set_default(false);
-  bool& help = flag("h, help", "print help");
-  bool& time = kwarg("t, time", "print time").set_default(true);
+    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
+    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
+    bool& help = flag("h, help", "print help");
+    bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -64,125 +60,118 @@ double dt = 1.;      // time step
 double dx = 1.;      // grid spacing
 
 template <class... Ts>
-using any_sender_of = typename exec::any_receiver_ref<
-    stdexec::completion_signatures<Ts...>>::template any_sender<>;
+using any_sender_of = typename exec::any_receiver_ref<stdexec::completion_signatures<Ts...>>::template any_sender<>;
 
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-  // Our partition type
-  typedef double partition;
+    // Our partition type
+    typedef double partition;
 
-  // Our data for one time step
-  typedef thrust::device_vector<partition> space;
+    // Our data for one time step
+    typedef thrust::device_vector<partition> space;
 
-  // Our operator
-  double heat(double left, double middle, double right, const double k = ::k,
-              const double dt = ::dt, const double dx = ::dx) {
-    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
-  }
-
-  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-    if (id == 0 && dir == -1) {
-      return size - 1;
+    // Our operator
+    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
+                const double dx = ::dx) {
+        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
     }
 
-    if (id == size - 1 && dir == +1) {
-      return (std::size_t)0;
-    }
-    assert(id < size);
-
-    return id + dir;
-  }
-
-  // do all the work on 'nx' data points for 'nt' time steps
-  space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx,
-                std::size_t nt) {
-    std::size_t size = np * nx;
-    thrust::device_vector<partition> current_vec(size);
-    thrust::device_vector<partition> next_vec(size);
-
-    auto current_ptr = thrust::raw_pointer_cast(current_vec.data());
-    auto next_ptr = thrust::raw_pointer_cast(next_vec.data());
-
-    stdexec::sender auto init =
-        stdexec::transfer_just(sch, current_ptr, nx) |
-        stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) {
-          current_ptr[i] = (double)i;
-        });
-    stdexec::sync_wait(std::move(init));
-
-    for (std::size_t t = 0; t != nt; ++t) {
-      auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt,
-                                           dx, np, nx) |
-                    stdexec::bulk(np * nx, [&](int i, auto current_ptr,
-                                               auto next_ptr, auto k, auto dt,
-                                               auto dx, auto np, auto nx) {
-                      auto left = idx(i, -1, np * nx);
-                      auto right = idx(i, +1, np * nx);
-                      next_ptr[i] = heat(current_ptr[left], current_ptr[i],
-                                         current_ptr[right], k, dt, dx);
-                    });
-      stdexec::sync_wait(std::move(sender));
-      std::swap(current_ptr, next_ptr);
+    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+        if (id == 0 && dir == -1) {
+            return size - 1;
+        }
+
+        if (id == size - 1 && dir == +1) {
+            return (std::size_t)0;
+        }
+        assert(id < size);
+
+        return id + dir;
     }
 
-    if (nt % 2 == 0) {
-      return current_vec;
+    // do all the work on 'nx' data points for 'nt' time steps
+    space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) {
+        std::size_t size = np * nx;
+        thrust::device_vector<partition> current_vec(size);
+        thrust::device_vector<partition> next_vec(size);
+
+        auto current_ptr = thrust::raw_pointer_cast(current_vec.data());
+        auto next_ptr = thrust::raw_pointer_cast(next_vec.data());
+
+        stdexec::sender auto init =
+            stdexec::transfer_just(sch, current_ptr, nx) |
+            stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { current_ptr[i] = (double)i; });
+        stdexec::sync_wait(std::move(init));
+
+        for (std::size_t t = 0; t != nt; ++t) {
+            auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx) |
+                          stdexec::bulk(np * nx, [&](int i, auto current_ptr, auto next_ptr, auto k, auto dt, auto dx,
+                                                     auto np, auto nx) {
+                              auto left = idx(i, -1, np * nx);
+                              auto right = idx(i, +1, np * nx);
+                              next_ptr[i] = heat(current_ptr[left], current_ptr[i], current_ptr[right], k, dt, dx);
+                          });
+            stdexec::sync_wait(std::move(sender));
+            std::swap(current_ptr, next_ptr);
+        }
+
+        if (nt % 2 == 0) {
+            return current_vec;
+        }
+        return next_vec;
     }
-    return next_vec;
-  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-  std::uint64_t np = args.np;  // Number of partitions.
-  std::uint64_t nx = args.nx;  // Number of grid points.
-  std::uint64_t nt = args.nt;  // Number of steps.
+    std::uint64_t np = args.np;  // Number of partitions.
+    std::uint64_t nx = args.nx;  // Number of grid points.
+    std::uint64_t nt = args.nt;  // Number of steps.
 
-  // Create the stepper object
-  stepper step;
+    // Create the stepper object
+    stepper step;
 
-  nvexec::multi_gpu_stream_context stream_context{};
-  stdexec::scheduler auto sch = stream_context.get_scheduler();
+    nvexec::multi_gpu_stream_context stream_context{};
+    stdexec::scheduler auto sch = stream_context.get_scheduler();
 
-  // Measure execution time.
-  Timer timer;
+    // Measure execution time.
+    Timer timer;
 
-  // Execute nt time steps on nx grid points.
-  stepper::space solution = step.do_work(sch, np, nx, nt);
+    // Execute nt time steps on nx grid points.
+    stepper::space solution = step.do_work(sch, np, nx, nt);
 
-  auto time = timer.stop();
+    auto time = timer.stop();
 
-  // Print the final solution
-  if (args.results) {
-    for (std::size_t i = 0; i != np; ++i) {
-      std::cout << "U[" << i << "] = {";
-      for (std::size_t j = 0; j != nx; ++j) {
-        std::cout << solution[i * nx + j] << " ";
-      }
-      std::cout << "}\n";
+    // Print the final solution
+    if (args.results) {
+        for (std::size_t i = 0; i != np; ++i) {
+            std::cout << "U[" << i << "] = {";
+            for (std::size_t j = 0; j != nx; ++j) {
+                std::cout << solution[i * nx + j] << " ";
+            }
+            std::cout << "}\n";
+        }
     }
-  }
 
-  if (args.time) {
-    std::cout << "Duration: " << time << " ms."
-              << "\n";
-  }
+    if (args.time) {
+        std::cout << "Duration: " << time << " ms."
+                  << "\n";
+    }
 
-  return 0;
+    return 0;
 }
 
 int main(int argc, char* argv[]) {
-  // parse params
-  args_params_t args = argparse::parse<args_params_t>(argc, argv);
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // parse params
+    args_params_t args = argparse::parse<args_params_t>(argc, argv);
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  benchmark(args);
+    benchmark(args);
 
-  return 0;
+    return 0;
 }
diff --git a/apps/1d_stencil/stencil_snd_gpu_s.cpp b/apps/1d_stencil/stencil_snd_gpu_s.cpp
index 58fc06c..8144c52 100644
--- a/apps/1d_stencil/stencil_snd_gpu_s.cpp
+++ b/apps/1d_stencil/stencil_snd_gpu_s.cpp
@@ -40,20 +40,16 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-  bool& results = kwarg("results", "print generated results (default: false)")
-                      .set_default(false);
-  std::uint64_t& nx =
-      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-  bool& no_header =
-      kwarg("no-header", "Do not print csv header row (default: false)")
-          .set_default(false);
-  bool& help = flag("h, help", "print help");
-  bool& time = kwarg("t, time", "print time").set_default(true);
+    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
+    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
+    bool& help = flag("h, help", "print help");
+    bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -64,126 +60,118 @@ double dt = 1.;      // time step
 double dx = 1.;      // grid spacing
 
 template <class... Ts>
-using any_sender_of = typename exec::any_receiver_ref<
-    stdexec::completion_signatures<Ts...>>::template any_sender<>;
+using any_sender_of = typename exec::any_receiver_ref<stdexec::completion_signatures<Ts...>>::template any_sender<>;
 
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-  // Our partition type
-  typedef double partition;
+    // Our partition type
+    typedef double partition;
 
-  // Our data for one time step
-  typedef thrust::device_vector<partition> space;
+    // Our data for one time step
+    typedef thrust::device_vector<partition> space;
 
-  // Our operator
-  double heat(double left, double middle, double right, const double k = ::k,
-              const double dt = ::dt, const double dx = ::dx) {
-    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
-  }
-
-  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-    if (id == 0 && dir == -1) {
-      return size - 1;
+    // Our operator
+    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
+                const double dx = ::dx) {
+        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
     }
 
-    if (id == size - 1 && dir == +1) {
-      return (std::size_t)0;
-    }
-    assert(id < size);
-
-    return id + dir;
-  }
-
-  // do all the work on 'nx' data points for 'nt' time steps
-  space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx,
-                std::size_t nt) {
-    std::size_t size = np * nx;
-    thrust::device_vector<partition> current_vec(size);
-    thrust::device_vector<partition> next_vec(size);
-
-    auto current_ptr = thrust::raw_pointer_cast(current_vec.data());
-    auto next_ptr = thrust::raw_pointer_cast(next_vec.data());
-
-    stdexec::sender auto init =
-        stdexec::transfer_just(sch, current_ptr, nx) |
-        stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) {
-          current_ptr[i] = (double)i;
-        });
-    stdexec::sync_wait(std::move(init));
-
-    for (std::size_t t = 0; t != nt; ++t) {
-      auto sender =
-          stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx,
-                                 size) |
-          stdexec::bulk(np * nx,
-                        [&](int i, auto& current_ptr, auto& next_ptr, auto k,
-                            auto dt, auto dx, auto np, auto nx, auto size) {
-                          std::size_t left = (i == 0) ? size - 1 : i - 1;
-                          std::size_t right = (i == size - 1) ? 0 : i + 1;
-                          next_ptr[i] = heat(current_ptr[left], current_ptr[i],
-                                             current_ptr[right], k, dt, dx);
-                        });
-      stdexec::sync_wait(std::move(sender));
-      std::swap(current_ptr, next_ptr);
+    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+        if (id == 0 && dir == -1) {
+            return size - 1;
+        }
+
+        if (id == size - 1 && dir == +1) {
+            return (std::size_t)0;
+        }
+        assert(id < size);
+
+        return id + dir;
     }
 
-    if (nt % 2 == 0) {
-      return current_vec;
+    // do all the work on 'nx' data points for 'nt' time steps
+    space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) {
+        std::size_t size = np * nx;
+        thrust::device_vector<partition> current_vec(size);
+        thrust::device_vector<partition> next_vec(size);
+
+        auto current_ptr = thrust::raw_pointer_cast(current_vec.data());
+        auto next_ptr = thrust::raw_pointer_cast(next_vec.data());
+
+        stdexec::sender auto init =
+            stdexec::transfer_just(sch, current_ptr, nx) |
+            stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { current_ptr[i] = (double)i; });
+        stdexec::sync_wait(std::move(init));
+
+        for (std::size_t t = 0; t != nt; ++t) {
+            auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx, size) |
+                          stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto& next_ptr, auto k, auto dt, auto dx,
+                                                     auto np, auto nx, auto size) {
+                              std::size_t left = (i == 0) ? size - 1 : i - 1;
+                              std::size_t right = (i == size - 1) ? 0 : i + 1;
+                              next_ptr[i] = heat(current_ptr[left], current_ptr[i], current_ptr[right], k, dt, dx);
+                          });
+            stdexec::sync_wait(std::move(sender));
+            std::swap(current_ptr, next_ptr);
+        }
+
+        if (nt % 2 == 0) {
+            return current_vec;
+        }
+        return next_vec;
     }
-    return next_vec;
-  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-  std::uint64_t np = args.np;  // Number of partitions.
-  std::uint64_t nx = args.nx;  // Number of grid points.
-  std::uint64_t nt = args.nt;  // Number of steps.
+    std::uint64_t np = args.np;  // Number of partitions.
+    std::uint64_t nx = args.nx;  // Number of grid points.
+    std::uint64_t nt = args.nt;  // Number of steps.
 
-  // Create the stepper object
-  stepper step;
+    // Create the stepper object
+    stepper step;
 
-  nvexec::stream_context stream_ctx{};
-  stdexec::scheduler auto sch = stream_ctx.get_scheduler();
+    nvexec::stream_context stream_ctx{};
+    stdexec::scheduler auto sch = stream_ctx.get_scheduler();
 
-  // Measure execution time.
-  Timer timer;
+    // Measure execution time.
+    Timer timer;
 
-  // Execute nt time steps on nx grid points.
-  stepper::space solution = step.do_work(sch, np, nx, nt);
+    // Execute nt time steps on nx grid points.
+    stepper::space solution = step.do_work(sch, np, nx, nt);
 
-  auto time = timer.stop();
+    auto time = timer.stop();
 
-  // Print the final solution
-  if (args.results) {
-    for (std::size_t i = 0; i != np; ++i) {
-      std::cout << "U[" << i << "] = {";
-      for (std::size_t j = 0; j != nx; ++j) {
-        std::cout << solution[i * nx + j] << " ";
-      }
-      std::cout << "}\n";
+    // Print the final solution
+    if (args.results) {
+        for (std::size_t i = 0; i != np; ++i) {
+            std::cout << "U[" << i << "] = {";
+            for (std::size_t j = 0; j != nx; ++j) {
+                std::cout << solution[i * nx + j] << " ";
+            }
+            std::cout << "}\n";
+        }
     }
-  }
 
-  if (args.time) {
-    std::cout << "Duration: " << time << " ms."
-              << "\n";
-  }
+    if (args.time) {
+        std::cout << "Duration: " << time << " ms."
+                  << "\n";
+    }
 
-  return 0;
+    return 0;
 }
 
 int main(int argc, char* argv[]) {
-  // parse params
-  args_params_t args = argparse::parse<args_params_t>(argc, argv);
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // parse params
+    args_params_t args = argparse::parse<args_params_t>(argc, argv);
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  benchmark(args);
+    benchmark(args);
 
-  return 0;
+    return 0;
 }
diff --git a/apps/1d_stencil/stencil_stdpar.cpp b/apps/1d_stencil/stencil_stdpar.cpp
index c1e780c..e424620 100644
--- a/apps/1d_stencil/stencil_stdpar.cpp
+++ b/apps/1d_stencil/stencil_stdpar.cpp
@@ -34,20 +34,16 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-  bool& results = kwarg("results", "print generated results (default: false)")
-                      .set_default(false);
-  std::uint64_t& nx =
-      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-  bool& no_header =
-      kwarg("no-header", "Do not print csv header row (default: false)")
-          .set_default(false);
-  bool& help = flag("h, help", "print help");
-  bool& time = kwarg("t, time", "print time").set_default(true);
+    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
+    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
+    bool& help = flag("h, help", "print help");
+    bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -60,105 +56,104 @@ double dx = 1.;      // grid spacing
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-  // Our partition type
-  typedef double partition;
-
-  // Our data for one time step
-  using view_1d = std::extents<int, std::dynamic_extent>;
-  typedef std::mdspan<partition, view_1d, std::layout_right> space;
-
-  // Our operator
-  double heat(double left, double middle, double right, const double k = ::k,
-              const double dt = ::dt, const double dx = ::dx) {
-    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
-  }
-
-  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-    if (id == 0 && dir == -1) {
-      return size - 1;
-    }
+    // Our partition type
+    typedef double partition;
+
+    // Our data for one time step
+    using view_1d = std::extents<int, std::dynamic_extent>;
+    typedef std::mdspan<partition, view_1d, std::layout_right> space;
 
-    if (id == size - 1 && dir == +1) {
-      return (std::size_t)0;
+    // Our operator
+    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
+                const double dx = ::dx) {
+        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
     }
-    assert(id < size);
-
-    return id + dir;
-  }
-
-  // do all the work on 'nx' data points for 'nt' time steps
-  space do_work(std::size_t np, std::size_t nx, std::size_t nt) {
-    std::size_t size = np * nx;
-    partition* current_ptr = new partition[size];
-    partition* next_ptr = new partition[size];
-
-    auto current = space(current_ptr, size);
-    auto next = space(next_ptr, size);
-    // parallel init
-    std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
-                    [=](std::size_t i) { current_ptr[i] = (double)i; });
-
-    // Actual time step loop
-    for (std::size_t t = 0; t != nt; ++t) {
-      std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
-                      [=, k = k, dt = dt, dx = dx](int32_t i) {
-                        auto left = idx(i, -1, size);
-                        auto right = idx(i, +1, size);
-                        next[i] = heat(current[left], current[i],
-                                       current[right], k, dt, dx);
-                      });
-      std::swap(current, next);
+
+    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+        if (id == 0 && dir == -1) {
+            return size - 1;
+        }
+
+        if (id == size - 1 && dir == +1) {
+            return (std::size_t)0;
+        }
+        assert(id < size);
+
+        return id + dir;
     }
 
-    return current;
-  }
+    // do all the work on 'nx' data points for 'nt' time steps
+    space do_work(std::size_t np, std::size_t nx, std::size_t nt) {
+        std::size_t size = np * nx;
+        partition* current_ptr = new partition[size];
+        partition* next_ptr = new partition[size];
+
+        auto current = space(current_ptr, size);
+        auto next = space(next_ptr, size);
+        // parallel init
+        std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
+                        [=](std::size_t i) { current_ptr[i] = (double)i; });
+
+        // Actual time step loop
+        for (std::size_t t = 0; t != nt; ++t) {
+            std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
+                            [=, k = k, dt = dt, dx = dx](int32_t i) {
+                                auto left = idx(i, -1, size);
+                                auto right = idx(i, +1, size);
+                                next[i] = heat(current[left], current[i], current[right], k, dt, dx);
+                            });
+            std::swap(current, next);
+        }
+
+        return current;
+    }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-  std::uint64_t np = args.np;  // Number of partitions.
-  std::uint64_t nx = args.nx;  // Number of grid points.
-  std::uint64_t nt = args.nt;  // Number of steps.
-
-  // Create the stepper object
-  stepper step;
-
-  // Measure execution time.
-  Timer timer;
-
-  // Execute nt time steps on nx grid points.
-  auto solution = step.do_work(np, nx, nt);
-  auto time = timer.stop();
-
-  // Print the final solution
-  if (args.results) {
-    for (std::size_t i = 0; i != np; ++i) {
-      std::cout << "U[" << i << "] = {";
-      for (std::size_t j = 0; j != nx; ++j) {
-        std::cout << solution[i * nx + j] << " ";
-      }
-      std::cout << "}\n";
+    std::uint64_t np = args.np;  // Number of partitions.
+    std::uint64_t nx = args.nx;  // Number of grid points.
+    std::uint64_t nt = args.nt;  // Number of steps.
+
+    // Create the stepper object
+    stepper step;
+
+    // Measure execution time.
+    Timer timer;
+
+    // Execute nt time steps on nx grid points.
+    auto solution = step.do_work(np, nx, nt);
+    auto time = timer.stop();
+
+    // Print the final solution
+    if (args.results) {
+        for (std::size_t i = 0; i != np; ++i) {
+            std::cout << "U[" << i << "] = {";
+            for (std::size_t j = 0; j != nx; ++j) {
+                std::cout << solution[i * nx + j] << " ";
+            }
+            std::cout << "}\n";
+        }
     }
-  }
 
-  if (args.time) {
-    std::cout << "Duration: " << time << " ms."
-              << "\n";
-  }
+    if (args.time) {
+        std::cout << "Duration: " << time << " ms."
+                  << "\n";
+    }
 
-  return 0;
+    return 0;
 }
 
 int main(int argc, char* argv[]) {
-  // parse params
-  args_params_t args = argparse::parse<args_params_t>(argc, argv);
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // parse params
+    args_params_t args = argparse::parse<args_params_t>(argc, argv);
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  benchmark(args);
+    benchmark(args);
 
-  return 0;
+    return 0;
 }
diff --git a/apps/1d_stencil/stencil_stdpar_snd.cpp b/apps/1d_stencil/stencil_stdpar_snd.cpp
index 30cfca8..6a08a63 100644
--- a/apps/1d_stencil/stencil_stdpar_snd.cpp
+++ b/apps/1d_stencil/stencil_stdpar_snd.cpp
@@ -37,20 +37,16 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-  bool& results = kwarg("results", "print generated results (default: false)")
-                      .set_default(false);
-  std::uint64_t& nx =
-      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-  bool& no_header =
-      kwarg("no-header", "Do not print csv header row (default: false)")
-          .set_default(false);
-  bool& help = flag("h, help", "print help");
-  bool& time = kwarg("t, time", "print time").set_default(true);
+    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
+    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
+    bool& help = flag("h, help", "print help");
+    bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -61,144 +57,134 @@ double dt = 1.;      // time step
 double dx = 1.;      // grid spacing
 
 template <class... Ts>
-using any_sender_of = typename exec::any_receiver_ref<
-    stdexec::completion_signatures<Ts...>>::template any_sender<>;
+using any_sender_of = typename exec::any_receiver_ref<stdexec::completion_signatures<Ts...>>::template any_sender<>;
 
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-  // Our partition type
-  typedef double partition;
-
-  // Our data for one time step
-  using view_1d = std::extents<int, std::dynamic_extent>;
-  typedef std::mdspan<partition, view_1d, std::layout_right> space;
-
-  using any_space_sender =
-      any_sender_of<stdexec::set_value_t(space), stdexec::set_stopped_t(),
-                    stdexec::set_error_t(std::exception_ptr)>;
-
-  // Our operator
-  double heat(double left, double middle, double right, const double k = ::k,
-              const double dt = ::dt, const double dx = ::dx) {
-    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
-  }
-
-  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-    if (id == 0 && dir == -1) {
-      return size - 1;
-    }
+    // Our partition type
+    typedef double partition;
+
+    // Our data for one time step
+    using view_1d = std::extents<int, std::dynamic_extent>;
+    typedef std::mdspan<partition, view_1d, std::layout_right> space;
+
+    using any_space_sender =
+        any_sender_of<stdexec::set_value_t(space), stdexec::set_stopped_t(), stdexec::set_error_t(std::exception_ptr)>;
 
-    if (id == size - 1 && dir == +1) {
-      return (std::size_t)0;
+    // Our operator
+    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
+                const double dx = ::dx) {
+        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
     }
-    assert(id < size);
-
-    return id + dir;
-  }
-
-  partition* current_ptr = nullptr;
-  partition* next_ptr = nullptr;
-  space current;
-  space next;
-
-  // do all the work on 'nx' data points for 'nt' time steps
-  auto do_work(std::size_t np, std::size_t nx, std::size_t nt)
-      -> any_space_sender {
-    if (nt == 0) {
-      std::size_t size = np * nx;
-      partition* current_ptr = new partition[size];
-      partition* next_ptr = new partition[size];
-      current = space(current_ptr, size);
-      next = space(next_ptr, size);
-
-      // parallel init
-      std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
-                      [=](std::size_t i) { current_ptr[i] = (double)i; });
-
-      return stdexec::just(current);
+
+    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+        if (id == 0 && dir == -1) {
+            return size - 1;
+        }
+
+        if (id == size - 1 && dir == +1) {
+            return (std::size_t)0;
+        }
+        assert(id < size);
+
+        return id + dir;
     }
 
-    return stdexec::just(nt - 1) |
-           stdexec::let_value([=](std::size_t nt_updated) {
-             return do_work(np, nx, nt_updated);
-           }) |
-           stdexec::bulk(np,
-                         [&, k = k, dt = dt, dx = dx, nx = nx, np = np](
-                             std::size_t i, auto const& current) {
-                           std::for_each_n(
-                               std::execution::par, counting_iterator(0), nx,
-                               [=, next = next](std::size_t j) {
-                                 std::size_t id = i * nx + j;
-                                 auto left = idx(id, -1, np * nx);
-                                 auto right = idx(id, +1, np * nx);
-                                 next[id] = heat(current[left], current[id],
-                                                 current[right], k, dt, dx);
-                               });
-                         }) |
-           stdexec::then([&](auto current) {
-             // TODO: return next?
-             std::swap(current, next);
-             return current;
-           });
-  }
+    partition* current_ptr = nullptr;
+    partition* next_ptr = nullptr;
+    space current;
+    space next;
+
+    // do all the work on 'nx' data points for 'nt' time steps
+    auto do_work(std::size_t np, std::size_t nx, std::size_t nt) -> any_space_sender {
+        if (nt == 0) {
+            std::size_t size = np * nx;
+            partition* current_ptr = new partition[size];
+            partition* next_ptr = new partition[size];
+            current = space(current_ptr, size);
+            next = space(next_ptr, size);
+
+            // parallel init
+            std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
+                            [=](std::size_t i) { current_ptr[i] = (double)i; });
+
+            return stdexec::just(current);
+        }
+
+        return stdexec::just(nt - 1) |
+               stdexec::let_value([=](std::size_t nt_updated) { return do_work(np, nx, nt_updated); }) |
+               stdexec::bulk(np,
+                             [&, k = k, dt = dt, dx = dx, nx = nx, np = np](std::size_t i, auto const& current) {
+                                 std::for_each_n(
+                                     std::execution::par, counting_iterator(0), nx, [=, next = next](std::size_t j) {
+                                         std::size_t id = i * nx + j;
+                                         auto left = idx(id, -1, np * nx);
+                                         auto right = idx(id, +1, np * nx);
+                                         next[id] = heat(current[left], current[id], current[right], k, dt, dx);
+                                     });
+                             }) |
+               stdexec::then([&](auto current) {
+                   // TODO: return next?
+                   std::swap(current, next);
+                   return current;
+               });
+    }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-  std::uint64_t np = args.np;  // Number of partitions.
-  std::uint64_t nx = args.nx;  // Number of grid points.
-  std::uint64_t nt = args.nt;  // Number of steps.
-
-  // Create the stepper object
-  stepper step;
-
-  exec::static_thread_pool pool(np);
-  stdexec::scheduler auto sch = pool.get_scheduler();
-  stdexec::sender auto begin = stdexec::schedule(sch);
-
-  // Measure execution time.
-  Timer timer;
-
-  // Execute nt time steps on nx grid points.
-  stdexec::sender auto sender =
-      begin | stdexec::then([=]() { return nt; }) |
-      stdexec::let_value(
-          [=, &step](std::uint64_t nt) { return step.do_work(np, nx, nt); });
-
-  auto [solution] = stdexec::sync_wait(std::move(sender)).value();
-
-  auto time = timer.stop();
-
-  // Print the final solution
-  if (args.results) {
-    for (std::size_t i = 0; i != np; ++i) {
-      std::cout << "U[" << i << "] = {";
-      for (std::size_t j = 0; j != nx; ++j) {
-        std::cout << solution[i * nx + j] << " ";
-      }
-      std::cout << "}\n";
+    std::uint64_t np = args.np;  // Number of partitions.
+    std::uint64_t nx = args.nx;  // Number of grid points.
+    std::uint64_t nt = args.nt;  // Number of steps.
+
+    // Create the stepper object
+    stepper step;
+
+    exec::static_thread_pool pool(np);
+    stdexec::scheduler auto sch = pool.get_scheduler();
+    stdexec::sender auto begin = stdexec::schedule(sch);
+
+    // Measure execution time.
+    Timer timer;
+
+    // Execute nt time steps on nx grid points.
+    stdexec::sender auto sender = begin | stdexec::then([=]() { return nt; }) |
+                                  stdexec::let_value([=, &step](std::uint64_t nt) { return step.do_work(np, nx, nt); });
+
+    auto [solution] = stdexec::sync_wait(std::move(sender)).value();
+
+    auto time = timer.stop();
+
+    // Print the final solution
+    if (args.results) {
+        for (std::size_t i = 0; i != np; ++i) {
+            std::cout << "U[" << i << "] = {";
+            for (std::size_t j = 0; j != nx; ++j) {
+                std::cout << solution[i * nx + j] << " ";
+            }
+            std::cout << "}\n";
+        }
     }
-  }
 
-  if (args.time) {
-    std::cout << "Duration: " << time << " ms."
-              << "\n";
-  }
+    if (args.time) {
+        std::cout << "Duration: " << time << " ms."
+                  << "\n";
+    }
 
-  return 0;
+    return 0;
 }
 
 int main(int argc, char* argv[]) {
-  // parse params
-  args_params_t args = argparse::parse<args_params_t>(argc, argv);
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // parse params
+    args_params_t args = argparse::parse<args_params_t>(argc, argv);
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  benchmark(args);
+    benchmark(args);
 
-  return 0;
+    return 0;
 }
diff --git a/apps/1d_stencil/stencil_stdpar_snd_iter.cpp b/apps/1d_stencil/stencil_stdpar_snd_iter.cpp
index 0c1280a..0cbffc9 100644
--- a/apps/1d_stencil/stencil_stdpar_snd_iter.cpp
+++ b/apps/1d_stencil/stencil_stdpar_snd_iter.cpp
@@ -37,20 +37,16 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-  bool& results = kwarg("results", "print generated results (default: false)")
-                      .set_default(false);
-  std::uint64_t& nx =
-      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-  bool& no_header =
-      kwarg("no-header", "Do not print csv header row (default: false)")
-          .set_default(false);
-  bool& help = flag("h, help", "print help");
-  bool& time = kwarg("t, time", "print time").set_default(true);
+    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
+    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
+    bool& help = flag("h, help", "print help");
+    bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -61,127 +57,122 @@ double dt = 1.;      // time step
 double dx = 1.;      // grid spacing
 
 template <class... Ts>
-using any_sender_of = typename exec::any_receiver_ref<
-    stdexec::completion_signatures<Ts...>>::template any_sender<>;
+using any_sender_of = typename exec::any_receiver_ref<stdexec::completion_signatures<Ts...>>::template any_sender<>;
 
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-  // Our partition type
-  typedef double partition;
-
-  // Our data for one time step
-  using view_1d = std::extents<int, std::dynamic_extent>;
-  typedef std::mdspan<partition, view_1d, std::layout_right> space;
-
-  // Our operator
-  double heat(double left, double middle, double right, const double k = ::k,
-              const double dt = ::dt, const double dx = ::dx) {
-    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
-  }
-
-  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-    if (id == 0 && dir == -1) {
-      return size - 1;
-    }
+    // Our partition type
+    typedef double partition;
+
+    // Our data for one time step
+    using view_1d = std::extents<int, std::dynamic_extent>;
+    typedef std::mdspan<partition, view_1d, std::layout_right> space;
 
-    if (id == size - 1 && dir == +1) {
-      return (std::size_t)0;
+    // Our operator
+    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
+                const double dx = ::dx) {
+        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
     }
-    assert(id < size);
-
-    return id + dir;
-  }
-
-  partition* current_ptr = nullptr;
-  partition* next_ptr = nullptr;
-  space current;
-  space next;
-
-  // do all the work on 'nx' data points for 'nt' time steps
-  space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx,
-                std::size_t nt) {
-    std::size_t size = np * nx;
-    partition* current_ptr = new partition[size];
-    partition* next_ptr = new partition[size];
-
-    auto current = space(current_ptr, size);
-    auto next = space(next_ptr, size);
-    // parallel init
-    std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
-                    [=](std::size_t i) { current(i) = (double)i; });
-
-    // Actual time step loop
-    for (std::size_t t = 0; t != nt; ++t) {
-      auto sender =
-          stdexec::transfer_just(sch, current, next, k, dt, dx, np, nx) |
-          stdexec::bulk(np, [&](int i, auto& current, auto& next, auto k,
-                                auto dt, auto dx, auto np, auto nx) {
-            std::for_each_n(std::execution::par, counting_iterator(0), nx,
-                            [=](std::size_t j) {
-                              std::size_t id = i * nx + j;
-                              auto left = idx(id, -1, np * nx);
-                              auto right = idx(id, +1, np * nx);
-                              next(id) = heat(current(left), current(id),
-                                              current(right), k, dt, dx);
-                            });
-          });
-      stdexec::sync_wait(std::move(sender));
-      std::swap(current, next);
+
+    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+        if (id == 0 && dir == -1) {
+            return size - 1;
+        }
+
+        if (id == size - 1 && dir == +1) {
+            return (std::size_t)0;
+        }
+        assert(id < size);
+
+        return id + dir;
     }
 
-    return current;
-  }
+    partition* current_ptr = nullptr;
+    partition* next_ptr = nullptr;
+    space current;
+    space next;
+
+    // do all the work on 'nx' data points for 'nt' time steps
+    space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) {
+        std::size_t size = np * nx;
+        partition* current_ptr = new partition[size];
+        partition* next_ptr = new partition[size];
+
+        auto current = space(current_ptr, size);
+        auto next = space(next_ptr, size);
+        // parallel init
+        std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
+                        [=](std::size_t i) { current(i) = (double)i; });
+
+        // Actual time step loop
+        for (std::size_t t = 0; t != nt; ++t) {
+            auto sender =
+                stdexec::transfer_just(sch, current, next, k, dt, dx, np, nx) |
+                stdexec::bulk(np, [&](int i, auto& current, auto& next, auto k, auto dt, auto dx, auto np, auto nx) {
+                    std::for_each_n(std::execution::par, counting_iterator(0), nx, [=](std::size_t j) {
+                        std::size_t id = i * nx + j;
+                        auto left = idx(id, -1, np * nx);
+                        auto right = idx(id, +1, np * nx);
+                        next(id) = heat(current(left), current(id), current(right), k, dt, dx);
+                    });
+                });
+            stdexec::sync_wait(std::move(sender));
+            std::swap(current, next);
+        }
+
+        return current;
+    }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-  std::uint64_t np = args.np;  // Number of partitions.
-  std::uint64_t nx = args.nx;  // Number of grid points.
-  std::uint64_t nt = args.nt;  // Number of steps.
+    std::uint64_t np = args.np;  // Number of partitions.
+    std::uint64_t nx = args.nx;  // Number of grid points.
+    std::uint64_t nt = args.nt;  // Number of steps.
 
-  // Create the stepper object
-  stepper step;
+    // Create the stepper object
+    stepper step;
 
-  exec::static_thread_pool pool(np);
-  stdexec::scheduler auto sch = pool.get_scheduler();
+    exec::static_thread_pool pool(np);
+    stdexec::scheduler auto sch = pool.get_scheduler();
 
-  // Measure execution time.
-  Timer timer;
+    // Measure execution time.
+    Timer timer;
 
-  stepper::space solution = step.do_work(sch, np, nx, nt);
+    stepper::space solution = step.do_work(sch, np, nx, nt);
 
-  auto time = timer.stop();
+    auto time = timer.stop();
 
-  // Print the final solution
-  if (args.results) {
-    for (std::size_t i = 0; i != np; ++i) {
-      std::cout << "U[" << i << "] = {";
-      for (std::size_t j = 0; j != nx; ++j) {
-        std::cout << solution[i * nx + j] << " ";
-      }
-      std::cout << "}\n";
+    // Print the final solution
+    if (args.results) {
+        for (std::size_t i = 0; i != np; ++i) {
+            std::cout << "U[" << i << "] = {";
+            for (std::size_t j = 0; j != nx; ++j) {
+                std::cout << solution[i * nx + j] << " ";
+            }
+            std::cout << "}\n";
+        }
     }
-  }
 
-  if (args.time) {
-    std::cout << "Duration: " << time << " ms."
-              << "\n";
-  }
+    if (args.time) {
+        std::cout << "Duration: " << time << " ms."
+                  << "\n";
+    }
 
-  return 0;
+    return 0;
 }
 
 int main(int argc, char* argv[]) {
-  // parse params
-  args_params_t args = argparse::parse<args_params_t>(argc, argv);
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // parse params
+    args_params_t args = argparse::parse<args_params_t>(argc, argv);
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  benchmark(args);
+    benchmark(args);
 
-  return 0;
+    return 0;
 }
diff --git a/apps/choleskey/choleskey_serial.cpp b/apps/choleskey/choleskey_serial.cpp
index 5c82498..88d6824 100644
--- a/apps/choleskey/choleskey_serial.cpp
+++ b/apps/choleskey/choleskey_serial.cpp
@@ -39,92 +39,90 @@ using namespace std;
 
 struct solver {
 
-  using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
-
-  typedef std::mdspan<int, view_2d, std::layout_right> matrix_ms_t;
-
-  template <typename T>
-  matrix_ms_t Cholesky_Decomposition(std::vector<T>& vec, int n) {
-    std::vector<T> lower(n * n, 0);
-
-    auto matrix_ms =
-        std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
-    auto lower_ms =
-        std::mdspan<T, view_2d, std::layout_right>(lower.data(), n, n);
-
-    // Decomposing a matrix into Lower Triangular
-    for (int i = 0; i < matrix_ms.extent(0); i++) {
-      for (int j = 0; j <= i; j++) {
-        T sum = 0;
-
-        if (j == i) {
-          // summation for diagonals
-          for (int k = 0; k < j; k++)
-            sum += pow(lower_ms(j, k), 2);
-          lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum);
-        } else {
-          // Evaluating L(i, j) using L(j, j)
-          for (int k = 0; k < j; k++)
-            sum += (lower_ms(i, k) * lower_ms(j, k));
-          lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j);
+    using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+
+    typedef std::mdspan<int, view_2d, std::layout_right> matrix_ms_t;
+
+    template <typename T>
+    matrix_ms_t Cholesky_Decomposition(std::vector<T>& vec, int n) {
+        std::vector<T> lower(n * n, 0);
+
+        auto matrix_ms = std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
+        auto lower_ms = std::mdspan<T, view_2d, std::layout_right>(lower.data(), n, n);
+
+        // Decomposing a matrix into Lower Triangular
+        for (int i = 0; i < matrix_ms.extent(0); i++) {
+            for (int j = 0; j <= i; j++) {
+                T sum = 0;
+
+                if (j == i) {
+                    // summation for diagonals
+                    for (int k = 0; k < j; k++)
+                        sum += pow(lower_ms(j, k), 2);
+                    lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum);
+                } else {
+                    // Evaluating L(i, j) using L(j, j)
+                    for (int k = 0; k < j; k++)
+                        sum += (lower_ms(i, k) * lower_ms(j, k));
+                    lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j);
+                }
+            }
         }
-      }
+        return lower_ms;
     }
-    return lower_ms;
-  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
 
-  std::uint64_t nd = args.nd;  // Number of matrix dimension.
-
-  std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
-
-  // Create the solverobject
-  solver solve;
-  // Measure execution time.
-  Timer timer;
-  // start decomposation
-  auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
-
-  // Print the final results
-  if (args.results) {
-    // Displaying Lower Triangular and its Transpose
-    cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
-    for (int i = 0; i < nd; i++) {
-      // Lower Triangular
-      for (int j = 0; j < nd; j++)
-        cout << setw(6) << res_matrix(i, j) << "\t";
-      cout << "\t";
-
-      // Transpose of Lower Triangular
-      for (int j = 0; j < nd; j++)
-        cout << setw(6) << res_matrix(j, i) << "\t";
-      cout << endl;
+    std::uint64_t nd = args.nd;  // Number of matrix dimension.
+
+    std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
+
+    // Create the solverobject
+    solver solve;
+    // Measure execution time.
+    Timer timer;
+    // start decomposation
+    auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
+
+    // Print the final results
+    if (args.results) {
+        // Displaying Lower Triangular and its Transpose
+        cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
+        for (int i = 0; i < nd; i++) {
+            // Lower Triangular
+            for (int j = 0; j < nd; j++)
+                cout << setw(6) << res_matrix(i, j) << "\t";
+            cout << "\t";
+
+            // Transpose of Lower Triangular
+            for (int j = 0; j < nd; j++)
+                cout << setw(6) << res_matrix(j, i) << "\t";
+            cout << endl;
+        }
     }
-  }
 
-  if (args.time) {
-    std::cout << "Duration: " << time << " ms."
-              << "\n";
-  }
+    if (args.time) {
+        std::cout << "Duration: " << time << " ms."
+                  << "\n";
+    }
 
-  return 0;
+    return 0;
 }
 
 // Driver Code for testing
 int main(int argc, char* argv[]) {
 
-  // parse params
-  args_params_t args = argparse::parse<args_params_t>(argc, argv);
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // parse params
+    args_params_t args = argparse::parse<args_params_t>(argc, argv);
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  benchmark(args);
+    benchmark(args);
 
-  return 0;
+    return 0;
 }
diff --git a/apps/choleskey/choleskey_stdpar.cpp b/apps/choleskey/choleskey_stdpar.cpp
index 2b19b7d..33c6b87 100644
--- a/apps/choleskey/choleskey_stdpar.cpp
+++ b/apps/choleskey/choleskey_stdpar.cpp
@@ -44,99 +44,95 @@ using namespace std;
 
 struct solver {
 
-  using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+    using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 
-  template <typename T>
-  std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec,
-                                                     int n) {
-    std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
+    template <typename T>
+    std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec, int n) {
+        std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
 
-    auto matrix_ms =
-        std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
+        auto matrix_ms = std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
 
-    auto multiplier_lambda = [=](auto a, auto b) {
-      return a * b;
-    };
+        auto multiplier_lambda = [=](auto a, auto b) {
+            return a * b;
+        };
 
-    // Decomposing a matrix into Lower Triangular
-    for (int i = 0; i < matrix_ms.extent(0); i++) {
-      for (int j = 0; j <= i; j++) {
-        T sum = 0;
+        // Decomposing a matrix into Lower Triangular
+        for (int i = 0; i < matrix_ms.extent(0); i++) {
+            for (int j = 0; j <= i; j++) {
+                T sum = 0;
 
-        if (j == i)  // summation for diagonals
-        {
-          sum = std::transform_reduce(std::execution::par, lower[j].cbegin(),
-                                      lower[j].cbegin() + j, 0, std::plus{},
-                                      [=](int val) { return val * val; });
+                if (j == i)  // summation for diagonals
+                {
+                    sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j, 0,
+                                                std::plus{}, [=](int val) { return val * val; });
 
-          lower[j][j] = std::sqrt(matrix_ms(i, j) - sum);
+                    lower[j][j] = std::sqrt(matrix_ms(i, j) - sum);
 
-        } else {  // Evaluating L(i, j) using L(j, j)
+                } else {  // Evaluating L(i, j) using L(j, j)
 
-          sum = std::transform_reduce(std::execution::par, lower[j].cbegin(),
-                                      lower[j].cbegin() + j, lower[i].cbegin(),
-                                      0, std::plus<>(), multiplier_lambda);
+                    sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j,
+                                                lower[i].cbegin(), 0, std::plus<>(), multiplier_lambda);
 
-          lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j];
+                    lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j];
+                }
+            }
         }
-      }
+        return lower;
     }
-    return lower;
-  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
 
-  std::uint64_t nd = args.nd;  // Number of matrix dimension.
+    std::uint64_t nd = args.nd;  // Number of matrix dimension.
 
-  std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
+    std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
 
-  // Create the solver object
-  solver solve;
-  // Measure execution time.
-  Timer timer;
+    // Create the solver object
+    solver solve;
+    // Measure execution time.
+    Timer timer;
 
-  // start decomposation
-  auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
+    // start decomposation
+    auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
 
-  // Print the final results
-  if (args.results) {
-    // Displaying Lower Triangular and its Transpose
-    cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
-    for (int i = 0; i < nd; i++) {
-      // Lower Triangular
-      for (int j = 0; j < nd; j++)
-        cout << setw(6) << res_matrix[i][j] << "\t";
-      cout << "\t";
+    // Print the final results
+    if (args.results) {
+        // Displaying Lower Triangular and its Transpose
+        cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
+        for (int i = 0; i < nd; i++) {
+            // Lower Triangular
+            for (int j = 0; j < nd; j++)
+                cout << setw(6) << res_matrix[i][j] << "\t";
+            cout << "\t";
 
-      // Transpose of Lower Triangular
-      for (int j = 0; j < nd; j++)
-        cout << setw(6) << res_matrix[j][i] << "\t";
-      cout << endl;
+            // Transpose of Lower Triangular
+            for (int j = 0; j < nd; j++)
+                cout << setw(6) << res_matrix[j][i] << "\t";
+            cout << endl;
+        }
     }
-  }
 
-  if (args.time) {
-    std::cout << "Duration: " << time << " ms."
-              << "\n";
-  }
+    if (args.time) {
+        std::cout << "Duration: " << time << " ms."
+                  << "\n";
+    }
 
-  return 0;
+    return 0;
 }
 
 // Driver Code for testing
 int main(int argc, char* argv[]) {
 
-  // parse params
-  args_params_t args = argparse::parse<args_params_t>(argc, argv);
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // parse params
+    args_params_t args = argparse::parse<args_params_t>(argc, argv);
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  benchmark(args);
+    benchmark(args);
 
-  return 0;
+    return 0;
 }
diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp
index 4fa7d79..0a02682 100644
--- a/apps/choleskey/choleskey_stdpar_snd.cpp
+++ b/apps/choleskey/choleskey_stdpar_snd.cpp
@@ -45,166 +45,157 @@ using namespace std;
 
 struct solver {
 
-  using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
-
-  template <typename T>
-  std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec, int n,
-                                                     int np) {
-
-    // test here first, scheduler from a thread pool
-    exec::static_thread_pool pool(np);
-    stdexec::scheduler auto sch = pool.get_scheduler();
-    stdexec::sender auto begin = stdexec::schedule(sch);
-
-    std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
-
-    auto matrix_ms =
-        std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
-
-    auto multiplier_lambda = [=](auto a, auto b) {
-      return a * b;
-    };
-
-    for (int i = 0; i < matrix_ms.extent(0); i++) {
-      for (int j = 0; j <= i; j++) {
-        // avoid over parallelize
-        if (j == 0) {
-          np = 1;
-        } else if (j > 0 && np > j) {
-          np = j;
-        }
-
-        if (j == i)  // summation for diagonals
-        {
-
-          if (i == 0 && j == 0) {
-            lower[j][j] = std::sqrt(matrix_ms(i, j));
-          } else {
-
-            std::vector<T> sum_vec(np);  // sub res for each piece
-            int size = j;  // there are j elements need to be calculated(power)
-
-            stdexec::sender auto send1 =
-                stdexec::bulk(begin, np,
-                              [&](int piece) {
-                                int start = piece * size / np;
-                                int chunk_size = size / np;
-                                int remaining = size % np;
-                                chunk_size += (piece == np - 1) ? remaining : 0;
-
-                                sum_vec[piece] = std::transform_reduce(
-                                    std::execution::par,
-                                    counting_iterator(start),
-                                    counting_iterator(start + chunk_size), 0,
-                                    std ::plus{}, [=](int val) {
-                                      return lower[j][val] * lower[j][val];
-                                    });
-                              }) |
-                stdexec::then([&sum_vec]() {
-                  return std::reduce(std::execution::par, sum_vec.begin(),
-                                     sum_vec.end());
-                });
-
-            auto [sum1] = stdexec::sync_wait(std::move(send1)).value();
-
-            lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1);
-          }
-
-        } else {
-          // Evaluating L(i, j) using L(j, j)
-
-          if (j == 0) {
-            lower[i][j] = (matrix_ms(i, j)) / lower[j][j];
-          } else {
-
-            std::vector<T> sum_vec(np);  // sub res for each piece
-            int size_nondiag = j;
-
-            stdexec::sender auto send2 =
-                stdexec::bulk(
-                    begin, np,
-                    [&](int piece) {
-                      int start = piece * size_nondiag / np;
-                      int chunk_size = size_nondiag / np;
-                      int remaining = size_nondiag % np;
-                      chunk_size += (piece == np - 1) ? remaining : 0;
-
-                      sum_vec[piece] = std::transform_reduce(
-                          std::execution::par, counting_iterator(start),
-                          counting_iterator(start + chunk_size), 0,
-                          std ::plus{},
-                          [=](int k) { return lower[j][k] * lower[i][k]; });
-                    }) |
-                stdexec::then([&sum_vec]() {
-                  return std::reduce(std::execution::par, sum_vec.begin(),
-                                     sum_vec.end());
-                });
-
-            auto [sum2] = stdexec::sync_wait(std::move(send2)).value();
-
-            lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j];
-          }
+    using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+
+    template <typename T>
+    std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec, int n, int np) {
+
+        // test here first, scheduler from a thread pool
+        exec::static_thread_pool pool(np);
+        stdexec::scheduler auto sch = pool.get_scheduler();
+        stdexec::sender auto begin = stdexec::schedule(sch);
+
+        std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
+
+        auto matrix_ms = std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
+
+        auto multiplier_lambda = [=](auto a, auto b) {
+            return a * b;
+        };
+
+        for (int i = 0; i < matrix_ms.extent(0); i++) {
+            for (int j = 0; j <= i; j++) {
+                // avoid over parallelize
+                if (j == 0) {
+                    np = 1;
+                } else if (j > 0 && np > j) {
+                    np = j;
+                }
+
+                if (j == i)  // summation for diagonals
+                {
+
+                    if (i == 0 && j == 0) {
+                        lower[j][j] = std::sqrt(matrix_ms(i, j));
+                    } else {
+
+                        std::vector<T> sum_vec(np);  // sub res for each piece
+                        int size = j;                // there are j elements need to be calculated(power)
+
+                        stdexec::sender auto send1 =
+                            stdexec::bulk(begin, np,
+                                          [&](int piece) {
+                                              int start = piece * size / np;
+                                              int chunk_size = size / np;
+                                              int remaining = size % np;
+                                              chunk_size += (piece == np - 1) ? remaining : 0;
+
+                                              sum_vec[piece] = std::transform_reduce(
+                                                  std::execution::par, counting_iterator(start),
+                                                  counting_iterator(start + chunk_size), 0, std ::plus{},
+                                                  [=](int val) { return lower[j][val] * lower[j][val]; });
+                                          }) |
+                            stdexec::then([&sum_vec]() {
+                                return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end());
+                            });
+
+                        auto [sum1] = stdexec::sync_wait(std::move(send1)).value();
+
+                        lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1);
+                    }
+
+                } else {
+                    // Evaluating L(i, j) using L(j, j)
+
+                    if (j == 0) {
+                        lower[i][j] = (matrix_ms(i, j)) / lower[j][j];
+                    } else {
+
+                        std::vector<T> sum_vec(np);  // sub res for each piece
+                        int size_nondiag = j;
+
+                        stdexec::sender auto send2 =
+                            stdexec::bulk(begin, np,
+                                          [&](int piece) {
+                                              int start = piece * size_nondiag / np;
+                                              int chunk_size = size_nondiag / np;
+                                              int remaining = size_nondiag % np;
+                                              chunk_size += (piece == np - 1) ? remaining : 0;
+
+                                              sum_vec[piece] = std::transform_reduce(
+                                                  std::execution::par, counting_iterator(start),
+                                                  counting_iterator(start + chunk_size), 0, std ::plus{},
+                                                  [=](int k) { return lower[j][k] * lower[i][k]; });
+                                          }) |
+                            stdexec::then([&sum_vec]() {
+                                return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end());
+                            });
+
+                        auto [sum2] = stdexec::sync_wait(std::move(send2)).value();
+
+                        lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j];
+                    }
+                }
+            }
         }
-      }
+        return lower;
     }
-    return lower;
-  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
 
-  std::uint64_t nd = args.nd;  // Number of matrix dimension.
-  std::uint64_t np = args.np;  // Number of parallel partitions.
+    std::uint64_t nd = args.nd;  // Number of matrix dimension.
+    std::uint64_t np = args.np;  // Number of parallel partitions.
 
-  std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
+    std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
 
-  // Create the solver object
-  solver solve;
+    // Create the solver object
+    solver solve;
 
-  // Measure execution time.
-  Timer timer;
+    // Measure execution time.
+    Timer timer;
 
-  // start decomposation
-  auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np);
+    // start decomposation
+    auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np);
 
-  // Print the final results
-  if (args.results) {
-    // Displaying Lower Triangular and its Transpose
-    cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
-    for (int i = 0; i < nd; i++) {
-      // Lower Triangular
-      for (int j = 0; j < nd; j++)
-        cout << setw(6) << res_matrix[i][j] << "\t";
-      cout << "\t";
+    // Print the final results
+    if (args.results) {
+        // Displaying Lower Triangular and its Transpose
+        cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
+        for (int i = 0; i < nd; i++) {
+            // Lower Triangular
+            for (int j = 0; j < nd; j++)
+                cout << setw(6) << res_matrix[i][j] << "\t";
+            cout << "\t";
 
-      // Transpose of Lower Triangular
-      for (int j = 0; j < nd; j++)
-        cout << setw(6) << res_matrix[j][i] << "\t";
-      cout << endl;
+            // Transpose of Lower Triangular
+            for (int j = 0; j < nd; j++)
+                cout << setw(6) << res_matrix[j][i] << "\t";
+            cout << endl;
+        }
     }
-  }
 
-  if (args.time) {
-    std::cout << "Duration: " << time << " ms."
-              << "\n";
-  }
+    if (args.time) {
+        std::cout << "Duration: " << time << " ms."
+                  << "\n";
+    }
 
-  return 0;
+    return 0;
 }
 
 // Driver Code for testing
 int main(int argc, char* argv[]) {
 
-  // parse params
-  args_params_t args = argparse::parse<args_params_t>(argc, argv);
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // parse params
+    args_params_t args = argparse::parse<args_params_t>(argc, argv);
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  benchmark(args);
+    benchmark(args);
 
-  return 0;
+    return 0;
 }
diff --git a/apps/choleskey/matrixutil.hpp b/apps/choleskey/matrixutil.hpp
index 44f0468..8b08fb1 100644
--- a/apps/choleskey/matrixutil.hpp
+++ b/apps/choleskey/matrixutil.hpp
@@ -9,33 +9,30 @@ using Matrix = std::vector<std::vector<T>>;
 
 template <typename T>
 std::vector<T> generate_pascal_matrix(const int n) {
-  Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));
+    Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));
 
-  for (int i = 0; i < n; ++i) {
-    for (int j = 0; j < n; ++j) {
-      if (i == 0 || j == 0) {
-        matrix[i][j] = static_cast<T>(1);
-      } else {
-        matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
-      }
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+            if (i == 0 || j == 0) {
+                matrix[i][j] = static_cast<T>(1);
+            } else {
+                matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
+            }
+        }
     }
-  }
 
-  std::vector<T> flattenedVector;
-  for (const auto& row : matrix) {
-    flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
-  }
-  return std::move(flattenedVector);
+    std::vector<T> flattenedVector;
+    for (const auto& row : matrix) {
+        flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
+    }
+    return std::move(flattenedVector);
 }
 
 // parameters define
 struct args_params_t : public argparse::Args {
-  bool& results = kwarg("results", "print generated results (default: false)")
-                      .set_default(true);
-  std::uint64_t& nd =
-      kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)")
-          .set_default(10);
-  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
-  bool& help = flag("h, help", "print help");
-  bool& time = kwarg("t, time", "print time").set_default(true);
+    bool& results = kwarg("results", "print generated results (default: false)").set_default(true);
+    std::uint64_t& nd = kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)").set_default(10);
+    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
+    bool& help = flag("h, help", "print help");
+    bool& time = kwarg("t, time", "print time").set_default(true);
 };
diff --git a/apps/comm-study/comm-study-no-senders.cpp b/apps/comm-study/comm-study-no-senders.cpp
index 1550094..87fa74b 100644
--- a/apps/comm-study/comm-study-no-senders.cpp
+++ b/apps/comm-study/comm-study-no-senders.cpp
@@ -37,87 +37,79 @@ using time_point_t = std::chrono::system_clock::time_point;
 // must take in the pointers/vectors by reference
 template <typename P>
 auto work(P& A, P& B, P& Y, int N) {
-  // init A and B separately - will it cause an H2D copy?
-  std::for_each(std::execution::par_unseq, &A[0], &A[N],
-                [&](T& ai) { ai = cos(M_PI / 4); });
+    // init A and B separately - will it cause an H2D copy?
+    std::for_each(std::execution::par_unseq, &A[0], &A[N], [&](T& ai) { ai = cos(M_PI / 4); });
 
-  T sum = 0.0;
+    T sum = 0.0;
 
-  for (int i = 0; i < N / 3; i++) {
-    // read only or read-write operations
-    sum += A[i] / N;
+    for (int i = 0; i < N / 3; i++) {
+        // read only or read-write operations
+        sum += A[i] / N;
 
-    // this line if commented should not result in an H2D after this but it
-    // does.
-    // A[i] = sin(M_PI/4);
-  }
+        // this line if commented should not result in an H2D after this but it
+        // does.
+        // A[i] = sin(M_PI/4);
+    }
 
-  std::cout << std::endl;
+    std::cout << std::endl;
 
-  // will it cause an H2D here?
-  std::for_each(std::execution::par_unseq, &B[0], &B[N],
-                [&](T& bi) { bi = sin(M_PI / 6); });
+    // will it cause an H2D here?
+    std::for_each(std::execution::par_unseq, &B[0], &B[N], [&](T& bi) { bi = sin(M_PI / 6); });
 
-  // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B)
+    // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B)
 
-  std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &A[N / 2],
-                 [&](T& ai, T& bi) { return ai + bi; });
-  std::transform(
-      std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &Y[0],
-      [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); });
+    std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &A[N / 2],
+                   [&](T& ai, T& bi) { return ai + bi; });
+    std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &Y[0],
+                   [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); });
 
-  // should trigger a D2H copy of N/5 elements
-  for (int i = 0; i < N / 3; i++)
-    sum += Y[i] / N;
+    // should trigger a D2H copy of N/5 elements
+    for (int i = 0; i < N / 3; i++)
+        sum += Y[i] / N;
 
-  std::cout << std::endl;
+    std::cout << std::endl;
 
-  // get sum(Y) - one last memcpy (not USM) D2H
-  sum +=
-      std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>(), [](T &val){return val * val;});
+    // get sum(Y) - one last memcpy (not USM) D2H
+    sum += std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>(),
+                                 [](T& val) { return val * val; });
 
-  return sum / N;
+    return sum / N;
 }
 
 int main(int argc, char* argv[]) {
-  constexpr int N = 1e9;
-  time_point_t mark = std::chrono::system_clock::now();
-  auto es =
-      std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-          .count();
-  T sum = 0;
+    constexpr int N = 1e9;
+    time_point_t mark = std::chrono::system_clock::now();
+    auto es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
+    T sum = 0;
 
 #if 1  // 0 if only want to run with pointers
-  std::vector<T> A(N);
-  std::vector<T> B(N);
-  std::vector<T> Y(N);
+    std::vector<T> A(N);
+    std::vector<T> B(N);
+    std::vector<T> Y(N);
 
-  mark = std::chrono::system_clock::now();
-  sum = work(A, B, Y, N);
-  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-           .count();
-  std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl;
+    mark = std::chrono::system_clock::now();
+    sum = work(A, B, Y, N);
+    es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
+    std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl;
 
 #endif
 
 #if 1  // 0 if only want to run with vectors
 
-  // allocate memory - where is this allocated?
-  T* a = new T[N];
-  T* b = new T[N];
-  T* y = new T[N];
-
-  sum = 0;
-  mark = std::chrono::system_clock::now();
-  sum = work(a, b, y, N);
-  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-           .count();
-  std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl
-            << std::endl;
+    // allocate memory - where is this allocated?
+    T* a = new T[N];
+    T* b = new T[N];
+    T* y = new T[N];
+
+    sum = 0;
+    mark = std::chrono::system_clock::now();
+    sum = work(a, b, y, N);
+    es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
+    std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl << std::endl;
 #endif
 
-  // do not use scientific notation
-  std::cout << std::fixed << "sum: " << sum << "\n";
+    // do not use scientific notation
+    std::cout << std::fixed << "sum: " << sum << "\n";
 
-  return 0;
+    return 0;
 }
\ No newline at end of file
diff --git a/apps/comm-study/comm-study.cpp b/apps/comm-study/comm-study.cpp
index 7629ce0..99abcfc 100644
--- a/apps/comm-study/comm-study.cpp
+++ b/apps/comm-study/comm-study.cpp
@@ -37,106 +37,91 @@ using time_point_t = std::chrono::system_clock::time_point;
 // must take in the pointers/vectors by reference
 template <typename P>
 auto work(P& A, P& B, P& Y, int N) {
-  T sum = 0.0;
-
-  // init A and B separately - will it cause an H2D copy?
-  sender auto s1 = then(just(),
-                        [&] {
-                          std::for_each(std::execution::par_unseq, &A[0], &A[N],
-                                        [&](T& ai) { ai = cos(M_PI / 4); });
-                        })
-                   // trigger a D2H here
-                   | then([&] {
-                       for (int i = 0; i < N / 3; i++) {
-                         // read only or read-write operations
-                         sum += A[i] / N;
-
-                         // this line if commented should not result in an H2D
-                         // after this but it does.
-                         // A[i] = sin(M_PI/4);
-                       }
-                       std::cout << std::endl;
-                     });
-
-  // will it cause an H2D here?
-  sender auto s2 = then(just(), [&] {
-    std::for_each(std::execution::par_unseq, &B[0], &B[N],
-                  [&](T& bi) { bi = sin(M_PI / 6); });
-  });
-
-  // will s1 and s2 execute in parallel or not?
-  sync_wait(when_all(std::move(s1), std::move(s2)));
-
-  // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B)
-  sender auto s3 =
-      then(just(),
-           [&] {
-             std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0],
-                            &A[0], [&](T& ai, T& bi) { return ai + bi; });
-             std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0],
-                            &Y[0], [&](T& ai, T& bi) {
-                              return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi);
-                            });
-           })
-      // should trigger a D2H copy of N/3 elements
-      | then([&] {
-          for (int i = 0; i < N / 3; i++)
-            sum += Y[i] / N;
-
-          std::cout << std::endl;
-        })
-      // get sum(Y) - wonder if there is another H2D as we only read it in the
-      // last step
-      | then([&] {
-          return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0,
-                             std::plus<T>());
-        });
-
-  auto [val] = sync_wait(s3).value();
-
-  return sum += val;
+    T sum = 0.0;
+
+    // init A and B separately - will it cause an H2D copy?
+    sender auto s1 =
+        then(just(),
+             [&] { std::for_each(std::execution::par_unseq, &A[0], &A[N], [&](T& ai) { ai = cos(M_PI / 4); }); })
+        // trigger a D2H here
+        | then([&] {
+              for (int i = 0; i < N / 3; i++) {
+                  // read only or read-write operations
+                  sum += A[i] / N;
+
+                  // this line if commented should not result in an H2D
+                  // after this but it does.
+                  // A[i] = sin(M_PI/4);
+              }
+              std::cout << std::endl;
+          });
+
+    // will it cause an H2D here?
+    sender auto s2 = then(
+        just(), [&] { std::for_each(std::execution::par_unseq, &B[0], &B[N], [&](T& bi) { bi = sin(M_PI / 6); }); });
+
+    // will s1 and s2 execute in parallel or not?
+    sync_wait(when_all(std::move(s1), std::move(s2)));
+
+    // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B)
+    sender auto s3 = then(just(),
+                          [&] {
+                              std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], &A[0],
+                                             [&](T& ai, T& bi) { return ai + bi; });
+                              std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], &Y[0],
+                                             [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); });
+                          })
+                     // should trigger a D2H copy of N/3 elements
+                     | then([&] {
+                           for (int i = 0; i < N / 3; i++)
+                               sum += Y[i] / N;
+
+                           std::cout << std::endl;
+                       })
+                     // get sum(Y) - wonder if there is another H2D as we only read it in the
+                     // last step
+                     | then([&] { return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>()); });
+
+    auto [val] = sync_wait(s3).value();
+
+    return sum += val;
 }
 
 int main(int argc, char* argv[]) {
-  constexpr int N = 1e9;
-  time_point_t mark = std::chrono::system_clock::now();
-  auto es =
-      std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-          .count();
-  T sum = 0.0;
+    constexpr int N = 1e9;
+    time_point_t mark = std::chrono::system_clock::now();
+    auto es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
+    T sum = 0.0;
 
 #if 1  // 0 if only arrays
-  std::vector<T> A(N);
-  std::vector<T> B(N);
-  std::vector<T> Y(N);
+    std::vector<T> A(N);
+    std::vector<T> B(N);
+    std::vector<T> Y(N);
 
-  mark = std::chrono::system_clock::now();
-  sum = work(A, B, Y, N);
-  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-           .count();
-  std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl;
+    mark = std::chrono::system_clock::now();
+    sum = work(A, B, Y, N);
+    es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
+    std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl;
 
-  std::cout << fixed << "sum: " << sum << "\n";
+    std::cout << fixed << "sum: " << sum << "\n";
 #endif
 
 #if 1  // 0 if only vectors
 
-  // allocate memory - can we just allocate it on device only?
-  T* a = new T[N];
-  T* b = new T[N];
-  T* y = new T[N];
-
-  sum = 0;
-  mark = std::chrono::system_clock::now();
-  sum = work(a, b, y, N);
-  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-           .count();
-  std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl
-            << std::endl;
-
-  // do not use scientific notation
-  std::cout << fixed << "sum: " << sum << "\n";
+    // allocate memory - can we just allocate it on device only?
+    T* a = new T[N];
+    T* b = new T[N];
+    T* y = new T[N];
+
+    sum = 0;
+    mark = std::chrono::system_clock::now();
+    sum = work(a, b, y, N);
+    es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
+    std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl << std::endl;
+
+    // do not use scientific notation
+    std::cout << fixed << "sum: " << sum << "\n";
 #endif
 
-  return 0;
+    return 0;
 }
\ No newline at end of file
diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp
index b174b5a..02bbd7b 100644
--- a/apps/fft/fft-serial.cpp
+++ b/apps/fft/fft-serial.cpp
@@ -33,14 +33,12 @@
 //
 // simulation
 //
-int main(int argc, char* argv[])
-{
+int main(int argc, char* argv[]) {
     // parse params
     fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
 
     // see if help wanted
-    if (args.help)
-    {
+    if (args.help) {
         args.print();  // prints all variables
         return 0;
     }
@@ -60,8 +58,7 @@ int main(int argc, char* argv[])
 
     sig_t x_n(N, sig_type);
 
-    if (!isPowOf2(N))
-    {
+    if (!isPowOf2(N)) {
         N = ceilPowOf2(N);
         std::cout << "log_2(N) != integer. Padding zeros for N = " << N << std::endl;
 
@@ -70,8 +67,7 @@ int main(int argc, char* argv[])
 
     sig_t y_n(x_n);
 
-    if (print_sig)
-    {
+    if (print_sig) {
         std::cout << std::endl << "x[n] = ";
         x_n.printSignal();
         std::cout << std::endl;
@@ -80,40 +76,36 @@ int main(int argc, char* argv[])
     // niterations
     int niters = ilog2(N);
 
-    std::function<void(data_t *, int, const int)> fft = [&](data_t *x, int lN, const int N)
-    {
-        int stride = N/lN;
+    std::function<void(data_t*, int, const int)> fft = [&](data_t* x, int lN, const int N) {
+        int stride = N / lN;
 
-        if (lN == 2)
-        {
-            auto x_0 = x[0] + x[1]* WNk(N, 0);
-            x[1] = x[0] - x[1]* WNk(N, 0);
+        if (lN == 2) {
+            auto x_0 = x[0] + x[1] * WNk(N, 0);
+            x[1] = x[0] - x[1] * WNk(N, 0);
             x[0] = x_0;
             return;
         }
 
         // vectors for left and right
-        std::vector<data_t> e(lN/2);
-        std::vector<data_t> o(lN/2);
+        std::vector<data_t> e(lN / 2);
+        std::vector<data_t> o(lN / 2);
 
         // copy data into vectors
-        for (auto k = 0; k < lN/2; k++)
-        {
-            e[k] = x[2*k];
-            o[k] = x[2*k+1];
+        for (auto k = 0; k < lN / 2; k++) {
+            e[k] = x[2 * k];
+            o[k] = x[2 * k + 1];
         }
 
         // compute N/2 pt FFT on even
-        fft(e.data(), lN/2, N);
+        fft(e.data(), lN / 2, N);
 
         // compute N/2 pt FFT on odd
-        fft(o.data(), lN/2, N);
+        fft(o.data(), lN / 2, N);
 
         // combine even and odd FFTs
-        for (int k = 0; k < lN/2; k++)
-        {
+        for (int k = 0; k < lN / 2; k++) {
             x[k] = e[k] + o[k] * WNk(N, k * stride);
-            x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride);
+            x[k + lN / 2] = e[k] - o[k] * WNk(N, k * stride);
         }
 
         return;
@@ -122,8 +114,7 @@ int main(int argc, char* argv[])
     // fft radix-2 algorithm with senders
     fft(y_n.data(), N, N);
 
-    if (print_sig)
-    {
+    if (print_sig) {
         std::cout << "X[k] = ";
         y_n.printSignal();
         std::cout << std::endl;
diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp
index 80a7446..56354f8 100644
--- a/apps/fft/fft.hpp
+++ b/apps/fft/fft.hpp
@@ -32,10 +32,10 @@
 
 #include <bit>
 #include <complex>
-#include <functional>
-#include <experimental/mdspan>
 #include <exec/any_sender_of.hpp>
 #include <exec/static_thread_pool.hpp>
+#include <experimental/mdspan>
+#include <functional>
 #include <stdexec/execution.hpp>
 
 #include "argparse/argparse.hpp"
@@ -56,158 +56,135 @@ constexpr int radix = 2;
 
 // parameters
 struct fft_params_t : public argparse::Args {
-  sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box);
-  int& freq = kwarg("f,freq", "Signal frequency").set_default(1024);
-  int& N = kwarg("N", "N-point FFT").set_default(1024);
-  bool& print_sig = flag("p,print", "print x[n] and X(k)");
+    sig_type_t& sig =
+        kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box);
+    int& freq = kwarg("f,freq", "Signal frequency").set_default(1024);
+    int& N = kwarg("N", "N-point FFT").set_default(1024);
+    bool& print_sig = flag("p,print", "print x[n] and X(k)");
 
 #if defined(USE_OMP)
-  int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
+    int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
 #endif  // USE_OMP
 
-  bool& help = flag("h, help", "print help");
-  bool& print_time = flag("t,time", "print fft time");
+    bool& help = flag("h, help", "print help");
+    bool& print_time = flag("t,time", "print fft time");
 };
 
 inline bool isPowOf2(long long int x) {
-  return !(x == 0) && !(x & (x - 1));
+    return !(x == 0) && !(x & (x - 1));
 }
 
 template <typename T>
-void printVec(T &vec, int len)
-{
+void printVec(T& vec, int len) {
     std::cout << "[ ";
     for (int i = 0; i < len; i++)
-      std::cout << vec[i] << " ";
+        std::cout << vec[i] << " ";
 
     std::cout << "]" << std::endl;
 }
 
-inline std::complex<Real_t> WNk(int N, int k)
-{
-    return std::complex<Real_t>(exp(-2*M_PI*1/N*k*1i));
+inline std::complex<Real_t> WNk(int N, int k) {
+    return std::complex<Real_t>(exp(-2 * M_PI * 1 / N * k * 1i));
 }
 
-inline int ceilPowOf2(unsigned int v)
-{
-  return static_cast<int>(std::bit_ceil(v));
+inline int ceilPowOf2(unsigned int v) {
+    return static_cast<int>(std::bit_ceil(v));
 }
 
-inline int ilog2(uint32_t x)
-{
+inline int ilog2(uint32_t x) {
     return static_cast<int>(log2(x));
 }
 
-class signal
-{
-public:
-
-  signal() = default;
-  signal(int N)
-  {
-    if (N <= 0)
-    {
-      std::cerr << "FATAL: N must be > 0. exiting.." << std::endl;
-      exit(1);
+class signal {
+   public:
+    signal() = default;
+
+    signal(int N) {
+        if (N <= 0) {
+            std::cerr << "FATAL: N must be > 0. exiting.." << std::endl;
+            exit(1);
+        }
+        y.reserve(ceilPowOf2(N));
+        y.resize(N);
     }
-    y.reserve(ceilPowOf2(N));
-    y.resize(N);
-  }
-
-  signal(signal &rhs)
-  {
-    y = rhs.y;
-  }
-  signal(std::vector<data_t> &in)
-  {
-    y = std::move(in);
-  }
-
-  signal(int N, sig_type type)
-  {
-    if (N <= 0)
-    {
-      std::cerr << "FATAL: N must be > 0. exiting.." << std::endl;
-      exit(1);
+
+    signal(signal& rhs) { y = rhs.y; }
+
+    signal(std::vector<data_t>& in) { y = std::move(in); }
+
+    signal(int N, sig_type type) {
+        if (N <= 0) {
+            std::cerr << "FATAL: N must be > 0. exiting.." << std::endl;
+            exit(1);
+        }
+        y.reserve(ceilPowOf2(N));
+        y.resize(N);
+        signalGenerator(type);
     }
-    y.reserve(ceilPowOf2(N));
-    y.resize(N);
-    signalGenerator(type);
-  }
-
-  void signalGenerator(sig_type type=sig_type::box)
-  {
-    int N = y.size();
-
-    switch (type) {
-      case sig_type::square:
-        for (int n = 0; n < N; ++n)
-          y[n] = (n < N / 4 || n > 3 * N/4) ? 1.0 : -1.0;
-        break;
-      case sig_type::sinusoid:
-        for (int n = 0; n < N; ++n)
-          y[n] = std::sin(2.0 * M_PI * n / N);
-        break;
-      case sig_type::sawtooth:
-        for (int n = 0; n < N; ++n)
-          y[n] = 2.0 * (n / N) - 1.0;
-        break;
-      case sig_type::triangle:
-        for (int n = 0; n < N; ++n)
-          y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0;
-        break;
-      case sig_type::sinc:
-          y[0] = 1.0;
-        for (int n = 1; n < N; ++n)
-          y[n] = std::sin(2.0 * M_PI * n / N) / (2.0 * M_PI * n / N);
-        break;
-      case sig_type::box:
-        for (int n = 0; n < N; ++n)
-          y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0;
-        break;
-      default:
-        std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl;
-        exit(1);
+
+    void signalGenerator(sig_type type = sig_type::box) {
+        int N = y.size();
+
+        switch (type) {
+            case sig_type::square:
+                for (int n = 0; n < N; ++n)
+                    y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : -1.0;
+                break;
+            case sig_type::sinusoid:
+                for (int n = 0; n < N; ++n)
+                    y[n] = std::sin(2.0 * M_PI * n / N);
+                break;
+            case sig_type::sawtooth:
+                for (int n = 0; n < N; ++n)
+                    y[n] = 2.0 * (n / N) - 1.0;
+                break;
+            case sig_type::triangle:
+                for (int n = 0; n < N; ++n)
+                    y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0;
+                break;
+            case sig_type::sinc:
+                y[0] = 1.0;
+                for (int n = 1; n < N; ++n)
+                    y[n] = std::sin(2.0 * M_PI * n / N) / (2.0 * M_PI * n / N);
+                break;
+            case sig_type::box:
+                for (int n = 0; n < N; ++n)
+                    y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0;
+                break;
+            default:
+                std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl;
+                exit(1);
+        }
     }
-  }
 
-  ~signal()
-  {
-    y.clear();
-  }
+    ~signal() { y.clear(); }
 
-  data_t *data() { return y.data(); }
-  int len() { return y.size(); }
+    data_t* data() { return y.data(); }
 
-  void resize(int N)
-  {
-    if (N != y.size())
-      y.resize(N, 0);
-  }
+    int len() { return y.size(); }
 
-  data_t &operator[](int n)
-  {
-    return y[n];
-  }
+    void resize(int N) {
+        if (N != y.size())
+            y.resize(N, 0);
+    }
 
-  data_t &operator()(int n)
-  {
-    return y[n];
-  }
+    data_t& operator[](int n) { return y[n]; }
 
-  void printSignal() {
-    std::cout << std::fixed << std::setprecision(2);
+    data_t& operator()(int n) { return y[n]; }
 
-    std::cout << "[ ";
-    for (auto &el : y)
-      std::cout << el << " ";
+    void printSignal() {
+        std::cout << std::fixed << std::setprecision(2);
 
-    std::cout << "]" << std::endl;
-  }
+        std::cout << "[ ";
+        for (auto& el : y)
+            std::cout << el << " ";
+
+        std::cout << "]" << std::endl;
+    }
 
-private:
-  // y[n]
-  std::vector<data_t> y;
+   private:
+    // y[n]
+    std::vector<data_t> y;
 };
 
 using sig_t = signal;
diff --git a/apps/heat-equation/heat-equation-cuda.cpp b/apps/heat-equation/heat-equation-cuda.cpp
index 3ea2988..b8cca1b 100644
--- a/apps/heat-equation/heat-equation-cuda.cpp
+++ b/apps/heat-equation/heat-equation-cuda.cpp
@@ -41,15 +41,14 @@ __constant__ Real_t dx[2];
 
 // error checking function
 template <typename T>
-static inline void check(T result, const char* const file, const int line,
-                         bool is_fatal = true) {
-  if (result != cudaSuccess) {
-    std::cerr << "CUDA error at " << file << ":" << line << std::endl;
-    std::cerr << cudaGetErrorString(result) << std::endl;
-
-    if (is_fatal)
-      exit(result);
-  }
+static inline void check(T result, const char* const file, const int line, bool is_fatal = true) {
+    if (result != cudaSuccess) {
+        std::cerr << "CUDA error at " << file << ":" << line << std::endl;
+        std::cerr << cudaGetErrorString(result) << std::endl;
+
+        if (is_fatal)
+            exit(result);
+    }
 }
 
 //
@@ -57,24 +56,24 @@ static inline void check(T result, const char* const file, const int line,
 //
 template <typename T>
 __global__ void initialize(T* phi, int ncells, int ghost_cells) {
-  int ind = blockIdx.x * blockDim.x + threadIdx.x;
-  int d_nghosts = nghosts;
-  int phi_old_extent = ncells + d_nghosts;
-  int gsize = ncells * ncells;
+    int ind = blockIdx.x * blockDim.x + threadIdx.x;
+    int d_nghosts = nghosts;
+    int phi_old_extent = ncells + d_nghosts;
+    int gsize = ncells * ncells;
 
-  for (; ind < gsize; ind += blockDim.x * gridDim.x) {
-    int i = 1 + (ind / ncells);
-    int j = 1 + (ind % ncells);
+    for (; ind < gsize; ind += blockDim.x * gridDim.x) {
+        int i = 1 + (ind / ncells);
+        int j = 1 + (ind % ncells);
 
-    Real_t x = pos(i, ghost_cells, dx[0]);
-    Real_t y = pos(j, ghost_cells, dx[1]);
+        Real_t x = pos(i, ghost_cells, dx[0]);
+        Real_t y = pos(j, ghost_cells, dx[1]);
 
-    // L2 distance (r2 from origin)
-    Real_t r2 = (x * x + y * y) / (0.01);
+        // L2 distance (r2 from origin)
+        Real_t r2 = (x * x + y * y) / (0.01);
 
-    // phi(x,y) = 1 + exp(-r^2)
-    phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
-  }
+        // phi(x,y) = 1 + exp(-r^2)
+        phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
+    }
 }
 
 //
@@ -82,57 +81,52 @@ __global__ void initialize(T* phi, int ncells, int ghost_cells) {
 //
 template <typename T>
 __global__ void fillBoundary(T* phi_old, int ncells, int ghost_cells) {
-  int pos = blockIdx.x * blockDim.x + threadIdx.x;
-  int d_nghosts = nghosts;
-  int phi_old_extent = ncells + d_nghosts;
-  int len = phi_old_extent;
+    int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    int d_nghosts = nghosts;
+    int phi_old_extent = ncells + d_nghosts;
+    int len = phi_old_extent;
 
-  for (; pos < phi_old_extent - nghosts; pos += blockDim.x * gridDim.x) {
-    int i = pos + ghost_cells;
+    for (; pos < phi_old_extent - nghosts; pos += blockDim.x * gridDim.x) {
+        int i = pos + ghost_cells;
 
-    // fill boundary cells in phi_old
-    phi_old[i] = phi_old[i + (ghost_cells * len)];
+        // fill boundary cells in phi_old
+        phi_old[i] = phi_old[i + (ghost_cells * len)];
 
-    phi_old[i + (len * (len - ghost_cells))] =
-        phi_old[i + (len * (len - ghost_cells - 1))];
+        phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))];
 
-    phi_old[i * len] = phi_old[(ghost_cells * len) + i];
+        phi_old[i * len] = phi_old[(ghost_cells * len) + i];
 
-    phi_old[(len - ghost_cells) + (len * i)] =
-        phi_old[(len - ghost_cells - 1) + (len * i)];
-  }
+        phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)];
+    }
 }
 
 //
 // jacobi 2d stencil kernel
 //
 template <typename T>
-__global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha,
-                       Real_t dt) {
-  int pos = blockIdx.x * blockDim.x + threadIdx.x;
-  int d_nghosts = nghosts;
-  int phi_old_extent = ncells + d_nghosts;
-  int gsize = ncells * ncells;
-
-  for (; pos < gsize; pos += blockDim.x * gridDim.x) {
-    int i = 1 + (pos / ncells);
-    int j = 1 + (pos % ncells);
-
-    // Jacobi iteration
-    phi_new[(i - 1) * ncells + j - 1] =
-        phi_old[(i)*phi_old_extent + j] +
-        alpha * dt *
-
-            ((phi_old[(i + 1) * phi_old_extent + j] -
-              2.0 * phi_old[(i)*phi_old_extent + j] +
-              phi_old[(i - 1) * phi_old_extent + j]) /
-                 (dx[0] * dx[0]) +
-
-             (phi_old[(i)*phi_old_extent + j + 1] -
-              2.0 * phi_old[(i)*phi_old_extent + j] +
-              phi_old[(i)*phi_old_extent + j - 1]) /
-                 (dx[1] * dx[1]));
-  }
+__global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha, Real_t dt) {
+    int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    int d_nghosts = nghosts;
+    int phi_old_extent = ncells + d_nghosts;
+    int gsize = ncells * ncells;
+
+    for (; pos < gsize; pos += blockDim.x * gridDim.x) {
+        int i = 1 + (pos / ncells);
+        int j = 1 + (pos % ncells);
+
+        // Jacobi iteration
+        phi_new[(i - 1) * ncells + j - 1] =
+            phi_old[(i)*phi_old_extent + j] +
+            alpha * dt *
+
+                ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] +
+                  phi_old[(i - 1) * phi_old_extent + j]) /
+                     (dx[0] * dx[0]) +
+
+                 (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] +
+                  phi_old[(i)*phi_old_extent + j - 1]) /
+                     (dx[1] * dx[1]));
+    }
 }
 
 //
@@ -140,127 +134,121 @@ __global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha,
 //
 template <typename T>
 __global__ void parallelCopy(T* phi_old, T* phi_new, int ncells) {
-  int pos = blockIdx.x * blockDim.x + threadIdx.x;
-  int d_nghosts = nghosts;
-  int phi_old_extent = ncells + d_nghosts;
-  int gsize = ncells * ncells;
-
-  for (; pos < gsize; pos += blockDim.x * gridDim.x) {
-    int i = 1 + (pos / ncells);
-    int j = 1 + (pos % ncells);
-    phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
-  }
+    int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    int d_nghosts = nghosts;
+    int phi_old_extent = ncells + d_nghosts;
+    int gsize = ncells * ncells;
+
+    for (; pos < gsize; pos += blockDim.x * gridDim.x) {
+        int i = 1 + (pos / ncells);
+        int j = 1 + (pos % ncells);
+        phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
+    }
 }
 
 //
 // main simulation
 //
 int main(int argc, char* argv[]) {
-  // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+    // parse params
+    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  // simulation variables
-  int ncells = args.ncells;
-  int nsteps = args.nsteps;
-  Real_t dt = args.dt;
-  Real_t alpha = args.alpha;
+    // simulation variables
+    int ncells = args.ncells;
+    int nsteps = args.nsteps;
+    Real_t dt = args.dt;
+    Real_t alpha = args.alpha;
 
-  // init simulation time
-  Real_t time = 0.0;
+    // init simulation time
+    Real_t time = 0.0;
 
-  // initialize dx, dy, dz
-  Real_t h_dx[dims];
-  for (int i = 0; i < dims; ++i)
-    h_dx[i] = 1.0 / (ncells - 1);
+    // initialize dx, dy, dz
+    Real_t h_dx[dims];
+    for (int i = 0; i < dims; ++i)
+        h_dx[i] = 1.0 / (ncells - 1);
 
-  cudaErrorCheck(cudaMemcpyToSymbol(dx, h_dx, sizeof(Real_t) * dims));
+    cudaErrorCheck(cudaMemcpyToSymbol(dx, h_dx, sizeof(Real_t) * dims));
 
-  // grid size
-  int gsize = ncells * ncells;
+    // grid size
+    int gsize = ncells * ncells;
 
-  // host memory for printing
-  Real_t* h_phi = nullptr;
+    // host memory for printing
+    Real_t* h_phi = nullptr;
 
-  // simulation setup (2D)
-  Real_t* phi_old = nullptr;
-  Real_t* phi_new = nullptr;
+    // simulation setup (2D)
+    Real_t* phi_old = nullptr;
+    Real_t* phi_new = nullptr;
 
-  cudaErrorCheck(cudaMalloc(
-      &phi_old, sizeof(Real_t) * ((ncells + nghosts) * (ncells + nghosts))));
-  cudaErrorCheck(cudaMalloc(&phi_new, sizeof(Real_t) * ((ncells) * (ncells))));
+    cudaErrorCheck(cudaMalloc(&phi_old, sizeof(Real_t) * ((ncells + nghosts) * (ncells + nghosts))));
+    cudaErrorCheck(cudaMalloc(&phi_new, sizeof(Real_t) * ((ncells) * (ncells))));
 
-  // setup grid
-  int blockSize = std::min(1024, gsize);  // let's do at most 1024 threads.
-  int nBlocks = (gsize + blockSize - 1) / blockSize;
+    // setup grid
+    int blockSize = std::min(1024, gsize);  // let's do at most 1024 threads.
+    int nBlocks = (gsize + blockSize - 1) / blockSize;
 
-  Timer timer;
+    Timer timer;
 
-  // initialize grid
-  initialize<<<nBlocks, blockSize>>>(phi_old, ncells, ghost_cells);
+    // initialize grid
+    initialize<<<nBlocks, blockSize>>>(phi_old, ncells, ghost_cells);
 
-  cudaErrorCheck(cudaDeviceSynchronize());
+    cudaErrorCheck(cudaDeviceSynchronize());
 
-  // print initial grid if needed
-  if (args.print_grid) {
-    // copy initial grid to host
-    h_phi = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
-    cudaErrorCheck(
-        cudaMemcpy(h_phi, phi_old,
-                   sizeof(Real_t) * (ncells + nghosts) * (ncells + nghosts),
-                   cudaMemcpyDeviceToHost));
+    // print initial grid if needed
+    if (args.print_grid) {
+        // copy initial grid to host
+        h_phi = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
+        cudaErrorCheck(cudaMemcpy(h_phi, phi_old, sizeof(Real_t) * (ncells + nghosts) * (ncells + nghosts),
+                                  cudaMemcpyDeviceToHost));
 
-    printGrid(h_phi, ncells + nghosts);
-  }
+        printGrid(h_phi, ncells + nghosts);
+    }
 
-  // evolve the system
-  for (auto step = 0; step < nsteps; step++) {
-    static int fBblock =
-        std::min(1024, ncells);  // let's do at most 1024 threads.
-    static int fBnBlocks =
-        (ncells + fBblock - 1) / fBblock;  // fillBoundary blocks
+    // evolve the system
+    for (auto step = 0; step < nsteps; step++) {
+        static int fBblock = std::min(1024, ncells);              // let's do at most 1024 threads.
+        static int fBnBlocks = (ncells + fBblock - 1) / fBblock;  // fillBoundary blocks
 
-    // fillboundary
-    fillBoundary<<<fBnBlocks, fBblock>>>(phi_old, ncells, ghost_cells);
+        // fillboundary
+        fillBoundary<<<fBnBlocks, fBblock>>>(phi_old, ncells, ghost_cells);
 
-    // jacobi
-    jacobi<<<nBlocks, blockSize>>>(phi_old, phi_new, ncells, alpha, dt);
+        // jacobi
+        jacobi<<<nBlocks, blockSize>>>(phi_old, phi_new, ncells, alpha, dt);
 
-    // parallelCopy
-    parallelCopy<<<nBlocks, blockSize>>>(phi_old, phi_new, ncells);
+        // parallelCopy
+        parallelCopy<<<nBlocks, blockSize>>>(phi_old, phi_new, ncells);
 
-    cudaErrorCheck(cudaDeviceSynchronize());
+        cudaErrorCheck(cudaDeviceSynchronize());
 
-    // update time
-    time += dt;
-  }
+        // update time
+        time += dt;
+    }
 
-  auto elapsed = timer.stop();
+    auto elapsed = timer.stop();
 
-  // print timing
-  if (args.print_time) {
-    std::cout << "Time: " << elapsed << " ms" << std::endl;
-  }
+    // print timing
+    if (args.print_time) {
+        std::cout << "Time: " << elapsed << " ms" << std::endl;
+    }
 
-  // print final grid if needed
-  if (args.print_grid) {
-    cudaErrorCheck(cudaMemcpy(h_phi, phi_new, sizeof(Real_t) * gsize,
-                              cudaMemcpyDeviceToHost));
-    printGrid(h_phi, ncells);
+    // print final grid if needed
+    if (args.print_grid) {
+        cudaErrorCheck(cudaMemcpy(h_phi, phi_new, sizeof(Real_t) * gsize, cudaMemcpyDeviceToHost));
+        printGrid(h_phi, ncells);
 
-    // free host memory
-    delete[] h_phi;
-    h_phi = nullptr;
-  }
+        // free host memory
+        delete[] h_phi;
+        h_phi = nullptr;
+    }
 
-  // free device memory
-  cudaErrorCheck(cudaFree(phi_old));
-  cudaErrorCheck(cudaFree(phi_new));
+    // free device memory
+    cudaErrorCheck(cudaFree(phi_old));
+    cudaErrorCheck(cudaFree(phi_new));
 
-  return 0;
+    return 0;
 }
diff --git a/apps/heat-equation/heat-equation-gpu-scheduler.cpp b/apps/heat-equation/heat-equation-gpu-scheduler.cpp
index b294235..2b9590d 100644
--- a/apps/heat-equation/heat-equation-gpu-scheduler.cpp
+++ b/apps/heat-equation/heat-equation-gpu-scheduler.cpp
@@ -44,138 +44,132 @@ using namespace nvexec;
 // simulation
 //
 int main(int argc, char* argv[]) {
-  // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
-
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
-
-  // simulation variables
-  int ncells = args.ncells;
-  int nsteps = args.nsteps;
-  Real_t dt = args.dt;
-  Real_t alpha = args.alpha;
-  // future if needed to split in multiple grids
-  // int max_grid_size = args.max_grid_size;
-
-  // init simulation time
-  Real_t time = 0.0;
-
-  // initialize dx, dy, dz
-  thrust::universal_vector<Real_t> dx(dims);
-  for (int i = 0; i < dims; ++i)
-    dx[i] = 1.0 / (ncells - 1);
-
-  // simulation setup (2D)
-  thrust::universal_vector<Real_t> grid_old((ncells + nghosts) *
-                                            (ncells + nghosts));
-  thrust::universal_vector<Real_t> grid_new(ncells * ncells);
-
-  /*    Real_t *grid_old = new Real_t[(ncells+nghosts) * (ncells+nghosts)];
+    // parse params
+    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
+
+    // simulation variables
+    int ncells = args.ncells;
+    int nsteps = args.nsteps;
+    Real_t dt = args.dt;
+    Real_t alpha = args.alpha;
+    // future if needed to split in multiple grids
+    // int max_grid_size = args.max_grid_size;
+
+    // init simulation time
+    Real_t time = 0.0;
+
+    // initialize dx, dy, dz
+    thrust::universal_vector<Real_t> dx(dims);
+    for (int i = 0; i < dims; ++i)
+        dx[i] = 1.0 / (ncells - 1);
+
+    // simulation setup (2D)
+    thrust::universal_vector<Real_t> grid_old((ncells + nghosts) * (ncells + nghosts));
+    thrust::universal_vector<Real_t> grid_new(ncells * ncells);
+
+    /*    Real_t *grid_old = new Real_t[(ncells+nghosts) * (ncells+nghosts)];
       Real_t *grid_new = new Real_t[(ncells) * (ncells)];*/
 
-  // initialize grid
-  auto phi_old = thrust::raw_pointer_cast(grid_old.data());
-  auto phi_new = thrust::raw_pointer_cast(grid_new.data());
-
-  Timer timer;
-
-  // scheduler from gpu
-  nvexec::stream_context stream_ctx{};
-  auto gpu = stream_ctx.get_scheduler();
-
-  auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()),
-                           thrust::raw_pointer_cast(dx.data()) + dx.size()};
-  auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()};
-  auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()};
-  auto phi_old_extent = ncells + nghosts;
-
-  int gsize = ncells * ncells;
-  auto heat_eq_init = ex::transfer_just(gpu, dx_span, phi_old_span) |
-                      ex::bulk(gsize, [=](int pos, auto ds, auto phi) {
-                        int i = 1 + (pos / ncells);
-                        int j = 1 + (pos % ncells);
-
-                        Real_t x = pos(i, ghost_cells, ds[0]);
-                        Real_t y = pos(j, ghost_cells, ds[1]);
-
-                        // L2 distance (r2 from origin)
-                        Real_t r2 = (x * x + y * y) / (0.01);
-
-                        // phi(x,y) = 1 + exp(-r^2)
-                        phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
-                      });
-
-  ex::sync_wait(std::move(heat_eq_init));
-  if (args.print_grid)
-    printGrid(phi_old, ncells + nghosts);
-
-  auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span);
-
-  // evolve the system
-  for (auto step = 0; step < nsteps; step++) {
-    static auto evolve =
-        tx |
-        ex::bulk(phi_old_extent - nghosts,
-                 [=](int pos, auto ds, auto phi_old, auto phi_new) {
-                   int i = pos + ghost_cells;
-                   int len = phi_old_extent;
-                   // fill boundary cells in old_phi
-                   phi_old[i] = phi_old[i + (ghost_cells * len)];
-                   phi_old[i + (len * (len - ghost_cells))] =
-                       phi_old[i + (len * (len - ghost_cells - 1))];
-                   phi_old[i * len] = phi_old[(ghost_cells * len) + i];
-                   phi_old[(len - ghost_cells) + (len * i)] =
-                       phi_old[(len - ghost_cells - 1) + (len * i)];
-                 }) |
-        ex::bulk(gsize,
-                 [=](int pos, auto ds, auto phi_old, auto phi_new) {
-                   int i = 1 + (pos / ncells);
-                   int j = 1 + (pos % ncells);
-
-                   // Jacobi iteration
-                   phi_new[(i - 1) * ncells + j - 1] =
-                       phi_old[(i)*phi_old_extent + j] +
-                       alpha * dt *
-                           ((phi_old[(i + 1) * phi_old_extent + j] -
-                             2.0 * phi_old[(i)*phi_old_extent + j] +
-                             phi_old[(i - 1) * phi_old_extent + j]) /
-                                (ds[0] * ds[0]) +
-                            (phi_old[(i)*phi_old_extent + j + 1] -
-                             2.0 * phi_old[(i)*phi_old_extent + j] +
-                             phi_old[(i)*phi_old_extent + j - 1]) /
-                                (ds[1] * ds[1]));
-                 }) |
-        ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) {
-          int i = 1 + (pos / ncells);
-          int j = 1 + (pos % ncells);
-          phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
-        });
+    // initialize grid
+    auto phi_old = thrust::raw_pointer_cast(grid_old.data());
+    auto phi_new = thrust::raw_pointer_cast(grid_new.data());
 
-    ex::sync_wait(std::move(evolve));
+    Timer timer;
 
-    // update the simulation time
-    time += dt;
-  }
+    // scheduler from gpu
+    nvexec::stream_context stream_ctx{};
+    auto gpu = stream_ctx.get_scheduler();
 
-  auto elapsed = timer.stop();
+    auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), thrust::raw_pointer_cast(dx.data()) + dx.size()};
+    auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()};
+    auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()};
+    auto phi_old_extent = ncells + nghosts;
 
-  // print timing
-  if (args.print_time) {
-    std::cout << "Time: " << elapsed << " ms" << std::endl;
-  }
+    int gsize = ncells * ncells;
+    auto heat_eq_init =
+        ex::transfer_just(gpu, dx_span, phi_old_span) | ex::bulk(gsize, [=](int pos, auto ds, auto phi) {
+            int i = 1 + (pos / ncells);
+            int j = 1 + (pos % ncells);
 
-  auto finalize = ex::then(ex::just(), [&]() {
-    if (args.print_grid)
-      // print the final grid
-      printGrid(phi_new, ncells);
-  });
+            Real_t x = pos(i, ghost_cells, ds[0]);
+            Real_t y = pos(j, ghost_cells, ds[1]);
 
-  // end the simulation
-  ex::sync_wait(std::move(finalize));
+            // L2 distance (r2 from origin)
+            Real_t r2 = (x * x + y * y) / (0.01);
 
-  return 0;
+            // phi(x,y) = 1 + exp(-r^2)
+            phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
+        });
+
+    ex::sync_wait(std::move(heat_eq_init));
+    if (args.print_grid)
+        printGrid(phi_old, ncells + nghosts);
+
+    auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span);
+
+    // evolve the system
+    for (auto step = 0; step < nsteps; step++) {
+        static auto evolve =
+            tx |
+            ex::bulk(phi_old_extent - nghosts,
+                     [=](int pos, auto ds, auto phi_old, auto phi_new) {
+                         int i = pos + ghost_cells;
+                         int len = phi_old_extent;
+                         // fill boundary cells in old_phi
+                         phi_old[i] = phi_old[i + (ghost_cells * len)];
+                         phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))];
+                         phi_old[i * len] = phi_old[(ghost_cells * len) + i];
+                         phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)];
+                     }) |
+            ex::bulk(gsize,
+                     [=](int pos, auto ds, auto phi_old, auto phi_new) {
+                         int i = 1 + (pos / ncells);
+                         int j = 1 + (pos % ncells);
+
+                         // Jacobi iteration
+                         phi_new[(i - 1) * ncells + j - 1] =
+                             phi_old[(i)*phi_old_extent + j] +
+                             alpha * dt *
+                                 ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] +
+                                   phi_old[(i - 1) * phi_old_extent + j]) /
+                                      (ds[0] * ds[0]) +
+                                  (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] +
+                                   phi_old[(i)*phi_old_extent + j - 1]) /
+                                      (ds[1] * ds[1]));
+                     }) |
+            ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) {
+                int i = 1 + (pos / ncells);
+                int j = 1 + (pos % ncells);
+                phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
+            });
+
+        ex::sync_wait(std::move(evolve));
+
+        // update the simulation time
+        time += dt;
+    }
+
+    auto elapsed = timer.stop();
+
+    // print timing
+    if (args.print_time) {
+        std::cout << "Time: " << elapsed << " ms" << std::endl;
+    }
+
+    auto finalize = ex::then(ex::just(), [&]() {
+        if (args.print_grid)
+            // print the final grid
+            printGrid(phi_new, ncells);
+    });
+
+    // end the simulation
+    ex::sync_wait(std::move(finalize));
+
+    return 0;
 }
\ No newline at end of file
diff --git a/apps/heat-equation/heat-equation-mdspan.cpp b/apps/heat-equation/heat-equation-mdspan.cpp
index 1ae243b..f38b9ed 100644
--- a/apps/heat-equation/heat-equation-mdspan.cpp
+++ b/apps/heat-equation/heat-equation-mdspan.cpp
@@ -33,128 +33,121 @@
 // fill boundary cells
 template <typename T>
 void fill2Dboundaries_mdspan(T* grid, int len, int ghost_cells = 1) {
-  auto row_view = std::mdspan<T, view_2d, std::layout_right>(grid, len, len);
-
-  for (auto j = ghost_cells; j < row_view.extent(1) - ghost_cells; ++j) {
-    row_view(0, j) = row_view(ghost_cells, j);
-    row_view(row_view.extent(0) - ghost_cells, j) =
-        row_view(row_view.extent(0) - ghost_cells - 1, j);
-  }
-
-  auto col_view =
-      std::mdspan<Real_t, view_2d, std::layout_left>(grid, len, len);
-
-  for (auto i = 1; i < col_view.extent(1) - 1; ++i) {
-    col_view(0, i) = col_view(ghost_cells, i);
-    col_view(col_view.extent(0) - 1, i) =
-        col_view(col_view.extent(0) - ghost_cells - 1, i);
-  }
+    auto row_view = std::mdspan<T, view_2d, std::layout_right>(grid, len, len);
+
+    for (auto j = ghost_cells; j < row_view.extent(1) - ghost_cells; ++j) {
+        row_view(0, j) = row_view(ghost_cells, j);
+        row_view(row_view.extent(0) - ghost_cells, j) = row_view(row_view.extent(0) - ghost_cells - 1, j);
+    }
+
+    auto col_view = std::mdspan<Real_t, view_2d, std::layout_left>(grid, len, len);
+
+    for (auto i = 1; i < col_view.extent(1) - 1; ++i) {
+        col_view(0, i) = col_view(ghost_cells, i);
+        col_view(col_view.extent(0) - 1, i) = col_view(col_view.extent(0) - ghost_cells - 1, i);
+    }
 }
 
 //
 // simulation
 //
 int main(int argc, char* argv[]) {
-  // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+    // parse params
+    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
-
-  // simulation variables
-  int ncells = args.ncells;
-  int nsteps = args.nsteps;
-  Real_t dt = args.dt;
-  Real_t alpha = args.alpha;
-  // future if needed to split in multiple grids
-  // int max_grid_size = args.max_grid_size;
-
-  // initialize dx, dy, dz
-  auto* dx = new Real_t[dims];
-  for (int i = 0; i < dims; ++i)
-    dx[i] = 1.0 / (ncells - 1);
-
-  // simulation setup (2D)
-  Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
-  Real_t* grid_new = new Real_t[(ncells) * (ncells)];
-
-  auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(
-      grid_old, ncells + nghosts, ncells + nghosts);
-  auto phi_new =
-      std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
-
-  Timer timer;
-
-  // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
-  for (int i = 1; i < phi_old.extent(0) - 1; ++i) {
-    for (int j = 1; j < phi_old.extent(1) - 1; ++j) {
-      Real_t x = pos(i, ghost_cells, dx[0]);
-      Real_t y = pos(j, ghost_cells, dx[1]);
-
-      // L2 distance (r2 from origin)
-      Real_t r2 = (x * x + y * y) / (0.01);
-
-      // phi(x,y) = 1 + exp(-r^2)
-      phi_old(i, j) = 1 + exp(-r2);
-    }
-  }
-
-  if (args.print_grid)
-    // print the initial grid
-    printGrid(grid_old, ncells + nghosts);
-
-  // init simulation time
-  Real_t time = 0.0;
-
-  // evolve the system
-  for (auto step = 0; step < nsteps; step++) {
-    // fill boundary cells in old_phi
-    fill2Dboundaries_mdspan(grid_old, ncells + nghosts, ghost_cells);
-
-    // update phi_new
-    for (auto i = 1; i < phi_old.extent(0) - 1; i++) {
-      for (auto j = 1; j < phi_old.extent(1) - 1; j++) {
-        // Jacobi iteration
-        phi_new(i - 1, j - 1) =
-            phi_old(i, j) +
-            alpha * dt *
-                ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) /
-                     (dx[0] * dx[0]) +
-                 (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) /
-                     (dx[1] * dx[1]));
-      }
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
     }
 
-    // update the simulation time
-    time += dt;
+    // simulation variables
+    int ncells = args.ncells;
+    int nsteps = args.nsteps;
+    Real_t dt = args.dt;
+    Real_t alpha = args.alpha;
+    // future if needed to split in multiple grids
+    // int max_grid_size = args.max_grid_size;
+
+    // initialize dx, dy, dz
+    auto* dx = new Real_t[dims];
+    for (int i = 0; i < dims; ++i)
+        dx[i] = 1.0 / (ncells - 1);
+
+    // simulation setup (2D)
+    Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
+    Real_t* grid_new = new Real_t[(ncells) * (ncells)];
 
-    // parallel copy phi_new to phi_old
-    for (auto i = 1; i < phi_old.extent(0) - 1; i++)
-      for (auto j = 1; j < phi_old.extent(1) - 1; j++)
-        // copy phi_new to phi_old
-        phi_old(i, j) = phi_new(i - 1, j - 1);
-  }
+    auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(grid_old, ncells + nghosts, ncells + nghosts);
+    auto phi_new = std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
 
-  auto elapsed = timer.stop();
+    Timer timer;
 
-  // print timing
-  if (args.print_time) {
-    std::cout << "Time: " << elapsed << " ms" << std::endl;
-  }
+    // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
+    for (int i = 1; i < phi_old.extent(0) - 1; ++i) {
+        for (int j = 1; j < phi_old.extent(1) - 1; ++j) {
+            Real_t x = pos(i, ghost_cells, dx[0]);
+            Real_t y = pos(j, ghost_cells, dx[1]);
 
-  if (args.print_grid)
-    // print the final grid
-    printGrid(grid_new, ncells);
+            // L2 distance (r2 from origin)
+            Real_t r2 = (x * x + y * y) / (0.01);
 
-  // delete all memory
-  delete[] grid_old;
-  delete[] grid_new;
+            // phi(x,y) = 1 + exp(-r^2)
+            phi_old(i, j) = 1 + exp(-r2);
+        }
+    }
+
+    if (args.print_grid)
+        // print the initial grid
+        printGrid(grid_old, ncells + nghosts);
+
+    // init simulation time
+    Real_t time = 0.0;
+
+    // evolve the system
+    for (auto step = 0; step < nsteps; step++) {
+        // fill boundary cells in old_phi
+        fill2Dboundaries_mdspan(grid_old, ncells + nghosts, ghost_cells);
+
+        // update phi_new
+        for (auto i = 1; i < phi_old.extent(0) - 1; i++) {
+            for (auto j = 1; j < phi_old.extent(1) - 1; j++) {
+                // Jacobi iteration
+                phi_new(i - 1, j - 1) =
+                    phi_old(i, j) +
+                    alpha * dt *
+                        ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) +
+                         (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1]));
+            }
+        }
+
+        // update the simulation time
+        time += dt;
+
+        // parallel copy phi_new to phi_old
+        for (auto i = 1; i < phi_old.extent(0) - 1; i++)
+            for (auto j = 1; j < phi_old.extent(1) - 1; j++)
+                // copy phi_new to phi_old
+                phi_old(i, j) = phi_new(i - 1, j - 1);
+    }
 
-  grid_old = nullptr;
-  grid_new = nullptr;
+    auto elapsed = timer.stop();
+
+    // print timing
+    if (args.print_time) {
+        std::cout << "Time: " << elapsed << " ms" << std::endl;
+    }
 
-  return 0;
+    if (args.print_grid)
+        // print the final grid
+        printGrid(grid_new, ncells);
+
+    // delete all memory
+    delete[] grid_old;
+    delete[] grid_new;
+
+    grid_old = nullptr;
+    grid_new = nullptr;
+
+    return 0;
 }
diff --git a/apps/heat-equation/heat-equation-multigpu-scheduler.cpp b/apps/heat-equation/heat-equation-multigpu-scheduler.cpp
index efcc9e5..d8e79b3 100644
--- a/apps/heat-equation/heat-equation-multigpu-scheduler.cpp
+++ b/apps/heat-equation/heat-equation-multigpu-scheduler.cpp
@@ -44,135 +44,129 @@ using namespace nvexec;
 // simulation
 //
 int main(int argc, char* argv[]) {
-  // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
-
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
-
-  // simulation variables
-  int ncells = args.ncells;
-  int nsteps = args.nsteps;
-  Real_t dt = args.dt;
-  Real_t alpha = args.alpha;
-  // future if needed to split in multiple grids
-  // int max_grid_size = args.max_grid_size;
-
-  // init simulation time
-  Real_t time = 0.0;
-
-  // initialize dx, dy, dz
-  thrust::universal_vector<Real_t> dx(dims);
-  for (int i = 0; i < dims; ++i)
-    dx[i] = 1.0 / (ncells - 1);
-
-  // simulation setup (2D)
-  thrust::universal_vector<Real_t> grid_old((ncells + nghosts) *
-                                            (ncells + nghosts));
-  thrust::universal_vector<Real_t> grid_new(ncells * ncells);
-
-  // initialize grid
-  auto phi_old = thrust::raw_pointer_cast(grid_old.data());
-  auto phi_new = thrust::raw_pointer_cast(grid_new.data());
-
-  Timer timer;
-
-  // scheduler from gpu
-  nvexec::multi_gpu_stream_context stream_ctx{};
-  auto gpu = stream_ctx.get_scheduler();
-
-  auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()),
-                           thrust::raw_pointer_cast(dx.data()) + dx.size()};
-  auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()};
-  auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()};
-  auto phi_old_extent = ncells + nghosts;
-
-  int gsize = ncells * ncells;
-  auto heat_eq_init = ex::transfer_just(gpu, dx_span, phi_old_span) |
-                      ex::bulk(gsize, [=](int pos, auto ds, auto phi) {
-                        int i = 1 + (pos / ncells);
-                        int j = 1 + (pos % ncells);
-
-                        Real_t x = pos(i, ghost_cells, ds[0]);
-                        Real_t y = pos(j, ghost_cells, ds[1]);
-
-                        // L2 distance (r2 from origin)
-                        Real_t r2 = (x * x + y * y) / (0.01);
-
-                        // phi(x,y) = 1 + exp(-r^2)
-                        phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
-                      });
-
-  ex::sync_wait(std::move(heat_eq_init));
-  if (args.print_grid)
-    printGrid(phi_old, ncells + nghosts);
-
-  auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span);
-
-  // evolve the system
-  for (auto step = 0; step < nsteps; step++) {
-    static auto evolve =
-        tx |
-        ex::bulk(phi_old_extent - nghosts,
-                 [=](int pos, auto ds, auto phi_old, auto phi_new) {
-                   int i = pos + ghost_cells;
-                   int len = phi_old_extent;
-                   // fill boundary cells in old_phi
-                   phi_old[i] = phi_old[i + (ghost_cells * len)];
-                   phi_old[i + (len * (len - ghost_cells))] =
-                       phi_old[i + (len * (len - ghost_cells - 1))];
-                   phi_old[i * len] = phi_old[(ghost_cells * len) + i];
-                   phi_old[(len - ghost_cells) + (len * i)] =
-                       phi_old[(len - ghost_cells - 1) + (len * i)];
-                 }) |
-        ex::bulk(gsize,
-                 [=](int pos, auto ds, auto phi_old, auto phi_new) {
-                   int i = 1 + (pos / ncells);
-                   int j = 1 + (pos % ncells);
-
-                   // Jacobi iteration
-                   phi_new[(i - 1) * ncells + j - 1] =
-                       phi_old[(i)*phi_old_extent + j] +
-                       alpha * dt *
-                           ((phi_old[(i + 1) * phi_old_extent + j] -
-                             2.0 * phi_old[(i)*phi_old_extent + j] +
-                             phi_old[(i - 1) * phi_old_extent + j]) /
-                                (ds[0] * ds[0]) +
-                            (phi_old[(i)*phi_old_extent + j + 1] -
-                             2.0 * phi_old[(i)*phi_old_extent + j] +
-                             phi_old[(i)*phi_old_extent + j - 1]) /
-                                (ds[1] * ds[1]));
-                 }) |
-        ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) {
-          int i = 1 + (pos / ncells);
-          int j = 1 + (pos % ncells);
-          phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
+    // parse params
+    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
+
+    // simulation variables
+    int ncells = args.ncells;
+    int nsteps = args.nsteps;
+    Real_t dt = args.dt;
+    Real_t alpha = args.alpha;
+    // future if needed to split in multiple grids
+    // int max_grid_size = args.max_grid_size;
+
+    // init simulation time
+    Real_t time = 0.0;
+
+    // initialize dx, dy, dz
+    thrust::universal_vector<Real_t> dx(dims);
+    for (int i = 0; i < dims; ++i)
+        dx[i] = 1.0 / (ncells - 1);
+
+    // simulation setup (2D)
+    thrust::universal_vector<Real_t> grid_old((ncells + nghosts) * (ncells + nghosts));
+    thrust::universal_vector<Real_t> grid_new(ncells * ncells);
+
+    // initialize grid
+    auto phi_old = thrust::raw_pointer_cast(grid_old.data());
+    auto phi_new = thrust::raw_pointer_cast(grid_new.data());
+
+    Timer timer;
+
+    // scheduler from gpu
+    nvexec::multi_gpu_stream_context stream_ctx{};
+    auto gpu = stream_ctx.get_scheduler();
+
+    auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), thrust::raw_pointer_cast(dx.data()) + dx.size()};
+    auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()};
+    auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()};
+    auto phi_old_extent = ncells + nghosts;
+
+    int gsize = ncells * ncells;
+    auto heat_eq_init =
+        ex::transfer_just(gpu, dx_span, phi_old_span) | ex::bulk(gsize, [=](int pos, auto ds, auto phi) {
+            int i = 1 + (pos / ncells);
+            int j = 1 + (pos % ncells);
+
+            Real_t x = pos(i, ghost_cells, ds[0]);
+            Real_t y = pos(j, ghost_cells, ds[1]);
+
+            // L2 distance (r2 from origin)
+            Real_t r2 = (x * x + y * y) / (0.01);
+
+            // phi(x,y) = 1 + exp(-r^2)
+            phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
         });
 
-    ex::sync_wait(std::move(evolve));
-
-    // update the simulation time
-    time += dt;
-  }
-
-  auto elapsed = timer.stop();
-
-  // print timing
-  if (args.print_time) {
-    std::cout << "Time: " << elapsed << " ms" << std::endl;
-  }
-
-  auto finalize = ex::then(ex::just(), [&]() {
+    ex::sync_wait(std::move(heat_eq_init));
     if (args.print_grid)
-      // print the final grid
-      printGrid(phi_new, ncells);
-  });
+        printGrid(phi_old, ncells + nghosts);
+
+    auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span);
+
+    // evolve the system
+    for (auto step = 0; step < nsteps; step++) {
+        static auto evolve =
+            tx |
+            ex::bulk(phi_old_extent - nghosts,
+                     [=](int pos, auto ds, auto phi_old, auto phi_new) {
+                         int i = pos + ghost_cells;
+                         int len = phi_old_extent;
+                         // fill boundary cells in old_phi
+                         phi_old[i] = phi_old[i + (ghost_cells * len)];
+                         phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))];
+                         phi_old[i * len] = phi_old[(ghost_cells * len) + i];
+                         phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)];
+                     }) |
+            ex::bulk(gsize,
+                     [=](int pos, auto ds, auto phi_old, auto phi_new) {
+                         int i = 1 + (pos / ncells);
+                         int j = 1 + (pos % ncells);
+
+                         // Jacobi iteration
+                         phi_new[(i - 1) * ncells + j - 1] =
+                             phi_old[(i)*phi_old_extent + j] +
+                             alpha * dt *
+                                 ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] +
+                                   phi_old[(i - 1) * phi_old_extent + j]) /
+                                      (ds[0] * ds[0]) +
+                                  (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] +
+                                   phi_old[(i)*phi_old_extent + j - 1]) /
+                                      (ds[1] * ds[1]));
+                     }) |
+            ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) {
+                int i = 1 + (pos / ncells);
+                int j = 1 + (pos % ncells);
+                phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
+            });
+
+        ex::sync_wait(std::move(evolve));
+
+        // update the simulation time
+        time += dt;
+    }
+
+    auto elapsed = timer.stop();
+
+    // print timing
+    if (args.print_time) {
+        std::cout << "Time: " << elapsed << " ms" << std::endl;
+    }
+
+    auto finalize = ex::then(ex::just(), [&]() {
+        if (args.print_grid)
+            // print the final grid
+            printGrid(phi_new, ncells);
+    });
+
+    // end the simulation
+    ex::sync_wait(std::move(finalize));
 
-  // end the simulation
-  ex::sync_wait(std::move(finalize));
-
-  return 0;
+    return 0;
 }
\ No newline at end of file
diff --git a/apps/heat-equation/heat-equation-omp.cpp b/apps/heat-equation/heat-equation-omp.cpp
index 6af69b0..ebf89e2 100644
--- a/apps/heat-equation/heat-equation-omp.cpp
+++ b/apps/heat-equation/heat-equation-omp.cpp
@@ -33,134 +33,126 @@
 
 // fill boundary cells OpenMP
 template <typename T>
-void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1,
-                          int ghost_cells = 1) {
+void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1, int ghost_cells = 1) {
 #pragma omp parallel for num_threads(nthreads)
-  for (int i = ghost_cells; i < len - ghost_cells; i++) {
-    grid[i] = grid[i + (ghost_cells * len)];
-    grid[i + (len * (len - ghost_cells))] =
-        grid[i + (len * (len - ghost_cells - 1))];
-
-    grid[i * len] = grid[(ghost_cells * len) + i];
-    grid[(len - ghost_cells) + (len * i)] =
-        grid[(len - ghost_cells - 1) + (len * i)];
-  }
+    for (int i = ghost_cells; i < len - ghost_cells; i++) {
+        grid[i] = grid[i + (ghost_cells * len)];
+        grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))];
+
+        grid[i * len] = grid[(ghost_cells * len) + i];
+        grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)];
+    }
 }
 
 //
 // simulation
 //
 int main(int argc, char* argv[]) {
-  // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+    // parse params
+    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
 
-  // simulation variables
-  int ncells = args.ncells;
-  int nsteps = args.nsteps;
-  int nthreads = args.nthreads;
-  Real_t dt = args.dt;
-  Real_t alpha = args.alpha;
-  // future if needed to split in multiple grids
-  // int max_grid_size = args.max_grid_size;
+    // simulation variables
+    int ncells = args.ncells;
+    int nsteps = args.nsteps;
+    int nthreads = args.nthreads;
+    Real_t dt = args.dt;
+    Real_t alpha = args.alpha;
+    // future if needed to split in multiple grids
+    // int max_grid_size = args.max_grid_size;
 
-  // initialize dx, dy, dz
-  auto* dx = new Real_t[dims];
-  for (int i = 0; i < dims; ++i)
-    dx[i] = 1.0 / (ncells - 1);
+    // initialize dx, dy, dz
+    auto* dx = new Real_t[dims];
+    for (int i = 0; i < dims; ++i)
+        dx[i] = 1.0 / (ncells - 1);
 
-  // simulation setup (2D)
-  Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
-  Real_t* grid_new = new Real_t[(ncells) * (ncells)];
+    // simulation setup (2D)
+    Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
+    Real_t* grid_new = new Real_t[(ncells) * (ncells)];
 
-  auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(
-      grid_old, ncells + nghosts, ncells + nghosts);
-  auto phi_new =
-      std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
+    auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(grid_old, ncells + nghosts, ncells + nghosts);
+    auto phi_new = std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
 
-  int gsize = ncells * ncells;
+    int gsize = ncells * ncells;
 
-  Timer timer;
+    Timer timer;
 
-  // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
+    // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
 #pragma omp parallel for num_threads(nthreads)
-  for (int pos = 0; pos < gsize; pos++) {
-    int i = 1 + (pos / ncells);
-    int j = 1 + (pos % ncells);
+    for (int pos = 0; pos < gsize; pos++) {
+        int i = 1 + (pos / ncells);
+        int j = 1 + (pos % ncells);
 
-    Real_t x = pos(i, ghost_cells, dx[0]);
-    Real_t y = pos(j, ghost_cells, dx[1]);
+        Real_t x = pos(i, ghost_cells, dx[0]);
+        Real_t y = pos(j, ghost_cells, dx[1]);
 
-    // L2 distance (r2 from origin)
-    Real_t r2 = (x * x + y * y) / (0.01);
+        // L2 distance (r2 from origin)
+        Real_t r2 = (x * x + y * y) / (0.01);
 
-    // phi(x,y) = 1 + exp(-r^2)
-    phi_old(i, j) = 1 + exp(-r2);
-  }
+        // phi(x,y) = 1 + exp(-r^2)
+        phi_old(i, j) = 1 + exp(-r2);
+    }
 
-  if (args.print_grid)
-    // print the initial grid
-    printGrid(grid_old, ncells + nghosts);
+    if (args.print_grid)
+        // print the initial grid
+        printGrid(grid_old, ncells + nghosts);
 
-  // init simulation time
-  Real_t time = 0.0;
+    // init simulation time
+    Real_t time = 0.0;
 
-  // evolve the system
-  for (auto step = 0; step < nsteps; step++) {
-    // fill boundary cells in old_phi
-    fill2Dboundaries_omp(grid_old, ncells + nghosts, ghost_cells, nthreads);
+    // evolve the system
+    for (auto step = 0; step < nsteps; step++) {
+        // fill boundary cells in old_phi
+        fill2Dboundaries_omp(grid_old, ncells + nghosts, ghost_cells, nthreads);
 
 #pragma omp parallel for num_threads(nthreads)
-    for (int pos = 0; pos < gsize; pos++) {
-      int i = 1 + (pos / ncells);
-      int j = 1 + (pos % ncells);
-
-      // Jacobi iteration
-      phi_new(i - 1, j - 1) =
-          phi_old(i, j) +
-          alpha * dt *
-              ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) /
-                   (dx[0] * dx[0]) +
-               (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) /
-                   (dx[1] * dx[1]));
-    }
+        for (int pos = 0; pos < gsize; pos++) {
+            int i = 1 + (pos / ncells);
+            int j = 1 + (pos % ncells);
+
+            // Jacobi iteration
+            phi_new(i - 1, j - 1) =
+                phi_old(i, j) + alpha * dt *
+                                    ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) +
+                                     (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1]));
+        }
 
-    // update the simulation time
-    time += dt;
+        // update the simulation time
+        time += dt;
 
-    // parallel copy phi_new to phi_old
+        // parallel copy phi_new to phi_old
 #pragma omp parallel for num_threads(nthreads)
-    for (int pos = 0; pos < gsize; pos++) {
-      int i = 1 + (pos / ncells);
-      int j = 1 + (pos % ncells);
+        for (int pos = 0; pos < gsize; pos++) {
+            int i = 1 + (pos / ncells);
+            int j = 1 + (pos % ncells);
 
-      // copy phi_new to phi_old
-      phi_old(i, j) = phi_new(i - 1, j - 1);
+            // copy phi_new to phi_old
+            phi_old(i, j) = phi_new(i - 1, j - 1);
+        }
     }
-  }
 
-  auto elapsed = timer.stop();
+    auto elapsed = timer.stop();
 
-  // print timing
-  if (args.print_time) {
-    std::cout << "Time: " << elapsed << " ms" << std::endl;
-  }
+    // print timing
+    if (args.print_time) {
+        std::cout << "Time: " << elapsed << " ms" << std::endl;
+    }
 
-  if (args.print_grid)
-    // print the final grid
-    printGrid(grid_new, ncells);
+    if (args.print_grid)
+        // print the final grid
+        printGrid(grid_new, ncells);
 
-  // delete all memory
-  delete[] grid_old;
-  delete[] grid_new;
+    // delete all memory
+    delete[] grid_old;
+    delete[] grid_new;
 
-  grid_old = nullptr;
-  grid_new = nullptr;
+    grid_old = nullptr;
+    grid_new = nullptr;
 
-  return 0;
+    return 0;
 }
diff --git a/apps/heat-equation/heat-equation-stdpar-senders.cpp b/apps/heat-equation/heat-equation-stdpar-senders.cpp
index f83b113..5209f37 100644
--- a/apps/heat-equation/heat-equation-stdpar-senders.cpp
+++ b/apps/heat-equation/heat-equation-stdpar-senders.cpp
@@ -45,166 +45,156 @@ using stdexec::sync_wait;
 // simulation
 //
 int main(int argc, char* argv[]) {
-  // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
-
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
-    return 0;
-  }
-
-  // simulation variables
-  int ncells = args.ncells;
-  int nsteps = args.nsteps;
-  // number of parallel tiles
-  int ntiles = args.ntiles;
-  Real_t dt = args.dt;
-  Real_t alpha = args.alpha;
-  // future if needed to split in multiple grids
-  // int max_grid_size = args.max_grid_size;
-
-  // init simulation time
-  Real_t time = 0.0;
-
-  // initialize dx, dy, dz
-  auto* dx = new Real_t[dims];
-  for (int i = 0; i < dims; ++i)
-    dx[i] = 1.0 / (ncells - 1);
-
-  // simulation setup (2D)
-  Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
-  Real_t* grid_new = new Real_t[(ncells) * (ncells)];
-
-  auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(
-      grid_old, ncells + nghosts, ncells + nghosts);
-  auto phi_new =
-      std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
-
-  Timer timer;
-
-  // scheduler from a thread pool
-  exec::static_thread_pool ctx{ntiles};
-
-  scheduler auto sch = ctx.get_scheduler();
-  sender auto begin = schedule(sch);
-
-  // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
-  sender auto heat_eq_init =
-      bulk(begin, ntiles,
-           [&](int tile) {
-             int start = tile * (ncells * ncells) / ntiles;
-             int size = (ncells * ncells) / ntiles;
-             int remaining = (ncells * ncells) % ntiles;
-             size += (tile == ntiles - 1) ? remaining : 0;
-
-             std::for_each_n(std::execution::par_unseq,
-                             counting_iterator(start), size, [=](int pos) {
-                               int i = 1 + (pos / ncells);
-                               int j = 1 + (pos % ncells);
-
-                               Real_t x = pos(i, ghost_cells, dx[0]);
-                               Real_t y = pos(j, ghost_cells, dx[1]);
-
-                               // L2 distance (r2 from origin)
-                               Real_t r2 = (x * x + y * y) / (0.01);
-
-                               // phi(x,y) = 1 + exp(-r^2)
-                               phi_old(i, j) = 1 + exp(-r2);
-                             });
-           }) |
-      then([&]() {
-        if (args.print_grid)
-          // print the initial grid
-          printGrid(grid_old, ncells + nghosts);
-      });
-
-  // start the simulation
-  sync_wait(std::move(heat_eq_init));
-
-  // evolve the system
-  for (auto step = 0; step < nsteps; step++) {
-    static sender auto evolve =
-        then(begin,
-             [&]() {
-               // fill boundary cells in old_phi
-               fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells);
-             }) |
-        bulk(ntiles,
+    // parse params
+    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
+
+    // simulation variables
+    int ncells = args.ncells;
+    int nsteps = args.nsteps;
+    // number of parallel tiles
+    int ntiles = args.ntiles;
+    Real_t dt = args.dt;
+    Real_t alpha = args.alpha;
+    // future if needed to split in multiple grids
+    // int max_grid_size = args.max_grid_size;
+
+    // init simulation time
+    Real_t time = 0.0;
+
+    // initialize dx, dy, dz
+    auto* dx = new Real_t[dims];
+    for (int i = 0; i < dims; ++i)
+        dx[i] = 1.0 / (ncells - 1);
+
+    // simulation setup (2D)
+    Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
+    Real_t* grid_new = new Real_t[(ncells) * (ncells)];
+
+    auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(grid_old, ncells + nghosts, ncells + nghosts);
+    auto phi_new = std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
+
+    Timer timer;
+
+    // scheduler from a thread pool
+    exec::static_thread_pool ctx{ntiles};
+
+    scheduler auto sch = ctx.get_scheduler();
+    sender auto begin = schedule(sch);
+
+    // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
+    sender auto heat_eq_init =
+        bulk(begin, ntiles,
              [&](int tile) {
-               int start = tile * (ncells * ncells) / ntiles;
-               int size = (ncells * ncells) / ntiles;
-               int remaining = (ncells * ncells) % ntiles;
-               size += (tile == ntiles - 1) ? remaining : 0;
-
-               // update phi_new with stencil
-               std::for_each_n(
-                   std::execution::par_unseq, counting_iterator(start), size,
-                   [=](int pos) {
+                 int start = tile * (ncells * ncells) / ntiles;
+                 int size = (ncells * ncells) / ntiles;
+                 int remaining = (ncells * ncells) % ntiles;
+                 size += (tile == ntiles - 1) ? remaining : 0;
+
+                 std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) {
                      int i = 1 + (pos / ncells);
                      int j = 1 + (pos % ncells);
 
-                     // Jacobi iteration
-                     phi_new(i - 1, j - 1) =
-                         phi_old(i, j) +
-                         alpha * dt *
-                             ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) +
-                               phi_old(i - 1, j)) /
-                                  (dx[0] * dx[0]) +
-                              (phi_old(i, j + 1) - 2.0 * phi_old(i, j) +
-                               phi_old(i, j - 1)) /
-                                  (dx[1] * dx[1]));
-                   });
-             }) |
-        bulk(ntiles,
-             [&](int tile) {
-               int start = tile * (ncells * ncells) / ntiles;
-               int size = (ncells * ncells) / ntiles;
-               int remaining = (ncells * ncells) % ntiles;
-               size += (tile == ntiles - 1) ? remaining : 0;
-
-               // parallel copy phi_new to phi_old
-               std::for_each_n(std::execution::par_unseq,
-                               counting_iterator(start), size, [=](int pos) {
-                                 int i = 1 + (pos / ncells);
-                                 int j = 1 + (pos % ncells);
-
-                                 // copy phi_new to phi_old
-                                 phi_old(i, j) = phi_new(i - 1, j - 1);
-                               });
+                     Real_t x = pos(i, ghost_cells, dx[0]);
+                     Real_t y = pos(j, ghost_cells, dx[1]);
+
+                     // L2 distance (r2 from origin)
+                     Real_t r2 = (x * x + y * y) / (0.01);
+
+                     // phi(x,y) = 1 + exp(-r^2)
+                     phi_old(i, j) = 1 + exp(-r2);
+                 });
              }) |
         then([&]() {
-          // update the simulation time
-          time += dt;
+            if (args.print_grid)
+                // print the initial grid
+                printGrid(grid_old, ncells + nghosts);
         });
 
-    sync_wait(std::move(evolve));
-  }
-
-  auto elapsed = timer.stop();
-
-  // print timing
-  if (args.print_time) {
-    std::cout << "Time: " << elapsed << " ms" << std::endl;
-  }
-
-  sender auto finalize = then(just(),
-                              [&]() {
-                                if (args.print_grid)
-                                  // print the final grid
-                                  printGrid(grid_new, ncells);
-                              }) |
-                         then([&]() {
-                           // delete all memory
-                           delete[] grid_old;
-                           delete[] grid_new;
+    // start the simulation
+    sync_wait(std::move(heat_eq_init));
+
+    // evolve the system
+    for (auto step = 0; step < nsteps; step++) {
+        static sender auto evolve =
+            then(begin,
+                 [&]() {
+                     // fill boundary cells in old_phi
+                     fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells);
+                 }) |
+            bulk(ntiles,
+                 [&](int tile) {
+                     int start = tile * (ncells * ncells) / ntiles;
+                     int size = (ncells * ncells) / ntiles;
+                     int remaining = (ncells * ncells) % ntiles;
+                     size += (tile == ntiles - 1) ? remaining : 0;
+
+                     // update phi_new with stencil
+                     std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) {
+                         int i = 1 + (pos / ncells);
+                         int j = 1 + (pos % ncells);
+
+                         // Jacobi iteration
+                         phi_new(i - 1, j - 1) =
+                             phi_old(i, j) +
+                             alpha * dt *
+                                 ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) +
+                                  (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1]));
+                     });
+                 }) |
+            bulk(ntiles,
+                 [&](int tile) {
+                     int start = tile * (ncells * ncells) / ntiles;
+                     int size = (ncells * ncells) / ntiles;
+                     int remaining = (ncells * ncells) % ntiles;
+                     size += (tile == ntiles - 1) ? remaining : 0;
+
+                     // parallel copy phi_new to phi_old
+                     std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) {
+                         int i = 1 + (pos / ncells);
+                         int j = 1 + (pos % ncells);
+
+                         // copy phi_new to phi_old
+                         phi_old(i, j) = phi_new(i - 1, j - 1);
+                     });
+                 }) |
+            then([&]() {
+                // update the simulation time
+                time += dt;
+            });
+
+        sync_wait(std::move(evolve));
+    }
+
+    auto elapsed = timer.stop();
+
+    // print timing
+    if (args.print_time) {
+        std::cout << "Time: " << elapsed << " ms" << std::endl;
+    }
+
+    sender auto finalize = then(just(),
+                                [&]() {
+                                    if (args.print_grid)
+                                        // print the final grid
+                                        printGrid(grid_new, ncells);
+                                }) |
+                           then([&]() {
+                               // delete all memory
+                               delete[] grid_old;
+                               delete[] grid_new;
+
+                               grid_old = nullptr;
+                               grid_new = nullptr;
+                           });
+
+    // start the simulation
+    sync_wait(std::move(finalize));
 
-                           grid_old = nullptr;
-                           grid_new = nullptr;
-                         });
-
-  // start the simulation
-  sync_wait(std::move(finalize));
-
-  return 0;
+    return 0;
 }
diff --git a/apps/heat-equation/heat-equation-stdpar.cpp b/apps/heat-equation/heat-equation-stdpar.cpp
index b20fb68..164c482 100644
--- a/apps/heat-equation/heat-equation-stdpar.cpp
+++ b/apps/heat-equation/heat-equation-stdpar.cpp
@@ -34,117 +34,107 @@
 // simulation
 //
 int main(int argc, char* argv[]) {
-  // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+    // parse params
+    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+
+    // see if help wanted
+    if (args.help) {
+        args.print();  // prints all variables
+        return 0;
+    }
+
+    // simulation variables
+    int ncells = args.ncells;
+    int nsteps = args.nsteps;
+    Real_t dt = args.dt;
+    Real_t alpha = args.alpha;
+    // future if needed to split in multiple grids
+    // int max_grid_size = args.max_grid_size;
+
+    // initialize dx, dy, dz
+    auto* dx = new Real_t[dims];
+    for (int i = 0; i < dims; ++i)
+        dx[i] = 1.0 / (ncells - 1);
+
+    // simulation setup (2D)
+    Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
+    Real_t* grid_new = new Real_t[(ncells) * (ncells)];
+
+    auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(grid_old, ncells + nghosts, ncells + nghosts);
+    auto phi_new = std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
+
+    // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
+
+    Timer timer;
+
+    std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) {
+        int i = 1 + (ind / ncells);
+        int j = 1 + (ind % ncells);
+
+        Real_t x = pos(i, ghost_cells, dx[0]);
+        Real_t y = pos(j, ghost_cells, dx[1]);
+
+        // L2 distance (r2 from origin)
+        Real_t r2 = (x * x + y * y) / (0.01);
+
+        // phi(x,y) = 1 + exp(-r^2)
+        phi_old(i, j) = 1 + exp(-r2);
+    });
+
+    if (args.print_grid)
+        // print the initial grid
+        printGrid(grid_old, ncells + nghosts);
+
+    // init simulation time
+    Real_t time = 0.0;
+
+    // evolve the system
+    for (auto step = 0; step < nsteps; step++) {
+        // fill boundary cells in old_phi
+        fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells);
+
+        // update phi_new with stencil
+        std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) {
+            int i = 1 + (ind / ncells);
+            int j = 1 + (ind % ncells);
+
+            // Jacobi iteration
+            phi_new(i - 1, j - 1) =
+                phi_old(i, j) + alpha * dt *
+                                    ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) +
+                                     (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1]));
+        });
+
+        // update the simulation time
+        time += dt;
+
+        // parallel copy phi_new to phi_old
+        std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) {
+            int i = 1 + (ind / ncells);
+            int j = 1 + (ind % ncells);
+
+            // copy phi_new to phi_old
+            phi_old(i, j) = phi_new(i - 1, j - 1);
+        });
+    }
+
+    auto elapsed = timer.stop();
+
+    // print timing
+    if (args.print_time) {
+        std::cout << "Time: " << elapsed << " ms" << std::endl;
+    }
+
+    if (args.print_grid)
+        // print the final grid
+        printGrid(grid_new, ncells);
+
+    // delete all memory
+    delete[] grid_old;
+    delete[] grid_new;
+
+    grid_old = nullptr;
+    grid_new = nullptr;
 
-  // see if help wanted
-  if (args.help) {
-    args.print();  // prints all variables
     return 0;
-  }
-
-  // simulation variables
-  int ncells = args.ncells;
-  int nsteps = args.nsteps;
-  Real_t dt = args.dt;
-  Real_t alpha = args.alpha;
-  // future if needed to split in multiple grids
-  // int max_grid_size = args.max_grid_size;
-
-  // initialize dx, dy, dz
-  auto* dx = new Real_t[dims];
-  for (int i = 0; i < dims; ++i)
-    dx[i] = 1.0 / (ncells - 1);
-
-  // simulation setup (2D)
-  Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
-  Real_t* grid_new = new Real_t[(ncells) * (ncells)];
-
-  auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(
-      grid_old, ncells + nghosts, ncells + nghosts);
-  auto phi_new =
-      std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
-
-  // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
-
-  Timer timer;
-
-  std::for_each_n(std::execution::par_unseq, counting_iterator(0),
-                  ncells * ncells, [=](int ind) {
-                    int i = 1 + (ind / ncells);
-                    int j = 1 + (ind % ncells);
-
-                    Real_t x = pos(i, ghost_cells, dx[0]);
-                    Real_t y = pos(j, ghost_cells, dx[1]);
-
-                    // L2 distance (r2 from origin)
-                    Real_t r2 = (x * x + y * y) / (0.01);
-
-                    // phi(x,y) = 1 + exp(-r^2)
-                    phi_old(i, j) = 1 + exp(-r2);
-                  });
-
-  if (args.print_grid)
-    // print the initial grid
-    printGrid(grid_old, ncells + nghosts);
-
-  // init simulation time
-  Real_t time = 0.0;
-
-  // evolve the system
-  for (auto step = 0; step < nsteps; step++) {
-    // fill boundary cells in old_phi
-    fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells);
-
-    // update phi_new with stencil
-    std::for_each_n(std::execution::par_unseq, counting_iterator(0),
-                    ncells * ncells, [=](int ind) {
-                      int i = 1 + (ind / ncells);
-                      int j = 1 + (ind % ncells);
-
-                      // Jacobi iteration
-                      phi_new(i - 1, j - 1) =
-                          phi_old(i, j) +
-                          alpha * dt *
-                              ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) +
-                                phi_old(i - 1, j)) /
-                                   (dx[0] * dx[0]) +
-                               (phi_old(i, j + 1) - 2.0 * phi_old(i, j) +
-                                phi_old(i, j - 1)) /
-                                   (dx[1] * dx[1]));
-                    });
-
-    // update the simulation time
-    time += dt;
-
-    // parallel copy phi_new to phi_old
-    std::for_each_n(std::execution::par_unseq, counting_iterator(0),
-                    ncells * ncells, [=](int ind) {
-                      int i = 1 + (ind / ncells);
-                      int j = 1 + (ind % ncells);
-
-                      // copy phi_new to phi_old
-                      phi_old(i, j) = phi_new(i - 1, j - 1);
-                    });
-  }
-
-  auto elapsed = timer.stop();
-
-  // print timing
-  if (args.print_time) {
-    std::cout << "Time: " << elapsed << " ms" << std::endl;
-  }
-
-  if (args.print_grid)
-    // print the final grid
-    printGrid(grid_new, ncells);
-
-  // delete all memory
-  delete[] grid_old;
-  delete[] grid_new;
-
-  grid_old = nullptr;
-  grid_new = nullptr;
-
-  return 0;
 }
diff --git a/apps/heat-equation/heat-equation.hpp b/apps/heat-equation/heat-equation.hpp
index a226a45..94bf8b6 100644
--- a/apps/heat-equation/heat-equation.hpp
+++ b/apps/heat-equation/heat-equation.hpp
@@ -49,62 +49,57 @@ constexpr int nghosts = ghost_cells * dims;
 using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 
 // 3D view
-using view_3d = std::extents<int, std::dynamic_extent, std::dynamic_extent,
-                             std::dynamic_extent>;
+using view_3d = std::extents<int, std::dynamic_extent, std::dynamic_extent, std::dynamic_extent>;
 
 // macros to get x and y positions from indices
 #define pos(i, ghosts, dx) -0.5 + dx*(i - ghosts)
 
 // parameters
 struct heat_params_t : public argparse::Args {
-  int& ncells = kwarg("n,ncells", "number of cells on each side of the domain")
-                    .set_default(32);
-  int& nsteps = kwarg("s,nsteps", "total steps in simulation").set_default(100);
+    int& ncells = kwarg("n,ncells", "number of cells on each side of the domain").set_default(32);
+    int& nsteps = kwarg("s,nsteps", "total steps in simulation").set_default(100);
 #if defined(HEQ_OMP)
-  int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
+    int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
 #endif  // HEQ_OMP
-  Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f);
-  Real_t& dt = kwarg("t,dt", "time step").set_default(5.0e-5f);
-  bool& help = flag("h, help", "print help");
-  bool& print_grid = flag("p,print", "print grids at step 0 and step n");
-  bool& print_time = flag("time", "print simulation time");
+    Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f);
+    Real_t& dt = kwarg("t,dt", "time step").set_default(5.0e-5f);
+    bool& help = flag("h, help", "print help");
+    bool& print_grid = flag("p,print", "print grids at step 0 and step n");
+    bool& print_time = flag("time", "print simulation time");
 #if defined(TILING)
-  int& ntiles = kwarg("ntiles", "number of parallel tiles").set_default(4);
+    int& ntiles = kwarg("ntiles", "number of parallel tiles").set_default(4);
 #endif  // TILING               \
         // future use if needed \
         // int &max_grid_size = kwarg("g, max_grid_size", "size of each box (or
-  // grid)").set_default(32); bool &verbose = kwarg("v, verbose", "verbose
-  // mode").set_default(false); int &plot_int = kwarg("p, plot_int", "how often
-  // to write a plotfile").set_default(-1);
+        // grid)").set_default(32); bool &verbose = kwarg("v, verbose", "verbose
+        // mode").set_default(false); int &plot_int = kwarg("p, plot_int", "how often
+        // to write a plotfile").set_default(-1);
 };
 
 template <typename T>
 void printGrid(T* grid, int len) {
-  auto view = std::mdspan<T, view_2d, std::layout_right>(grid, len, len);
-  std::cout << "Grid: " << std::endl;
-  std::cout << std::fixed << std::showpoint;
-  std::cout << std::setprecision(2);
+    auto view = std::mdspan<T, view_2d, std::layout_right>(grid, len, len);
+    std::cout << "Grid: " << std::endl;
+    std::cout << std::fixed << std::showpoint;
+    std::cout << std::setprecision(2);
 
-  for (auto j = 0; j < view.extent(1); ++j) {
-    for (auto i = 0; i < view.extent(0); ++i) {
-      std::cout << view(i, j) << ", ";
+    for (auto j = 0; j < view.extent(1); ++j) {
+        for (auto i = 0; i < view.extent(0); ++i) {
+            std::cout << view(i, j) << ", ";
+        }
+        std::cout << std::endl;
     }
     std::cout << std::endl;
-  }
-  std::cout << std::endl;
 }
 
 // fill boundary cells
 template <typename T>
 void fill2Dboundaries(T* grid, int len, int ghost_cells = 1) {
-  std::for_each_n(std::execution::par_unseq, counting_iterator(ghost_cells),
-                  len - nghosts, [=](auto i) {
-                    grid[i] = grid[i + (ghost_cells * len)];
-                    grid[i + (len * (len - ghost_cells))] =
-                        grid[i + (len * (len - ghost_cells - 1))];
+    std::for_each_n(std::execution::par_unseq, counting_iterator(ghost_cells), len - nghosts, [=](auto i) {
+        grid[i] = grid[i + (ghost_cells * len)];
+        grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))];
 
-                    grid[i * len] = grid[(ghost_cells * len) + i];
-                    grid[(len - ghost_cells) + (len * i)] =
-                        grid[(len - ghost_cells - 1) + (len * i)];
-                  });
+        grid[i * len] = grid[(ghost_cells * len) + i];
+        grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)];
+    });
 }
\ No newline at end of file
diff --git a/apps/mdspan-stdpar/mdspan-stdpar.cpp b/apps/mdspan-stdpar/mdspan-stdpar.cpp
index 9c92a87..92dbdb9 100644
--- a/apps/mdspan-stdpar/mdspan-stdpar.cpp
+++ b/apps/mdspan-stdpar/mdspan-stdpar.cpp
@@ -30,58 +30,51 @@
 
 using data_type = int;
 // 2D view
-using extents_type =
-    std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+using extents_type = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 // 3D view (fix the first dimension to 2)
-using extents_type2 =
-    std::extents<int, 2, std::dynamic_extent, std::dynamic_extent>;
+using extents_type2 = std::extents<int, 2, std::dynamic_extent, std::dynamic_extent>;
 
 int main() {
-  constexpr int N = 1e9;
-  std::vector<data_type> v(N);
+    constexpr int N = 1e9;
+    std::vector<data_type> v(N);
 
-  // View data as contiguous memory representing 2 rows of 6 ints each
-  auto ms2 = std::mdspan<data_type, extents_type, std::layout_right>(v.data(),
-                                                                     N / 2, 2);
-  // View the same data as a 3D array 2 (fixed above) x 3 x 2
-  auto ms3 = std::mdspan<data_type, extents_type2, std::layout_right>(v.data(),
-                                                                      N / 4, 2);
+    // View data as contiguous memory representing 2 rows of 6 ints each
+    auto ms2 = std::mdspan<data_type, extents_type, std::layout_right>(v.data(), N / 2, 2);
+    // View the same data as a 3D array 2 (fixed above) x 3 x 2
+    auto ms3 = std::mdspan<data_type, extents_type2, std::layout_right>(v.data(), N / 4, 2);
 
-  // auto dim2 = [=](int i){int i1 = i/ms2.extent(1); int i2 = i%ms2.extent(1);
-  // return std::make_tuple(i1, i2);}; auto dim3 = [=](int i){int i1 =
-  // i/(ms3.extent(1)*ms3.extent(2)); int i2 = (i/ms3.extent(2))%ms3.extent(1);
-  // int i3 = i%ms3.extent(2); return std::make_tuple(i1, i2, i3);};
+    // auto dim2 = [=](int i){int i1 = i/ms2.extent(1); int i2 = i%ms2.extent(1);
+    // return std::make_tuple(i1, i2);}; auto dim3 = [=](int i){int i1 =
+    // i/(ms3.extent(1)*ms3.extent(2)); int i2 = (i/ms3.extent(2))%ms3.extent(1);
+    // int i3 = i%ms3.extent(2); return std::make_tuple(i1, i2, i3);};
 
-  std::for_each(std::execution::par_unseq, ms2.data_handle(),
-                ms2.data_handle() + ms2.size(), [=](int& i) {
-                  auto global_idx = std::distance(ms2.data_handle(), &i);
-                  dim2(global_idx, ms2);
-                  // auto [i1, i2] = dim2(global_idx);
-                  ms2(ii, ij) = global_idx;
-                });
+    std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) {
+        auto global_idx = std::distance(ms2.data_handle(), &i);
+        dim2(global_idx, ms2);
+        // auto [i1, i2] = dim2(global_idx);
+        ms2(ii, ij) = global_idx;
+    });
 
-  std::cout << std::endl << std::endl;
+    std::cout << std::endl << std::endl;
 
-  std::for_each(std::execution::par_unseq, ms2.data_handle(),
-                ms2.data_handle() + ms2.size(), [=](int& i) {
-                  auto global_idx = std::distance(ms2.data_handle(), &i);
-                  dim3(global_idx, ms3);
-                  // auto [i1, i2, i3] = dim3(global_idx);
-                  ms3(ii, ij, ik) = 1000 + global_idx;
-                });
+    std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) {
+        auto global_idx = std::distance(ms2.data_handle(), &i);
+        dim3(global_idx, ms3);
+        // auto [i1, i2, i3] = dim3(global_idx);
+        ms3(ii, ij, ik) = 1000 + global_idx;
+    });
 
-  // read subset of data using 3D view
-  for (size_t i = 0; i < ms3.extent(0); i++) {
-    for (size_t j = 0; j < 10; j++) {
-      for (size_t k = 0; k < ms3.extent(2); k++) {
-        assert(ms3(i, j, k) == 1000 + i * ms3.extent(1) * ms3.extent(2) +
-                                   j * ms3.extent(2) + k);
-        std::cout << ms3(i, j, k) << " ";
-      }
-      std::cout << std::endl;
+    // read subset of data using 3D view
+    for (size_t i = 0; i < ms3.extent(0); i++) {
+        for (size_t j = 0; j < 10; j++) {
+            for (size_t k = 0; k < ms3.extent(2); k++) {
+                assert(ms3(i, j, k) == 1000 + i * ms3.extent(1) * ms3.extent(2) + j * ms3.extent(2) + k);
+                std::cout << ms3(i, j, k) << " ";
+            }
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
     }
-    std::cout << std::endl;
-  }
 
-  std::cout << ms3(0, 0, 1) << "\n";
+    std::cout << ms3(0, 0, 1) << "\n";
 }
\ No newline at end of file
diff --git a/include/commons.hpp b/include/commons.hpp
index c043a20..cfacfa1 100644
--- a/include/commons.hpp
+++ b/include/commons.hpp
@@ -48,43 +48,38 @@
 #include "counting_iterator.hpp"
 
 // get mdpsan 2d indices from 1d index
-#define dim2(x, ms)          \
-  int ii = x / ms.extent(1); \
-  int ij = x % ms.extent(1);
+#define dim2(x, ms)            \
+    int ii = x / ms.extent(1); \
+    int ij = x % ms.extent(1);
 // get mdspan 3d indices from 1d index
-#define dim3(x, ms)                            \
-  int ii = x / (ms3.extent(1) * ms.extent(2)); \
-  int ij = (x / ms.extent(2)) % ms.extent(1);  \
-  int ik = x % ms.extent(2)
+#define dim3(x, ms)                              \
+    int ii = x / (ms3.extent(1) * ms.extent(2)); \
+    int ij = (x / ms.extent(2)) % ms.extent(1);  \
+    int ik = x % ms.extent(2)
 
 class Timer {
- public:
-  Timer() { start(); }
+   public:
+    Timer() { start(); }
 
-  ~Timer() { stop(); }
+    ~Timer() { stop(); }
 
-  void start() { start_time_point = std::chrono::high_resolution_clock::now(); }
+    void start() { start_time_point = std::chrono::high_resolution_clock::now(); }
 
-  double stop() {
-    end_time_point = std::chrono::high_resolution_clock::now();
-    return duration();
-  }
+    double stop() {
+        end_time_point = std::chrono::high_resolution_clock::now();
+        return duration();
+    }
 
-  double duration() {
-    auto start = std::chrono::time_point_cast<std::chrono::microseconds>(
-                     start_time_point)
-                     .time_since_epoch()
-                     .count();
-    auto end =
-        std::chrono::time_point_cast<std::chrono::microseconds>(end_time_point)
-            .time_since_epoch()
-            .count();
-    auto duration = end - start;
-    double ms = duration * 0.001;
-    return ms;
-  }
+    double duration() {
+        auto start =
+            std::chrono::time_point_cast<std::chrono::microseconds>(start_time_point).time_since_epoch().count();
+        auto end = std::chrono::time_point_cast<std::chrono::microseconds>(end_time_point).time_since_epoch().count();
+        auto duration = end - start;
+        double ms = duration * 0.001;
+        return ms;
+    }
 
- private:
-  std::chrono::time_point<std::chrono::high_resolution_clock> start_time_point;
-  std::chrono::time_point<std::chrono::high_resolution_clock> end_time_point;
+   private:
+    std::chrono::time_point<std::chrono::high_resolution_clock> start_time_point;
+    std::chrono::time_point<std::chrono::high_resolution_clock> end_time_point;
 };
diff --git a/include/counting_iterator.hpp b/include/counting_iterator.hpp
index aae6a85..09d0fa2 100644
--- a/include/counting_iterator.hpp
+++ b/include/counting_iterator.hpp
@@ -36,96 +36,76 @@
 using Index_t = int32_t;
 
 struct counting_iterator {
- private:
-  using self = counting_iterator;
-
- public:
-  using value_type = Index_t;
-  using difference_type = typename std::make_signed<Index_t>::type;
-  using pointer = Index_t*;
-  using reference = Index_t&;
-  using iterator_category = std::random_access_iterator_tag;
-
-  counting_iterator() : value(0) {}
-
-  explicit counting_iterator(value_type v) : value(v) {}
-
-  value_type operator*() const { return value; }
-
-  value_type operator[](difference_type n) const { return value + n; }
-
-  self& operator++() {
-    ++value;
-    return *this;
-  }
-
-  self operator++(int) {
-    self result{value};
-    ++value;
-    return result;
-  }
-
-  self& operator--() {
-    --value;
-    return *this;
-  }
-
-  self operator--(int) {
-    self result{value};
-    --value;
-    return result;
-  }
-
-  self& operator+=(difference_type n) {
-    value += n;
-    return *this;
-  }
-
-  self& operator-=(difference_type n) {
-    value -= n;
-    return *this;
-  }
-
-  friend self operator+(self const& i, difference_type n) {
-    return self(i.value + n);
-  }
-
-  friend self operator+(difference_type n, self const& i) {
-    return self(i.value + n);
-  }
-
-  friend difference_type operator-(self const& x, self const& y) {
-    return x.value - y.value;
-  }
-
-  friend self operator-(self const& i, difference_type n) {
-    return self(i.value - n);
-  }
-
-  friend bool operator==(self const& x, self const& y) {
-    return x.value == y.value;
-  }
-
-  friend bool operator!=(self const& x, self const& y) {
-    return x.value != y.value;
-  }
-
-  friend bool operator<(self const& x, self const& y) {
-    return x.value < y.value;
-  }
-
-  friend bool operator<=(self const& x, self const& y) {
-    return x.value <= y.value;
-  }
-
-  friend bool operator>(self const& x, self const& y) {
-    return x.value > y.value;
-  }
-
-  friend bool operator>=(self const& x, self const& y) {
-    return x.value >= y.value;
-  }
-
- private:
-  value_type value;
+   private:
+    using self = counting_iterator;
+
+   public:
+    using value_type = Index_t;
+    using difference_type = typename std::make_signed<Index_t>::type;
+    using pointer = Index_t*;
+    using reference = Index_t&;
+    using iterator_category = std::random_access_iterator_tag;
+
+    counting_iterator() : value(0) {}
+
+    explicit counting_iterator(value_type v) : value(v) {}
+
+    value_type operator*() const { return value; }
+
+    value_type operator[](difference_type n) const { return value + n; }
+
+    self& operator++() {
+        ++value;
+        return *this;
+    }
+
+    self operator++(int) {
+        self result{value};
+        ++value;
+        return result;
+    }
+
+    self& operator--() {
+        --value;
+        return *this;
+    }
+
+    self operator--(int) {
+        self result{value};
+        --value;
+        return result;
+    }
+
+    self& operator+=(difference_type n) {
+        value += n;
+        return *this;
+    }
+
+    self& operator-=(difference_type n) {
+        value -= n;
+        return *this;
+    }
+
+    friend self operator+(self const& i, difference_type n) { return self(i.value + n); }
+
+    friend self operator+(difference_type n, self const& i) { return self(i.value + n); }
+
+    friend difference_type operator-(self const& x, self const& y) { return x.value - y.value; }
+
+    friend self operator-(self const& i, difference_type n) { return self(i.value - n); }
+
+    friend bool operator==(self const& x, self const& y) { return x.value == y.value; }
+
+    friend bool operator!=(self const& x, self const& y) { return x.value != y.value; }
+
+    friend bool operator<(self const& x, self const& y) { return x.value < y.value; }
+
+    friend bool operator<=(self const& x, self const& y) { return x.value <= y.value; }
+
+    friend bool operator>(self const& x, self const& y) { return x.value > y.value; }
+
+    friend bool operator>=(self const& x, self const& y) { return x.value >= y.value; }
+
+   private:
+    value_type value;
 };
\ No newline at end of file

From e01a2b19d71ccfcde22d286afcb64ec83556a2d1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Sat, 7 Oct 2023 01:07:26 -0700
Subject: [PATCH 19/20] Revert "clang-format"

This reverts commit 2761f772913e40b34f1b23532f4b0c75c57cf16b.
---
 apps/1d_stencil/stencil_cuda.cpp              | 175 ++++++-----
 apps/1d_stencil/stencil_serial.cpp            | 190 +++++------
 apps/1d_stencil/stencil_snd_gpu_m.cpp         | 201 ++++++------
 apps/1d_stencil/stencil_snd_gpu_s.cpp         | 202 ++++++------
 apps/1d_stencil/stencil_stdpar.cpp            | 193 ++++++------
 apps/1d_stencil/stencil_stdpar_snd.cpp        | 254 ++++++++-------
 apps/1d_stencil/stencil_stdpar_snd_iter.cpp   | 213 +++++++------
 apps/choleskey/choleskey_serial.cpp           | 140 +++++----
 apps/choleskey/choleskey_stdpar.cpp           | 124 ++++----
 apps/choleskey/choleskey_stdpar_snd.cpp       | 269 ++++++++--------
 apps/choleskey/matrixutil.hpp                 |  41 +--
 apps/comm-study/comm-study-no-senders.cpp     | 108 ++++---
 apps/comm-study/comm-study.cpp                | 163 +++++-----
 apps/fft/fft-serial.cpp                       |  47 +--
 apps/fft/fft.hpp                              | 215 +++++++------
 apps/heat-equation/heat-equation-cuda.cpp     | 294 ++++++++---------
 .../heat-equation-gpu-scheduler.cpp           | 244 ++++++++-------
 apps/heat-equation/heat-equation-mdspan.cpp   | 209 +++++++------
 .../heat-equation-multigpu-scheduler.cpp      | 248 ++++++++-------
 apps/heat-equation/heat-equation-omp.cpp      | 182 ++++++-----
 .../heat-equation-stdpar-senders.cpp          | 296 +++++++++---------
 apps/heat-equation/heat-equation-stdpar.cpp   | 212 +++++++------
 apps/heat-equation/heat-equation.hpp          |  61 ++--
 apps/mdspan-stdpar/mdspan-stdpar.cpp          |  79 ++---
 include/commons.hpp                           |  57 ++--
 include/counting_iterator.hpp                 | 164 +++++-----
 26 files changed, 2405 insertions(+), 2176 deletions(-)

diff --git a/apps/1d_stencil/stencil_cuda.cpp b/apps/1d_stencil/stencil_cuda.cpp
index 3436893..2c87bac 100644
--- a/apps/1d_stencil/stencil_cuda.cpp
+++ b/apps/1d_stencil/stencil_cuda.cpp
@@ -7,16 +7,20 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
-    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
-    bool& help = flag("h, help", "print help");
-    bool& time = kwarg("t, time", "print time").set_default(true);
+  bool& results = kwarg("results", "print generated results (default: false)")
+                      .set_default(false);
+  std::uint64_t& nx =
+      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+  bool& no_header =
+      kwarg("no-header", "Do not print csv header row (default: false)")
+          .set_default(false);
+  bool& help = flag("h, help", "print help");
+  bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -28,95 +32,96 @@ constexpr double dx = 1.;  // grid spacing
 
 // Our operator
 __device__ double heat(double left, double middle, double right) {
-    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+  return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
 }
 
 __global__ void heat_equation(double* current, double* next, std::size_t size) {
-    std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if (i < size) {
-        std::size_t left = (i == 0) ? size - 1 : i - 1;
-        std::size_t right = (i == size - 1) ? 0 : i + 1;
-        next[i] = heat(current[left], current[i], current[right]);
-    }
+  if (i < size) {
+    std::size_t left = (i == 0) ? size - 1 : i - 1;
+    std::size_t right = (i == size - 1) ? 0 : i + 1;
+    next[i] = heat(current[left], current[i], current[right]);
+  }
 }
 
 int benchmark(args_params_t const& args) {
-    // Parameters (for simplicity, some are hardcoded)
-    std::uint64_t np = args.np;  // Number of partitions.
-    std::uint64_t nx = args.nx;  // Number of grid points.
-    std::uint64_t nt = args.nt;  // Number of steps.
-    std::size_t size = np * nx;
-
-    double* h_current = nullptr;
-    double* h_next = nullptr;
-
-    // Measure execution time.
-    Timer timer;
-
-    // Memory allocation
-    if (args.results) {
-        h_current = new double[size];
-        h_next = new double[size];
+  // Parameters (for simplicity, some are hardcoded)
+  std::uint64_t np = args.np;  // Number of partitions.
+  std::uint64_t nx = args.nx;  // Number of grid points.
+  std::uint64_t nt = args.nt;  // Number of steps.
+  std::size_t size = np * nx;
+
+  double* h_current = nullptr;
+  double* h_next = nullptr;
+
+  // Measure execution time.
+  Timer timer;
+
+  // Memory allocation
+  if (args.results) {
+    h_current = new double[size];
+    h_next = new double[size];
+  }
+
+  double* d_current;
+  double* d_next;
+  cudaMalloc(&d_current, size * sizeof(double));
+  cudaMalloc(&d_next, size * sizeof(double));
+  thrust::sequence(thrust::device, d_current, d_current + size, 0);
+  thrust::sequence(thrust::device, d_next, d_next + size, 0);
+
+  // CUDA kernel execution parameters
+  const int threadsPerBlock = std::min(1024, (int)size);
+  const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  // Actual time step loop
+  for (std::size_t t = 0; t < nt; ++t) {
+    heat_equation<<<blocks, threadsPerBlock>>>(d_current, d_next, size);
+    std::swap(d_current, d_next);
+  }
+  cudaDeviceSynchronize();
+  auto time = timer.stop();
+
+  if (args.results) {
+    // Copy result back to host
+    cudaMemcpy(h_current, d_current, size * sizeof(double),
+               cudaMemcpyDeviceToHost);
+
+    // Print results
+    for (std::size_t i = 0; i < np; ++i) {
+      std::cout << "U[" << i << "] = {";
+      for (std::size_t j = 0; j < nx; ++j) {
+        std::cout << h_current[i * nx + j] << " ";
+      }
+      std::cout << "}\n";
     }
+    // Cleanup
+    delete[] h_current;
+    delete[] h_next;
+  }
 
-    double* d_current;
-    double* d_next;
-    cudaMalloc(&d_current, size * sizeof(double));
-    cudaMalloc(&d_next, size * sizeof(double));
-    thrust::sequence(thrust::device, d_current, d_current + size, 0);
-    thrust::sequence(thrust::device, d_next, d_next + size, 0);
-
-    // CUDA kernel execution parameters
-    const int threadsPerBlock = std::min(1024, (int)size);
-    const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock;
-
-    // Actual time step loop
-    for (std::size_t t = 0; t < nt; ++t) {
-        heat_equation<<<blocks, threadsPerBlock>>>(d_current, d_next, size);
-        std::swap(d_current, d_next);
-    }
-    cudaDeviceSynchronize();
-    auto time = timer.stop();
-
-    if (args.results) {
-        // Copy result back to host
-        cudaMemcpy(h_current, d_current, size * sizeof(double), cudaMemcpyDeviceToHost);
-
-        // Print results
-        for (std::size_t i = 0; i < np; ++i) {
-            std::cout << "U[" << i << "] = {";
-            for (std::size_t j = 0; j < nx; ++j) {
-                std::cout << h_current[i * nx + j] << " ";
-            }
-            std::cout << "}\n";
-        }
-        // Cleanup
-        delete[] h_current;
-        delete[] h_next;
-    }
+  cudaFree(d_current);
+  cudaFree(d_next);
 
-    cudaFree(d_current);
-    cudaFree(d_next);
-
-    if (args.time) {
-        std::cout << "Duration: " << time << " ms."
-                  << "\n";
-    }
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
 
-    return 0;
+  return 0;
 }
 
 int main(int argc, char* argv[]) {
-    // parse params
-    args_params_t args = argparse::parse<args_params_t>(argc, argv);
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    benchmark(args);
+  benchmark(args);
 
-    return 0;
+  return 0;
 }
diff --git a/apps/1d_stencil/stencil_serial.cpp b/apps/1d_stencil/stencil_serial.cpp
index f4a7180..fce1d5d 100644
--- a/apps/1d_stencil/stencil_serial.cpp
+++ b/apps/1d_stencil/stencil_serial.cpp
@@ -32,16 +32,20 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
-    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
-    bool& help = flag("h, help", "print help");
-    bool& time = kwarg("t, time", "print time").set_default(true);
+  bool& results = kwarg("results", "print generated results (default: false)")
+                      .set_default(false);
+  std::uint64_t& nx =
+      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+  bool& no_header =
+      kwarg("no-header", "Do not print csv header row (default: false)")
+          .set_default(false);
+  bool& help = flag("h, help", "print help");
+  bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -54,107 +58,107 @@ double dx = 1.;      // grid spacing
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-    // Our partition type
-    typedef double partition;
+  // Our partition type
+  typedef double partition;
 
-    // Our data for one time step
-    using view_1d = std::extents<int, std::dynamic_extent>;
-    typedef std::mdspan<partition, view_1d, std::layout_right> space;
+  // Our data for one time step
+  using view_1d = std::extents<int, std::dynamic_extent>;
+  typedef std::mdspan<partition, view_1d, std::layout_right> space;
 
-    void init_value(auto& data, std::size_t np, std::size_t nx) {
-        for (std::size_t i = 0; i != np * nx; ++i) {
-            data[i] = double(i);
-        }
+  void init_value(auto& data, std::size_t np, std::size_t nx) {
+    for (std::size_t i = 0; i != np * nx; ++i) {
+      data[i] = double(i);
     }
+  }
 
-    // Our operator
-    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
-                const double dx = ::dx) {
-        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
-    }
-
-    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-        if (id == 0 && dir == -1) {
-            return size - 1;
-        }
-
-        if (id == size - 1 && dir == +1) {
-            return (std::size_t)0;
-        }
-        assert(id < size);
+  // Our operator
+  double heat(double left, double middle, double right, const double k = ::k,
+              const double dt = ::dt, const double dx = ::dx) {
+    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+  }
 
-        return id + dir;
+  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+    if (id == 0 && dir == -1) {
+      return size - 1;
     }
 
-    // do all the work on 'nx' data points for 'nt' time steps
-    space do_work(std::size_t np, std::size_t nx, std::size_t nt) {
-        std::size_t size = np * nx;
-        partition* current_ptr = new partition[size];
-        partition* next_ptr = new partition[size];
-        auto current = space(current_ptr, size);
-        auto next = space(next_ptr, size);
-
-        init_value(current, np, nx);
-
-        // Actual time step loop
-        for (std::size_t t = 0; t != nt; ++t) {
-            for (std::size_t i = 0; i < np * nx; ++i) {
-                auto left = idx(i, -1, size);
-                auto right = idx(i, +1, size);
-                next[i] = heat(current[left], current[i], current[right], k, dt, dx);
-            }
-            std::swap(current, next);
-        }
-
-        return current;
+    if (id == size - 1 && dir == +1) {
+      return (std::size_t)0;
     }
+    assert(id < size);
+
+    return id + dir;
+  }
+
+  // do all the work on 'nx' data points for 'nt' time steps
+  space do_work(std::size_t np, std::size_t nx, std::size_t nt) {
+    std::size_t size = np * nx;
+    partition* current_ptr = new partition[size];
+    partition* next_ptr = new partition[size];
+    auto current = space(current_ptr, size);
+    auto next = space(next_ptr, size);
+
+    init_value(current, np, nx);
+
+    // Actual time step loop
+    for (std::size_t t = 0; t != nt; ++t) {
+      for (std::size_t i = 0; i < np * nx; ++i) {
+        auto left = idx(i, -1, size);
+        auto right = idx(i, +1, size);
+        next[i] = heat(current[left], current[i], current[right], k, dt, dx);
+      }
+      std::swap(current, next);
+    }
+
+    return current;
+  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-    std::uint64_t np = args.np;  // Number of partitions.
-    std::uint64_t nx = args.nx;  // Number of grid points.
-    std::uint64_t nt = args.nt;  // Number of steps.
-
-    // Create the stepper object
-    stepper step;
-
-    // Measure execution time.
-    Timer timer;
-
-    // Execute nt time steps on nx grid points.
-    auto solution = step.do_work(np, nx, nt);
-    auto time = timer.stop();
-
-    // Print the final solution
-    if (args.results) {
-        for (std::size_t i = 0; i != np; ++i) {
-            std::cout << "U[" << i << "] = {";
-            for (std::size_t j = 0; j != nx; ++j) {
-                std::cout << solution[i * nx + j] << " ";
-            }
-            std::cout << "}\n";
-        }
+  std::uint64_t np = args.np;  // Number of partitions.
+  std::uint64_t nx = args.nx;  // Number of grid points.
+  std::uint64_t nt = args.nt;  // Number of steps.
+
+  // Create the stepper object
+  stepper step;
+
+  // Measure execution time.
+  Timer timer;
+
+  // Execute nt time steps on nx grid points.
+  auto solution = step.do_work(np, nx, nt);
+  auto time = timer.stop();
+
+  // Print the final solution
+  if (args.results) {
+    for (std::size_t i = 0; i != np; ++i) {
+      std::cout << "U[" << i << "] = {";
+      for (std::size_t j = 0; j != nx; ++j) {
+        std::cout << solution[i * nx + j] << " ";
+      }
+      std::cout << "}\n";
     }
+  }
 
-    if (args.time) {
-        std::cout << "Duration: " << time << " ms."
-                  << "\n";
-    }
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
 
-    return 0;
+  return 0;
 }
 
 int main(int argc, char* argv[]) {
-    // parse params
-    args_params_t args = argparse::parse<args_params_t>(argc, argv);
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    benchmark(args);
+  benchmark(args);
 
-    return 0;
+  return 0;
 }
diff --git a/apps/1d_stencil/stencil_snd_gpu_m.cpp b/apps/1d_stencil/stencil_snd_gpu_m.cpp
index 0c385d9..83d3e16 100644
--- a/apps/1d_stencil/stencil_snd_gpu_m.cpp
+++ b/apps/1d_stencil/stencil_snd_gpu_m.cpp
@@ -40,16 +40,20 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
-    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
-    bool& help = flag("h, help", "print help");
-    bool& time = kwarg("t, time", "print time").set_default(true);
+  bool& results = kwarg("results", "print generated results (default: false)")
+                      .set_default(false);
+  std::uint64_t& nx =
+      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+  bool& no_header =
+      kwarg("no-header", "Do not print csv header row (default: false)")
+          .set_default(false);
+  bool& help = flag("h, help", "print help");
+  bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -60,118 +64,125 @@ double dt = 1.;      // time step
 double dx = 1.;      // grid spacing
 
 template <class... Ts>
-using any_sender_of = typename exec::any_receiver_ref<stdexec::completion_signatures<Ts...>>::template any_sender<>;
+using any_sender_of = typename exec::any_receiver_ref<
+    stdexec::completion_signatures<Ts...>>::template any_sender<>;
 
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-    // Our partition type
-    typedef double partition;
+  // Our partition type
+  typedef double partition;
 
-    // Our data for one time step
-    typedef thrust::device_vector<partition> space;
+  // Our data for one time step
+  typedef thrust::device_vector<partition> space;
 
-    // Our operator
-    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
-                const double dx = ::dx) {
-        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
-    }
-
-    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-        if (id == 0 && dir == -1) {
-            return size - 1;
-        }
+  // Our operator
+  double heat(double left, double middle, double right, const double k = ::k,
+              const double dt = ::dt, const double dx = ::dx) {
+    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+  }
 
-        if (id == size - 1 && dir == +1) {
-            return (std::size_t)0;
-        }
-        assert(id < size);
+  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+    if (id == 0 && dir == -1) {
+      return size - 1;
+    }
 
-        return id + dir;
+    if (id == size - 1 && dir == +1) {
+      return (std::size_t)0;
+    }
+    assert(id < size);
+
+    return id + dir;
+  }
+
+  // do all the work on 'nx' data points for 'nt' time steps
+  space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx,
+                std::size_t nt) {
+    std::size_t size = np * nx;
+    thrust::device_vector<partition> current_vec(size);
+    thrust::device_vector<partition> next_vec(size);
+
+    auto current_ptr = thrust::raw_pointer_cast(current_vec.data());
+    auto next_ptr = thrust::raw_pointer_cast(next_vec.data());
+
+    stdexec::sender auto init =
+        stdexec::transfer_just(sch, current_ptr, nx) |
+        stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) {
+          current_ptr[i] = (double)i;
+        });
+    stdexec::sync_wait(std::move(init));
+
+    for (std::size_t t = 0; t != nt; ++t) {
+      auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt,
+                                           dx, np, nx) |
+                    stdexec::bulk(np * nx, [&](int i, auto current_ptr,
+                                               auto next_ptr, auto k, auto dt,
+                                               auto dx, auto np, auto nx) {
+                      auto left = idx(i, -1, np * nx);
+                      auto right = idx(i, +1, np * nx);
+                      next_ptr[i] = heat(current_ptr[left], current_ptr[i],
+                                         current_ptr[right], k, dt, dx);
+                    });
+      stdexec::sync_wait(std::move(sender));
+      std::swap(current_ptr, next_ptr);
     }
 
-    // do all the work on 'nx' data points for 'nt' time steps
-    space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) {
-        std::size_t size = np * nx;
-        thrust::device_vector<partition> current_vec(size);
-        thrust::device_vector<partition> next_vec(size);
-
-        auto current_ptr = thrust::raw_pointer_cast(current_vec.data());
-        auto next_ptr = thrust::raw_pointer_cast(next_vec.data());
-
-        stdexec::sender auto init =
-            stdexec::transfer_just(sch, current_ptr, nx) |
-            stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { current_ptr[i] = (double)i; });
-        stdexec::sync_wait(std::move(init));
-
-        for (std::size_t t = 0; t != nt; ++t) {
-            auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx) |
-                          stdexec::bulk(np * nx, [&](int i, auto current_ptr, auto next_ptr, auto k, auto dt, auto dx,
-                                                     auto np, auto nx) {
-                              auto left = idx(i, -1, np * nx);
-                              auto right = idx(i, +1, np * nx);
-                              next_ptr[i] = heat(current_ptr[left], current_ptr[i], current_ptr[right], k, dt, dx);
-                          });
-            stdexec::sync_wait(std::move(sender));
-            std::swap(current_ptr, next_ptr);
-        }
-
-        if (nt % 2 == 0) {
-            return current_vec;
-        }
-        return next_vec;
+    if (nt % 2 == 0) {
+      return current_vec;
     }
+    return next_vec;
+  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-    std::uint64_t np = args.np;  // Number of partitions.
-    std::uint64_t nx = args.nx;  // Number of grid points.
-    std::uint64_t nt = args.nt;  // Number of steps.
+  std::uint64_t np = args.np;  // Number of partitions.
+  std::uint64_t nx = args.nx;  // Number of grid points.
+  std::uint64_t nt = args.nt;  // Number of steps.
 
-    // Create the stepper object
-    stepper step;
+  // Create the stepper object
+  stepper step;
 
-    nvexec::multi_gpu_stream_context stream_context{};
-    stdexec::scheduler auto sch = stream_context.get_scheduler();
+  nvexec::multi_gpu_stream_context stream_context{};
+  stdexec::scheduler auto sch = stream_context.get_scheduler();
 
-    // Measure execution time.
-    Timer timer;
+  // Measure execution time.
+  Timer timer;
 
-    // Execute nt time steps on nx grid points.
-    stepper::space solution = step.do_work(sch, np, nx, nt);
+  // Execute nt time steps on nx grid points.
+  stepper::space solution = step.do_work(sch, np, nx, nt);
 
-    auto time = timer.stop();
+  auto time = timer.stop();
 
-    // Print the final solution
-    if (args.results) {
-        for (std::size_t i = 0; i != np; ++i) {
-            std::cout << "U[" << i << "] = {";
-            for (std::size_t j = 0; j != nx; ++j) {
-                std::cout << solution[i * nx + j] << " ";
-            }
-            std::cout << "}\n";
-        }
+  // Print the final solution
+  if (args.results) {
+    for (std::size_t i = 0; i != np; ++i) {
+      std::cout << "U[" << i << "] = {";
+      for (std::size_t j = 0; j != nx; ++j) {
+        std::cout << solution[i * nx + j] << " ";
+      }
+      std::cout << "}\n";
     }
+  }
 
-    if (args.time) {
-        std::cout << "Duration: " << time << " ms."
-                  << "\n";
-    }
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
 
-    return 0;
+  return 0;
 }
 
 int main(int argc, char* argv[]) {
-    // parse params
-    args_params_t args = argparse::parse<args_params_t>(argc, argv);
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    benchmark(args);
+  benchmark(args);
 
-    return 0;
+  return 0;
 }
diff --git a/apps/1d_stencil/stencil_snd_gpu_s.cpp b/apps/1d_stencil/stencil_snd_gpu_s.cpp
index 8144c52..58fc06c 100644
--- a/apps/1d_stencil/stencil_snd_gpu_s.cpp
+++ b/apps/1d_stencil/stencil_snd_gpu_s.cpp
@@ -40,16 +40,20 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
-    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
-    bool& help = flag("h, help", "print help");
-    bool& time = kwarg("t, time", "print time").set_default(true);
+  bool& results = kwarg("results", "print generated results (default: false)")
+                      .set_default(false);
+  std::uint64_t& nx =
+      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+  bool& no_header =
+      kwarg("no-header", "Do not print csv header row (default: false)")
+          .set_default(false);
+  bool& help = flag("h, help", "print help");
+  bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -60,118 +64,126 @@ double dt = 1.;      // time step
 double dx = 1.;      // grid spacing
 
 template <class... Ts>
-using any_sender_of = typename exec::any_receiver_ref<stdexec::completion_signatures<Ts...>>::template any_sender<>;
+using any_sender_of = typename exec::any_receiver_ref<
+    stdexec::completion_signatures<Ts...>>::template any_sender<>;
 
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-    // Our partition type
-    typedef double partition;
+  // Our partition type
+  typedef double partition;
 
-    // Our data for one time step
-    typedef thrust::device_vector<partition> space;
+  // Our data for one time step
+  typedef thrust::device_vector<partition> space;
 
-    // Our operator
-    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
-                const double dx = ::dx) {
-        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
-    }
-
-    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-        if (id == 0 && dir == -1) {
-            return size - 1;
-        }
+  // Our operator
+  double heat(double left, double middle, double right, const double k = ::k,
+              const double dt = ::dt, const double dx = ::dx) {
+    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+  }
 
-        if (id == size - 1 && dir == +1) {
-            return (std::size_t)0;
-        }
-        assert(id < size);
+  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+    if (id == 0 && dir == -1) {
+      return size - 1;
+    }
 
-        return id + dir;
+    if (id == size - 1 && dir == +1) {
+      return (std::size_t)0;
+    }
+    assert(id < size);
+
+    return id + dir;
+  }
+
+  // do all the work on 'nx' data points for 'nt' time steps
+  space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx,
+                std::size_t nt) {
+    std::size_t size = np * nx;
+    thrust::device_vector<partition> current_vec(size);
+    thrust::device_vector<partition> next_vec(size);
+
+    auto current_ptr = thrust::raw_pointer_cast(current_vec.data());
+    auto next_ptr = thrust::raw_pointer_cast(next_vec.data());
+
+    stdexec::sender auto init =
+        stdexec::transfer_just(sch, current_ptr, nx) |
+        stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) {
+          current_ptr[i] = (double)i;
+        });
+    stdexec::sync_wait(std::move(init));
+
+    for (std::size_t t = 0; t != nt; ++t) {
+      auto sender =
+          stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx,
+                                 size) |
+          stdexec::bulk(np * nx,
+                        [&](int i, auto& current_ptr, auto& next_ptr, auto k,
+                            auto dt, auto dx, auto np, auto nx, auto size) {
+                          std::size_t left = (i == 0) ? size - 1 : i - 1;
+                          std::size_t right = (i == size - 1) ? 0 : i + 1;
+                          next_ptr[i] = heat(current_ptr[left], current_ptr[i],
+                                             current_ptr[right], k, dt, dx);
+                        });
+      stdexec::sync_wait(std::move(sender));
+      std::swap(current_ptr, next_ptr);
     }
 
-    // do all the work on 'nx' data points for 'nt' time steps
-    space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) {
-        std::size_t size = np * nx;
-        thrust::device_vector<partition> current_vec(size);
-        thrust::device_vector<partition> next_vec(size);
-
-        auto current_ptr = thrust::raw_pointer_cast(current_vec.data());
-        auto next_ptr = thrust::raw_pointer_cast(next_vec.data());
-
-        stdexec::sender auto init =
-            stdexec::transfer_just(sch, current_ptr, nx) |
-            stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto nx) { current_ptr[i] = (double)i; });
-        stdexec::sync_wait(std::move(init));
-
-        for (std::size_t t = 0; t != nt; ++t) {
-            auto sender = stdexec::transfer_just(sch, current_ptr, next_ptr, k, dt, dx, np, nx, size) |
-                          stdexec::bulk(np * nx, [&](int i, auto& current_ptr, auto& next_ptr, auto k, auto dt, auto dx,
-                                                     auto np, auto nx, auto size) {
-                              std::size_t left = (i == 0) ? size - 1 : i - 1;
-                              std::size_t right = (i == size - 1) ? 0 : i + 1;
-                              next_ptr[i] = heat(current_ptr[left], current_ptr[i], current_ptr[right], k, dt, dx);
-                          });
-            stdexec::sync_wait(std::move(sender));
-            std::swap(current_ptr, next_ptr);
-        }
-
-        if (nt % 2 == 0) {
-            return current_vec;
-        }
-        return next_vec;
+    if (nt % 2 == 0) {
+      return current_vec;
     }
+    return next_vec;
+  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-    std::uint64_t np = args.np;  // Number of partitions.
-    std::uint64_t nx = args.nx;  // Number of grid points.
-    std::uint64_t nt = args.nt;  // Number of steps.
+  std::uint64_t np = args.np;  // Number of partitions.
+  std::uint64_t nx = args.nx;  // Number of grid points.
+  std::uint64_t nt = args.nt;  // Number of steps.
 
-    // Create the stepper object
-    stepper step;
+  // Create the stepper object
+  stepper step;
 
-    nvexec::stream_context stream_ctx{};
-    stdexec::scheduler auto sch = stream_ctx.get_scheduler();
+  nvexec::stream_context stream_ctx{};
+  stdexec::scheduler auto sch = stream_ctx.get_scheduler();
 
-    // Measure execution time.
-    Timer timer;
+  // Measure execution time.
+  Timer timer;
 
-    // Execute nt time steps on nx grid points.
-    stepper::space solution = step.do_work(sch, np, nx, nt);
+  // Execute nt time steps on nx grid points.
+  stepper::space solution = step.do_work(sch, np, nx, nt);
 
-    auto time = timer.stop();
+  auto time = timer.stop();
 
-    // Print the final solution
-    if (args.results) {
-        for (std::size_t i = 0; i != np; ++i) {
-            std::cout << "U[" << i << "] = {";
-            for (std::size_t j = 0; j != nx; ++j) {
-                std::cout << solution[i * nx + j] << " ";
-            }
-            std::cout << "}\n";
-        }
+  // Print the final solution
+  if (args.results) {
+    for (std::size_t i = 0; i != np; ++i) {
+      std::cout << "U[" << i << "] = {";
+      for (std::size_t j = 0; j != nx; ++j) {
+        std::cout << solution[i * nx + j] << " ";
+      }
+      std::cout << "}\n";
     }
+  }
 
-    if (args.time) {
-        std::cout << "Duration: " << time << " ms."
-                  << "\n";
-    }
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
 
-    return 0;
+  return 0;
 }
 
 int main(int argc, char* argv[]) {
-    // parse params
-    args_params_t args = argparse::parse<args_params_t>(argc, argv);
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    benchmark(args);
+  benchmark(args);
 
-    return 0;
+  return 0;
 }
diff --git a/apps/1d_stencil/stencil_stdpar.cpp b/apps/1d_stencil/stencil_stdpar.cpp
index e424620..c1e780c 100644
--- a/apps/1d_stencil/stencil_stdpar.cpp
+++ b/apps/1d_stencil/stencil_stdpar.cpp
@@ -34,16 +34,20 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
-    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
-    bool& help = flag("h, help", "print help");
-    bool& time = kwarg("t, time", "print time").set_default(true);
+  bool& results = kwarg("results", "print generated results (default: false)")
+                      .set_default(false);
+  std::uint64_t& nx =
+      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+  bool& no_header =
+      kwarg("no-header", "Do not print csv header row (default: false)")
+          .set_default(false);
+  bool& help = flag("h, help", "print help");
+  bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -56,104 +60,105 @@ double dx = 1.;      // grid spacing
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-    // Our partition type
-    typedef double partition;
-
-    // Our data for one time step
-    using view_1d = std::extents<int, std::dynamic_extent>;
-    typedef std::mdspan<partition, view_1d, std::layout_right> space;
-
-    // Our operator
-    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
-                const double dx = ::dx) {
-        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+  // Our partition type
+  typedef double partition;
+
+  // Our data for one time step
+  using view_1d = std::extents<int, std::dynamic_extent>;
+  typedef std::mdspan<partition, view_1d, std::layout_right> space;
+
+  // Our operator
+  double heat(double left, double middle, double right, const double k = ::k,
+              const double dt = ::dt, const double dx = ::dx) {
+    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+  }
+
+  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+    if (id == 0 && dir == -1) {
+      return size - 1;
     }
 
-    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-        if (id == 0 && dir == -1) {
-            return size - 1;
-        }
-
-        if (id == size - 1 && dir == +1) {
-            return (std::size_t)0;
-        }
-        assert(id < size);
-
-        return id + dir;
+    if (id == size - 1 && dir == +1) {
+      return (std::size_t)0;
     }
-
-    // do all the work on 'nx' data points for 'nt' time steps
-    space do_work(std::size_t np, std::size_t nx, std::size_t nt) {
-        std::size_t size = np * nx;
-        partition* current_ptr = new partition[size];
-        partition* next_ptr = new partition[size];
-
-        auto current = space(current_ptr, size);
-        auto next = space(next_ptr, size);
-        // parallel init
-        std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
-                        [=](std::size_t i) { current_ptr[i] = (double)i; });
-
-        // Actual time step loop
-        for (std::size_t t = 0; t != nt; ++t) {
-            std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
-                            [=, k = k, dt = dt, dx = dx](int32_t i) {
-                                auto left = idx(i, -1, size);
-                                auto right = idx(i, +1, size);
-                                next[i] = heat(current[left], current[i], current[right], k, dt, dx);
-                            });
-            std::swap(current, next);
-        }
-
-        return current;
+    assert(id < size);
+
+    return id + dir;
+  }
+
+  // do all the work on 'nx' data points for 'nt' time steps
+  space do_work(std::size_t np, std::size_t nx, std::size_t nt) {
+    std::size_t size = np * nx;
+    partition* current_ptr = new partition[size];
+    partition* next_ptr = new partition[size];
+
+    auto current = space(current_ptr, size);
+    auto next = space(next_ptr, size);
+    // parallel init
+    std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
+                    [=](std::size_t i) { current_ptr[i] = (double)i; });
+
+    // Actual time step loop
+    for (std::size_t t = 0; t != nt; ++t) {
+      std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
+                      [=, k = k, dt = dt, dx = dx](int32_t i) {
+                        auto left = idx(i, -1, size);
+                        auto right = idx(i, +1, size);
+                        next[i] = heat(current[left], current[i],
+                                       current[right], k, dt, dx);
+                      });
+      std::swap(current, next);
     }
+
+    return current;
+  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-    std::uint64_t np = args.np;  // Number of partitions.
-    std::uint64_t nx = args.nx;  // Number of grid points.
-    std::uint64_t nt = args.nt;  // Number of steps.
-
-    // Create the stepper object
-    stepper step;
-
-    // Measure execution time.
-    Timer timer;
-
-    // Execute nt time steps on nx grid points.
-    auto solution = step.do_work(np, nx, nt);
-    auto time = timer.stop();
-
-    // Print the final solution
-    if (args.results) {
-        for (std::size_t i = 0; i != np; ++i) {
-            std::cout << "U[" << i << "] = {";
-            for (std::size_t j = 0; j != nx; ++j) {
-                std::cout << solution[i * nx + j] << " ";
-            }
-            std::cout << "}\n";
-        }
+  std::uint64_t np = args.np;  // Number of partitions.
+  std::uint64_t nx = args.nx;  // Number of grid points.
+  std::uint64_t nt = args.nt;  // Number of steps.
+
+  // Create the stepper object
+  stepper step;
+
+  // Measure execution time.
+  Timer timer;
+
+  // Execute nt time steps on nx grid points.
+  auto solution = step.do_work(np, nx, nt);
+  auto time = timer.stop();
+
+  // Print the final solution
+  if (args.results) {
+    for (std::size_t i = 0; i != np; ++i) {
+      std::cout << "U[" << i << "] = {";
+      for (std::size_t j = 0; j != nx; ++j) {
+        std::cout << solution[i * nx + j] << " ";
+      }
+      std::cout << "}\n";
     }
+  }
 
-    if (args.time) {
-        std::cout << "Duration: " << time << " ms."
-                  << "\n";
-    }
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
 
-    return 0;
+  return 0;
 }
 
 int main(int argc, char* argv[]) {
-    // parse params
-    args_params_t args = argparse::parse<args_params_t>(argc, argv);
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    benchmark(args);
+  benchmark(args);
 
-    return 0;
+  return 0;
 }
diff --git a/apps/1d_stencil/stencil_stdpar_snd.cpp b/apps/1d_stencil/stencil_stdpar_snd.cpp
index 6a08a63..30cfca8 100644
--- a/apps/1d_stencil/stencil_stdpar_snd.cpp
+++ b/apps/1d_stencil/stencil_stdpar_snd.cpp
@@ -37,16 +37,20 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
-    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
-    bool& help = flag("h, help", "print help");
-    bool& time = kwarg("t, time", "print time").set_default(true);
+  bool& results = kwarg("results", "print generated results (default: false)")
+                      .set_default(false);
+  std::uint64_t& nx =
+      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+  bool& no_header =
+      kwarg("no-header", "Do not print csv header row (default: false)")
+          .set_default(false);
+  bool& help = flag("h, help", "print help");
+  bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -57,134 +61,144 @@ double dt = 1.;      // time step
 double dx = 1.;      // grid spacing
 
 template <class... Ts>
-using any_sender_of = typename exec::any_receiver_ref<stdexec::completion_signatures<Ts...>>::template any_sender<>;
+using any_sender_of = typename exec::any_receiver_ref<
+    stdexec::completion_signatures<Ts...>>::template any_sender<>;
 
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-    // Our partition type
-    typedef double partition;
-
-    // Our data for one time step
-    using view_1d = std::extents<int, std::dynamic_extent>;
-    typedef std::mdspan<partition, view_1d, std::layout_right> space;
-
-    using any_space_sender =
-        any_sender_of<stdexec::set_value_t(space), stdexec::set_stopped_t(), stdexec::set_error_t(std::exception_ptr)>;
-
-    // Our operator
-    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
-                const double dx = ::dx) {
-        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+  // Our partition type
+  typedef double partition;
+
+  // Our data for one time step
+  using view_1d = std::extents<int, std::dynamic_extent>;
+  typedef std::mdspan<partition, view_1d, std::layout_right> space;
+
+  using any_space_sender =
+      any_sender_of<stdexec::set_value_t(space), stdexec::set_stopped_t(),
+                    stdexec::set_error_t(std::exception_ptr)>;
+
+  // Our operator
+  double heat(double left, double middle, double right, const double k = ::k,
+              const double dt = ::dt, const double dx = ::dx) {
+    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+  }
+
+  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+    if (id == 0 && dir == -1) {
+      return size - 1;
     }
 
-    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-        if (id == 0 && dir == -1) {
-            return size - 1;
-        }
-
-        if (id == size - 1 && dir == +1) {
-            return (std::size_t)0;
-        }
-        assert(id < size);
-
-        return id + dir;
+    if (id == size - 1 && dir == +1) {
+      return (std::size_t)0;
     }
-
-    partition* current_ptr = nullptr;
-    partition* next_ptr = nullptr;
-    space current;
-    space next;
-
-    // do all the work on 'nx' data points for 'nt' time steps
-    auto do_work(std::size_t np, std::size_t nx, std::size_t nt) -> any_space_sender {
-        if (nt == 0) {
-            std::size_t size = np * nx;
-            partition* current_ptr = new partition[size];
-            partition* next_ptr = new partition[size];
-            current = space(current_ptr, size);
-            next = space(next_ptr, size);
-
-            // parallel init
-            std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
-                            [=](std::size_t i) { current_ptr[i] = (double)i; });
-
-            return stdexec::just(current);
-        }
-
-        return stdexec::just(nt - 1) |
-               stdexec::let_value([=](std::size_t nt_updated) { return do_work(np, nx, nt_updated); }) |
-               stdexec::bulk(np,
-                             [&, k = k, dt = dt, dx = dx, nx = nx, np = np](std::size_t i, auto const& current) {
-                                 std::for_each_n(
-                                     std::execution::par, counting_iterator(0), nx, [=, next = next](std::size_t j) {
-                                         std::size_t id = i * nx + j;
-                                         auto left = idx(id, -1, np * nx);
-                                         auto right = idx(id, +1, np * nx);
-                                         next[id] = heat(current[left], current[id], current[right], k, dt, dx);
-                                     });
-                             }) |
-               stdexec::then([&](auto current) {
-                   // TODO: return next?
-                   std::swap(current, next);
-                   return current;
-               });
+    assert(id < size);
+
+    return id + dir;
+  }
+
+  partition* current_ptr = nullptr;
+  partition* next_ptr = nullptr;
+  space current;
+  space next;
+
+  // do all the work on 'nx' data points for 'nt' time steps
+  auto do_work(std::size_t np, std::size_t nx, std::size_t nt)
+      -> any_space_sender {
+    if (nt == 0) {
+      std::size_t size = np * nx;
+      partition* current_ptr = new partition[size];
+      partition* next_ptr = new partition[size];
+      current = space(current_ptr, size);
+      next = space(next_ptr, size);
+
+      // parallel init
+      std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
+                      [=](std::size_t i) { current_ptr[i] = (double)i; });
+
+      return stdexec::just(current);
     }
+
+    return stdexec::just(nt - 1) |
+           stdexec::let_value([=](std::size_t nt_updated) {
+             return do_work(np, nx, nt_updated);
+           }) |
+           stdexec::bulk(np,
+                         [&, k = k, dt = dt, dx = dx, nx = nx, np = np](
+                             std::size_t i, auto const& current) {
+                           std::for_each_n(
+                               std::execution::par, counting_iterator(0), nx,
+                               [=, next = next](std::size_t j) {
+                                 std::size_t id = i * nx + j;
+                                 auto left = idx(id, -1, np * nx);
+                                 auto right = idx(id, +1, np * nx);
+                                 next[id] = heat(current[left], current[id],
+                                                 current[right], k, dt, dx);
+                               });
+                         }) |
+           stdexec::then([&](auto current) {
+             // TODO: return next?
+             std::swap(current, next);
+             return current;
+           });
+  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-    std::uint64_t np = args.np;  // Number of partitions.
-    std::uint64_t nx = args.nx;  // Number of grid points.
-    std::uint64_t nt = args.nt;  // Number of steps.
-
-    // Create the stepper object
-    stepper step;
-
-    exec::static_thread_pool pool(np);
-    stdexec::scheduler auto sch = pool.get_scheduler();
-    stdexec::sender auto begin = stdexec::schedule(sch);
-
-    // Measure execution time.
-    Timer timer;
-
-    // Execute nt time steps on nx grid points.
-    stdexec::sender auto sender = begin | stdexec::then([=]() { return nt; }) |
-                                  stdexec::let_value([=, &step](std::uint64_t nt) { return step.do_work(np, nx, nt); });
-
-    auto [solution] = stdexec::sync_wait(std::move(sender)).value();
-
-    auto time = timer.stop();
-
-    // Print the final solution
-    if (args.results) {
-        for (std::size_t i = 0; i != np; ++i) {
-            std::cout << "U[" << i << "] = {";
-            for (std::size_t j = 0; j != nx; ++j) {
-                std::cout << solution[i * nx + j] << " ";
-            }
-            std::cout << "}\n";
-        }
+  std::uint64_t np = args.np;  // Number of partitions.
+  std::uint64_t nx = args.nx;  // Number of grid points.
+  std::uint64_t nt = args.nt;  // Number of steps.
+
+  // Create the stepper object
+  stepper step;
+
+  exec::static_thread_pool pool(np);
+  stdexec::scheduler auto sch = pool.get_scheduler();
+  stdexec::sender auto begin = stdexec::schedule(sch);
+
+  // Measure execution time.
+  Timer timer;
+
+  // Execute nt time steps on nx grid points.
+  stdexec::sender auto sender =
+      begin | stdexec::then([=]() { return nt; }) |
+      stdexec::let_value(
+          [=, &step](std::uint64_t nt) { return step.do_work(np, nx, nt); });
+
+  auto [solution] = stdexec::sync_wait(std::move(sender)).value();
+
+  auto time = timer.stop();
+
+  // Print the final solution
+  if (args.results) {
+    for (std::size_t i = 0; i != np; ++i) {
+      std::cout << "U[" << i << "] = {";
+      for (std::size_t j = 0; j != nx; ++j) {
+        std::cout << solution[i * nx + j] << " ";
+      }
+      std::cout << "}\n";
     }
+  }
 
-    if (args.time) {
-        std::cout << "Duration: " << time << " ms."
-                  << "\n";
-    }
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
 
-    return 0;
+  return 0;
 }
 
 int main(int argc, char* argv[]) {
-    // parse params
-    args_params_t args = argparse::parse<args_params_t>(argc, argv);
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    benchmark(args);
+  benchmark(args);
 
-    return 0;
+  return 0;
 }
diff --git a/apps/1d_stencil/stencil_stdpar_snd_iter.cpp b/apps/1d_stencil/stencil_stdpar_snd_iter.cpp
index 0cbffc9..0c1280a 100644
--- a/apps/1d_stencil/stencil_stdpar_snd_iter.cpp
+++ b/apps/1d_stencil/stencil_stdpar_snd_iter.cpp
@@ -37,16 +37,20 @@
 
 // parameters
 struct args_params_t : public argparse::Args {
-    bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
-    std::uint64_t& nx = kwarg("nx", "Local x dimension (of each partition)").set_default(10);
-    std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
-    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
-    bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
-    double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
-    double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
-    bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
-    bool& help = flag("h, help", "print help");
-    bool& time = kwarg("t, time", "print time").set_default(true);
+  bool& results = kwarg("results", "print generated results (default: false)")
+                      .set_default(false);
+  std::uint64_t& nx =
+      kwarg("nx", "Local x dimension (of each partition)").set_default(10);
+  std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
+  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(10);
+  bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
+  double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
+  double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
+  bool& no_header =
+      kwarg("no-header", "Do not print csv header row (default: false)")
+          .set_default(false);
+  bool& help = flag("h, help", "print help");
+  bool& time = kwarg("t, time", "print time").set_default(true);
 };
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -57,122 +61,127 @@ double dt = 1.;      // time step
 double dx = 1.;      // grid spacing
 
 template <class... Ts>
-using any_sender_of = typename exec::any_receiver_ref<stdexec::completion_signatures<Ts...>>::template any_sender<>;
+using any_sender_of = typename exec::any_receiver_ref<
+    stdexec::completion_signatures<Ts...>>::template any_sender<>;
 
 ///////////////////////////////////////////////////////////////////////////////
 //[stepper_1
 struct stepper {
-    // Our partition type
-    typedef double partition;
-
-    // Our data for one time step
-    using view_1d = std::extents<int, std::dynamic_extent>;
-    typedef std::mdspan<partition, view_1d, std::layout_right> space;
-
-    // Our operator
-    double heat(double left, double middle, double right, const double k = ::k, const double dt = ::dt,
-                const double dx = ::dx) {
-        return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+  // Our partition type
+  typedef double partition;
+
+  // Our data for one time step
+  using view_1d = std::extents<int, std::dynamic_extent>;
+  typedef std::mdspan<partition, view_1d, std::layout_right> space;
+
+  // Our operator
+  double heat(double left, double middle, double right, const double k = ::k,
+              const double dt = ::dt, const double dx = ::dx) {
+    return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
+  }
+
+  inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
+    if (id == 0 && dir == -1) {
+      return size - 1;
     }
 
-    inline std::size_t idx(std::size_t id, int dir, std::size_t size) {
-        if (id == 0 && dir == -1) {
-            return size - 1;
-        }
-
-        if (id == size - 1 && dir == +1) {
-            return (std::size_t)0;
-        }
-        assert(id < size);
-
-        return id + dir;
+    if (id == size - 1 && dir == +1) {
+      return (std::size_t)0;
     }
-
-    partition* current_ptr = nullptr;
-    partition* next_ptr = nullptr;
-    space current;
-    space next;
-
-    // do all the work on 'nx' data points for 'nt' time steps
-    space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx, std::size_t nt) {
-        std::size_t size = np * nx;
-        partition* current_ptr = new partition[size];
-        partition* next_ptr = new partition[size];
-
-        auto current = space(current_ptr, size);
-        auto next = space(next_ptr, size);
-        // parallel init
-        std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
-                        [=](std::size_t i) { current(i) = (double)i; });
-
-        // Actual time step loop
-        for (std::size_t t = 0; t != nt; ++t) {
-            auto sender =
-                stdexec::transfer_just(sch, current, next, k, dt, dx, np, nx) |
-                stdexec::bulk(np, [&](int i, auto& current, auto& next, auto k, auto dt, auto dx, auto np, auto nx) {
-                    std::for_each_n(std::execution::par, counting_iterator(0), nx, [=](std::size_t j) {
-                        std::size_t id = i * nx + j;
-                        auto left = idx(id, -1, np * nx);
-                        auto right = idx(id, +1, np * nx);
-                        next(id) = heat(current(left), current(id), current(right), k, dt, dx);
-                    });
-                });
-            stdexec::sync_wait(std::move(sender));
-            std::swap(current, next);
-        }
-
-        return current;
+    assert(id < size);
+
+    return id + dir;
+  }
+
+  partition* current_ptr = nullptr;
+  partition* next_ptr = nullptr;
+  space current;
+  space next;
+
+  // do all the work on 'nx' data points for 'nt' time steps
+  space do_work(stdexec::scheduler auto& sch, std::size_t np, std::size_t nx,
+                std::size_t nt) {
+    std::size_t size = np * nx;
+    partition* current_ptr = new partition[size];
+    partition* next_ptr = new partition[size];
+
+    auto current = space(current_ptr, size);
+    auto next = space(next_ptr, size);
+    // parallel init
+    std::for_each_n(std::execution::par, counting_iterator(0), np * nx,
+                    [=](std::size_t i) { current(i) = (double)i; });
+
+    // Actual time step loop
+    for (std::size_t t = 0; t != nt; ++t) {
+      auto sender =
+          stdexec::transfer_just(sch, current, next, k, dt, dx, np, nx) |
+          stdexec::bulk(np, [&](int i, auto& current, auto& next, auto k,
+                                auto dt, auto dx, auto np, auto nx) {
+            std::for_each_n(std::execution::par, counting_iterator(0), nx,
+                            [=](std::size_t j) {
+                              std::size_t id = i * nx + j;
+                              auto left = idx(id, -1, np * nx);
+                              auto right = idx(id, +1, np * nx);
+                              next(id) = heat(current(left), current(id),
+                                              current(right), k, dt, dx);
+                            });
+          });
+      stdexec::sync_wait(std::move(sender));
+      std::swap(current, next);
     }
+
+    return current;
+  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
-    std::uint64_t np = args.np;  // Number of partitions.
-    std::uint64_t nx = args.nx;  // Number of grid points.
-    std::uint64_t nt = args.nt;  // Number of steps.
+  std::uint64_t np = args.np;  // Number of partitions.
+  std::uint64_t nx = args.nx;  // Number of grid points.
+  std::uint64_t nt = args.nt;  // Number of steps.
 
-    // Create the stepper object
-    stepper step;
+  // Create the stepper object
+  stepper step;
 
-    exec::static_thread_pool pool(np);
-    stdexec::scheduler auto sch = pool.get_scheduler();
+  exec::static_thread_pool pool(np);
+  stdexec::scheduler auto sch = pool.get_scheduler();
 
-    // Measure execution time.
-    Timer timer;
+  // Measure execution time.
+  Timer timer;
 
-    stepper::space solution = step.do_work(sch, np, nx, nt);
+  stepper::space solution = step.do_work(sch, np, nx, nt);
 
-    auto time = timer.stop();
+  auto time = timer.stop();
 
-    // Print the final solution
-    if (args.results) {
-        for (std::size_t i = 0; i != np; ++i) {
-            std::cout << "U[" << i << "] = {";
-            for (std::size_t j = 0; j != nx; ++j) {
-                std::cout << solution[i * nx + j] << " ";
-            }
-            std::cout << "}\n";
-        }
+  // Print the final solution
+  if (args.results) {
+    for (std::size_t i = 0; i != np; ++i) {
+      std::cout << "U[" << i << "] = {";
+      for (std::size_t j = 0; j != nx; ++j) {
+        std::cout << solution[i * nx + j] << " ";
+      }
+      std::cout << "}\n";
     }
+  }
 
-    if (args.time) {
-        std::cout << "Duration: " << time << " ms."
-                  << "\n";
-    }
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
 
-    return 0;
+  return 0;
 }
 
 int main(int argc, char* argv[]) {
-    // parse params
-    args_params_t args = argparse::parse<args_params_t>(argc, argv);
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    benchmark(args);
+  benchmark(args);
 
-    return 0;
+  return 0;
 }
diff --git a/apps/choleskey/choleskey_serial.cpp b/apps/choleskey/choleskey_serial.cpp
index 88d6824..5c82498 100644
--- a/apps/choleskey/choleskey_serial.cpp
+++ b/apps/choleskey/choleskey_serial.cpp
@@ -39,90 +39,92 @@ using namespace std;
 
 struct solver {
 
-    using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
-
-    typedef std::mdspan<int, view_2d, std::layout_right> matrix_ms_t;
-
-    template <typename T>
-    matrix_ms_t Cholesky_Decomposition(std::vector<T>& vec, int n) {
-        std::vector<T> lower(n * n, 0);
-
-        auto matrix_ms = std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
-        auto lower_ms = std::mdspan<T, view_2d, std::layout_right>(lower.data(), n, n);
-
-        // Decomposing a matrix into Lower Triangular
-        for (int i = 0; i < matrix_ms.extent(0); i++) {
-            for (int j = 0; j <= i; j++) {
-                T sum = 0;
-
-                if (j == i) {
-                    // summation for diagonals
-                    for (int k = 0; k < j; k++)
-                        sum += pow(lower_ms(j, k), 2);
-                    lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum);
-                } else {
-                    // Evaluating L(i, j) using L(j, j)
-                    for (int k = 0; k < j; k++)
-                        sum += (lower_ms(i, k) * lower_ms(j, k));
-                    lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j);
-                }
-            }
+  using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+
+  typedef std::mdspan<int, view_2d, std::layout_right> matrix_ms_t;
+
+  template <typename T>
+  matrix_ms_t Cholesky_Decomposition(std::vector<T>& vec, int n) {
+    std::vector<T> lower(n * n, 0);
+
+    auto matrix_ms =
+        std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
+    auto lower_ms =
+        std::mdspan<T, view_2d, std::layout_right>(lower.data(), n, n);
+
+    // Decomposing a matrix into Lower Triangular
+    for (int i = 0; i < matrix_ms.extent(0); i++) {
+      for (int j = 0; j <= i; j++) {
+        T sum = 0;
+
+        if (j == i) {
+          // summation for diagonals
+          for (int k = 0; k < j; k++)
+            sum += pow(lower_ms(j, k), 2);
+          lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum);
+        } else {
+          // Evaluating L(i, j) using L(j, j)
+          for (int k = 0; k < j; k++)
+            sum += (lower_ms(i, k) * lower_ms(j, k));
+          lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j);
         }
-        return lower_ms;
+      }
     }
+    return lower_ms;
+  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
 
-    std::uint64_t nd = args.nd;  // Number of matrix dimension.
-
-    std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
-
-    // Create the solverobject
-    solver solve;
-    // Measure execution time.
-    Timer timer;
-    // start decomposation
-    auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
-
-    // Print the final results
-    if (args.results) {
-        // Displaying Lower Triangular and its Transpose
-        cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
-        for (int i = 0; i < nd; i++) {
-            // Lower Triangular
-            for (int j = 0; j < nd; j++)
-                cout << setw(6) << res_matrix(i, j) << "\t";
-            cout << "\t";
-
-            // Transpose of Lower Triangular
-            for (int j = 0; j < nd; j++)
-                cout << setw(6) << res_matrix(j, i) << "\t";
-            cout << endl;
-        }
+  std::uint64_t nd = args.nd;  // Number of matrix dimension.
+
+  std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
+
+  // Create the solverobject
+  solver solve;
+  // Measure execution time.
+  Timer timer;
+  // start decomposation
+  auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
+
+  // Print the final results
+  if (args.results) {
+    // Displaying Lower Triangular and its Transpose
+    cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
+    for (int i = 0; i < nd; i++) {
+      // Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix(i, j) << "\t";
+      cout << "\t";
+
+      // Transpose of Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix(j, i) << "\t";
+      cout << endl;
     }
+  }
 
-    if (args.time) {
-        std::cout << "Duration: " << time << " ms."
-                  << "\n";
-    }
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
 
-    return 0;
+  return 0;
 }
 
 // Driver Code for testing
 int main(int argc, char* argv[]) {
 
-    // parse params
-    args_params_t args = argparse::parse<args_params_t>(argc, argv);
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    benchmark(args);
+  benchmark(args);
 
-    return 0;
+  return 0;
 }
diff --git a/apps/choleskey/choleskey_stdpar.cpp b/apps/choleskey/choleskey_stdpar.cpp
index 33c6b87..2b19b7d 100644
--- a/apps/choleskey/choleskey_stdpar.cpp
+++ b/apps/choleskey/choleskey_stdpar.cpp
@@ -44,95 +44,99 @@ using namespace std;
 
 struct solver {
 
-    using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+  using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 
-    template <typename T>
-    std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec, int n) {
-        std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
+  template <typename T>
+  std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec,
+                                                     int n) {
+    std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
 
-        auto matrix_ms = std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
+    auto matrix_ms =
+        std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
 
-        auto multiplier_lambda = [=](auto a, auto b) {
-            return a * b;
-        };
+    auto multiplier_lambda = [=](auto a, auto b) {
+      return a * b;
+    };
 
-        // Decomposing a matrix into Lower Triangular
-        for (int i = 0; i < matrix_ms.extent(0); i++) {
-            for (int j = 0; j <= i; j++) {
-                T sum = 0;
+    // Decomposing a matrix into Lower Triangular
+    for (int i = 0; i < matrix_ms.extent(0); i++) {
+      for (int j = 0; j <= i; j++) {
+        T sum = 0;
 
-                if (j == i)  // summation for diagonals
-                {
-                    sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j, 0,
-                                                std::plus{}, [=](int val) { return val * val; });
+        if (j == i)  // summation for diagonals
+        {
+          sum = std::transform_reduce(std::execution::par, lower[j].cbegin(),
+                                      lower[j].cbegin() + j, 0, std::plus{},
+                                      [=](int val) { return val * val; });
 
-                    lower[j][j] = std::sqrt(matrix_ms(i, j) - sum);
+          lower[j][j] = std::sqrt(matrix_ms(i, j) - sum);
 
-                } else {  // Evaluating L(i, j) using L(j, j)
+        } else {  // Evaluating L(i, j) using L(j, j)
 
-                    sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j,
-                                                lower[i].cbegin(), 0, std::plus<>(), multiplier_lambda);
+          sum = std::transform_reduce(std::execution::par, lower[j].cbegin(),
+                                      lower[j].cbegin() + j, lower[i].cbegin(),
+                                      0, std::plus<>(), multiplier_lambda);
 
-                    lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j];
-                }
-            }
+          lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j];
         }
-        return lower;
+      }
     }
+    return lower;
+  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
 
-    std::uint64_t nd = args.nd;  // Number of matrix dimension.
+  std::uint64_t nd = args.nd;  // Number of matrix dimension.
 
-    std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
+  std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
 
-    // Create the solver object
-    solver solve;
-    // Measure execution time.
-    Timer timer;
+  // Create the solver object
+  solver solve;
+  // Measure execution time.
+  Timer timer;
 
-    // start decomposation
-    auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
+  // start decomposation
+  auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
 
-    // Print the final results
-    if (args.results) {
-        // Displaying Lower Triangular and its Transpose
-        cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
-        for (int i = 0; i < nd; i++) {
-            // Lower Triangular
-            for (int j = 0; j < nd; j++)
-                cout << setw(6) << res_matrix[i][j] << "\t";
-            cout << "\t";
+  // Print the final results
+  if (args.results) {
+    // Displaying Lower Triangular and its Transpose
+    cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
+    for (int i = 0; i < nd; i++) {
+      // Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix[i][j] << "\t";
+      cout << "\t";
 
-            // Transpose of Lower Triangular
-            for (int j = 0; j < nd; j++)
-                cout << setw(6) << res_matrix[j][i] << "\t";
-            cout << endl;
-        }
+      // Transpose of Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix[j][i] << "\t";
+      cout << endl;
     }
+  }
 
-    if (args.time) {
-        std::cout << "Duration: " << time << " ms."
-                  << "\n";
-    }
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
 
-    return 0;
+  return 0;
 }
 
 // Driver Code for testing
 int main(int argc, char* argv[]) {
 
-    // parse params
-    args_params_t args = argparse::parse<args_params_t>(argc, argv);
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    benchmark(args);
+  benchmark(args);
 
-    return 0;
+  return 0;
 }
diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp
index 0a02682..4fa7d79 100644
--- a/apps/choleskey/choleskey_stdpar_snd.cpp
+++ b/apps/choleskey/choleskey_stdpar_snd.cpp
@@ -45,157 +45,166 @@ using namespace std;
 
 struct solver {
 
-    using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
-
-    template <typename T>
-    std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec, int n, int np) {
-
-        // test here first, scheduler from a thread pool
-        exec::static_thread_pool pool(np);
-        stdexec::scheduler auto sch = pool.get_scheduler();
-        stdexec::sender auto begin = stdexec::schedule(sch);
-
-        std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
-
-        auto matrix_ms = std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
-
-        auto multiplier_lambda = [=](auto a, auto b) {
-            return a * b;
-        };
-
-        for (int i = 0; i < matrix_ms.extent(0); i++) {
-            for (int j = 0; j <= i; j++) {
-                // avoid over parallelize
-                if (j == 0) {
-                    np = 1;
-                } else if (j > 0 && np > j) {
-                    np = j;
-                }
-
-                if (j == i)  // summation for diagonals
-                {
-
-                    if (i == 0 && j == 0) {
-                        lower[j][j] = std::sqrt(matrix_ms(i, j));
-                    } else {
-
-                        std::vector<T> sum_vec(np);  // sub res for each piece
-                        int size = j;                // there are j elements need to be calculated(power)
-
-                        stdexec::sender auto send1 =
-                            stdexec::bulk(begin, np,
-                                          [&](int piece) {
-                                              int start = piece * size / np;
-                                              int chunk_size = size / np;
-                                              int remaining = size % np;
-                                              chunk_size += (piece == np - 1) ? remaining : 0;
-
-                                              sum_vec[piece] = std::transform_reduce(
-                                                  std::execution::par, counting_iterator(start),
-                                                  counting_iterator(start + chunk_size), 0, std ::plus{},
-                                                  [=](int val) { return lower[j][val] * lower[j][val]; });
-                                          }) |
-                            stdexec::then([&sum_vec]() {
-                                return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end());
-                            });
-
-                        auto [sum1] = stdexec::sync_wait(std::move(send1)).value();
-
-                        lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1);
-                    }
-
-                } else {
-                    // Evaluating L(i, j) using L(j, j)
-
-                    if (j == 0) {
-                        lower[i][j] = (matrix_ms(i, j)) / lower[j][j];
-                    } else {
-
-                        std::vector<T> sum_vec(np);  // sub res for each piece
-                        int size_nondiag = j;
-
-                        stdexec::sender auto send2 =
-                            stdexec::bulk(begin, np,
-                                          [&](int piece) {
-                                              int start = piece * size_nondiag / np;
-                                              int chunk_size = size_nondiag / np;
-                                              int remaining = size_nondiag % np;
-                                              chunk_size += (piece == np - 1) ? remaining : 0;
-
-                                              sum_vec[piece] = std::transform_reduce(
-                                                  std::execution::par, counting_iterator(start),
-                                                  counting_iterator(start + chunk_size), 0, std ::plus{},
-                                                  [=](int k) { return lower[j][k] * lower[i][k]; });
-                                          }) |
-                            stdexec::then([&sum_vec]() {
-                                return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end());
-                            });
-
-                        auto [sum2] = stdexec::sync_wait(std::move(send2)).value();
-
-                        lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j];
-                    }
-                }
-            }
+  using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+
+  template <typename T>
+  std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec, int n,
+                                                     int np) {
+
+    // test here first, scheduler from a thread pool
+    exec::static_thread_pool pool(np);
+    stdexec::scheduler auto sch = pool.get_scheduler();
+    stdexec::sender auto begin = stdexec::schedule(sch);
+
+    std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
+
+    auto matrix_ms =
+        std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
+
+    auto multiplier_lambda = [=](auto a, auto b) {
+      return a * b;
+    };
+
+    for (int i = 0; i < matrix_ms.extent(0); i++) {
+      for (int j = 0; j <= i; j++) {
+        // avoid over parallelize
+        if (j == 0) {
+          np = 1;
+        } else if (j > 0 && np > j) {
+          np = j;
+        }
+
+        if (j == i)  // summation for diagonals
+        {
+
+          if (i == 0 && j == 0) {
+            lower[j][j] = std::sqrt(matrix_ms(i, j));
+          } else {
+
+            std::vector<T> sum_vec(np);  // sub res for each piece
+            int size = j;  // there are j elements need to be calculated(power)
+
+            stdexec::sender auto send1 =
+                stdexec::bulk(begin, np,
+                              [&](int piece) {
+                                int start = piece * size / np;
+                                int chunk_size = size / np;
+                                int remaining = size % np;
+                                chunk_size += (piece == np - 1) ? remaining : 0;
+
+                                sum_vec[piece] = std::transform_reduce(
+                                    std::execution::par,
+                                    counting_iterator(start),
+                                    counting_iterator(start + chunk_size), 0,
+                                    std ::plus{}, [=](int val) {
+                                      return lower[j][val] * lower[j][val];
+                                    });
+                              }) |
+                stdexec::then([&sum_vec]() {
+                  return std::reduce(std::execution::par, sum_vec.begin(),
+                                     sum_vec.end());
+                });
+
+            auto [sum1] = stdexec::sync_wait(std::move(send1)).value();
+
+            lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1);
+          }
+
+        } else {
+          // Evaluating L(i, j) using L(j, j)
+
+          if (j == 0) {
+            lower[i][j] = (matrix_ms(i, j)) / lower[j][j];
+          } else {
+
+            std::vector<T> sum_vec(np);  // sub res for each piece
+            int size_nondiag = j;
+
+            stdexec::sender auto send2 =
+                stdexec::bulk(
+                    begin, np,
+                    [&](int piece) {
+                      int start = piece * size_nondiag / np;
+                      int chunk_size = size_nondiag / np;
+                      int remaining = size_nondiag % np;
+                      chunk_size += (piece == np - 1) ? remaining : 0;
+
+                      sum_vec[piece] = std::transform_reduce(
+                          std::execution::par, counting_iterator(start),
+                          counting_iterator(start + chunk_size), 0,
+                          std ::plus{},
+                          [=](int k) { return lower[j][k] * lower[i][k]; });
+                    }) |
+                stdexec::then([&sum_vec]() {
+                  return std::reduce(std::execution::par, sum_vec.begin(),
+                                     sum_vec.end());
+                });
+
+            auto [sum2] = stdexec::sync_wait(std::move(send2)).value();
+
+            lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j];
+          }
         }
-        return lower;
+      }
     }
+    return lower;
+  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 int benchmark(args_params_t const& args) {
 
-    std::uint64_t nd = args.nd;  // Number of matrix dimension.
-    std::uint64_t np = args.np;  // Number of parallel partitions.
+  std::uint64_t nd = args.nd;  // Number of matrix dimension.
+  std::uint64_t np = args.np;  // Number of parallel partitions.
 
-    std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
+  std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
 
-    // Create the solver object
-    solver solve;
+  // Create the solver object
+  solver solve;
 
-    // Measure execution time.
-    Timer timer;
+  // Measure execution time.
+  Timer timer;
 
-    // start decomposation
-    auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np);
+  // start decomposation
+  auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np);
 
-    // Print the final results
-    if (args.results) {
-        // Displaying Lower Triangular and its Transpose
-        cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
-        for (int i = 0; i < nd; i++) {
-            // Lower Triangular
-            for (int j = 0; j < nd; j++)
-                cout << setw(6) << res_matrix[i][j] << "\t";
-            cout << "\t";
+  // Print the final results
+  if (args.results) {
+    // Displaying Lower Triangular and its Transpose
+    cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl;
+    for (int i = 0; i < nd; i++) {
+      // Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix[i][j] << "\t";
+      cout << "\t";
 
-            // Transpose of Lower Triangular
-            for (int j = 0; j < nd; j++)
-                cout << setw(6) << res_matrix[j][i] << "\t";
-            cout << endl;
-        }
+      // Transpose of Lower Triangular
+      for (int j = 0; j < nd; j++)
+        cout << setw(6) << res_matrix[j][i] << "\t";
+      cout << endl;
     }
+  }
 
-    if (args.time) {
-        std::cout << "Duration: " << time << " ms."
-                  << "\n";
-    }
+  if (args.time) {
+    std::cout << "Duration: " << time << " ms."
+              << "\n";
+  }
 
-    return 0;
+  return 0;
 }
 
 // Driver Code for testing
 int main(int argc, char* argv[]) {
 
-    // parse params
-    args_params_t args = argparse::parse<args_params_t>(argc, argv);
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // parse params
+  args_params_t args = argparse::parse<args_params_t>(argc, argv);
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    benchmark(args);
+  benchmark(args);
 
-    return 0;
+  return 0;
 }
diff --git a/apps/choleskey/matrixutil.hpp b/apps/choleskey/matrixutil.hpp
index 8b08fb1..44f0468 100644
--- a/apps/choleskey/matrixutil.hpp
+++ b/apps/choleskey/matrixutil.hpp
@@ -9,30 +9,33 @@ using Matrix = std::vector<std::vector<T>>;
 
 template <typename T>
 std::vector<T> generate_pascal_matrix(const int n) {
-    Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));
+  Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));
 
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < n; ++j) {
-            if (i == 0 || j == 0) {
-                matrix[i][j] = static_cast<T>(1);
-            } else {
-                matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
-            }
-        }
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < n; ++j) {
+      if (i == 0 || j == 0) {
+        matrix[i][j] = static_cast<T>(1);
+      } else {
+        matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
+      }
     }
+  }
 
-    std::vector<T> flattenedVector;
-    for (const auto& row : matrix) {
-        flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
-    }
-    return std::move(flattenedVector);
+  std::vector<T> flattenedVector;
+  for (const auto& row : matrix) {
+    flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
+  }
+  return std::move(flattenedVector);
 }
 
 // parameters define
 struct args_params_t : public argparse::Args {
-    bool& results = kwarg("results", "print generated results (default: false)").set_default(true);
-    std::uint64_t& nd = kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)").set_default(10);
-    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
-    bool& help = flag("h, help", "print help");
-    bool& time = kwarg("t, time", "print time").set_default(true);
+  bool& results = kwarg("results", "print generated results (default: false)")
+                      .set_default(true);
+  std::uint64_t& nd =
+      kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)")
+          .set_default(10);
+  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
+  bool& help = flag("h, help", "print help");
+  bool& time = kwarg("t, time", "print time").set_default(true);
 };
diff --git a/apps/comm-study/comm-study-no-senders.cpp b/apps/comm-study/comm-study-no-senders.cpp
index 87fa74b..1550094 100644
--- a/apps/comm-study/comm-study-no-senders.cpp
+++ b/apps/comm-study/comm-study-no-senders.cpp
@@ -37,79 +37,87 @@ using time_point_t = std::chrono::system_clock::time_point;
 // must take in the pointers/vectors by reference
 template <typename P>
 auto work(P& A, P& B, P& Y, int N) {
-    // init A and B separately - will it cause an H2D copy?
-    std::for_each(std::execution::par_unseq, &A[0], &A[N], [&](T& ai) { ai = cos(M_PI / 4); });
+  // init A and B separately - will it cause an H2D copy?
+  std::for_each(std::execution::par_unseq, &A[0], &A[N],
+                [&](T& ai) { ai = cos(M_PI / 4); });
 
-    T sum = 0.0;
+  T sum = 0.0;
 
-    for (int i = 0; i < N / 3; i++) {
-        // read only or read-write operations
-        sum += A[i] / N;
+  for (int i = 0; i < N / 3; i++) {
+    // read only or read-write operations
+    sum += A[i] / N;
 
-        // this line if commented should not result in an H2D after this but it
-        // does.
-        // A[i] = sin(M_PI/4);
-    }
+    // this line if commented should not result in an H2D after this but it
+    // does.
+    // A[i] = sin(M_PI/4);
+  }
 
-    std::cout << std::endl;
+  std::cout << std::endl;
 
-    // will it cause an H2D here?
-    std::for_each(std::execution::par_unseq, &B[0], &B[N], [&](T& bi) { bi = sin(M_PI / 6); });
+  // will it cause an H2D here?
+  std::for_each(std::execution::par_unseq, &B[0], &B[N],
+                [&](T& bi) { bi = sin(M_PI / 6); });
 
-    // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B)
+  // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B)
 
-    std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &A[N / 2],
-                   [&](T& ai, T& bi) { return ai + bi; });
-    std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &Y[0],
-                   [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); });
+  std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &A[N / 2],
+                 [&](T& ai, T& bi) { return ai + bi; });
+  std::transform(
+      std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &Y[0],
+      [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); });
 
-    // should trigger a D2H copy of N/5 elements
-    for (int i = 0; i < N / 3; i++)
-        sum += Y[i] / N;
+  // should trigger a D2H copy of N/5 elements
+  for (int i = 0; i < N / 3; i++)
+    sum += Y[i] / N;
 
-    std::cout << std::endl;
+  std::cout << std::endl;
 
-    // get sum(Y) - one last memcpy (not USM) D2H
-    sum += std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>(),
-                                 [](T& val) { return val * val; });
+  // get sum(Y) - one last memcpy (not USM) D2H
+  sum +=
+      std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>(), [](T &val){return val * val;});
 
-    return sum / N;
+  return sum / N;
 }
 
 int main(int argc, char* argv[]) {
-    constexpr int N = 1e9;
-    time_point_t mark = std::chrono::system_clock::now();
-    auto es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
-    T sum = 0;
+  constexpr int N = 1e9;
+  time_point_t mark = std::chrono::system_clock::now();
+  auto es =
+      std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
+          .count();
+  T sum = 0;
 
 #if 1  // 0 if only want to run with pointers
-    std::vector<T> A(N);
-    std::vector<T> B(N);
-    std::vector<T> Y(N);
+  std::vector<T> A(N);
+  std::vector<T> B(N);
+  std::vector<T> Y(N);
 
-    mark = std::chrono::system_clock::now();
-    sum = work(A, B, Y, N);
-    es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
-    std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl;
+  mark = std::chrono::system_clock::now();
+  sum = work(A, B, Y, N);
+  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
+           .count();
+  std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl;
 
 #endif
 
 #if 1  // 0 if only want to run with vectors
 
-    // allocate memory - where is this allocated?
-    T* a = new T[N];
-    T* b = new T[N];
-    T* y = new T[N];
-
-    sum = 0;
-    mark = std::chrono::system_clock::now();
-    sum = work(a, b, y, N);
-    es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
-    std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl << std::endl;
+  // allocate memory - where is this allocated?
+  T* a = new T[N];
+  T* b = new T[N];
+  T* y = new T[N];
+
+  sum = 0;
+  mark = std::chrono::system_clock::now();
+  sum = work(a, b, y, N);
+  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
+           .count();
+  std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl
+            << std::endl;
 #endif
 
-    // do not use scientific notation
-    std::cout << std::fixed << "sum: " << sum << "\n";
+  // do not use scientific notation
+  std::cout << std::fixed << "sum: " << sum << "\n";
 
-    return 0;
+  return 0;
 }
\ No newline at end of file
diff --git a/apps/comm-study/comm-study.cpp b/apps/comm-study/comm-study.cpp
index 99abcfc..7629ce0 100644
--- a/apps/comm-study/comm-study.cpp
+++ b/apps/comm-study/comm-study.cpp
@@ -37,91 +37,106 @@ using time_point_t = std::chrono::system_clock::time_point;
 // must take in the pointers/vectors by reference
 template <typename P>
 auto work(P& A, P& B, P& Y, int N) {
-    T sum = 0.0;
-
-    // init A and B separately - will it cause an H2D copy?
-    sender auto s1 =
-        then(just(),
-             [&] { std::for_each(std::execution::par_unseq, &A[0], &A[N], [&](T& ai) { ai = cos(M_PI / 4); }); })
-        // trigger a D2H here
-        | then([&] {
-              for (int i = 0; i < N / 3; i++) {
-                  // read only or read-write operations
-                  sum += A[i] / N;
-
-                  // this line if commented should not result in an H2D
-                  // after this but it does.
-                  // A[i] = sin(M_PI/4);
-              }
-              std::cout << std::endl;
-          });
-
-    // will it cause an H2D here?
-    sender auto s2 = then(
-        just(), [&] { std::for_each(std::execution::par_unseq, &B[0], &B[N], [&](T& bi) { bi = sin(M_PI / 6); }); });
-
-    // will s1 and s2 execute in parallel or not?
-    sync_wait(when_all(std::move(s1), std::move(s2)));
-
-    // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B)
-    sender auto s3 = then(just(),
-                          [&] {
-                              std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], &A[0],
-                                             [&](T& ai, T& bi) { return ai + bi; });
-                              std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], &Y[0],
-                                             [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); });
-                          })
-                     // should trigger a D2H copy of N/3 elements
-                     | then([&] {
-                           for (int i = 0; i < N / 3; i++)
-                               sum += Y[i] / N;
-
-                           std::cout << std::endl;
-                       })
-                     // get sum(Y) - wonder if there is another H2D as we only read it in the
-                     // last step
-                     | then([&] { return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>()); });
-
-    auto [val] = sync_wait(s3).value();
-
-    return sum += val;
+  T sum = 0.0;
+
+  // init A and B separately - will it cause an H2D copy?
+  sender auto s1 = then(just(),
+                        [&] {
+                          std::for_each(std::execution::par_unseq, &A[0], &A[N],
+                                        [&](T& ai) { ai = cos(M_PI / 4); });
+                        })
+                   // trigger a D2H here
+                   | then([&] {
+                       for (int i = 0; i < N / 3; i++) {
+                         // read only or read-write operations
+                         sum += A[i] / N;
+
+                         // this line if commented should not result in an H2D
+                         // after this but it does.
+                         // A[i] = sin(M_PI/4);
+                       }
+                       std::cout << std::endl;
+                     });
+
+  // will it cause an H2D here?
+  sender auto s2 = then(just(), [&] {
+    std::for_each(std::execution::par_unseq, &B[0], &B[N],
+                  [&](T& bi) { bi = sin(M_PI / 6); });
+  });
+
+  // will s1 and s2 execute in parallel or not?
+  sync_wait(when_all(std::move(s1), std::move(s2)));
+
+  // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B)
+  sender auto s3 =
+      then(just(),
+           [&] {
+             std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0],
+                            &A[0], [&](T& ai, T& bi) { return ai + bi; });
+             std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0],
+                            &Y[0], [&](T& ai, T& bi) {
+                              return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi);
+                            });
+           })
+      // should trigger a D2H copy of N/3 elements
+      | then([&] {
+          for (int i = 0; i < N / 3; i++)
+            sum += Y[i] / N;
+
+          std::cout << std::endl;
+        })
+      // get sum(Y) - wonder if there is another H2D as we only read it in the
+      // last step
+      | then([&] {
+          return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0,
+                             std::plus<T>());
+        });
+
+  auto [val] = sync_wait(s3).value();
+
+  return sum += val;
 }
 
 int main(int argc, char* argv[]) {
-    constexpr int N = 1e9;
-    time_point_t mark = std::chrono::system_clock::now();
-    auto es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
-    T sum = 0.0;
+  constexpr int N = 1e9;
+  time_point_t mark = std::chrono::system_clock::now();
+  auto es =
+      std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
+          .count();
+  T sum = 0.0;
 
 #if 1  // 0 if only arrays
-    std::vector<T> A(N);
-    std::vector<T> B(N);
-    std::vector<T> Y(N);
+  std::vector<T> A(N);
+  std::vector<T> B(N);
+  std::vector<T> Y(N);
 
-    mark = std::chrono::system_clock::now();
-    sum = work(A, B, Y, N);
-    es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
-    std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl;
+  mark = std::chrono::system_clock::now();
+  sum = work(A, B, Y, N);
+  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
+           .count();
+  std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl;
 
-    std::cout << fixed << "sum: " << sum << "\n";
+  std::cout << fixed << "sum: " << sum << "\n";
 #endif
 
 #if 1  // 0 if only vectors
 
-    // allocate memory - can we just allocate it on device only?
-    T* a = new T[N];
-    T* b = new T[N];
-    T* y = new T[N];
-
-    sum = 0;
-    mark = std::chrono::system_clock::now();
-    sum = work(a, b, y, N);
-    es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
-    std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl << std::endl;
-
-    // do not use scientific notation
-    std::cout << fixed << "sum: " << sum << "\n";
+  // allocate memory - can we just allocate it on device only?
+  T* a = new T[N];
+  T* b = new T[N];
+  T* y = new T[N];
+
+  sum = 0;
+  mark = std::chrono::system_clock::now();
+  sum = work(a, b, y, N);
+  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
+           .count();
+  std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl
+            << std::endl;
+
+  // do not use scientific notation
+  std::cout << fixed << "sum: " << sum << "\n";
 #endif
 
-    return 0;
+  return 0;
 }
\ No newline at end of file
diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp
index 02bbd7b..b174b5a 100644
--- a/apps/fft/fft-serial.cpp
+++ b/apps/fft/fft-serial.cpp
@@ -33,12 +33,14 @@
 //
 // simulation
 //
-int main(int argc, char* argv[]) {
+int main(int argc, char* argv[])
+{
     // parse params
     fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
 
     // see if help wanted
-    if (args.help) {
+    if (args.help)
+    {
         args.print();  // prints all variables
         return 0;
     }
@@ -58,7 +60,8 @@ int main(int argc, char* argv[]) {
 
     sig_t x_n(N, sig_type);
 
-    if (!isPowOf2(N)) {
+    if (!isPowOf2(N))
+    {
         N = ceilPowOf2(N);
         std::cout << "log_2(N) != integer. Padding zeros for N = " << N << std::endl;
 
@@ -67,7 +70,8 @@ int main(int argc, char* argv[]) {
 
     sig_t y_n(x_n);
 
-    if (print_sig) {
+    if (print_sig)
+    {
         std::cout << std::endl << "x[n] = ";
         x_n.printSignal();
         std::cout << std::endl;
@@ -76,36 +80,40 @@ int main(int argc, char* argv[]) {
     // niterations
     int niters = ilog2(N);
 
-    std::function<void(data_t*, int, const int)> fft = [&](data_t* x, int lN, const int N) {
-        int stride = N / lN;
+    std::function<void(data_t *, int, const int)> fft = [&](data_t *x, int lN, const int N)
+    {
+        int stride = N/lN;
 
-        if (lN == 2) {
-            auto x_0 = x[0] + x[1] * WNk(N, 0);
-            x[1] = x[0] - x[1] * WNk(N, 0);
+        if (lN == 2)
+        {
+            auto x_0 = x[0] + x[1]* WNk(N, 0);
+            x[1] = x[0] - x[1]* WNk(N, 0);
             x[0] = x_0;
             return;
         }
 
         // vectors for left and right
-        std::vector<data_t> e(lN / 2);
-        std::vector<data_t> o(lN / 2);
+        std::vector<data_t> e(lN/2);
+        std::vector<data_t> o(lN/2);
 
         // copy data into vectors
-        for (auto k = 0; k < lN / 2; k++) {
-            e[k] = x[2 * k];
-            o[k] = x[2 * k + 1];
+        for (auto k = 0; k < lN/2; k++)
+        {
+            e[k] = x[2*k];
+            o[k] = x[2*k+1];
         }
 
         // compute N/2 pt FFT on even
-        fft(e.data(), lN / 2, N);
+        fft(e.data(), lN/2, N);
 
         // compute N/2 pt FFT on odd
-        fft(o.data(), lN / 2, N);
+        fft(o.data(), lN/2, N);
 
         // combine even and odd FFTs
-        for (int k = 0; k < lN / 2; k++) {
+        for (int k = 0; k < lN/2; k++)
+        {
             x[k] = e[k] + o[k] * WNk(N, k * stride);
-            x[k + lN / 2] = e[k] - o[k] * WNk(N, k * stride);
+            x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride);
         }
 
         return;
@@ -114,7 +122,8 @@ int main(int argc, char* argv[]) {
     // fft radix-2 algorithm with senders
     fft(y_n.data(), N, N);
 
-    if (print_sig) {
+    if (print_sig)
+    {
         std::cout << "X[k] = ";
         y_n.printSignal();
         std::cout << std::endl;
diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp
index 56354f8..80a7446 100644
--- a/apps/fft/fft.hpp
+++ b/apps/fft/fft.hpp
@@ -32,10 +32,10 @@
 
 #include <bit>
 #include <complex>
+#include <functional>
+#include <experimental/mdspan>
 #include <exec/any_sender_of.hpp>
 #include <exec/static_thread_pool.hpp>
-#include <experimental/mdspan>
-#include <functional>
 #include <stdexec/execution.hpp>
 
 #include "argparse/argparse.hpp"
@@ -56,135 +56,158 @@ constexpr int radix = 2;
 
 // parameters
 struct fft_params_t : public argparse::Args {
-    sig_type_t& sig =
-        kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box);
-    int& freq = kwarg("f,freq", "Signal frequency").set_default(1024);
-    int& N = kwarg("N", "N-point FFT").set_default(1024);
-    bool& print_sig = flag("p,print", "print x[n] and X(k)");
+  sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(sig_type_t::box);
+  int& freq = kwarg("f,freq", "Signal frequency").set_default(1024);
+  int& N = kwarg("N", "N-point FFT").set_default(1024);
+  bool& print_sig = flag("p,print", "print x[n] and X(k)");
 
 #if defined(USE_OMP)
-    int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
+  int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
 #endif  // USE_OMP
 
-    bool& help = flag("h, help", "print help");
-    bool& print_time = flag("t,time", "print fft time");
+  bool& help = flag("h, help", "print help");
+  bool& print_time = flag("t,time", "print fft time");
 };
 
 inline bool isPowOf2(long long int x) {
-    return !(x == 0) && !(x & (x - 1));
+  return !(x == 0) && !(x & (x - 1));
 }
 
 template <typename T>
-void printVec(T& vec, int len) {
+void printVec(T &vec, int len)
+{
     std::cout << "[ ";
     for (int i = 0; i < len; i++)
-        std::cout << vec[i] << " ";
+      std::cout << vec[i] << " ";
 
     std::cout << "]" << std::endl;
 }
 
-inline std::complex<Real_t> WNk(int N, int k) {
-    return std::complex<Real_t>(exp(-2 * M_PI * 1 / N * k * 1i));
+inline std::complex<Real_t> WNk(int N, int k)
+{
+    return std::complex<Real_t>(exp(-2*M_PI*1/N*k*1i));
 }
 
-inline int ceilPowOf2(unsigned int v) {
-    return static_cast<int>(std::bit_ceil(v));
+inline int ceilPowOf2(unsigned int v)
+{
+  return static_cast<int>(std::bit_ceil(v));
 }
 
-inline int ilog2(uint32_t x) {
+inline int ilog2(uint32_t x)
+{
     return static_cast<int>(log2(x));
 }
 
-class signal {
-   public:
-    signal() = default;
-
-    signal(int N) {
-        if (N <= 0) {
-            std::cerr << "FATAL: N must be > 0. exiting.." << std::endl;
-            exit(1);
-        }
-        y.reserve(ceilPowOf2(N));
-        y.resize(N);
+class signal
+{
+public:
+
+  signal() = default;
+  signal(int N)
+  {
+    if (N <= 0)
+    {
+      std::cerr << "FATAL: N must be > 0. exiting.." << std::endl;
+      exit(1);
     }
-
-    signal(signal& rhs) { y = rhs.y; }
-
-    signal(std::vector<data_t>& in) { y = std::move(in); }
-
-    signal(int N, sig_type type) {
-        if (N <= 0) {
-            std::cerr << "FATAL: N must be > 0. exiting.." << std::endl;
-            exit(1);
-        }
-        y.reserve(ceilPowOf2(N));
-        y.resize(N);
-        signalGenerator(type);
+    y.reserve(ceilPowOf2(N));
+    y.resize(N);
+  }
+
+  signal(signal &rhs)
+  {
+    y = rhs.y;
+  }
+  signal(std::vector<data_t> &in)
+  {
+    y = std::move(in);
+  }
+
+  signal(int N, sig_type type)
+  {
+    if (N <= 0)
+    {
+      std::cerr << "FATAL: N must be > 0. exiting.." << std::endl;
+      exit(1);
     }
-
-    void signalGenerator(sig_type type = sig_type::box) {
-        int N = y.size();
-
-        switch (type) {
-            case sig_type::square:
-                for (int n = 0; n < N; ++n)
-                    y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : -1.0;
-                break;
-            case sig_type::sinusoid:
-                for (int n = 0; n < N; ++n)
-                    y[n] = std::sin(2.0 * M_PI * n / N);
-                break;
-            case sig_type::sawtooth:
-                for (int n = 0; n < N; ++n)
-                    y[n] = 2.0 * (n / N) - 1.0;
-                break;
-            case sig_type::triangle:
-                for (int n = 0; n < N; ++n)
-                    y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0;
-                break;
-            case sig_type::sinc:
-                y[0] = 1.0;
-                for (int n = 1; n < N; ++n)
-                    y[n] = std::sin(2.0 * M_PI * n / N) / (2.0 * M_PI * n / N);
-                break;
-            case sig_type::box:
-                for (int n = 0; n < N; ++n)
-                    y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0;
-                break;
-            default:
-                std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl;
-                exit(1);
-        }
+    y.reserve(ceilPowOf2(N));
+    y.resize(N);
+    signalGenerator(type);
+  }
+
+  void signalGenerator(sig_type type=sig_type::box)
+  {
+    int N = y.size();
+
+    switch (type) {
+      case sig_type::square:
+        for (int n = 0; n < N; ++n)
+          y[n] = (n < N / 4 || n > 3 * N/4) ? 1.0 : -1.0;
+        break;
+      case sig_type::sinusoid:
+        for (int n = 0; n < N; ++n)
+          y[n] = std::sin(2.0 * M_PI * n / N);
+        break;
+      case sig_type::sawtooth:
+        for (int n = 0; n < N; ++n)
+          y[n] = 2.0 * (n / N) - 1.0;
+        break;
+      case sig_type::triangle:
+        for (int n = 0; n < N; ++n)
+          y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0;
+        break;
+      case sig_type::sinc:
+          y[0] = 1.0;
+        for (int n = 1; n < N; ++n)
+          y[n] = std::sin(2.0 * M_PI * n / N) / (2.0 * M_PI * n / N);
+        break;
+      case sig_type::box:
+        for (int n = 0; n < N; ++n)
+          y[n] = (n < N / 4 || n > 3 * N / 4) ? 1.0 : 0.0;
+        break;
+      default:
+        std::cerr << "FATAL: Unknown signal type. exiting.." << std::endl;
+        exit(1);
     }
+  }
 
-    ~signal() { y.clear(); }
-
-    data_t* data() { return y.data(); }
+  ~signal()
+  {
+    y.clear();
+  }
 
-    int len() { return y.size(); }
+  data_t *data() { return y.data(); }
+  int len() { return y.size(); }
 
-    void resize(int N) {
-        if (N != y.size())
-            y.resize(N, 0);
-    }
+  void resize(int N)
+  {
+    if (N != y.size())
+      y.resize(N, 0);
+  }
 
-    data_t& operator[](int n) { return y[n]; }
+  data_t &operator[](int n)
+  {
+    return y[n];
+  }
 
-    data_t& operator()(int n) { return y[n]; }
+  data_t &operator()(int n)
+  {
+    return y[n];
+  }
 
-    void printSignal() {
-        std::cout << std::fixed << std::setprecision(2);
+  void printSignal() {
+    std::cout << std::fixed << std::setprecision(2);
 
-        std::cout << "[ ";
-        for (auto& el : y)
-            std::cout << el << " ";
+    std::cout << "[ ";
+    for (auto &el : y)
+      std::cout << el << " ";
 
-        std::cout << "]" << std::endl;
-    }
+    std::cout << "]" << std::endl;
+  }
 
-   private:
-    // y[n]
-    std::vector<data_t> y;
+private:
+  // y[n]
+  std::vector<data_t> y;
 };
 
 using sig_t = signal;
diff --git a/apps/heat-equation/heat-equation-cuda.cpp b/apps/heat-equation/heat-equation-cuda.cpp
index b8cca1b..3ea2988 100644
--- a/apps/heat-equation/heat-equation-cuda.cpp
+++ b/apps/heat-equation/heat-equation-cuda.cpp
@@ -41,14 +41,15 @@ __constant__ Real_t dx[2];
 
 // error checking function
 template <typename T>
-static inline void check(T result, const char* const file, const int line, bool is_fatal = true) {
-    if (result != cudaSuccess) {
-        std::cerr << "CUDA error at " << file << ":" << line << std::endl;
-        std::cerr << cudaGetErrorString(result) << std::endl;
-
-        if (is_fatal)
-            exit(result);
-    }
+static inline void check(T result, const char* const file, const int line,
+                         bool is_fatal = true) {
+  if (result != cudaSuccess) {
+    std::cerr << "CUDA error at " << file << ":" << line << std::endl;
+    std::cerr << cudaGetErrorString(result) << std::endl;
+
+    if (is_fatal)
+      exit(result);
+  }
 }
 
 //
@@ -56,24 +57,24 @@ static inline void check(T result, const char* const file, const int line, bool
 //
 template <typename T>
 __global__ void initialize(T* phi, int ncells, int ghost_cells) {
-    int ind = blockIdx.x * blockDim.x + threadIdx.x;
-    int d_nghosts = nghosts;
-    int phi_old_extent = ncells + d_nghosts;
-    int gsize = ncells * ncells;
+  int ind = blockIdx.x * blockDim.x + threadIdx.x;
+  int d_nghosts = nghosts;
+  int phi_old_extent = ncells + d_nghosts;
+  int gsize = ncells * ncells;
 
-    for (; ind < gsize; ind += blockDim.x * gridDim.x) {
-        int i = 1 + (ind / ncells);
-        int j = 1 + (ind % ncells);
+  for (; ind < gsize; ind += blockDim.x * gridDim.x) {
+    int i = 1 + (ind / ncells);
+    int j = 1 + (ind % ncells);
 
-        Real_t x = pos(i, ghost_cells, dx[0]);
-        Real_t y = pos(j, ghost_cells, dx[1]);
+    Real_t x = pos(i, ghost_cells, dx[0]);
+    Real_t y = pos(j, ghost_cells, dx[1]);
 
-        // L2 distance (r2 from origin)
-        Real_t r2 = (x * x + y * y) / (0.01);
+    // L2 distance (r2 from origin)
+    Real_t r2 = (x * x + y * y) / (0.01);
 
-        // phi(x,y) = 1 + exp(-r^2)
-        phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
-    }
+    // phi(x,y) = 1 + exp(-r^2)
+    phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
+  }
 }
 
 //
@@ -81,52 +82,57 @@ __global__ void initialize(T* phi, int ncells, int ghost_cells) {
 //
 template <typename T>
 __global__ void fillBoundary(T* phi_old, int ncells, int ghost_cells) {
-    int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    int d_nghosts = nghosts;
-    int phi_old_extent = ncells + d_nghosts;
-    int len = phi_old_extent;
+  int pos = blockIdx.x * blockDim.x + threadIdx.x;
+  int d_nghosts = nghosts;
+  int phi_old_extent = ncells + d_nghosts;
+  int len = phi_old_extent;
 
-    for (; pos < phi_old_extent - nghosts; pos += blockDim.x * gridDim.x) {
-        int i = pos + ghost_cells;
+  for (; pos < phi_old_extent - nghosts; pos += blockDim.x * gridDim.x) {
+    int i = pos + ghost_cells;
 
-        // fill boundary cells in phi_old
-        phi_old[i] = phi_old[i + (ghost_cells * len)];
+    // fill boundary cells in phi_old
+    phi_old[i] = phi_old[i + (ghost_cells * len)];
 
-        phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))];
+    phi_old[i + (len * (len - ghost_cells))] =
+        phi_old[i + (len * (len - ghost_cells - 1))];
 
-        phi_old[i * len] = phi_old[(ghost_cells * len) + i];
+    phi_old[i * len] = phi_old[(ghost_cells * len) + i];
 
-        phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)];
-    }
+    phi_old[(len - ghost_cells) + (len * i)] =
+        phi_old[(len - ghost_cells - 1) + (len * i)];
+  }
 }
 
 //
 // jacobi 2d stencil kernel
 //
 template <typename T>
-__global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha, Real_t dt) {
-    int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    int d_nghosts = nghosts;
-    int phi_old_extent = ncells + d_nghosts;
-    int gsize = ncells * ncells;
-
-    for (; pos < gsize; pos += blockDim.x * gridDim.x) {
-        int i = 1 + (pos / ncells);
-        int j = 1 + (pos % ncells);
-
-        // Jacobi iteration
-        phi_new[(i - 1) * ncells + j - 1] =
-            phi_old[(i)*phi_old_extent + j] +
-            alpha * dt *
-
-                ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] +
-                  phi_old[(i - 1) * phi_old_extent + j]) /
-                     (dx[0] * dx[0]) +
-
-                 (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] +
-                  phi_old[(i)*phi_old_extent + j - 1]) /
-                     (dx[1] * dx[1]));
-    }
+__global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha,
+                       Real_t dt) {
+  int pos = blockIdx.x * blockDim.x + threadIdx.x;
+  int d_nghosts = nghosts;
+  int phi_old_extent = ncells + d_nghosts;
+  int gsize = ncells * ncells;
+
+  for (; pos < gsize; pos += blockDim.x * gridDim.x) {
+    int i = 1 + (pos / ncells);
+    int j = 1 + (pos % ncells);
+
+    // Jacobi iteration
+    phi_new[(i - 1) * ncells + j - 1] =
+        phi_old[(i)*phi_old_extent + j] +
+        alpha * dt *
+
+            ((phi_old[(i + 1) * phi_old_extent + j] -
+              2.0 * phi_old[(i)*phi_old_extent + j] +
+              phi_old[(i - 1) * phi_old_extent + j]) /
+                 (dx[0] * dx[0]) +
+
+             (phi_old[(i)*phi_old_extent + j + 1] -
+              2.0 * phi_old[(i)*phi_old_extent + j] +
+              phi_old[(i)*phi_old_extent + j - 1]) /
+                 (dx[1] * dx[1]));
+  }
 }
 
 //
@@ -134,121 +140,127 @@ __global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha, Real_t
 //
 template <typename T>
 __global__ void parallelCopy(T* phi_old, T* phi_new, int ncells) {
-    int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    int d_nghosts = nghosts;
-    int phi_old_extent = ncells + d_nghosts;
-    int gsize = ncells * ncells;
-
-    for (; pos < gsize; pos += blockDim.x * gridDim.x) {
-        int i = 1 + (pos / ncells);
-        int j = 1 + (pos % ncells);
-        phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
-    }
+  int pos = blockIdx.x * blockDim.x + threadIdx.x;
+  int d_nghosts = nghosts;
+  int phi_old_extent = ncells + d_nghosts;
+  int gsize = ncells * ncells;
+
+  for (; pos < gsize; pos += blockDim.x * gridDim.x) {
+    int i = 1 + (pos / ncells);
+    int j = 1 + (pos % ncells);
+    phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
+  }
 }
 
 //
 // main simulation
 //
 int main(int argc, char* argv[]) {
-    // parse params
-    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  // parse params
+  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    // simulation variables
-    int ncells = args.ncells;
-    int nsteps = args.nsteps;
-    Real_t dt = args.dt;
-    Real_t alpha = args.alpha;
+  // simulation variables
+  int ncells = args.ncells;
+  int nsteps = args.nsteps;
+  Real_t dt = args.dt;
+  Real_t alpha = args.alpha;
 
-    // init simulation time
-    Real_t time = 0.0;
+  // init simulation time
+  Real_t time = 0.0;
 
-    // initialize dx, dy, dz
-    Real_t h_dx[dims];
-    for (int i = 0; i < dims; ++i)
-        h_dx[i] = 1.0 / (ncells - 1);
+  // initialize dx, dy, dz
+  Real_t h_dx[dims];
+  for (int i = 0; i < dims; ++i)
+    h_dx[i] = 1.0 / (ncells - 1);
 
-    cudaErrorCheck(cudaMemcpyToSymbol(dx, h_dx, sizeof(Real_t) * dims));
+  cudaErrorCheck(cudaMemcpyToSymbol(dx, h_dx, sizeof(Real_t) * dims));
 
-    // grid size
-    int gsize = ncells * ncells;
+  // grid size
+  int gsize = ncells * ncells;
 
-    // host memory for printing
-    Real_t* h_phi = nullptr;
+  // host memory for printing
+  Real_t* h_phi = nullptr;
 
-    // simulation setup (2D)
-    Real_t* phi_old = nullptr;
-    Real_t* phi_new = nullptr;
+  // simulation setup (2D)
+  Real_t* phi_old = nullptr;
+  Real_t* phi_new = nullptr;
 
-    cudaErrorCheck(cudaMalloc(&phi_old, sizeof(Real_t) * ((ncells + nghosts) * (ncells + nghosts))));
-    cudaErrorCheck(cudaMalloc(&phi_new, sizeof(Real_t) * ((ncells) * (ncells))));
+  cudaErrorCheck(cudaMalloc(
+      &phi_old, sizeof(Real_t) * ((ncells + nghosts) * (ncells + nghosts))));
+  cudaErrorCheck(cudaMalloc(&phi_new, sizeof(Real_t) * ((ncells) * (ncells))));
 
-    // setup grid
-    int blockSize = std::min(1024, gsize);  // let's do at most 1024 threads.
-    int nBlocks = (gsize + blockSize - 1) / blockSize;
+  // setup grid
+  int blockSize = std::min(1024, gsize);  // let's do at most 1024 threads.
+  int nBlocks = (gsize + blockSize - 1) / blockSize;
 
-    Timer timer;
+  Timer timer;
 
-    // initialize grid
-    initialize<<<nBlocks, blockSize>>>(phi_old, ncells, ghost_cells);
+  // initialize grid
+  initialize<<<nBlocks, blockSize>>>(phi_old, ncells, ghost_cells);
 
-    cudaErrorCheck(cudaDeviceSynchronize());
+  cudaErrorCheck(cudaDeviceSynchronize());
 
-    // print initial grid if needed
-    if (args.print_grid) {
-        // copy initial grid to host
-        h_phi = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
-        cudaErrorCheck(cudaMemcpy(h_phi, phi_old, sizeof(Real_t) * (ncells + nghosts) * (ncells + nghosts),
-                                  cudaMemcpyDeviceToHost));
+  // print initial grid if needed
+  if (args.print_grid) {
+    // copy initial grid to host
+    h_phi = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
+    cudaErrorCheck(
+        cudaMemcpy(h_phi, phi_old,
+                   sizeof(Real_t) * (ncells + nghosts) * (ncells + nghosts),
+                   cudaMemcpyDeviceToHost));
 
-        printGrid(h_phi, ncells + nghosts);
-    }
+    printGrid(h_phi, ncells + nghosts);
+  }
 
-    // evolve the system
-    for (auto step = 0; step < nsteps; step++) {
-        static int fBblock = std::min(1024, ncells);              // let's do at most 1024 threads.
-        static int fBnBlocks = (ncells + fBblock - 1) / fBblock;  // fillBoundary blocks
+  // evolve the system
+  for (auto step = 0; step < nsteps; step++) {
+    static int fBblock =
+        std::min(1024, ncells);  // let's do at most 1024 threads.
+    static int fBnBlocks =
+        (ncells + fBblock - 1) / fBblock;  // fillBoundary blocks
 
-        // fillboundary
-        fillBoundary<<<fBnBlocks, fBblock>>>(phi_old, ncells, ghost_cells);
+    // fillboundary
+    fillBoundary<<<fBnBlocks, fBblock>>>(phi_old, ncells, ghost_cells);
 
-        // jacobi
-        jacobi<<<nBlocks, blockSize>>>(phi_old, phi_new, ncells, alpha, dt);
+    // jacobi
+    jacobi<<<nBlocks, blockSize>>>(phi_old, phi_new, ncells, alpha, dt);
 
-        // parallelCopy
-        parallelCopy<<<nBlocks, blockSize>>>(phi_old, phi_new, ncells);
+    // parallelCopy
+    parallelCopy<<<nBlocks, blockSize>>>(phi_old, phi_new, ncells);
 
-        cudaErrorCheck(cudaDeviceSynchronize());
+    cudaErrorCheck(cudaDeviceSynchronize());
 
-        // update time
-        time += dt;
-    }
+    // update time
+    time += dt;
+  }
 
-    auto elapsed = timer.stop();
+  auto elapsed = timer.stop();
 
-    // print timing
-    if (args.print_time) {
-        std::cout << "Time: " << elapsed << " ms" << std::endl;
-    }
+  // print timing
+  if (args.print_time) {
+    std::cout << "Time: " << elapsed << " ms" << std::endl;
+  }
 
-    // print final grid if needed
-    if (args.print_grid) {
-        cudaErrorCheck(cudaMemcpy(h_phi, phi_new, sizeof(Real_t) * gsize, cudaMemcpyDeviceToHost));
-        printGrid(h_phi, ncells);
+  // print final grid if needed
+  if (args.print_grid) {
+    cudaErrorCheck(cudaMemcpy(h_phi, phi_new, sizeof(Real_t) * gsize,
+                              cudaMemcpyDeviceToHost));
+    printGrid(h_phi, ncells);
 
-        // free host memory
-        delete[] h_phi;
-        h_phi = nullptr;
-    }
+    // free host memory
+    delete[] h_phi;
+    h_phi = nullptr;
+  }
 
-    // free device memory
-    cudaErrorCheck(cudaFree(phi_old));
-    cudaErrorCheck(cudaFree(phi_new));
+  // free device memory
+  cudaErrorCheck(cudaFree(phi_old));
+  cudaErrorCheck(cudaFree(phi_new));
 
-    return 0;
+  return 0;
 }
diff --git a/apps/heat-equation/heat-equation-gpu-scheduler.cpp b/apps/heat-equation/heat-equation-gpu-scheduler.cpp
index 2b9590d..b294235 100644
--- a/apps/heat-equation/heat-equation-gpu-scheduler.cpp
+++ b/apps/heat-equation/heat-equation-gpu-scheduler.cpp
@@ -44,132 +44,138 @@ using namespace nvexec;
 // simulation
 //
 int main(int argc, char* argv[]) {
-    // parse params
-    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
-
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
-
-    // simulation variables
-    int ncells = args.ncells;
-    int nsteps = args.nsteps;
-    Real_t dt = args.dt;
-    Real_t alpha = args.alpha;
-    // future if needed to split in multiple grids
-    // int max_grid_size = args.max_grid_size;
-
-    // init simulation time
-    Real_t time = 0.0;
-
-    // initialize dx, dy, dz
-    thrust::universal_vector<Real_t> dx(dims);
-    for (int i = 0; i < dims; ++i)
-        dx[i] = 1.0 / (ncells - 1);
-
-    // simulation setup (2D)
-    thrust::universal_vector<Real_t> grid_old((ncells + nghosts) * (ncells + nghosts));
-    thrust::universal_vector<Real_t> grid_new(ncells * ncells);
-
-    /*    Real_t *grid_old = new Real_t[(ncells+nghosts) * (ncells+nghosts)];
-      Real_t *grid_new = new Real_t[(ncells) * (ncells)];*/
-
-    // initialize grid
-    auto phi_old = thrust::raw_pointer_cast(grid_old.data());
-    auto phi_new = thrust::raw_pointer_cast(grid_new.data());
-
-    Timer timer;
+  // parse params
+  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
-    // scheduler from gpu
-    nvexec::stream_context stream_ctx{};
-    auto gpu = stream_ctx.get_scheduler();
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
+
+  // simulation variables
+  int ncells = args.ncells;
+  int nsteps = args.nsteps;
+  Real_t dt = args.dt;
+  Real_t alpha = args.alpha;
+  // future if needed to split in multiple grids
+  // int max_grid_size = args.max_grid_size;
+
+  // init simulation time
+  Real_t time = 0.0;
+
+  // initialize dx, dy, dz
+  thrust::universal_vector<Real_t> dx(dims);
+  for (int i = 0; i < dims; ++i)
+    dx[i] = 1.0 / (ncells - 1);
+
+  // simulation setup (2D)
+  thrust::universal_vector<Real_t> grid_old((ncells + nghosts) *
+                                            (ncells + nghosts));
+  thrust::universal_vector<Real_t> grid_new(ncells * ncells);
+
+  /*    Real_t *grid_old = new Real_t[(ncells+nghosts) * (ncells+nghosts)];
+      Real_t *grid_new = new Real_t[(ncells) * (ncells)];*/
 
-    auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), thrust::raw_pointer_cast(dx.data()) + dx.size()};
-    auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()};
-    auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()};
-    auto phi_old_extent = ncells + nghosts;
+  // initialize grid
+  auto phi_old = thrust::raw_pointer_cast(grid_old.data());
+  auto phi_new = thrust::raw_pointer_cast(grid_new.data());
+
+  Timer timer;
+
+  // scheduler from gpu
+  nvexec::stream_context stream_ctx{};
+  auto gpu = stream_ctx.get_scheduler();
+
+  auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()),
+                           thrust::raw_pointer_cast(dx.data()) + dx.size()};
+  auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()};
+  auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()};
+  auto phi_old_extent = ncells + nghosts;
+
+  int gsize = ncells * ncells;
+  auto heat_eq_init = ex::transfer_just(gpu, dx_span, phi_old_span) |
+                      ex::bulk(gsize, [=](int pos, auto ds, auto phi) {
+                        int i = 1 + (pos / ncells);
+                        int j = 1 + (pos % ncells);
+
+                        Real_t x = pos(i, ghost_cells, ds[0]);
+                        Real_t y = pos(j, ghost_cells, ds[1]);
+
+                        // L2 distance (r2 from origin)
+                        Real_t r2 = (x * x + y * y) / (0.01);
+
+                        // phi(x,y) = 1 + exp(-r^2)
+                        phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
+                      });
+
+  ex::sync_wait(std::move(heat_eq_init));
+  if (args.print_grid)
+    printGrid(phi_old, ncells + nghosts);
+
+  auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span);
+
+  // evolve the system
+  for (auto step = 0; step < nsteps; step++) {
+    static auto evolve =
+        tx |
+        ex::bulk(phi_old_extent - nghosts,
+                 [=](int pos, auto ds, auto phi_old, auto phi_new) {
+                   int i = pos + ghost_cells;
+                   int len = phi_old_extent;
+                   // fill boundary cells in old_phi
+                   phi_old[i] = phi_old[i + (ghost_cells * len)];
+                   phi_old[i + (len * (len - ghost_cells))] =
+                       phi_old[i + (len * (len - ghost_cells - 1))];
+                   phi_old[i * len] = phi_old[(ghost_cells * len) + i];
+                   phi_old[(len - ghost_cells) + (len * i)] =
+                       phi_old[(len - ghost_cells - 1) + (len * i)];
+                 }) |
+        ex::bulk(gsize,
+                 [=](int pos, auto ds, auto phi_old, auto phi_new) {
+                   int i = 1 + (pos / ncells);
+                   int j = 1 + (pos % ncells);
+
+                   // Jacobi iteration
+                   phi_new[(i - 1) * ncells + j - 1] =
+                       phi_old[(i)*phi_old_extent + j] +
+                       alpha * dt *
+                           ((phi_old[(i + 1) * phi_old_extent + j] -
+                             2.0 * phi_old[(i)*phi_old_extent + j] +
+                             phi_old[(i - 1) * phi_old_extent + j]) /
+                                (ds[0] * ds[0]) +
+                            (phi_old[(i)*phi_old_extent + j + 1] -
+                             2.0 * phi_old[(i)*phi_old_extent + j] +
+                             phi_old[(i)*phi_old_extent + j - 1]) /
+                                (ds[1] * ds[1]));
+                 }) |
+        ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) {
+          int i = 1 + (pos / ncells);
+          int j = 1 + (pos % ncells);
+          phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
+        });
 
-    int gsize = ncells * ncells;
-    auto heat_eq_init =
-        ex::transfer_just(gpu, dx_span, phi_old_span) | ex::bulk(gsize, [=](int pos, auto ds, auto phi) {
-            int i = 1 + (pos / ncells);
-            int j = 1 + (pos % ncells);
+    ex::sync_wait(std::move(evolve));
 
-            Real_t x = pos(i, ghost_cells, ds[0]);
-            Real_t y = pos(j, ghost_cells, ds[1]);
+    // update the simulation time
+    time += dt;
+  }
 
-            // L2 distance (r2 from origin)
-            Real_t r2 = (x * x + y * y) / (0.01);
+  auto elapsed = timer.stop();
 
-            // phi(x,y) = 1 + exp(-r^2)
-            phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
-        });
+  // print timing
+  if (args.print_time) {
+    std::cout << "Time: " << elapsed << " ms" << std::endl;
+  }
 
-    ex::sync_wait(std::move(heat_eq_init));
+  auto finalize = ex::then(ex::just(), [&]() {
     if (args.print_grid)
-        printGrid(phi_old, ncells + nghosts);
-
-    auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span);
-
-    // evolve the system
-    for (auto step = 0; step < nsteps; step++) {
-        static auto evolve =
-            tx |
-            ex::bulk(phi_old_extent - nghosts,
-                     [=](int pos, auto ds, auto phi_old, auto phi_new) {
-                         int i = pos + ghost_cells;
-                         int len = phi_old_extent;
-                         // fill boundary cells in old_phi
-                         phi_old[i] = phi_old[i + (ghost_cells * len)];
-                         phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))];
-                         phi_old[i * len] = phi_old[(ghost_cells * len) + i];
-                         phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)];
-                     }) |
-            ex::bulk(gsize,
-                     [=](int pos, auto ds, auto phi_old, auto phi_new) {
-                         int i = 1 + (pos / ncells);
-                         int j = 1 + (pos % ncells);
-
-                         // Jacobi iteration
-                         phi_new[(i - 1) * ncells + j - 1] =
-                             phi_old[(i)*phi_old_extent + j] +
-                             alpha * dt *
-                                 ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] +
-                                   phi_old[(i - 1) * phi_old_extent + j]) /
-                                      (ds[0] * ds[0]) +
-                                  (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] +
-                                   phi_old[(i)*phi_old_extent + j - 1]) /
-                                      (ds[1] * ds[1]));
-                     }) |
-            ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) {
-                int i = 1 + (pos / ncells);
-                int j = 1 + (pos % ncells);
-                phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
-            });
-
-        ex::sync_wait(std::move(evolve));
-
-        // update the simulation time
-        time += dt;
-    }
-
-    auto elapsed = timer.stop();
-
-    // print timing
-    if (args.print_time) {
-        std::cout << "Time: " << elapsed << " ms" << std::endl;
-    }
-
-    auto finalize = ex::then(ex::just(), [&]() {
-        if (args.print_grid)
-            // print the final grid
-            printGrid(phi_new, ncells);
-    });
-
-    // end the simulation
-    ex::sync_wait(std::move(finalize));
+      // print the final grid
+      printGrid(phi_new, ncells);
+  });
 
-    return 0;
+  // end the simulation
+  ex::sync_wait(std::move(finalize));
+
+  return 0;
 }
\ No newline at end of file
diff --git a/apps/heat-equation/heat-equation-mdspan.cpp b/apps/heat-equation/heat-equation-mdspan.cpp
index f38b9ed..1ae243b 100644
--- a/apps/heat-equation/heat-equation-mdspan.cpp
+++ b/apps/heat-equation/heat-equation-mdspan.cpp
@@ -33,121 +33,128 @@
 // fill boundary cells
 template <typename T>
 void fill2Dboundaries_mdspan(T* grid, int len, int ghost_cells = 1) {
-    auto row_view = std::mdspan<T, view_2d, std::layout_right>(grid, len, len);
-
-    for (auto j = ghost_cells; j < row_view.extent(1) - ghost_cells; ++j) {
-        row_view(0, j) = row_view(ghost_cells, j);
-        row_view(row_view.extent(0) - ghost_cells, j) = row_view(row_view.extent(0) - ghost_cells - 1, j);
-    }
-
-    auto col_view = std::mdspan<Real_t, view_2d, std::layout_left>(grid, len, len);
-
-    for (auto i = 1; i < col_view.extent(1) - 1; ++i) {
-        col_view(0, i) = col_view(ghost_cells, i);
-        col_view(col_view.extent(0) - 1, i) = col_view(col_view.extent(0) - ghost_cells - 1, i);
-    }
+  auto row_view = std::mdspan<T, view_2d, std::layout_right>(grid, len, len);
+
+  for (auto j = ghost_cells; j < row_view.extent(1) - ghost_cells; ++j) {
+    row_view(0, j) = row_view(ghost_cells, j);
+    row_view(row_view.extent(0) - ghost_cells, j) =
+        row_view(row_view.extent(0) - ghost_cells - 1, j);
+  }
+
+  auto col_view =
+      std::mdspan<Real_t, view_2d, std::layout_left>(grid, len, len);
+
+  for (auto i = 1; i < col_view.extent(1) - 1; ++i) {
+    col_view(0, i) = col_view(ghost_cells, i);
+    col_view(col_view.extent(0) - 1, i) =
+        col_view(col_view.extent(0) - ghost_cells - 1, i);
+  }
 }
 
 //
 // simulation
 //
 int main(int argc, char* argv[]) {
-    // parse params
-    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  // parse params
+  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
+
+  // simulation variables
+  int ncells = args.ncells;
+  int nsteps = args.nsteps;
+  Real_t dt = args.dt;
+  Real_t alpha = args.alpha;
+  // future if needed to split in multiple grids
+  // int max_grid_size = args.max_grid_size;
+
+  // initialize dx, dy, dz
+  auto* dx = new Real_t[dims];
+  for (int i = 0; i < dims; ++i)
+    dx[i] = 1.0 / (ncells - 1);
+
+  // simulation setup (2D)
+  Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
+  Real_t* grid_new = new Real_t[(ncells) * (ncells)];
+
+  auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(
+      grid_old, ncells + nghosts, ncells + nghosts);
+  auto phi_new =
+      std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
+
+  Timer timer;
+
+  // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
+  for (int i = 1; i < phi_old.extent(0) - 1; ++i) {
+    for (int j = 1; j < phi_old.extent(1) - 1; ++j) {
+      Real_t x = pos(i, ghost_cells, dx[0]);
+      Real_t y = pos(j, ghost_cells, dx[1]);
+
+      // L2 distance (r2 from origin)
+      Real_t r2 = (x * x + y * y) / (0.01);
+
+      // phi(x,y) = 1 + exp(-r^2)
+      phi_old(i, j) = 1 + exp(-r2);
     }
-
-    // simulation variables
-    int ncells = args.ncells;
-    int nsteps = args.nsteps;
-    Real_t dt = args.dt;
-    Real_t alpha = args.alpha;
-    // future if needed to split in multiple grids
-    // int max_grid_size = args.max_grid_size;
-
-    // initialize dx, dy, dz
-    auto* dx = new Real_t[dims];
-    for (int i = 0; i < dims; ++i)
-        dx[i] = 1.0 / (ncells - 1);
-
-    // simulation setup (2D)
-    Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
-    Real_t* grid_new = new Real_t[(ncells) * (ncells)];
-
-    auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(grid_old, ncells + nghosts, ncells + nghosts);
-    auto phi_new = std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
-
-    Timer timer;
-
-    // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
-    for (int i = 1; i < phi_old.extent(0) - 1; ++i) {
-        for (int j = 1; j < phi_old.extent(1) - 1; ++j) {
-            Real_t x = pos(i, ghost_cells, dx[0]);
-            Real_t y = pos(j, ghost_cells, dx[1]);
-
-            // L2 distance (r2 from origin)
-            Real_t r2 = (x * x + y * y) / (0.01);
-
-            // phi(x,y) = 1 + exp(-r^2)
-            phi_old(i, j) = 1 + exp(-r2);
-        }
+  }
+
+  if (args.print_grid)
+    // print the initial grid
+    printGrid(grid_old, ncells + nghosts);
+
+  // init simulation time
+  Real_t time = 0.0;
+
+  // evolve the system
+  for (auto step = 0; step < nsteps; step++) {
+    // fill boundary cells in old_phi
+    fill2Dboundaries_mdspan(grid_old, ncells + nghosts, ghost_cells);
+
+    // update phi_new
+    for (auto i = 1; i < phi_old.extent(0) - 1; i++) {
+      for (auto j = 1; j < phi_old.extent(1) - 1; j++) {
+        // Jacobi iteration
+        phi_new(i - 1, j - 1) =
+            phi_old(i, j) +
+            alpha * dt *
+                ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) /
+                     (dx[0] * dx[0]) +
+                 (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) /
+                     (dx[1] * dx[1]));
+      }
     }
 
-    if (args.print_grid)
-        // print the initial grid
-        printGrid(grid_old, ncells + nghosts);
-
-    // init simulation time
-    Real_t time = 0.0;
-
-    // evolve the system
-    for (auto step = 0; step < nsteps; step++) {
-        // fill boundary cells in old_phi
-        fill2Dboundaries_mdspan(grid_old, ncells + nghosts, ghost_cells);
-
-        // update phi_new
-        for (auto i = 1; i < phi_old.extent(0) - 1; i++) {
-            for (auto j = 1; j < phi_old.extent(1) - 1; j++) {
-                // Jacobi iteration
-                phi_new(i - 1, j - 1) =
-                    phi_old(i, j) +
-                    alpha * dt *
-                        ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) +
-                         (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1]));
-            }
-        }
-
-        // update the simulation time
-        time += dt;
-
-        // parallel copy phi_new to phi_old
-        for (auto i = 1; i < phi_old.extent(0) - 1; i++)
-            for (auto j = 1; j < phi_old.extent(1) - 1; j++)
-                // copy phi_new to phi_old
-                phi_old(i, j) = phi_new(i - 1, j - 1);
-    }
+    // update the simulation time
+    time += dt;
 
-    auto elapsed = timer.stop();
+    // parallel copy phi_new to phi_old
+    for (auto i = 1; i < phi_old.extent(0) - 1; i++)
+      for (auto j = 1; j < phi_old.extent(1) - 1; j++)
+        // copy phi_new to phi_old
+        phi_old(i, j) = phi_new(i - 1, j - 1);
+  }
 
-    // print timing
-    if (args.print_time) {
-        std::cout << "Time: " << elapsed << " ms" << std::endl;
-    }
+  auto elapsed = timer.stop();
 
-    if (args.print_grid)
-        // print the final grid
-        printGrid(grid_new, ncells);
+  // print timing
+  if (args.print_time) {
+    std::cout << "Time: " << elapsed << " ms" << std::endl;
+  }
 
-    // delete all memory
-    delete[] grid_old;
-    delete[] grid_new;
+  if (args.print_grid)
+    // print the final grid
+    printGrid(grid_new, ncells);
 
-    grid_old = nullptr;
-    grid_new = nullptr;
+  // delete all memory
+  delete[] grid_old;
+  delete[] grid_new;
 
-    return 0;
+  grid_old = nullptr;
+  grid_new = nullptr;
+
+  return 0;
 }
diff --git a/apps/heat-equation/heat-equation-multigpu-scheduler.cpp b/apps/heat-equation/heat-equation-multigpu-scheduler.cpp
index d8e79b3..efcc9e5 100644
--- a/apps/heat-equation/heat-equation-multigpu-scheduler.cpp
+++ b/apps/heat-equation/heat-equation-multigpu-scheduler.cpp
@@ -44,129 +44,135 @@ using namespace nvexec;
 // simulation
 //
 int main(int argc, char* argv[]) {
-    // parse params
-    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
-
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
-
-    // simulation variables
-    int ncells = args.ncells;
-    int nsteps = args.nsteps;
-    Real_t dt = args.dt;
-    Real_t alpha = args.alpha;
-    // future if needed to split in multiple grids
-    // int max_grid_size = args.max_grid_size;
-
-    // init simulation time
-    Real_t time = 0.0;
-
-    // initialize dx, dy, dz
-    thrust::universal_vector<Real_t> dx(dims);
-    for (int i = 0; i < dims; ++i)
-        dx[i] = 1.0 / (ncells - 1);
-
-    // simulation setup (2D)
-    thrust::universal_vector<Real_t> grid_old((ncells + nghosts) * (ncells + nghosts));
-    thrust::universal_vector<Real_t> grid_new(ncells * ncells);
-
-    // initialize grid
-    auto phi_old = thrust::raw_pointer_cast(grid_old.data());
-    auto phi_new = thrust::raw_pointer_cast(grid_new.data());
-
-    Timer timer;
-
-    // scheduler from gpu
-    nvexec::multi_gpu_stream_context stream_ctx{};
-    auto gpu = stream_ctx.get_scheduler();
-
-    auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()), thrust::raw_pointer_cast(dx.data()) + dx.size()};
-    auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()};
-    auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()};
-    auto phi_old_extent = ncells + nghosts;
-
-    int gsize = ncells * ncells;
-    auto heat_eq_init =
-        ex::transfer_just(gpu, dx_span, phi_old_span) | ex::bulk(gsize, [=](int pos, auto ds, auto phi) {
-            int i = 1 + (pos / ncells);
-            int j = 1 + (pos % ncells);
-
-            Real_t x = pos(i, ghost_cells, ds[0]);
-            Real_t y = pos(j, ghost_cells, ds[1]);
-
-            // L2 distance (r2 from origin)
-            Real_t r2 = (x * x + y * y) / (0.01);
-
-            // phi(x,y) = 1 + exp(-r^2)
-            phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
+  // parse params
+  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
+
+  // simulation variables
+  int ncells = args.ncells;
+  int nsteps = args.nsteps;
+  Real_t dt = args.dt;
+  Real_t alpha = args.alpha;
+  // future if needed to split in multiple grids
+  // int max_grid_size = args.max_grid_size;
+
+  // init simulation time
+  Real_t time = 0.0;
+
+  // initialize dx, dy, dz
+  thrust::universal_vector<Real_t> dx(dims);
+  for (int i = 0; i < dims; ++i)
+    dx[i] = 1.0 / (ncells - 1);
+
+  // simulation setup (2D)
+  thrust::universal_vector<Real_t> grid_old((ncells + nghosts) *
+                                            (ncells + nghosts));
+  thrust::universal_vector<Real_t> grid_new(ncells * ncells);
+
+  // initialize grid
+  auto phi_old = thrust::raw_pointer_cast(grid_old.data());
+  auto phi_new = thrust::raw_pointer_cast(grid_new.data());
+
+  Timer timer;
+
+  // scheduler from gpu
+  nvexec::multi_gpu_stream_context stream_ctx{};
+  auto gpu = stream_ctx.get_scheduler();
+
+  auto dx_span = std::span{thrust::raw_pointer_cast(dx.data()),
+                           thrust::raw_pointer_cast(dx.data()) + dx.size()};
+  auto phi_old_span = std::span{phi_old, phi_old + grid_old.size()};
+  auto phi_new_span = std::span{phi_new, phi_new + grid_new.size()};
+  auto phi_old_extent = ncells + nghosts;
+
+  int gsize = ncells * ncells;
+  auto heat_eq_init = ex::transfer_just(gpu, dx_span, phi_old_span) |
+                      ex::bulk(gsize, [=](int pos, auto ds, auto phi) {
+                        int i = 1 + (pos / ncells);
+                        int j = 1 + (pos % ncells);
+
+                        Real_t x = pos(i, ghost_cells, ds[0]);
+                        Real_t y = pos(j, ghost_cells, ds[1]);
+
+                        // L2 distance (r2 from origin)
+                        Real_t r2 = (x * x + y * y) / (0.01);
+
+                        // phi(x,y) = 1 + exp(-r^2)
+                        phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
+                      });
+
+  ex::sync_wait(std::move(heat_eq_init));
+  if (args.print_grid)
+    printGrid(phi_old, ncells + nghosts);
+
+  auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span);
+
+  // evolve the system
+  for (auto step = 0; step < nsteps; step++) {
+    static auto evolve =
+        tx |
+        ex::bulk(phi_old_extent - nghosts,
+                 [=](int pos, auto ds, auto phi_old, auto phi_new) {
+                   int i = pos + ghost_cells;
+                   int len = phi_old_extent;
+                   // fill boundary cells in old_phi
+                   phi_old[i] = phi_old[i + (ghost_cells * len)];
+                   phi_old[i + (len * (len - ghost_cells))] =
+                       phi_old[i + (len * (len - ghost_cells - 1))];
+                   phi_old[i * len] = phi_old[(ghost_cells * len) + i];
+                   phi_old[(len - ghost_cells) + (len * i)] =
+                       phi_old[(len - ghost_cells - 1) + (len * i)];
+                 }) |
+        ex::bulk(gsize,
+                 [=](int pos, auto ds, auto phi_old, auto phi_new) {
+                   int i = 1 + (pos / ncells);
+                   int j = 1 + (pos % ncells);
+
+                   // Jacobi iteration
+                   phi_new[(i - 1) * ncells + j - 1] =
+                       phi_old[(i)*phi_old_extent + j] +
+                       alpha * dt *
+                           ((phi_old[(i + 1) * phi_old_extent + j] -
+                             2.0 * phi_old[(i)*phi_old_extent + j] +
+                             phi_old[(i - 1) * phi_old_extent + j]) /
+                                (ds[0] * ds[0]) +
+                            (phi_old[(i)*phi_old_extent + j + 1] -
+                             2.0 * phi_old[(i)*phi_old_extent + j] +
+                             phi_old[(i)*phi_old_extent + j - 1]) /
+                                (ds[1] * ds[1]));
+                 }) |
+        ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) {
+          int i = 1 + (pos / ncells);
+          int j = 1 + (pos % ncells);
+          phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
         });
 
-    ex::sync_wait(std::move(heat_eq_init));
+    ex::sync_wait(std::move(evolve));
+
+    // update the simulation time
+    time += dt;
+  }
+
+  auto elapsed = timer.stop();
+
+  // print timing
+  if (args.print_time) {
+    std::cout << "Time: " << elapsed << " ms" << std::endl;
+  }
+
+  auto finalize = ex::then(ex::just(), [&]() {
     if (args.print_grid)
-        printGrid(phi_old, ncells + nghosts);
-
-    auto tx = ex::transfer_just(gpu, dx_span, phi_old_span, phi_new_span);
-
-    // evolve the system
-    for (auto step = 0; step < nsteps; step++) {
-        static auto evolve =
-            tx |
-            ex::bulk(phi_old_extent - nghosts,
-                     [=](int pos, auto ds, auto phi_old, auto phi_new) {
-                         int i = pos + ghost_cells;
-                         int len = phi_old_extent;
-                         // fill boundary cells in old_phi
-                         phi_old[i] = phi_old[i + (ghost_cells * len)];
-                         phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))];
-                         phi_old[i * len] = phi_old[(ghost_cells * len) + i];
-                         phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)];
-                     }) |
-            ex::bulk(gsize,
-                     [=](int pos, auto ds, auto phi_old, auto phi_new) {
-                         int i = 1 + (pos / ncells);
-                         int j = 1 + (pos % ncells);
-
-                         // Jacobi iteration
-                         phi_new[(i - 1) * ncells + j - 1] =
-                             phi_old[(i)*phi_old_extent + j] +
-                             alpha * dt *
-                                 ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] +
-                                   phi_old[(i - 1) * phi_old_extent + j]) /
-                                      (ds[0] * ds[0]) +
-                                  (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] +
-                                   phi_old[(i)*phi_old_extent + j - 1]) /
-                                      (ds[1] * ds[1]));
-                     }) |
-            ex::bulk(gsize, [=](int pos, auto ds, auto phi_old, auto phi_new) {
-                int i = 1 + (pos / ncells);
-                int j = 1 + (pos % ncells);
-                phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
-            });
-
-        ex::sync_wait(std::move(evolve));
-
-        // update the simulation time
-        time += dt;
-    }
-
-    auto elapsed = timer.stop();
-
-    // print timing
-    if (args.print_time) {
-        std::cout << "Time: " << elapsed << " ms" << std::endl;
-    }
-
-    auto finalize = ex::then(ex::just(), [&]() {
-        if (args.print_grid)
-            // print the final grid
-            printGrid(phi_new, ncells);
-    });
-
-    // end the simulation
-    ex::sync_wait(std::move(finalize));
+      // print the final grid
+      printGrid(phi_new, ncells);
+  });
 
-    return 0;
+  // end the simulation
+  ex::sync_wait(std::move(finalize));
+
+  return 0;
 }
\ No newline at end of file
diff --git a/apps/heat-equation/heat-equation-omp.cpp b/apps/heat-equation/heat-equation-omp.cpp
index ebf89e2..6af69b0 100644
--- a/apps/heat-equation/heat-equation-omp.cpp
+++ b/apps/heat-equation/heat-equation-omp.cpp
@@ -33,126 +33,134 @@
 
 // fill boundary cells OpenMP
 template <typename T>
-void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1, int ghost_cells = 1) {
+void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1,
+                          int ghost_cells = 1) {
 #pragma omp parallel for num_threads(nthreads)
-    for (int i = ghost_cells; i < len - ghost_cells; i++) {
-        grid[i] = grid[i + (ghost_cells * len)];
-        grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))];
-
-        grid[i * len] = grid[(ghost_cells * len) + i];
-        grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)];
-    }
+  for (int i = ghost_cells; i < len - ghost_cells; i++) {
+    grid[i] = grid[i + (ghost_cells * len)];
+    grid[i + (len * (len - ghost_cells))] =
+        grid[i + (len * (len - ghost_cells - 1))];
+
+    grid[i * len] = grid[(ghost_cells * len) + i];
+    grid[(len - ghost_cells) + (len * i)] =
+        grid[(len - ghost_cells - 1) + (len * i)];
+  }
 }
 
 //
 // simulation
 //
 int main(int argc, char* argv[]) {
-    // parse params
-    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  // parse params
+  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
 
-    // simulation variables
-    int ncells = args.ncells;
-    int nsteps = args.nsteps;
-    int nthreads = args.nthreads;
-    Real_t dt = args.dt;
-    Real_t alpha = args.alpha;
-    // future if needed to split in multiple grids
-    // int max_grid_size = args.max_grid_size;
+  // simulation variables
+  int ncells = args.ncells;
+  int nsteps = args.nsteps;
+  int nthreads = args.nthreads;
+  Real_t dt = args.dt;
+  Real_t alpha = args.alpha;
+  // future if needed to split in multiple grids
+  // int max_grid_size = args.max_grid_size;
 
-    // initialize dx, dy, dz
-    auto* dx = new Real_t[dims];
-    for (int i = 0; i < dims; ++i)
-        dx[i] = 1.0 / (ncells - 1);
+  // initialize dx, dy, dz
+  auto* dx = new Real_t[dims];
+  for (int i = 0; i < dims; ++i)
+    dx[i] = 1.0 / (ncells - 1);
 
-    // simulation setup (2D)
-    Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
-    Real_t* grid_new = new Real_t[(ncells) * (ncells)];
+  // simulation setup (2D)
+  Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
+  Real_t* grid_new = new Real_t[(ncells) * (ncells)];
 
-    auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(grid_old, ncells + nghosts, ncells + nghosts);
-    auto phi_new = std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
+  auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(
+      grid_old, ncells + nghosts, ncells + nghosts);
+  auto phi_new =
+      std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
 
-    int gsize = ncells * ncells;
+  int gsize = ncells * ncells;
 
-    Timer timer;
+  Timer timer;
 
-    // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
+  // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
 #pragma omp parallel for num_threads(nthreads)
-    for (int pos = 0; pos < gsize; pos++) {
-        int i = 1 + (pos / ncells);
-        int j = 1 + (pos % ncells);
+  for (int pos = 0; pos < gsize; pos++) {
+    int i = 1 + (pos / ncells);
+    int j = 1 + (pos % ncells);
 
-        Real_t x = pos(i, ghost_cells, dx[0]);
-        Real_t y = pos(j, ghost_cells, dx[1]);
+    Real_t x = pos(i, ghost_cells, dx[0]);
+    Real_t y = pos(j, ghost_cells, dx[1]);
 
-        // L2 distance (r2 from origin)
-        Real_t r2 = (x * x + y * y) / (0.01);
+    // L2 distance (r2 from origin)
+    Real_t r2 = (x * x + y * y) / (0.01);
 
-        // phi(x,y) = 1 + exp(-r^2)
-        phi_old(i, j) = 1 + exp(-r2);
-    }
+    // phi(x,y) = 1 + exp(-r^2)
+    phi_old(i, j) = 1 + exp(-r2);
+  }
 
-    if (args.print_grid)
-        // print the initial grid
-        printGrid(grid_old, ncells + nghosts);
+  if (args.print_grid)
+    // print the initial grid
+    printGrid(grid_old, ncells + nghosts);
 
-    // init simulation time
-    Real_t time = 0.0;
+  // init simulation time
+  Real_t time = 0.0;
 
-    // evolve the system
-    for (auto step = 0; step < nsteps; step++) {
-        // fill boundary cells in old_phi
-        fill2Dboundaries_omp(grid_old, ncells + nghosts, ghost_cells, nthreads);
+  // evolve the system
+  for (auto step = 0; step < nsteps; step++) {
+    // fill boundary cells in old_phi
+    fill2Dboundaries_omp(grid_old, ncells + nghosts, ghost_cells, nthreads);
 
 #pragma omp parallel for num_threads(nthreads)
-        for (int pos = 0; pos < gsize; pos++) {
-            int i = 1 + (pos / ncells);
-            int j = 1 + (pos % ncells);
-
-            // Jacobi iteration
-            phi_new(i - 1, j - 1) =
-                phi_old(i, j) + alpha * dt *
-                                    ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) +
-                                     (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1]));
-        }
+    for (int pos = 0; pos < gsize; pos++) {
+      int i = 1 + (pos / ncells);
+      int j = 1 + (pos % ncells);
+
+      // Jacobi iteration
+      phi_new(i - 1, j - 1) =
+          phi_old(i, j) +
+          alpha * dt *
+              ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) /
+                   (dx[0] * dx[0]) +
+               (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) /
+                   (dx[1] * dx[1]));
+    }
 
-        // update the simulation time
-        time += dt;
+    // update the simulation time
+    time += dt;
 
-        // parallel copy phi_new to phi_old
+    // parallel copy phi_new to phi_old
 #pragma omp parallel for num_threads(nthreads)
-        for (int pos = 0; pos < gsize; pos++) {
-            int i = 1 + (pos / ncells);
-            int j = 1 + (pos % ncells);
+    for (int pos = 0; pos < gsize; pos++) {
+      int i = 1 + (pos / ncells);
+      int j = 1 + (pos % ncells);
 
-            // copy phi_new to phi_old
-            phi_old(i, j) = phi_new(i - 1, j - 1);
-        }
+      // copy phi_new to phi_old
+      phi_old(i, j) = phi_new(i - 1, j - 1);
     }
+  }
 
-    auto elapsed = timer.stop();
+  auto elapsed = timer.stop();
 
-    // print timing
-    if (args.print_time) {
-        std::cout << "Time: " << elapsed << " ms" << std::endl;
-    }
+  // print timing
+  if (args.print_time) {
+    std::cout << "Time: " << elapsed << " ms" << std::endl;
+  }
 
-    if (args.print_grid)
-        // print the final grid
-        printGrid(grid_new, ncells);
+  if (args.print_grid)
+    // print the final grid
+    printGrid(grid_new, ncells);
 
-    // delete all memory
-    delete[] grid_old;
-    delete[] grid_new;
+  // delete all memory
+  delete[] grid_old;
+  delete[] grid_new;
 
-    grid_old = nullptr;
-    grid_new = nullptr;
+  grid_old = nullptr;
+  grid_new = nullptr;
 
-    return 0;
+  return 0;
 }
diff --git a/apps/heat-equation/heat-equation-stdpar-senders.cpp b/apps/heat-equation/heat-equation-stdpar-senders.cpp
index 5209f37..f83b113 100644
--- a/apps/heat-equation/heat-equation-stdpar-senders.cpp
+++ b/apps/heat-equation/heat-equation-stdpar-senders.cpp
@@ -45,156 +45,166 @@ using stdexec::sync_wait;
 // simulation
 //
 int main(int argc, char* argv[]) {
-    // parse params
-    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
-
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
-
-    // simulation variables
-    int ncells = args.ncells;
-    int nsteps = args.nsteps;
-    // number of parallel tiles
-    int ntiles = args.ntiles;
-    Real_t dt = args.dt;
-    Real_t alpha = args.alpha;
-    // future if needed to split in multiple grids
-    // int max_grid_size = args.max_grid_size;
-
-    // init simulation time
-    Real_t time = 0.0;
-
-    // initialize dx, dy, dz
-    auto* dx = new Real_t[dims];
-    for (int i = 0; i < dims; ++i)
-        dx[i] = 1.0 / (ncells - 1);
-
-    // simulation setup (2D)
-    Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
-    Real_t* grid_new = new Real_t[(ncells) * (ncells)];
-
-    auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(grid_old, ncells + nghosts, ncells + nghosts);
-    auto phi_new = std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
-
-    Timer timer;
-
-    // scheduler from a thread pool
-    exec::static_thread_pool ctx{ntiles};
-
-    scheduler auto sch = ctx.get_scheduler();
-    sender auto begin = schedule(sch);
-
-    // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
-    sender auto heat_eq_init =
-        bulk(begin, ntiles,
-             [&](int tile) {
-                 int start = tile * (ncells * ncells) / ntiles;
-                 int size = (ncells * ncells) / ntiles;
-                 int remaining = (ncells * ncells) % ntiles;
-                 size += (tile == ntiles - 1) ? remaining : 0;
+  // parse params
+  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
-                 std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) {
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
+    return 0;
+  }
+
+  // simulation variables
+  int ncells = args.ncells;
+  int nsteps = args.nsteps;
+  // number of parallel tiles
+  int ntiles = args.ntiles;
+  Real_t dt = args.dt;
+  Real_t alpha = args.alpha;
+  // future if needed to split in multiple grids
+  // int max_grid_size = args.max_grid_size;
+
+  // init simulation time
+  Real_t time = 0.0;
+
+  // initialize dx, dy, dz
+  auto* dx = new Real_t[dims];
+  for (int i = 0; i < dims; ++i)
+    dx[i] = 1.0 / (ncells - 1);
+
+  // simulation setup (2D)
+  Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
+  Real_t* grid_new = new Real_t[(ncells) * (ncells)];
+
+  auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(
+      grid_old, ncells + nghosts, ncells + nghosts);
+  auto phi_new =
+      std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
+
+  Timer timer;
+
+  // scheduler from a thread pool
+  exec::static_thread_pool ctx{ntiles};
+
+  scheduler auto sch = ctx.get_scheduler();
+  sender auto begin = schedule(sch);
+
+  // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
+  sender auto heat_eq_init =
+      bulk(begin, ntiles,
+           [&](int tile) {
+             int start = tile * (ncells * ncells) / ntiles;
+             int size = (ncells * ncells) / ntiles;
+             int remaining = (ncells * ncells) % ntiles;
+             size += (tile == ntiles - 1) ? remaining : 0;
+
+             std::for_each_n(std::execution::par_unseq,
+                             counting_iterator(start), size, [=](int pos) {
+                               int i = 1 + (pos / ncells);
+                               int j = 1 + (pos % ncells);
+
+                               Real_t x = pos(i, ghost_cells, dx[0]);
+                               Real_t y = pos(j, ghost_cells, dx[1]);
+
+                               // L2 distance (r2 from origin)
+                               Real_t r2 = (x * x + y * y) / (0.01);
+
+                               // phi(x,y) = 1 + exp(-r^2)
+                               phi_old(i, j) = 1 + exp(-r2);
+                             });
+           }) |
+      then([&]() {
+        if (args.print_grid)
+          // print the initial grid
+          printGrid(grid_old, ncells + nghosts);
+      });
+
+  // start the simulation
+  sync_wait(std::move(heat_eq_init));
+
+  // evolve the system
+  for (auto step = 0; step < nsteps; step++) {
+    static sender auto evolve =
+        then(begin,
+             [&]() {
+               // fill boundary cells in old_phi
+               fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells);
+             }) |
+        bulk(ntiles,
+             [&](int tile) {
+               int start = tile * (ncells * ncells) / ntiles;
+               int size = (ncells * ncells) / ntiles;
+               int remaining = (ncells * ncells) % ntiles;
+               size += (tile == ntiles - 1) ? remaining : 0;
+
+               // update phi_new with stencil
+               std::for_each_n(
+                   std::execution::par_unseq, counting_iterator(start), size,
+                   [=](int pos) {
                      int i = 1 + (pos / ncells);
                      int j = 1 + (pos % ncells);
 
-                     Real_t x = pos(i, ghost_cells, dx[0]);
-                     Real_t y = pos(j, ghost_cells, dx[1]);
-
-                     // L2 distance (r2 from origin)
-                     Real_t r2 = (x * x + y * y) / (0.01);
-
-                     // phi(x,y) = 1 + exp(-r^2)
-                     phi_old(i, j) = 1 + exp(-r2);
-                 });
+                     // Jacobi iteration
+                     phi_new(i - 1, j - 1) =
+                         phi_old(i, j) +
+                         alpha * dt *
+                             ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) +
+                               phi_old(i - 1, j)) /
+                                  (dx[0] * dx[0]) +
+                              (phi_old(i, j + 1) - 2.0 * phi_old(i, j) +
+                               phi_old(i, j - 1)) /
+                                  (dx[1] * dx[1]));
+                   });
+             }) |
+        bulk(ntiles,
+             [&](int tile) {
+               int start = tile * (ncells * ncells) / ntiles;
+               int size = (ncells * ncells) / ntiles;
+               int remaining = (ncells * ncells) % ntiles;
+               size += (tile == ntiles - 1) ? remaining : 0;
+
+               // parallel copy phi_new to phi_old
+               std::for_each_n(std::execution::par_unseq,
+                               counting_iterator(start), size, [=](int pos) {
+                                 int i = 1 + (pos / ncells);
+                                 int j = 1 + (pos % ncells);
+
+                                 // copy phi_new to phi_old
+                                 phi_old(i, j) = phi_new(i - 1, j - 1);
+                               });
              }) |
         then([&]() {
-            if (args.print_grid)
-                // print the initial grid
-                printGrid(grid_old, ncells + nghosts);
+          // update the simulation time
+          time += dt;
         });
 
-    // start the simulation
-    sync_wait(std::move(heat_eq_init));
-
-    // evolve the system
-    for (auto step = 0; step < nsteps; step++) {
-        static sender auto evolve =
-            then(begin,
-                 [&]() {
-                     // fill boundary cells in old_phi
-                     fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells);
-                 }) |
-            bulk(ntiles,
-                 [&](int tile) {
-                     int start = tile * (ncells * ncells) / ntiles;
-                     int size = (ncells * ncells) / ntiles;
-                     int remaining = (ncells * ncells) % ntiles;
-                     size += (tile == ntiles - 1) ? remaining : 0;
-
-                     // update phi_new with stencil
-                     std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) {
-                         int i = 1 + (pos / ncells);
-                         int j = 1 + (pos % ncells);
-
-                         // Jacobi iteration
-                         phi_new(i - 1, j - 1) =
-                             phi_old(i, j) +
-                             alpha * dt *
-                                 ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) +
-                                  (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1]));
-                     });
-                 }) |
-            bulk(ntiles,
-                 [&](int tile) {
-                     int start = tile * (ncells * ncells) / ntiles;
-                     int size = (ncells * ncells) / ntiles;
-                     int remaining = (ncells * ncells) % ntiles;
-                     size += (tile == ntiles - 1) ? remaining : 0;
-
-                     // parallel copy phi_new to phi_old
-                     std::for_each_n(std::execution::par_unseq, counting_iterator(start), size, [=](int pos) {
-                         int i = 1 + (pos / ncells);
-                         int j = 1 + (pos % ncells);
-
-                         // copy phi_new to phi_old
-                         phi_old(i, j) = phi_new(i - 1, j - 1);
-                     });
-                 }) |
-            then([&]() {
-                // update the simulation time
-                time += dt;
-            });
-
-        sync_wait(std::move(evolve));
-    }
-
-    auto elapsed = timer.stop();
-
-    // print timing
-    if (args.print_time) {
-        std::cout << "Time: " << elapsed << " ms" << std::endl;
-    }
-
-    sender auto finalize = then(just(),
-                                [&]() {
-                                    if (args.print_grid)
-                                        // print the final grid
-                                        printGrid(grid_new, ncells);
-                                }) |
-                           then([&]() {
-                               // delete all memory
-                               delete[] grid_old;
-                               delete[] grid_new;
-
-                               grid_old = nullptr;
-                               grid_new = nullptr;
-                           });
-
-    // start the simulation
-    sync_wait(std::move(finalize));
+    sync_wait(std::move(evolve));
+  }
 
-    return 0;
+  auto elapsed = timer.stop();
+
+  // print timing
+  if (args.print_time) {
+    std::cout << "Time: " << elapsed << " ms" << std::endl;
+  }
+
+  sender auto finalize = then(just(),
+                              [&]() {
+                                if (args.print_grid)
+                                  // print the final grid
+                                  printGrid(grid_new, ncells);
+                              }) |
+                         then([&]() {
+                           // delete all memory
+                           delete[] grid_old;
+                           delete[] grid_new;
+
+                           grid_old = nullptr;
+                           grid_new = nullptr;
+                         });
+
+  // start the simulation
+  sync_wait(std::move(finalize));
+
+  return 0;
 }
diff --git a/apps/heat-equation/heat-equation-stdpar.cpp b/apps/heat-equation/heat-equation-stdpar.cpp
index 164c482..b20fb68 100644
--- a/apps/heat-equation/heat-equation-stdpar.cpp
+++ b/apps/heat-equation/heat-equation-stdpar.cpp
@@ -34,107 +34,117 @@
 // simulation
 //
 int main(int argc, char* argv[]) {
-    // parse params
-    heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
-
-    // see if help wanted
-    if (args.help) {
-        args.print();  // prints all variables
-        return 0;
-    }
-
-    // simulation variables
-    int ncells = args.ncells;
-    int nsteps = args.nsteps;
-    Real_t dt = args.dt;
-    Real_t alpha = args.alpha;
-    // future if needed to split in multiple grids
-    // int max_grid_size = args.max_grid_size;
-
-    // initialize dx, dy, dz
-    auto* dx = new Real_t[dims];
-    for (int i = 0; i < dims; ++i)
-        dx[i] = 1.0 / (ncells - 1);
-
-    // simulation setup (2D)
-    Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
-    Real_t* grid_new = new Real_t[(ncells) * (ncells)];
-
-    auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(grid_old, ncells + nghosts, ncells + nghosts);
-    auto phi_new = std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
-
-    // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
-
-    Timer timer;
-
-    std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) {
-        int i = 1 + (ind / ncells);
-        int j = 1 + (ind % ncells);
-
-        Real_t x = pos(i, ghost_cells, dx[0]);
-        Real_t y = pos(j, ghost_cells, dx[1]);
-
-        // L2 distance (r2 from origin)
-        Real_t r2 = (x * x + y * y) / (0.01);
-
-        // phi(x,y) = 1 + exp(-r^2)
-        phi_old(i, j) = 1 + exp(-r2);
-    });
-
-    if (args.print_grid)
-        // print the initial grid
-        printGrid(grid_old, ncells + nghosts);
-
-    // init simulation time
-    Real_t time = 0.0;
-
-    // evolve the system
-    for (auto step = 0; step < nsteps; step++) {
-        // fill boundary cells in old_phi
-        fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells);
-
-        // update phi_new with stencil
-        std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) {
-            int i = 1 + (ind / ncells);
-            int j = 1 + (ind % ncells);
-
-            // Jacobi iteration
-            phi_new(i - 1, j - 1) =
-                phi_old(i, j) + alpha * dt *
-                                    ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) +
-                                     (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1]));
-        });
-
-        // update the simulation time
-        time += dt;
-
-        // parallel copy phi_new to phi_old
-        std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) {
-            int i = 1 + (ind / ncells);
-            int j = 1 + (ind % ncells);
-
-            // copy phi_new to phi_old
-            phi_old(i, j) = phi_new(i - 1, j - 1);
-        });
-    }
-
-    auto elapsed = timer.stop();
-
-    // print timing
-    if (args.print_time) {
-        std::cout << "Time: " << elapsed << " ms" << std::endl;
-    }
-
-    if (args.print_grid)
-        // print the final grid
-        printGrid(grid_new, ncells);
-
-    // delete all memory
-    delete[] grid_old;
-    delete[] grid_new;
-
-    grid_old = nullptr;
-    grid_new = nullptr;
+  // parse params
+  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
+  // see if help wanted
+  if (args.help) {
+    args.print();  // prints all variables
     return 0;
+  }
+
+  // simulation variables
+  int ncells = args.ncells;
+  int nsteps = args.nsteps;
+  Real_t dt = args.dt;
+  Real_t alpha = args.alpha;
+  // future if needed to split in multiple grids
+  // int max_grid_size = args.max_grid_size;
+
+  // initialize dx, dy, dz
+  auto* dx = new Real_t[dims];
+  for (int i = 0; i < dims; ++i)
+    dx[i] = 1.0 / (ncells - 1);
+
+  // simulation setup (2D)
+  Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
+  Real_t* grid_new = new Real_t[(ncells) * (ncells)];
+
+  auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(
+      grid_old, ncells + nghosts, ncells + nghosts);
+  auto phi_new =
+      std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
+
+  // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
+
+  Timer timer;
+
+  std::for_each_n(std::execution::par_unseq, counting_iterator(0),
+                  ncells * ncells, [=](int ind) {
+                    int i = 1 + (ind / ncells);
+                    int j = 1 + (ind % ncells);
+
+                    Real_t x = pos(i, ghost_cells, dx[0]);
+                    Real_t y = pos(j, ghost_cells, dx[1]);
+
+                    // L2 distance (r2 from origin)
+                    Real_t r2 = (x * x + y * y) / (0.01);
+
+                    // phi(x,y) = 1 + exp(-r^2)
+                    phi_old(i, j) = 1 + exp(-r2);
+                  });
+
+  if (args.print_grid)
+    // print the initial grid
+    printGrid(grid_old, ncells + nghosts);
+
+  // init simulation time
+  Real_t time = 0.0;
+
+  // evolve the system
+  for (auto step = 0; step < nsteps; step++) {
+    // fill boundary cells in old_phi
+    fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells);
+
+    // update phi_new with stencil
+    std::for_each_n(std::execution::par_unseq, counting_iterator(0),
+                    ncells * ncells, [=](int ind) {
+                      int i = 1 + (ind / ncells);
+                      int j = 1 + (ind % ncells);
+
+                      // Jacobi iteration
+                      phi_new(i - 1, j - 1) =
+                          phi_old(i, j) +
+                          alpha * dt *
+                              ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) +
+                                phi_old(i - 1, j)) /
+                                   (dx[0] * dx[0]) +
+                               (phi_old(i, j + 1) - 2.0 * phi_old(i, j) +
+                                phi_old(i, j - 1)) /
+                                   (dx[1] * dx[1]));
+                    });
+
+    // update the simulation time
+    time += dt;
+
+    // parallel copy phi_new to phi_old
+    std::for_each_n(std::execution::par_unseq, counting_iterator(0),
+                    ncells * ncells, [=](int ind) {
+                      int i = 1 + (ind / ncells);
+                      int j = 1 + (ind % ncells);
+
+                      // copy phi_new to phi_old
+                      phi_old(i, j) = phi_new(i - 1, j - 1);
+                    });
+  }
+
+  auto elapsed = timer.stop();
+
+  // print timing
+  if (args.print_time) {
+    std::cout << "Time: " << elapsed << " ms" << std::endl;
+  }
+
+  if (args.print_grid)
+    // print the final grid
+    printGrid(grid_new, ncells);
+
+  // delete all memory
+  delete[] grid_old;
+  delete[] grid_new;
+
+  grid_old = nullptr;
+  grid_new = nullptr;
+
+  return 0;
 }
diff --git a/apps/heat-equation/heat-equation.hpp b/apps/heat-equation/heat-equation.hpp
index 94bf8b6..a226a45 100644
--- a/apps/heat-equation/heat-equation.hpp
+++ b/apps/heat-equation/heat-equation.hpp
@@ -49,57 +49,62 @@ constexpr int nghosts = ghost_cells * dims;
 using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 
 // 3D view
-using view_3d = std::extents<int, std::dynamic_extent, std::dynamic_extent, std::dynamic_extent>;
+using view_3d = std::extents<int, std::dynamic_extent, std::dynamic_extent,
+                             std::dynamic_extent>;
 
 // macros to get x and y positions from indices
 #define pos(i, ghosts, dx) -0.5 + dx*(i - ghosts)
 
 // parameters
 struct heat_params_t : public argparse::Args {
-    int& ncells = kwarg("n,ncells", "number of cells on each side of the domain").set_default(32);
-    int& nsteps = kwarg("s,nsteps", "total steps in simulation").set_default(100);
+  int& ncells = kwarg("n,ncells", "number of cells on each side of the domain")
+                    .set_default(32);
+  int& nsteps = kwarg("s,nsteps", "total steps in simulation").set_default(100);
 #if defined(HEQ_OMP)
-    int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
+  int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
 #endif  // HEQ_OMP
-    Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f);
-    Real_t& dt = kwarg("t,dt", "time step").set_default(5.0e-5f);
-    bool& help = flag("h, help", "print help");
-    bool& print_grid = flag("p,print", "print grids at step 0 and step n");
-    bool& print_time = flag("time", "print simulation time");
+  Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f);
+  Real_t& dt = kwarg("t,dt", "time step").set_default(5.0e-5f);
+  bool& help = flag("h, help", "print help");
+  bool& print_grid = flag("p,print", "print grids at step 0 and step n");
+  bool& print_time = flag("time", "print simulation time");
 #if defined(TILING)
-    int& ntiles = kwarg("ntiles", "number of parallel tiles").set_default(4);
+  int& ntiles = kwarg("ntiles", "number of parallel tiles").set_default(4);
 #endif  // TILING               \
         // future use if needed \
         // int &max_grid_size = kwarg("g, max_grid_size", "size of each box (or
-        // grid)").set_default(32); bool &verbose = kwarg("v, verbose", "verbose
-        // mode").set_default(false); int &plot_int = kwarg("p, plot_int", "how often
-        // to write a plotfile").set_default(-1);
+  // grid)").set_default(32); bool &verbose = kwarg("v, verbose", "verbose
+  // mode").set_default(false); int &plot_int = kwarg("p, plot_int", "how often
+  // to write a plotfile").set_default(-1);
 };
 
 template <typename T>
 void printGrid(T* grid, int len) {
-    auto view = std::mdspan<T, view_2d, std::layout_right>(grid, len, len);
-    std::cout << "Grid: " << std::endl;
-    std::cout << std::fixed << std::showpoint;
-    std::cout << std::setprecision(2);
+  auto view = std::mdspan<T, view_2d, std::layout_right>(grid, len, len);
+  std::cout << "Grid: " << std::endl;
+  std::cout << std::fixed << std::showpoint;
+  std::cout << std::setprecision(2);
 
-    for (auto j = 0; j < view.extent(1); ++j) {
-        for (auto i = 0; i < view.extent(0); ++i) {
-            std::cout << view(i, j) << ", ";
-        }
-        std::cout << std::endl;
+  for (auto j = 0; j < view.extent(1); ++j) {
+    for (auto i = 0; i < view.extent(0); ++i) {
+      std::cout << view(i, j) << ", ";
     }
     std::cout << std::endl;
+  }
+  std::cout << std::endl;
 }
 
 // fill boundary cells
 template <typename T>
 void fill2Dboundaries(T* grid, int len, int ghost_cells = 1) {
-    std::for_each_n(std::execution::par_unseq, counting_iterator(ghost_cells), len - nghosts, [=](auto i) {
-        grid[i] = grid[i + (ghost_cells * len)];
-        grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))];
+  std::for_each_n(std::execution::par_unseq, counting_iterator(ghost_cells),
+                  len - nghosts, [=](auto i) {
+                    grid[i] = grid[i + (ghost_cells * len)];
+                    grid[i + (len * (len - ghost_cells))] =
+                        grid[i + (len * (len - ghost_cells - 1))];
 
-        grid[i * len] = grid[(ghost_cells * len) + i];
-        grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)];
-    });
+                    grid[i * len] = grid[(ghost_cells * len) + i];
+                    grid[(len - ghost_cells) + (len * i)] =
+                        grid[(len - ghost_cells - 1) + (len * i)];
+                  });
 }
\ No newline at end of file
diff --git a/apps/mdspan-stdpar/mdspan-stdpar.cpp b/apps/mdspan-stdpar/mdspan-stdpar.cpp
index 92dbdb9..9c92a87 100644
--- a/apps/mdspan-stdpar/mdspan-stdpar.cpp
+++ b/apps/mdspan-stdpar/mdspan-stdpar.cpp
@@ -30,51 +30,58 @@
 
 using data_type = int;
 // 2D view
-using extents_type = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
+using extents_type =
+    std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 // 3D view (fix the first dimension to 2)
-using extents_type2 = std::extents<int, 2, std::dynamic_extent, std::dynamic_extent>;
+using extents_type2 =
+    std::extents<int, 2, std::dynamic_extent, std::dynamic_extent>;
 
 int main() {
-    constexpr int N = 1e9;
-    std::vector<data_type> v(N);
+  constexpr int N = 1e9;
+  std::vector<data_type> v(N);
 
-    // View data as contiguous memory representing 2 rows of 6 ints each
-    auto ms2 = std::mdspan<data_type, extents_type, std::layout_right>(v.data(), N / 2, 2);
-    // View the same data as a 3D array 2 (fixed above) x 3 x 2
-    auto ms3 = std::mdspan<data_type, extents_type2, std::layout_right>(v.data(), N / 4, 2);
+  // View data as contiguous memory representing 2 rows of 6 ints each
+  auto ms2 = std::mdspan<data_type, extents_type, std::layout_right>(v.data(),
+                                                                     N / 2, 2);
+  // View the same data as a 3D array 2 (fixed above) x 3 x 2
+  auto ms3 = std::mdspan<data_type, extents_type2, std::layout_right>(v.data(),
+                                                                      N / 4, 2);
 
-    // auto dim2 = [=](int i){int i1 = i/ms2.extent(1); int i2 = i%ms2.extent(1);
-    // return std::make_tuple(i1, i2);}; auto dim3 = [=](int i){int i1 =
-    // i/(ms3.extent(1)*ms3.extent(2)); int i2 = (i/ms3.extent(2))%ms3.extent(1);
-    // int i3 = i%ms3.extent(2); return std::make_tuple(i1, i2, i3);};
+  // auto dim2 = [=](int i){int i1 = i/ms2.extent(1); int i2 = i%ms2.extent(1);
+  // return std::make_tuple(i1, i2);}; auto dim3 = [=](int i){int i1 =
+  // i/(ms3.extent(1)*ms3.extent(2)); int i2 = (i/ms3.extent(2))%ms3.extent(1);
+  // int i3 = i%ms3.extent(2); return std::make_tuple(i1, i2, i3);};
 
-    std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) {
-        auto global_idx = std::distance(ms2.data_handle(), &i);
-        dim2(global_idx, ms2);
-        // auto [i1, i2] = dim2(global_idx);
-        ms2(ii, ij) = global_idx;
-    });
+  std::for_each(std::execution::par_unseq, ms2.data_handle(),
+                ms2.data_handle() + ms2.size(), [=](int& i) {
+                  auto global_idx = std::distance(ms2.data_handle(), &i);
+                  dim2(global_idx, ms2);
+                  // auto [i1, i2] = dim2(global_idx);
+                  ms2(ii, ij) = global_idx;
+                });
 
-    std::cout << std::endl << std::endl;
+  std::cout << std::endl << std::endl;
 
-    std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) {
-        auto global_idx = std::distance(ms2.data_handle(), &i);
-        dim3(global_idx, ms3);
-        // auto [i1, i2, i3] = dim3(global_idx);
-        ms3(ii, ij, ik) = 1000 + global_idx;
-    });
+  std::for_each(std::execution::par_unseq, ms2.data_handle(),
+                ms2.data_handle() + ms2.size(), [=](int& i) {
+                  auto global_idx = std::distance(ms2.data_handle(), &i);
+                  dim3(global_idx, ms3);
+                  // auto [i1, i2, i3] = dim3(global_idx);
+                  ms3(ii, ij, ik) = 1000 + global_idx;
+                });
 
-    // read subset of data using 3D view
-    for (size_t i = 0; i < ms3.extent(0); i++) {
-        for (size_t j = 0; j < 10; j++) {
-            for (size_t k = 0; k < ms3.extent(2); k++) {
-                assert(ms3(i, j, k) == 1000 + i * ms3.extent(1) * ms3.extent(2) + j * ms3.extent(2) + k);
-                std::cout << ms3(i, j, k) << " ";
-            }
-            std::cout << std::endl;
-        }
-        std::cout << std::endl;
+  // read subset of data using 3D view
+  for (size_t i = 0; i < ms3.extent(0); i++) {
+    for (size_t j = 0; j < 10; j++) {
+      for (size_t k = 0; k < ms3.extent(2); k++) {
+        assert(ms3(i, j, k) == 1000 + i * ms3.extent(1) * ms3.extent(2) +
+                                   j * ms3.extent(2) + k);
+        std::cout << ms3(i, j, k) << " ";
+      }
+      std::cout << std::endl;
     }
+    std::cout << std::endl;
+  }
 
-    std::cout << ms3(0, 0, 1) << "\n";
+  std::cout << ms3(0, 0, 1) << "\n";
 }
\ No newline at end of file
diff --git a/include/commons.hpp b/include/commons.hpp
index cfacfa1..c043a20 100644
--- a/include/commons.hpp
+++ b/include/commons.hpp
@@ -48,38 +48,43 @@
 #include "counting_iterator.hpp"
 
 // get mdpsan 2d indices from 1d index
-#define dim2(x, ms)            \
-    int ii = x / ms.extent(1); \
-    int ij = x % ms.extent(1);
+#define dim2(x, ms)          \
+  int ii = x / ms.extent(1); \
+  int ij = x % ms.extent(1);
 // get mdspan 3d indices from 1d index
-#define dim3(x, ms)                              \
-    int ii = x / (ms3.extent(1) * ms.extent(2)); \
-    int ij = (x / ms.extent(2)) % ms.extent(1);  \
-    int ik = x % ms.extent(2)
+#define dim3(x, ms)                            \
+  int ii = x / (ms3.extent(1) * ms.extent(2)); \
+  int ij = (x / ms.extent(2)) % ms.extent(1);  \
+  int ik = x % ms.extent(2)
 
 class Timer {
-   public:
-    Timer() { start(); }
+ public:
+  Timer() { start(); }
 
-    ~Timer() { stop(); }
+  ~Timer() { stop(); }
 
-    void start() { start_time_point = std::chrono::high_resolution_clock::now(); }
+  void start() { start_time_point = std::chrono::high_resolution_clock::now(); }
 
-    double stop() {
-        end_time_point = std::chrono::high_resolution_clock::now();
-        return duration();
-    }
+  double stop() {
+    end_time_point = std::chrono::high_resolution_clock::now();
+    return duration();
+  }
 
-    double duration() {
-        auto start =
-            std::chrono::time_point_cast<std::chrono::microseconds>(start_time_point).time_since_epoch().count();
-        auto end = std::chrono::time_point_cast<std::chrono::microseconds>(end_time_point).time_since_epoch().count();
-        auto duration = end - start;
-        double ms = duration * 0.001;
-        return ms;
-    }
+  double duration() {
+    auto start = std::chrono::time_point_cast<std::chrono::microseconds>(
+                     start_time_point)
+                     .time_since_epoch()
+                     .count();
+    auto end =
+        std::chrono::time_point_cast<std::chrono::microseconds>(end_time_point)
+            .time_since_epoch()
+            .count();
+    auto duration = end - start;
+    double ms = duration * 0.001;
+    return ms;
+  }
 
-   private:
-    std::chrono::time_point<std::chrono::high_resolution_clock> start_time_point;
-    std::chrono::time_point<std::chrono::high_resolution_clock> end_time_point;
+ private:
+  std::chrono::time_point<std::chrono::high_resolution_clock> start_time_point;
+  std::chrono::time_point<std::chrono::high_resolution_clock> end_time_point;
 };
diff --git a/include/counting_iterator.hpp b/include/counting_iterator.hpp
index 09d0fa2..aae6a85 100644
--- a/include/counting_iterator.hpp
+++ b/include/counting_iterator.hpp
@@ -36,76 +36,96 @@
 using Index_t = int32_t;
 
 struct counting_iterator {
-   private:
-    using self = counting_iterator;
-
-   public:
-    using value_type = Index_t;
-    using difference_type = typename std::make_signed<Index_t>::type;
-    using pointer = Index_t*;
-    using reference = Index_t&;
-    using iterator_category = std::random_access_iterator_tag;
-
-    counting_iterator() : value(0) {}
-
-    explicit counting_iterator(value_type v) : value(v) {}
-
-    value_type operator*() const { return value; }
-
-    value_type operator[](difference_type n) const { return value + n; }
-
-    self& operator++() {
-        ++value;
-        return *this;
-    }
-
-    self operator++(int) {
-        self result{value};
-        ++value;
-        return result;
-    }
-
-    self& operator--() {
-        --value;
-        return *this;
-    }
-
-    self operator--(int) {
-        self result{value};
-        --value;
-        return result;
-    }
-
-    self& operator+=(difference_type n) {
-        value += n;
-        return *this;
-    }
-
-    self& operator-=(difference_type n) {
-        value -= n;
-        return *this;
-    }
-
-    friend self operator+(self const& i, difference_type n) { return self(i.value + n); }
-
-    friend self operator+(difference_type n, self const& i) { return self(i.value + n); }
-
-    friend difference_type operator-(self const& x, self const& y) { return x.value - y.value; }
-
-    friend self operator-(self const& i, difference_type n) { return self(i.value - n); }
-
-    friend bool operator==(self const& x, self const& y) { return x.value == y.value; }
-
-    friend bool operator!=(self const& x, self const& y) { return x.value != y.value; }
-
-    friend bool operator<(self const& x, self const& y) { return x.value < y.value; }
-
-    friend bool operator<=(self const& x, self const& y) { return x.value <= y.value; }
-
-    friend bool operator>(self const& x, self const& y) { return x.value > y.value; }
-
-    friend bool operator>=(self const& x, self const& y) { return x.value >= y.value; }
-
-   private:
-    value_type value;
+ private:
+  using self = counting_iterator;
+
+ public:
+  using value_type = Index_t;
+  using difference_type = typename std::make_signed<Index_t>::type;
+  using pointer = Index_t*;
+  using reference = Index_t&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  counting_iterator() : value(0) {}
+
+  explicit counting_iterator(value_type v) : value(v) {}
+
+  value_type operator*() const { return value; }
+
+  value_type operator[](difference_type n) const { return value + n; }
+
+  self& operator++() {
+    ++value;
+    return *this;
+  }
+
+  self operator++(int) {
+    self result{value};
+    ++value;
+    return result;
+  }
+
+  self& operator--() {
+    --value;
+    return *this;
+  }
+
+  self operator--(int) {
+    self result{value};
+    --value;
+    return result;
+  }
+
+  self& operator+=(difference_type n) {
+    value += n;
+    return *this;
+  }
+
+  self& operator-=(difference_type n) {
+    value -= n;
+    return *this;
+  }
+
+  friend self operator+(self const& i, difference_type n) {
+    return self(i.value + n);
+  }
+
+  friend self operator+(difference_type n, self const& i) {
+    return self(i.value + n);
+  }
+
+  friend difference_type operator-(self const& x, self const& y) {
+    return x.value - y.value;
+  }
+
+  friend self operator-(self const& i, difference_type n) {
+    return self(i.value - n);
+  }
+
+  friend bool operator==(self const& x, self const& y) {
+    return x.value == y.value;
+  }
+
+  friend bool operator!=(self const& x, self const& y) {
+    return x.value != y.value;
+  }
+
+  friend bool operator<(self const& x, self const& y) {
+    return x.value < y.value;
+  }
+
+  friend bool operator<=(self const& x, self const& y) {
+    return x.value <= y.value;
+  }
+
+  friend bool operator>(self const& x, self const& y) {
+    return x.value > y.value;
+  }
+
+  friend bool operator>=(self const& x, self const& y) {
+    return x.value >= y.value;
+  }
+
+ private:
+  value_type value;
 };
\ No newline at end of file

From 003698871603bc9c99bd7d28655e6c7e6ee26cf2 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Sat, 7 Oct 2023 01:33:17 -0700
Subject: [PATCH 20/20] removing stale log files

---
 apps/fft/fft-serial.cpp |  2 +-
 log-gcc.txt             | 40 ----------------------
 log.txt                 | 76 -----------------------------------------
 3 files changed, 1 insertion(+), 117 deletions(-)
 delete mode 100644 log-gcc.txt
 delete mode 100644 log.txt

diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp
index b174b5a..21d66f6 100644
--- a/apps/fft/fft-serial.cpp
+++ b/apps/fft/fft-serial.cpp
@@ -135,4 +135,4 @@ int main(int argc, char* argv[])
         std::cout << "Elapsed Time: " << elapsed << " ms" << std::endl;
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/log-gcc.txt b/log-gcc.txt
deleted file mode 100644
index 6d41374..0000000
--- a/log-gcc.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-+ cd /global/homes/m/mhaseeb/repos/nvstdpar/build-gcc/apps/heat-equation
-+ ./heat-equation-mdspan -s=50 -n=30000 --time
-+ tee gcc-md.txt
-Time: 155095 ms
-+ T=(128 64 32 16 8 4 2 1)
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=128
-+ tee gcc-omp-128.txt
-Time: 15310.8 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=64
-+ tee gcc-omp-64.txt
-Time: 15362.4 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=32
-+ tee gcc-omp-32.txt
-Time: 15631.2 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=16
-+ tee gcc-omp-16.txt
-Time: 18824.7 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=8
-+ tee gcc-omp-8.txt
-Time: 30255 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=4
-+ tee gcc-omp-4.txt
-Time: 56973.2 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=2
-+ tee gcc-omp-2.txt
-Time: 117583 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=1
-+ tee gcc-omp-1.txt
-Time: 231557 ms
-+ ./heat-equation-stdpar -s=50 -n=30000 --time
-+ tee gcc-stdpar-1.txt
-Time: 15924.2 ms
\ No newline at end of file
diff --git a/log.txt b/log.txt
deleted file mode 100644
index ed41625..0000000
--- a/log.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-+ cd /global/homes/m/mhaseeb/repos/nvstdpar/build/apps/heat-equation
-+ ./heat-equation-mdspan -s=50 -n=30000 --time
-+ tee md.txt
-Time: 72373.3 ms
-+ T=(1 2 4 8 16 32 64)
-+ for i in "${T[@]}"
-+ export OMP_NUM_THREADS=1
-+ OMP_NUM_THREADS=1
-+ ./heat-equation-stdpar -s=50 -n=30000 --time
-+ tee stdpar-1.txt
-Time: 704823 ms
-+ for i in "${T[@]}"
-+ export OMP_NUM_THREADS=2
-+ OMP_NUM_THREADS=2
-+ ./heat-equation-stdpar -s=50 -n=30000 --time
-+ tee stdpar-2.txt
-Time: 352537 ms
-+ for i in "${T[@]}"
-+ export OMP_NUM_THREADS=4
-+ OMP_NUM_THREADS=4
-+ ./heat-equation-stdpar -s=50 -n=30000 --time
-+ tee stdpar-4.txt
-Time: 179607 ms
-+ for i in "${T[@]}"
-+ export OMP_NUM_THREADS=8
-+ OMP_NUM_THREADS=8
-+ ./heat-equation-stdpar -s=50 -n=30000 --time
-+ tee stdpar-8.txt
-Time: 91341.8 ms
-+ for i in "${T[@]}"
-+ export OMP_NUM_THREADS=16
-+ OMP_NUM_THREADS=16
-+ ./heat-equation-stdpar -s=50 -n=30000 --time
-+ tee stdpar-16.txt
-Time: 45602.9 ms
-+ for i in "${T[@]}"
-+ export OMP_NUM_THREADS=32
-+ OMP_NUM_THREADS=32
-+ ./heat-equation-stdpar -s=50 -n=30000 --time
-+ tee stdpar-32.txt
-Time: 24956.7 ms
-+ for i in "${T[@]}"
-+ export OMP_NUM_THREADS=64
-+ OMP_NUM_THREADS=64
-+ ./heat-equation-stdpar -s=50 -n=30000 --time
-+ tee stdpar-64.txt
-Time: 12437.9 ms
-+ unset OMP_NUM_THREADS
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=1
-+ tee omp-1.txt
-Time: 258170 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=2
-+ tee omp-2.txt
-Time: 129542 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=4
-+ tee omp-4.txt
-Time: 65776.1 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=8
-+ tee omp-8.txt
-Time: 32570 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=16
-+ tee omp-16.txt
-Time: 16814.6 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=32
-+ tee omp-32.txt
-Time: 11322.6 ms
-+ for i in "${T[@]}"
-+ ./heat-equation-omp -s=50 -n=30000 --time --nthreads=64
-+ tee omp-64.txt
-Time: 15135.6 ms