diff --git a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc b/common/cuda_hip/factorization/par_ic_kernels.hpp.inc index 9a4d605c6a3..7a3b3da8e32 100644 --- a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ic_kernels.hpp.inc @@ -78,16 +78,18 @@ __global__ __launch_bounds__(default_block_size) void ic_sweep( auto l_col = l_col_idxs[l_row_begin]; auto lh_row = l_col_idxs[lh_col_begin]; if (l_col == lh_row && l_col < last_entry) { - sum += l_vals[l_row_begin] * conj(l_vals[lh_col_begin]); + sum += load_relaxed(l_vals + l_row_begin) * + conj(load_relaxed(l_vals + lh_col_begin)); } l_row_begin += l_col <= lh_row; lh_col_begin += l_col >= lh_row; } - auto to_write = row == col - ? sqrt(a_val - sum) - : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1]; + auto to_write = + row == col + ? sqrt(a_val - sum) + : (a_val - sum) / load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1)); if (is_finite(to_write)) { - l_vals[l_nz] = to_write; + store_relaxed(l_vals + l_nz, to_write); } } diff --git a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc index 7eccbda61d2..d54fe3c6c77 100644 --- a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc @@ -75,8 +75,8 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep( // we don't need to use the `bool valid` because last_entry is // already a smaller sentinel value than the one used in group_merge if (l_col == lh_row && l_col < last_entry) { - sum += l_vals[l_idx + l_row_begin] * - conj(l_vals[lh_idx + lh_col_begin]); + sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * + conj(load_relaxed(l_vals + (lh_idx + lh_col_begin))); } // remember the transposed element auto found_transp = subwarp.ballot(lh_row == row); @@ -90,11 +90,12 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep( sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); if (subwarp.thread_rank() == 0) { - auto to_write = row == col - ? sqrt(a_val - sum) - : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1]; + auto to_write = + row == col ? sqrt(a_val - sum) + : (a_val - sum) / + load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1)); if (is_finite(to_write)) { - l_vals[l_nz] = to_write; + store_relaxed(l_vals + l_nz, to_write); } } } diff --git a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc index 08bd5bf8b4e..6785c161674 100644 --- a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc @@ -57,7 +57,8 @@ __global__ __launch_bounds__(default_block_size) void compute_l_u_factors( const auto u_col = u_col_idxs[u_idx]; last_operation = zero(); if (l_col == u_col) { - last_operation = l_values[l_idx] * u_values[u_idx]; + last_operation = load_relaxed(l_values + l_idx) * + load_relaxed(u_values + u_idx); sum -= last_operation; } l_idx += (l_col <= u_col); @@ -65,14 +66,15 @@ __global__ __launch_bounds__(default_block_size) void compute_l_u_factors( } sum += last_operation; // undo the last operation if (row > col) { - auto to_write = sum / u_values[u_row_ptrs[col + 1] - 1]; + auto to_write = + sum / load_relaxed(u_values + (u_row_ptrs[col + 1] - 1)); if (is_finite(to_write)) { - l_values[l_idx - 1] = to_write; + store_relaxed(l_values + (l_idx - 1), to_write); } } else { auto to_write = sum; if (is_finite(to_write)) { - u_values[u_idx - 1] = to_write; + store_relaxed(u_values + (u_idx - 1), to_write); } } } diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc index e99888b35b3..d3cc4330c39 100644 --- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc @@ -87,8 +87,8 @@ __global__ __launch_bounds__(default_block_size) void sweep( // we don't need to use the `bool valid` because last_entry is // already a smaller sentinel value than the one used in group_merge if (l_col == ut_row && l_col < last_entry) { - sum += l_vals[l_idx + l_row_begin] * - ut_vals[ut_idx + ut_col_begin]; + sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * + load_relaxed(ut_vals + (ut_idx + ut_col_begin)); } // remember the transposed element auto found_transp = subwarp.ballot(ut_row == row); @@ -103,15 +103,16 @@ __global__ __launch_bounds__(default_block_size) void sweep( if (subwarp.thread_rank() == 0) { if (lower) { - auto to_write = (a_val - sum) / ut_vals[ut_col_ptrs[col + 1] - 1]; + auto to_write = (a_val - sum) / + load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1)); if (is_finite(to_write)) { - l_vals[l_nz] = to_write; + store_relaxed(l_vals + l_nz, to_write); } } else { auto to_write = a_val - sum; if (is_finite(to_write)) { - u_vals[u_nz] = to_write; - ut_vals[ut_nz] = to_write; + store_relaxed(u_vals + u_nz, to_write); + store_relaxed(ut_vals + ut_nz, to_write); } } } diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu index b700be483ea..0f54e5b4a98 100644 --- a/cuda/factorization/par_ic_kernels.cu +++ b/cuda/factorization/par_ic_kernels.cu @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" +#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu index f2a5f9f4754..66f64e5959b 100644 --- a/cuda/factorization/par_ict_kernels.cu +++ b/cuda/factorization/par_ict_kernels.cu @@ -47,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/math.hpp" #include "cuda/components/intrinsics.cuh" +#include "cuda/components/memory.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu index 9796ee343fc..3b45c2993f2 100644 --- a/cuda/factorization/par_ilu_kernels.cu +++ b/cuda/factorization/par_ilu_kernels.cu @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" +#include "cuda/components/memory.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernel.cu index c4b292402ac..98cd8c5de48 100644 --- a/cuda/factorization/par_ilut_sweep_kernel.cu +++ b/cuda/factorization/par_ilut_sweep_kernel.cu @@ -47,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/math.hpp" #include "cuda/components/intrinsics.cuh" +#include "cuda/components/memory.cuh" #include "cuda/components/merging.cuh" #include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp index c8209f2c9dd..deb7d2b83f8 100644 --- a/hip/factorization/par_ic_kernels.hip.cpp +++ b/hip/factorization/par_ic_kernels.hip.cpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" +#include "hip/components/memory.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp index fa914f4d33c..24857fe6807 100644 --- a/hip/factorization/par_ict_kernels.hip.cpp +++ b/hip/factorization/par_ict_kernels.hip.cpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" #include "hip/components/intrinsics.hip.hpp" +#include "hip/components/memory.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp index 42e5fd55425..b283e00b8fd 100644 --- a/hip/factorization/par_ilu_kernels.hip.cpp +++ b/hip/factorization/par_ilu_kernels.hip.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" +#include "hip/components/memory.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernel.hip.cpp index 6e8ed1d8822..f566aa5a159 100644 --- a/hip/factorization/par_ilut_sweep_kernel.hip.cpp +++ b/hip/factorization/par_ilut_sweep_kernel.hip.cpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/math.hip.hpp" #include "hip/components/intrinsics.hip.hpp" +#include "hip/components/memory.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp"