From 57e6faedaa7002c8a786bf58eddc51923c35a625 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 30 Jan 2022 22:16:38 +0100 Subject: [PATCH] warp-parallel symbolic Cholesky --- .../factorization/cholesky_kernels.hpp.inc | 52 +++++++++++++------ cuda/factorization/cholesky_kernels.cu | 23 +++++--- hip/factorization/cholesky_kernels.hip.cpp | 23 +++++--- 3 files changed, 67 insertions(+), 31 deletions(-) diff --git a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc b/common/cuda_hip/factorization/cholesky_kernels.hpp.inc index cbd36090030..545e429dd0d 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc +++ b/common/cuda_hip/factorization/cholesky_kernels.hpp.inc @@ -72,21 +72,24 @@ __global__ __launch_bounds__(default_block_size) void pointer_doubling_kernel( } -template +template __global__ __launch_bounds__(default_block_size) void cholesky_symbolic_count_kernel( IndexType num_rows, const IndexType* row_ptrs, const IndexType* lower_ends, const IndexType* postorder_cols, const IndexType* postorder_parent, IndexType* row_nnz) { - const auto row = thread::get_thread_id_flat(); + const auto row = thread::get_subwarp_id_flat(); if (row >= num_rows) { return; } const auto row_begin = row_ptrs[row]; const auto lower_end = lower_ends[row]; + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + const auto lane = subwarp.thread_rank(); IndexType count{}; - for (auto nz = row_begin; nz < lower_end - 1; ++nz) { + for (auto nz = row_begin + lane; nz < lower_end - 1; nz += subwarp_size) { auto node = postorder_cols[nz]; const auto next_node = postorder_cols[nz + 1]; while (node < next_node) { @@ -94,11 +97,16 @@ __global__ node = postorder_parent[node]; } } - row_nnz[row] = count + 1; // lower entries plus diagonal + count = + reduce(subwarp, count, [](IndexType a, IndexType b) { return a + b; }) + + 1; // lower entries plus diagonal + if (lane == 0) { + row_nnz[row] = count; + } } -template +template __global__ __launch_bounds__(default_block_size) void cholesky_symbolic_factorize_kernel( IndexType num_rows, const IndexType* row_ptrs, @@ -106,22 +114,36 @@ __global__ const IndexType* postorder, const IndexType* postorder_parent, const IndexType* out_row_ptrs, IndexType* out_cols) { - const auto row = thread::get_thread_id_flat(); + const auto row = thread::get_subwarp_id_flat(); if (row >= num_rows) { return; } const auto row_begin = row_ptrs[row]; const auto lower_end = lower_ends[row]; - auto out_nz = out_row_ptrs[row]; - for (auto nz = row_begin; nz < lower_end - 1; ++nz) { - auto node = postorder_cols[nz]; - const auto next_node = postorder_cols[nz + 1]; - while (node < next_node) { - out_cols[out_nz] = postorder[node]; - out_nz++; - node = postorder_parent[node]; + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + const auto lane = subwarp.thread_rank(); + const auto prefix_mask = (config::lane_mask_type(1) << lane) - 1; + auto out_base = out_row_ptrs[row]; + for (auto base = row_begin; base < lower_end - 1; base += subwarp_size) { + auto nz = base + lane; + auto node = nz < lower_end - 1 ? postorder_cols[nz] : -1; + const auto next_node = nz < lower_end - 1 ? postorder_cols[nz + 1] : -1; + bool pred = node < next_node; + auto mask = subwarp.ballot(pred); + while (mask) { + if (pred) { + const auto out_nz = out_base + popcnt(mask & prefix_mask); + out_cols[out_nz] = postorder[node]; + node = postorder_parent[node]; + pred = node < next_node; + } + out_base += popcnt(mask); + mask = subwarp.ballot(pred); } } // add diagonal entry - out_cols[out_nz] = row; + if (lane == 0) { + out_cols[out_base] = row; + } } diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu index 9593d4b11c6..99abdce4bf6 100644 --- a/cuda/factorization/cholesky_kernels.cu +++ b/cuda/factorization/cholesky_kernels.cu @@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/factorization/elimination_forest.hpp" #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/intrinsics.cuh" +#include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" @@ -108,10 +111,12 @@ void cholesky_symbolic_count( } // count nonzeros in L { - const auto num_blocks = ceildiv(num_rows, default_block_size); - cholesky_symbolic_count_kernel<<>>( - num_rows, row_ptrs, lower_ends, postorder_cols, postorder_parent, - row_nnz); + const auto num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + cholesky_symbolic_count_kernel + <<>>(num_rows, row_ptrs, lower_ends, + postorder_cols, + postorder_parent, row_nnz); } } @@ -140,10 +145,12 @@ void cholesky_symbolic_factorize( const auto postorder_parent = forest.postorder_parents.get_const_data(); const auto out_row_ptrs = l_factor->get_const_row_ptrs(); const auto out_cols = l_factor->get_col_idxs(); - const auto num_blocks = ceildiv(num_rows, default_block_size); - cholesky_symbolic_factorize_kernel<<>>( - num_rows, row_ptrs, lower_ends, postorder_cols, postorder, - postorder_parent, out_row_ptrs, out_cols); + const auto num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + cholesky_symbolic_factorize_kernel + <<>>( + num_rows, row_ptrs, lower_ends, postorder_cols, postorder, + postorder_parent, out_row_ptrs, out_cols); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/factorization/cholesky_kernels.hip.cpp b/hip/factorization/cholesky_kernels.hip.cpp index 10b4eee6532..f069dd64b90 100644 --- a/hip/factorization/cholesky_kernels.hip.cpp +++ b/hip/factorization/cholesky_kernels.hip.cpp @@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/factorization/elimination_forest.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" #include "hip/base/math.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/intrinsics.hip.hpp" +#include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" @@ -108,10 +111,12 @@ void cholesky_symbolic_count( } // count nonzeros in L { - const auto num_blocks = ceildiv(num_rows, default_block_size); - cholesky_symbolic_count_kernel<<>>( - num_rows, row_ptrs, lower_ends, postorder_cols, postorder_parent, - row_nnz); + const auto num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + cholesky_symbolic_count_kernel + <<>>(num_rows, row_ptrs, lower_ends, + postorder_cols, + postorder_parent, row_nnz); } } @@ -140,10 +145,12 @@ void cholesky_symbolic_factorize( const auto postorder_parent = forest.postorder_parents.get_const_data(); const auto out_row_ptrs = l_factor->get_const_row_ptrs(); const auto out_cols = l_factor->get_col_idxs(); - const auto num_blocks = ceildiv(num_rows, default_block_size); - cholesky_symbolic_factorize_kernel<<>>( - num_rows, row_ptrs, lower_ends, postorder_cols, postorder, - postorder_parent, out_row_ptrs, out_cols); + const auto num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + cholesky_symbolic_factorize_kernel + <<>>( + num_rows, row_ptrs, lower_ends, postorder_cols, postorder, + postorder_parent, out_row_ptrs, out_cols); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(