Skip to content

Commit

Permalink
warp-parallel symbolic Cholesky
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Apr 21, 2022
1 parent 17d8086 commit 57e6fae
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 31 deletions.
52 changes: 37 additions & 15 deletions common/cuda_hip/factorization/cholesky_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -72,56 +72,78 @@ __global__ __launch_bounds__(default_block_size) void pointer_doubling_kernel(
}


template <typename IndexType>
template <int subwarp_size, typename IndexType>
__global__
__launch_bounds__(default_block_size) void cholesky_symbolic_count_kernel(
IndexType num_rows, const IndexType* row_ptrs,
const IndexType* lower_ends, const IndexType* postorder_cols,
const IndexType* postorder_parent, IndexType* row_nnz)
{
const auto row = thread::get_thread_id_flat<IndexType>();
const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
if (row >= num_rows) {
return;
}
const auto row_begin = row_ptrs[row];
const auto lower_end = lower_ends[row];
const auto subwarp =
group::tiled_partition<subwarp_size>(group::this_thread_block());
const auto lane = subwarp.thread_rank();
IndexType count{};
for (auto nz = row_begin; nz < lower_end - 1; ++nz) {
for (auto nz = row_begin + lane; nz < lower_end - 1; nz += subwarp_size) {
auto node = postorder_cols[nz];
const auto next_node = postorder_cols[nz + 1];
while (node < next_node) {
count++;
node = postorder_parent[node];
}
}
row_nnz[row] = count + 1; // lower entries plus diagonal
count =
reduce(subwarp, count, [](IndexType a, IndexType b) { return a + b; }) +
1; // lower entries plus diagonal
if (lane == 0) {
row_nnz[row] = count;
}
}


template <typename IndexType>
template <int subwarp_size, typename IndexType>
__global__
__launch_bounds__(default_block_size) void cholesky_symbolic_factorize_kernel(
IndexType num_rows, const IndexType* row_ptrs,
const IndexType* lower_ends, const IndexType* postorder_cols,
const IndexType* postorder, const IndexType* postorder_parent,
const IndexType* out_row_ptrs, IndexType* out_cols)
{
const auto row = thread::get_thread_id_flat<IndexType>();
const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
if (row >= num_rows) {
return;
}
const auto row_begin = row_ptrs[row];
const auto lower_end = lower_ends[row];
auto out_nz = out_row_ptrs[row];
for (auto nz = row_begin; nz < lower_end - 1; ++nz) {
auto node = postorder_cols[nz];
const auto next_node = postorder_cols[nz + 1];
while (node < next_node) {
out_cols[out_nz] = postorder[node];
out_nz++;
node = postorder_parent[node];
const auto subwarp =
group::tiled_partition<subwarp_size>(group::this_thread_block());
const auto lane = subwarp.thread_rank();
const auto prefix_mask = (config::lane_mask_type(1) << lane) - 1;
auto out_base = out_row_ptrs[row];
for (auto base = row_begin; base < lower_end - 1; base += subwarp_size) {
auto nz = base + lane;
auto node = nz < lower_end - 1 ? postorder_cols[nz] : -1;
const auto next_node = nz < lower_end - 1 ? postorder_cols[nz + 1] : -1;
bool pred = node < next_node;
auto mask = subwarp.ballot(pred);
while (mask) {
if (pred) {
const auto out_nz = out_base + popcnt(mask & prefix_mask);
out_cols[out_nz] = postorder[node];
node = postorder_parent[node];
pred = node < next_node;
}
out_base += popcnt(mask);
mask = subwarp.ballot(pred);
}
}
// add diagonal entry
out_cols[out_nz] = row;
if (lane == 0) {
out_cols[out_base] = row;
}
}
23 changes: 15 additions & 8 deletions cuda/factorization/cholesky_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "core/factorization/elimination_forest.hpp"
#include "cuda/base/cusparse_bindings.hpp"
#include "cuda/base/math.hpp"
#include "cuda/components/cooperative_groups.cuh"
#include "cuda/components/intrinsics.cuh"
#include "cuda/components/reduction.cuh"
#include "cuda/components/thread_ids.cuh"


Expand Down Expand Up @@ -108,10 +111,12 @@ void cholesky_symbolic_count(
}
// count nonzeros in L
{
const auto num_blocks = ceildiv(num_rows, default_block_size);
cholesky_symbolic_count_kernel<<<num_blocks, default_block_size>>>(
num_rows, row_ptrs, lower_ends, postorder_cols, postorder_parent,
row_nnz);
const auto num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
cholesky_symbolic_count_kernel<config::warp_size>
<<<num_blocks, default_block_size>>>(num_rows, row_ptrs, lower_ends,
postorder_cols,
postorder_parent, row_nnz);
}
}

Expand Down Expand Up @@ -140,10 +145,12 @@ void cholesky_symbolic_factorize(
const auto postorder_parent = forest.postorder_parents.get_const_data();
const auto out_row_ptrs = l_factor->get_const_row_ptrs();
const auto out_cols = l_factor->get_col_idxs();
const auto num_blocks = ceildiv(num_rows, default_block_size);
cholesky_symbolic_factorize_kernel<<<num_blocks, default_block_size>>>(
num_rows, row_ptrs, lower_ends, postorder_cols, postorder,
postorder_parent, out_row_ptrs, out_cols);
const auto num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
cholesky_symbolic_factorize_kernel<config::warp_size>
<<<num_blocks, default_block_size>>>(
num_rows, row_ptrs, lower_ends, postorder_cols, postorder,
postorder_parent, out_row_ptrs, out_cols);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand Down
23 changes: 15 additions & 8 deletions hip/factorization/cholesky_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "core/factorization/elimination_forest.hpp"
#include "hip/base/hipsparse_bindings.hip.hpp"
#include "hip/base/math.hip.hpp"
#include "hip/components/cooperative_groups.hip.hpp"
#include "hip/components/intrinsics.hip.hpp"
#include "hip/components/reduction.hip.hpp"
#include "hip/components/thread_ids.hip.hpp"


Expand Down Expand Up @@ -108,10 +111,12 @@ void cholesky_symbolic_count(
}
// count nonzeros in L
{
const auto num_blocks = ceildiv(num_rows, default_block_size);
cholesky_symbolic_count_kernel<<<num_blocks, default_block_size>>>(
num_rows, row_ptrs, lower_ends, postorder_cols, postorder_parent,
row_nnz);
const auto num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
cholesky_symbolic_count_kernel<config::warp_size>
<<<num_blocks, default_block_size>>>(num_rows, row_ptrs, lower_ends,
postorder_cols,
postorder_parent, row_nnz);
}
}

Expand Down Expand Up @@ -140,10 +145,12 @@ void cholesky_symbolic_factorize(
const auto postorder_parent = forest.postorder_parents.get_const_data();
const auto out_row_ptrs = l_factor->get_const_row_ptrs();
const auto out_cols = l_factor->get_col_idxs();
const auto num_blocks = ceildiv(num_rows, default_block_size);
cholesky_symbolic_factorize_kernel<<<num_blocks, default_block_size>>>(
num_rows, row_ptrs, lower_ends, postorder_cols, postorder,
postorder_parent, out_row_ptrs, out_cols);
const auto num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
cholesky_symbolic_factorize_kernel<config::warp_size>
<<<num_blocks, default_block_size>>>(
num_rows, row_ptrs, lower_ends, postorder_cols, postorder,
postorder_parent, out_row_ptrs, out_cols);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand Down

0 comments on commit 57e6fae

Please sign in to comment.