From 3c8191c562de2f0d0a6ac62d783def6b415ab394 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 18 Oct 2023 10:15:26 +0200 Subject: [PATCH 1/2] add dpcpp csr diagonal missing components - check_diagonal_entries - add_scaled_identity --- dpcpp/matrix/csr_kernels.dp.cpp | 102 +++++++++++++++++++++++++++++++- test/matrix/csr_kernels2.cpp | 6 -- 2 files changed, 99 insertions(+), 9 deletions(-) diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 11309b67b9b..c5a8e3ef4d4 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -871,6 +871,74 @@ void extract_diagonal(size_type diag_size, size_type nnz, GKO_ENABLE_DEFAULT_HOST(extract_diagonal, extract_diagonal); +template +void check_diagonal_entries(const IndexType num_min_rows_cols, + const IndexType* const __restrict__ row_ptrs, + const IndexType* const __restrict__ col_idxs, + bool* const __restrict__ has_all_diags, + sycl::nd_item<3> item_ct1) +{ + constexpr int subgroup_size = config::warp_size; + auto tile_grp = group::tiled_partition( + group::this_thread_block(item_ct1)); + const auto row = + thread::get_subwarp_id_flat(item_ct1); + if (row < num_min_rows_cols) { + const auto tid_in_warp = tile_grp.thread_rank(); + const auto row_start = row_ptrs[row]; + const auto num_nz = row_ptrs[row + 1] - row_start; + bool row_has_diag_local{false}; + for (IndexType iz = tid_in_warp; iz < num_nz; iz += subgroup_size) { + if (col_idxs[iz + row_start] == row) { + row_has_diag_local = true; + break; + } + } + auto row_has_diag = static_cast(tile_grp.any(row_has_diag_local)); + if (!row_has_diag) { + if (tile_grp.thread_rank() == 0) { + *has_all_diags = false; + } + return; + } + } +} + +GKO_ENABLE_DEFAULT_HOST(check_diagonal_entries, check_diagonal_entries); + + +template +void add_scaled_identity(const ValueType* const __restrict__ alpha, + const ValueType* const __restrict__ beta, + const IndexType num_rows, + const IndexType* const __restrict__ row_ptrs, + const IndexType* const __restrict__ col_idxs, + ValueType* const __restrict__ values, + sycl::nd_item<3> item_ct1) +{ + constexpr int subgroup_size = config::warp_size; + auto tile_grp = group::tiled_partition( + group::this_thread_block(item_ct1)); + const auto row = + thread::get_subwarp_id_flat(item_ct1); + const auto num_warps = + thread::get_subwarp_num_flat(item_ct1); + if (row < num_rows) { + const auto tid_in_warp = tile_grp.thread_rank(); + const auto row_start = row_ptrs[row]; + const auto num_nz = row_ptrs[row + 1] - row_start; + for (IndexType iz = tid_in_warp; iz < num_nz; iz += subgroup_size) { + values[iz + row_start] *= beta[0]; + if (col_idxs[iz + row_start] == row) { + values[iz + row_start] += alpha[0]; + } + } + } +} + +GKO_ENABLE_DEFAULT_HOST(add_scaled_identity, add_scaled_identity); + + } // namespace kernel @@ -2364,8 +2432,24 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL); template void check_diagonal_entries_exist( std::shared_ptr exec, - const matrix::Csr* const mtx, - bool& has_all_diags) GKO_NOT_IMPLEMENTED; + const matrix::Csr* const mtx, bool& has_all_diags) +{ + const size_type num_subgroup = mtx->get_size()[0]; + if (num_subgroup > 0) { + const size_type num_blocks = + num_subgroup / (default_block_size / config::warp_size); + array has_diags(exec, {true}); + kernel::check_diagonal_entries( + num_blocks, default_block_size, 0, exec->get_queue(), + static_cast( + std::min(mtx->get_size()[0], mtx->get_size()[1])), + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + has_diags.get_data()); + has_all_diags = exec->copy_val_to_host(has_diags.get_const_data()); + } else { + has_all_diags = true; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST); @@ -2376,7 +2460,19 @@ void add_scaled_identity(std::shared_ptr exec, const matrix::Dense* const alpha, const matrix::Dense* const beta, matrix::Csr* const mtx) - GKO_NOT_IMPLEMENTED; +{ + const auto nrows = mtx->get_size()[0]; + if (nrows == 0) { + return; + } + const auto nthreads = nrows * config::warp_size; + const auto nblocks = ceildiv(nthreads, default_block_size); + kernel::add_scaled_identity( + nblocks, default_block_size, 0, exec->get_queue(), + alpha->get_const_values(), beta->get_const_values(), + static_cast(nrows), mtx->get_const_row_ptrs(), + mtx->get_const_col_idxs(), mtx->get_values()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp index 4d3ffa61323..412f9a41158 100644 --- a/test/matrix/csr_kernels2.cpp +++ b/test/matrix/csr_kernels2.cpp @@ -1311,9 +1311,6 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef) } -#ifndef GKO_COMPILING_DPCPP - - TEST_F(Csr, CanDetectMissingDiagonalEntry) { using T = double; @@ -1359,6 +1356,3 @@ TEST_F(Csr, AddScaledIdentityToNonSquare) GKO_ASSERT_MTX_NEAR(mtx, dmtx, r::value); } - - -#endif // GKO_COMPILING_DPCPP From a1225b550ed6f10e4641cd2f96200de5edbcf1a0 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 23 Oct 2023 22:50:16 +0200 Subject: [PATCH 2/2] refine the kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Pratik Nayak Co-authored-by: Thomas Grützmacher --- common/cuda_hip/matrix/csr_common.hpp.inc | 1 - common/cuda_hip/matrix/csr_kernels.hpp.inc | 12 ++++++++---- dpcpp/matrix/csr_kernels.dp.cpp | 14 ++++++++------ omp/matrix/csr_kernels.cpp | 11 ++++++++--- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/common/cuda_hip/matrix/csr_common.hpp.inc b/common/cuda_hip/matrix/csr_common.hpp.inc index 0fce02aecfa..35718464c42 100644 --- a/common/cuda_hip/matrix/csr_common.hpp.inc +++ b/common/cuda_hip/matrix/csr_common.hpp.inc @@ -102,7 +102,6 @@ __global__ __launch_bounds__(default_block_size) void check_diagonal_entries( if (tile_grp.thread_rank() == 0) { *has_all_diags = false; } - return; } } } diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 3f02337747e..4bc601c5067 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -826,15 +826,19 @@ __global__ __launch_bounds__(default_block_size) void add_scaled_identity( auto tile_grp = group::tiled_partition(group::this_thread_block()); const auto warpid = thread::get_subwarp_id_flat(); - const auto num_warps = thread::get_subwarp_num_flat(); if (warpid < num_rows) { const auto tid_in_warp = tile_grp.thread_rank(); const IndexType row_start = row_ptrs[warpid]; const IndexType num_nz = row_ptrs[warpid + 1] - row_start; + const auto beta_val = beta[0]; + const auto alpha_val = alpha[0]; for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) { - values[iz + row_start] *= beta[0]; - if (col_idxs[iz + row_start] == warpid) { - values[iz + row_start] += alpha[0]; + if (beta_val != one()) { + values[iz + row_start] *= beta_val; + } + if (col_idxs[iz + row_start] == warpid && + alpha_val != zero()) { + values[iz + row_start] += alpha_val; } } } diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index c5a8e3ef4d4..915e2027a26 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -899,7 +899,6 @@ void check_diagonal_entries(const IndexType num_min_rows_cols, if (tile_grp.thread_rank() == 0) { *has_all_diags = false; } - return; } } } @@ -921,16 +920,19 @@ void add_scaled_identity(const ValueType* const __restrict__ alpha, group::this_thread_block(item_ct1)); const auto row = thread::get_subwarp_id_flat(item_ct1); - const auto num_warps = - thread::get_subwarp_num_flat(item_ct1); if (row < num_rows) { const auto tid_in_warp = tile_grp.thread_rank(); const auto row_start = row_ptrs[row]; const auto num_nz = row_ptrs[row + 1] - row_start; + const auto beta_val = beta[0]; + const auto alpha_val = alpha[0]; for (IndexType iz = tid_in_warp; iz < num_nz; iz += subgroup_size) { - values[iz + row_start] *= beta[0]; - if (col_idxs[iz + row_start] == row) { - values[iz + row_start] += alpha[0]; + if (beta_val != one()) { + values[iz + row_start] *= beta_val; + } + if (col_idxs[iz + row_start] == row && + alpha_val != zero()) { + values[iz + row_start] += alpha_val; } } } diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 7d4a5a7ebd1..1757b4b8a25 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1134,12 +1134,17 @@ void add_scaled_identity(std::shared_ptr exec, const auto nrows = static_cast(mtx->get_size()[0]); const auto row_ptrs = mtx->get_const_row_ptrs(); const auto vals = mtx->get_values(); + const auto beta_val = beta->get_const_values()[0]; + const auto alpha_val = alpha->get_const_values()[0]; #pragma omp parallel for for (IndexType row = 0; row < nrows; row++) { for (IndexType iz = row_ptrs[row]; iz < row_ptrs[row + 1]; iz++) { - vals[iz] *= beta->get_const_values()[0]; - if (row == mtx->get_const_col_idxs()[iz]) { - vals[iz] += alpha->get_const_values()[0]; + if (beta_val != one()) { + vals[iz] *= beta_val; + } + if (row == mtx->get_const_col_idxs()[iz] && + alpha_val != zero()) { + vals[iz] += alpha_val; } } }