From 3c8191c562de2f0d0a6ac62d783def6b415ab394 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 18 Oct 2023 10:15:26 +0200
Subject: [PATCH 1/2] add dpcpp csr diagonal missing components -
 check_diagonal_entries - add_scaled_identity

---
 dpcpp/matrix/csr_kernels.dp.cpp | 102 +++++++++++++++++++++++++++++++-
 test/matrix/csr_kernels2.cpp    |   6 --
 2 files changed, 99 insertions(+), 9 deletions(-)
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 11309b67b9b..c5a8e3ef4d4 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -871,6 +871,74 @@ void extract_diagonal(size_type diag_size, size_type nnz,
 GKO_ENABLE_DEFAULT_HOST(extract_diagonal, extract_diagonal);
 
 
+template <typename IndexType>
+void check_diagonal_entries(const IndexType num_min_rows_cols,
+                            const IndexType* const __restrict__ row_ptrs,
+                            const IndexType* const __restrict__ col_idxs,
+                            bool* const __restrict__ has_all_diags,
+                            sycl::nd_item<3> item_ct1)
+{
+    constexpr int subgroup_size = config::warp_size;
+    auto tile_grp = group::tiled_partition<subgroup_size>(
+        group::this_thread_block(item_ct1));
+    const auto row =
+        thread::get_subwarp_id_flat<subgroup_size, IndexType>(item_ct1);
+    if (row < num_min_rows_cols) {
+        const auto tid_in_warp = tile_grp.thread_rank();
+        const auto row_start = row_ptrs[row];
+        const auto num_nz = row_ptrs[row + 1] - row_start;
+        bool row_has_diag_local{false};
+        for (IndexType iz = tid_in_warp; iz < num_nz; iz += subgroup_size) {
+            if (col_idxs[iz + row_start] == row) {
+                row_has_diag_local = true;
+                break;
+            }
+        }
+        auto row_has_diag = static_cast<bool>(tile_grp.any(row_has_diag_local));
+        if (!row_has_diag) {
+            if (tile_grp.thread_rank() == 0) {
+                *has_all_diags = false;
+            }
+            return;
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(check_diagonal_entries, check_diagonal_entries);
+
+
+template <typename ValueType, typename IndexType>
+void add_scaled_identity(const ValueType* const __restrict__ alpha,
+                         const ValueType* const __restrict__ beta,
+                         const IndexType num_rows,
+                         const IndexType* const __restrict__ row_ptrs,
+                         const IndexType* const __restrict__ col_idxs,
+                         ValueType* const __restrict__ values,
+                         sycl::nd_item<3> item_ct1)
+{
+    constexpr int subgroup_size = config::warp_size;
+    auto tile_grp = group::tiled_partition<subgroup_size>(
+        group::this_thread_block(item_ct1));
+    const auto row =
+        thread::get_subwarp_id_flat<subgroup_size, IndexType>(item_ct1);
+    const auto num_warps =
+        thread::get_subwarp_num_flat<subgroup_size, IndexType>(item_ct1);
+    if (row < num_rows) {
+        const auto tid_in_warp = tile_grp.thread_rank();
+        const auto row_start = row_ptrs[row];
+        const auto num_nz = row_ptrs[row + 1] - row_start;
+        for (IndexType iz = tid_in_warp; iz < num_nz; iz += subgroup_size) {
+            values[iz + row_start] *= beta[0];
+            if (col_idxs[iz + row_start] == row) {
+                values[iz + row_start] += alpha[0];
+            }
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(add_scaled_identity, add_scaled_identity);
+
+
 }  // namespace kernel
 
 
@@ -2364,8 +2432,24 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
 template <typename ValueType, typename IndexType>
 void check_diagonal_entries_exist(
     std::shared_ptr<const DpcppExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* const mtx,
-    bool& has_all_diags) GKO_NOT_IMPLEMENTED;
+    const matrix::Csr<ValueType, IndexType>* const mtx, bool& has_all_diags)
+{
+    const size_type num_subgroup = mtx->get_size()[0];
+    if (num_subgroup > 0) {
+        const size_type num_blocks =
+            num_subgroup / (default_block_size / config::warp_size);
+        array<bool> has_diags(exec, {true});
+        kernel::check_diagonal_entries(
+            num_blocks, default_block_size, 0, exec->get_queue(),
+            static_cast<IndexType>(
+                std::min(mtx->get_size()[0], mtx->get_size()[1])),
+            mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            has_diags.get_data());
+        has_all_diags = exec->copy_val_to_host(has_diags.get_const_data());
+    } else {
+        has_all_diags = true;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
@@ -2376,7 +2460,19 @@ void add_scaled_identity(std::shared_ptr<const DpcppExecutor> exec,
                          const matrix::Dense<ValueType>* const alpha,
                          const matrix::Dense<ValueType>* const beta,
                          matrix::Csr<ValueType, IndexType>* const mtx)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto nrows = mtx->get_size()[0];
+    if (nrows == 0) {
+        return;
+    }
+    const auto nthreads = nrows * config::warp_size;
+    const auto nblocks = ceildiv(nthreads, default_block_size);
+    kernel::add_scaled_identity(
+        nblocks, default_block_size, 0, exec->get_queue(),
+        alpha->get_const_values(), beta->get_const_values(),
+        static_cast<IndexType>(nrows), mtx->get_const_row_ptrs(),
+        mtx->get_const_col_idxs(), mtx->get_values());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp
index 4d3ffa61323..412f9a41158 100644
--- a/test/matrix/csr_kernels2.cpp
+++ b/test/matrix/csr_kernels2.cpp
@@ -1311,9 +1311,6 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
 }
 
 
-#ifndef GKO_COMPILING_DPCPP
-
-
 TEST_F(Csr, CanDetectMissingDiagonalEntry)
 {
     using T = double;
@@ -1359,6 +1356,3 @@ TEST_F(Csr, AddScaledIdentityToNonSquare)
 
     GKO_ASSERT_MTX_NEAR(mtx, dmtx, r<value_type>::value);
 }
-
-
-#endif  // GKO_COMPILING_DPCPP

From a1225b550ed6f10e4641cd2f96200de5edbcf1a0 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 23 Oct 2023 22:50:16 +0200
Subject: [PATCH 2/2] refine the kernel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Pratik Nayak <pratikvn@protonmail.com>
Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
---
 common/cuda_hip/matrix/csr_common.hpp.inc  |  1 -
 common/cuda_hip/matrix/csr_kernels.hpp.inc | 12 ++++++++----
 dpcpp/matrix/csr_kernels.dp.cpp            | 14 ++++++++------
 omp/matrix/csr_kernels.cpp                 | 11 ++++++++---
 4 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/common/cuda_hip/matrix/csr_common.hpp.inc b/common/cuda_hip/matrix/csr_common.hpp.inc
index 0fce02aecfa..35718464c42 100644
--- a/common/cuda_hip/matrix/csr_common.hpp.inc
+++ b/common/cuda_hip/matrix/csr_common.hpp.inc
@@ -102,7 +102,6 @@ __global__ __launch_bounds__(default_block_size) void check_diagonal_entries(
             if (tile_grp.thread_rank() == 0) {
                 *has_all_diags = false;
             }
-            return;
         }
     }
 }
diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index 3f02337747e..4bc601c5067 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -826,15 +826,19 @@ __global__ __launch_bounds__(default_block_size) void add_scaled_identity(
     auto tile_grp =
         group::tiled_partition<warp_size>(group::this_thread_block());
     const auto warpid = thread::get_subwarp_id_flat<warp_size, IndexType>();
-    const auto num_warps = thread::get_subwarp_num_flat<warp_size, IndexType>();
     if (warpid < num_rows) {
         const auto tid_in_warp = tile_grp.thread_rank();
         const IndexType row_start = row_ptrs[warpid];
         const IndexType num_nz = row_ptrs[warpid + 1] - row_start;
+        const auto beta_val = beta[0];
+        const auto alpha_val = alpha[0];
         for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) {
-            values[iz + row_start] *= beta[0];
-            if (col_idxs[iz + row_start] == warpid) {
-                values[iz + row_start] += alpha[0];
+            if (beta_val != one<ValueType>()) {
+                values[iz + row_start] *= beta_val;
+            }
+            if (col_idxs[iz + row_start] == warpid &&
+                alpha_val != zero<ValueType>()) {
+                values[iz + row_start] += alpha_val;
             }
         }
     }
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index c5a8e3ef4d4..915e2027a26 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -899,7 +899,6 @@ void check_diagonal_entries(const IndexType num_min_rows_cols,
             if (tile_grp.thread_rank() == 0) {
                 *has_all_diags = false;
             }
-            return;
         }
     }
 }
@@ -921,16 +920,19 @@ void add_scaled_identity(const ValueType* const __restrict__ alpha,
         group::this_thread_block(item_ct1));
     const auto row =
         thread::get_subwarp_id_flat<subgroup_size, IndexType>(item_ct1);
-    const auto num_warps =
-        thread::get_subwarp_num_flat<subgroup_size, IndexType>(item_ct1);
     if (row < num_rows) {
         const auto tid_in_warp = tile_grp.thread_rank();
         const auto row_start = row_ptrs[row];
         const auto num_nz = row_ptrs[row + 1] - row_start;
+        const auto beta_val = beta[0];
+        const auto alpha_val = alpha[0];
         for (IndexType iz = tid_in_warp; iz < num_nz; iz += subgroup_size) {
-            values[iz + row_start] *= beta[0];
-            if (col_idxs[iz + row_start] == row) {
-                values[iz + row_start] += alpha[0];
+            if (beta_val != one<ValueType>()) {
+                values[iz + row_start] *= beta_val;
+            }
+            if (col_idxs[iz + row_start] == row &&
+                alpha_val != zero<ValueType>()) {
+                values[iz + row_start] += alpha_val;
             }
         }
     }
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 7d4a5a7ebd1..1757b4b8a25 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -1134,12 +1134,17 @@ void add_scaled_identity(std::shared_ptr<const OmpExecutor> exec,
     const auto nrows = static_cast<IndexType>(mtx->get_size()[0]);
     const auto row_ptrs = mtx->get_const_row_ptrs();
     const auto vals = mtx->get_values();
+    const auto beta_val = beta->get_const_values()[0];
+    const auto alpha_val = alpha->get_const_values()[0];
 #pragma omp parallel for
     for (IndexType row = 0; row < nrows; row++) {
         for (IndexType iz = row_ptrs[row]; iz < row_ptrs[row + 1]; iz++) {
-            vals[iz] *= beta->get_const_values()[0];
-            if (row == mtx->get_const_col_idxs()[iz]) {
-                vals[iz] += alpha->get_const_values()[0];
+            if (beta_val != one<ValueType>()) {
+                vals[iz] *= beta_val;
+            }
+            if (row == mtx->get_const_col_idxs()[iz] &&
+                alpha_val != zero<ValueType>()) {
+                vals[iz] += alpha_val;
             }
         }
     }