ginkgo-project · pratikvn · Jul 2, 2021 · Jun 29, 2021 · Jun 29, 2021 · Jun 29, 2021
diff --git a/common/preconditioner/jacobi_kernels.hpp.inc b/common/preconditioner/jacobi_kernels.hpp.inc
@@ -213,3 +213,50 @@ __launch_bounds__(warps_per_block *config::warp_size) adaptive_transpose_jacobi(
             });
     }
 }
+
+
+namespace kernel {
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void scalar_apply(
+    size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag,
+    const ValueType *__restrict__ alpha, size_type source_stride,
+    const ValueType *__restrict__ source_values,
+    const ValueType *__restrict__ beta, size_type result_stride,
+    ValueType *__restrict__ result_values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / num_cols;
+    const auto col = tidx % num_cols;
+
+    if (row < num_rows) {
+        auto diag_val =
+            diag[row] == zero<ValueType>() ? one<ValueType>() : diag[row];
+        result_values[row * result_stride + col] =
+            result_values[row * result_stride + col] * beta[0] +
+            alpha[0] * source_values[row * source_stride + col] / diag_val;
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void simple_scalar_apply(
+    size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag,
+    size_type source_stride, const ValueType *__restrict__ source_values,
+    size_type result_stride, ValueType *__restrict__ result_values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / num_cols;
+    const auto col = tidx % num_cols;
+
+    if (row < num_rows) {
+        auto diag_val =
+            diag[row] == zero<ValueType>() ? one<ValueType>() : diag[row];
+        result_values[row * result_stride + col] =
+            source_values[row * source_stride + col] / diag_val;
+    }
+}
+
+
+}  // namespace kernel
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -1081,6 +1081,17 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
+template <typename ValueType>
+GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);

diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
@@ -57,7 +57,9 @@ namespace jacobi {
 
 
 GKO_REGISTER_OPERATION(simple_apply, jacobi::simple_apply);
+GKO_REGISTER_OPERATION(simple_scalar_apply, jacobi::simple_scalar_apply);
 GKO_REGISTER_OPERATION(apply, jacobi::apply);
+GKO_REGISTER_OPERATION(scalar_apply, jacobi::scalar_apply);
 GKO_REGISTER_OPERATION(find_blocks, jacobi::find_blocks);
 GKO_REGISTER_OPERATION(generate, jacobi::generate);
 GKO_REGISTER_OPERATION(transpose_jacobi, jacobi::transpose_jacobi);
@@ -74,10 +76,15 @@ void Jacobi<ValueType, IndexType>::apply_impl(const LinOp *b, LinOp *x) const
 {
     precision_dispatch_real_complex<ValueType>(
         [this](auto dense_b, auto dense_x) {
-            this->get_executor()->run(jacobi::make_simple_apply(
-                num_blocks_, parameters_.max_block_size, storage_scheme_,
-                parameters_.storage_optimization.block_wise,
-                parameters_.block_pointers, blocks_, dense_b, dense_x));
+            if (parameters_.max_block_size == 1) {
+                this->get_executor()->run(jacobi::make_simple_scalar_apply(
+                    this->blocks_, dense_b, dense_x));
+            } else {
+                this->get_executor()->run(jacobi::make_simple_apply(
+                    num_blocks_, parameters_.max_block_size, storage_scheme_,
+                    parameters_.storage_optimization.block_wise,
+                    parameters_.block_pointers, blocks_, dense_b, dense_x));
+            }
         },
         b, x);
 }
@@ -90,11 +97,16 @@ void Jacobi<ValueType, IndexType>::apply_impl(const LinOp *alpha,
 {
     precision_dispatch_real_complex<ValueType>(
         [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
-            this->get_executor()->run(jacobi::make_apply(
-                num_blocks_, parameters_.max_block_size, storage_scheme_,
-                parameters_.storage_optimization.block_wise,
-                parameters_.block_pointers, blocks_, dense_alpha, dense_b,
-                dense_beta, dense_x));
+            if (parameters_.max_block_size == 1) {
+                this->get_executor()->run(jacobi::make_scalar_apply(
+                    this->blocks_, dense_alpha, dense_b, dense_beta, dense_x));
+            } else {
+                this->get_executor()->run(jacobi::make_apply(
+                    num_blocks_, parameters_.max_block_size, storage_scheme_,
+                    parameters_.storage_optimization.block_wise,
+                    parameters_.block_pointers, blocks_, dense_alpha, dense_b,
+                    dense_beta, dense_x));
+            }
         },
         alpha, b, beta, x);
 }
@@ -218,33 +230,43 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp *system_matrix,
     GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
     using csr_type = matrix::Csr<ValueType, IndexType>;
     const auto exec = this->get_executor();
-    auto csr_mtx =
-        convert_to_with_sorting<csr_type>(exec, system_matrix, skip_sorting);
-
-    if (parameters_.block_pointers.get_data() == nullptr) {
-        this->detect_blocks(csr_mtx.get());
-    }
+    if (parameters_.max_block_size == 1) {
+        auto diag = as<DiagonalExtractable<ValueType>>(system_matrix)
+                        ->extract_diagonal();
+        auto temp = gko::Array<ValueType>::view(
+            exec, system_matrix->get_size()[0], diag->get_values());
+        this->blocks_ = temp;
+    } else {
+        auto csr_mtx = convert_to_with_sorting<csr_type>(exec, system_matrix,
+                                                         skip_sorting);
+
+        if (parameters_.block_pointers.get_data() == nullptr) {
+            this->detect_blocks(csr_mtx.get());
+        }
 
-    const auto all_block_opt = parameters_.storage_optimization.of_all_blocks;
-    auto &precisions = parameters_.storage_optimization.block_wise;
-    // if adaptive version is used, make sure that the precision array is of
-    // the correct size by replicating it multiple times if needed
-    if (parameters_.storage_optimization.is_block_wise ||
-        all_block_opt != precision_reduction(0, 0)) {
-        if (!parameters_.storage_optimization.is_block_wise) {
-            precisions = gko::Array<precision_reduction>(exec, {all_block_opt});
+        const auto all_block_opt =
+            parameters_.storage_optimization.of_all_blocks;
+        auto &precisions = parameters_.storage_optimization.block_wise;
+        // if adaptive version is used, make sure that the precision array is of
+        // the correct size by replicating it multiple times if needed
+        if (parameters_.storage_optimization.is_block_wise ||
+            all_block_opt != precision_reduction(0, 0)) {
+            if (!parameters_.storage_optimization.is_block_wise) {
+                precisions =
+                    gko::Array<precision_reduction>(exec, {all_block_opt});
+            }
+            Array<precision_reduction> tmp(
+                exec, parameters_.block_pointers.get_num_elems() - 1);
+            exec->run(jacobi::make_initialize_precisions(precisions, tmp));
+            precisions = std::move(tmp);
+            conditioning_.resize_and_reset(num_blocks_);
         }
-        Array<precision_reduction> tmp(
-            exec, parameters_.block_pointers.get_num_elems() - 1);
-        exec->run(jacobi::make_initialize_precisions(precisions, tmp));
-        precisions = std::move(tmp);
-        conditioning_.resize_and_reset(num_blocks_);
-    }
 
-    exec->run(jacobi::make_generate(
-        csr_mtx.get(), num_blocks_, parameters_.max_block_size,
-        parameters_.accuracy, storage_scheme_, conditioning_, precisions,
-        parameters_.block_pointers, blocks_));
+        exec->run(jacobi::make_generate(
+            csr_mtx.get(), num_blocks_, parameters_.max_block_size,
+            parameters_.accuracy, storage_scheme_, conditioning_, precisions,
+            parameters_.block_pointers, blocks_));
+    }
 }
 
 

diff --git a/core/preconditioner/jacobi_kernels.hpp b/core/preconditioner/jacobi_kernels.hpp
@@ -74,6 +74,12 @@ namespace kernels {
         const matrix::Dense<ValueType> *b,                                     \
         const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *x)
 
+#define GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType)          \
+    void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec, \
+                             const Array<ValueType> &diag,                \
+                             const matrix::Dense<ValueType> *b,           \
+                             matrix::Dense<ValueType> *x)
+
 #define GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType)       \
     void simple_apply(                                                     \
         std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
@@ -85,6 +91,13 @@ namespace kernels {
         const Array<ValueType> &blocks, const matrix::Dense<ValueType> *b, \
         matrix::Dense<ValueType> *x)
 
+#define GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType)                    \
+    void scalar_apply(                                                       \
+        std::shared_ptr<const DefaultExecutor> exec,                         \
+        const Array<ValueType> &diag, const matrix::Dense<ValueType> *alpha, \
+        const matrix::Dense<ValueType> *b,                                   \
+        const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *x)
+
 #define GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType)          \
     void transpose_jacobi(                                                 \
         std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
@@ -127,8 +140,12 @@ namespace kernels {
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL(ValueType, IndexType);      \
     template <typename ValueType, typename IndexType>                 \
     GKO_DECLARE_JACOBI_GENERATE_KERNEL(ValueType, IndexType);         \
+    template <typename ValueType>                                     \
+    GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType);                \
     template <typename ValueType, typename IndexType>                 \
     GKO_DECLARE_JACOBI_APPLY_KERNEL(ValueType, IndexType);            \
+    template <typename ValueType>                                     \
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType);         \
     template <typename ValueType, typename IndexType>                 \
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType);     \
     template <typename ValueType, typename IndexType>                 \

diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
@@ -65,12 +65,14 @@ constexpr int default_num_warps = 32;
 // current GPUs have at most 84 SMs)
 constexpr int default_grid_size = 32 * 32 * 128;
 
+constexpr int default_block_size = 512;
+
 
 #include "common/preconditioner/jacobi_kernels.hpp.inc"
 
 
 template <typename ValueType, typename IndexType>
-size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,
+size_type find_natural_blocks(std::shared_ptr<const DefaultExecutor> exec,
                               const matrix::Csr<ValueType, IndexType> *mtx,
                               int32 max_block_size,
                               IndexType *__restrict__ block_ptrs)
@@ -95,7 +97,7 @@ size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,
 
 template <typename IndexType>
 inline size_type agglomerate_supervariables(
-    std::shared_ptr<const CudaExecutor> exec, int32 max_block_size,
+    std::shared_ptr<const DefaultExecutor> exec, int32 max_block_size,
     size_type num_natural_blocks, IndexType *block_ptrs)
 {
     Array<size_type> nums(exec, 1);
@@ -111,7 +113,7 @@ inline size_type agglomerate_supervariables(
 }  // namespace
 
 
-void initialize_precisions(std::shared_ptr<const CudaExecutor> exec,
+void initialize_precisions(std::shared_ptr<const DefaultExecutor> exec,
                            const Array<precision_reduction> &source,
                            Array<precision_reduction> &precisions)
 {
@@ -126,7 +128,7 @@ void initialize_precisions(std::shared_ptr<const CudaExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-void find_blocks(std::shared_ptr<const CudaExecutor> exec,
+void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
                  const matrix::Csr<ValueType, IndexType> *system_matrix,
                  uint32 max_block_size, size_type &num_blocks,
                  Array<IndexType> &block_pointers)
@@ -230,7 +232,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(
-    std::shared_ptr<const CudaExecutor> exec, size_type num_blocks,
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     const Array<precision_reduction> &block_precisions,
     const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>
@@ -241,6 +243,61 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
+template <typename ValueType>
+void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const Array<ValueType> &diag,
+                  const matrix::Dense<ValueType> *alpha,
+                  const matrix::Dense<ValueType> *b,
+                  const matrix::Dense<ValueType> *beta,
+                  matrix::Dense<ValueType> *x)
+{
+    const auto b_size = b->get_size();
+    const auto num_rows = b_size[0];
+    const auto num_cols = b_size[1];
+    const auto b_stride = b->get_stride();
+    const auto x_stride = x->get_stride();
+    const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);
+
+    const auto b_values = b->get_const_values();
+    const auto diag_values = diag.get_const_data();
+    auto x_values = x->get_values();
+
+    kernel::scalar_apply<<<grid_dim, default_block_size>>>(
+        num_rows, num_cols, as_cuda_type(diag_values),
+        as_cuda_type(alpha->get_const_values()), b_stride,
+        as_cuda_type(b_values), as_cuda_type(beta->get_const_values()),
+        x_stride, as_cuda_type(x_values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                         const Array<ValueType> &diag,
+                         const matrix::Dense<ValueType> *b,
+                         matrix::Dense<ValueType> *x)
+{
+    const auto b_size = b->get_size();
+    const auto num_rows = b_size[0];
+    const auto num_cols = b_size[1];
+    const auto b_stride = b->get_stride();
+    const auto x_stride = x->get_stride();
+    const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);
+
+    const auto b_values = b->get_const_values();
+    const auto diag_values = diag.get_const_data();
+    auto x_values = x->get_values();
+
+    kernel::simple_scalar_apply<<<grid_dim, default_block_size>>>(
+        num_rows, num_cols, as_cuda_type(diag_values), b_stride,
+        as_cuda_type(b_values), x_stride, as_cuda_type(x_values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+
+
 }  // namespace jacobi
 }  // namespace cuda
 }  // namespace kernels