ginkgo-project · yhmtsai · Oct 15, 2019 · Oct 10, 2019 · Oct 11, 2019 · Oct 13, 2019
diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp
@@ -123,7 +123,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
     solver_factory{{"cg", create_solver<gko::solver::Cg<>>},
                    {"bicgstab", create_solver<gko::solver::Bicgstab<>>},
                    {"cgs", create_solver<gko::solver::Cgs<>>},
-                   {"fcg", create_solver<gko::solver::Fcg<>>}};
+                   {"fcg", create_solver<gko::solver::Fcg<>>},
+                   {"gmres", create_solver<gko::solver::Gmres<>>}};
 
 
 // TODO: Workaround until GPU matrix conversions are implemented

diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
 
 
 #include "core/solver/gmres_kernels.hpp"
@@ -66,18 +67,25 @@ namespace {
 
 
 template <typename ValueType>
-void apply_preconditioner(const LinOp *preconditioner,
-                          matrix::Dense<ValueType> *krylov_bases,
-                          matrix::Dense<ValueType> *preconditioned_vector,
-                          const size_type iter)
+void apply_preconditioner(
+    const LinOp *preconditioner, matrix::Dense<ValueType> *krylov_bases,
+    std::shared_ptr<matrix::Dense<ValueType>> &preconditioned_vector,
+    const size_type iter)
 {
-    auto target_basis = krylov_bases->create_submatrix(
-        span{0, krylov_bases->get_size()[0]},
-        span{iter * preconditioned_vector->get_size()[1],
-             (iter + 1) * preconditioned_vector->get_size()[1]});
+    std::shared_ptr<matrix::Dense<ValueType>> target_basis =
+        krylov_bases->create_submatrix(
+            span{0, krylov_bases->get_size()[0]},
+            span{iter * preconditioned_vector->get_size()[1],
+                 (iter + 1) * preconditioned_vector->get_size()[1]});
 
     // Apply preconditioner
-    preconditioner->apply(target_basis.get(), preconditioned_vector);
+    auto identity_pointer =
+        dynamic_cast<const matrix::Identity<ValueType> *>(preconditioner);
+    if (identity_pointer) {
+        preconditioned_vector = target_basis;
+    } else {
+        preconditioner->apply(target_basis.get(), preconditioned_vector.get());
+    }
 }
 
 
@@ -105,7 +113,8 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
         exec, dim<2>{system_matrix_->get_size()[1],
                      (krylov_dim_ + 1) * dense_b->get_size()[1]});
     auto next_krylov_basis = Vector::create_with_config_of(dense_b);
-    auto preconditioned_vector = Vector::create_with_config_of(dense_b);
+    std::shared_ptr<matrix::Dense<ValueType>> preconditioned_vector =
+        Vector::create_with_config_of(dense_b);
     auto hessenberg = Vector::create(
         exec, dim<2>{krylov_dim_ + 1, krylov_dim_ * dense_b->get_size()[1]});
     auto givens_sin =
@@ -200,7 +209,7 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
         }
 
         apply_preconditioner(get_preconditioner().get(), krylov_bases.get(),
-                             preconditioned_vector.get(), restart_iter);
+                             preconditioned_vector, restart_iter);
         // preconditioned_vector = get_preconditioner() *
         //                         krylov_bases(:, restart_iter)
 

diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -62,6 +63,8 @@ namespace gmres {
 
 
 constexpr int default_block_size = 512;
+constexpr int default_dot_dim = cuda_config::warp_size;
+constexpr int default_dot_size = default_dot_dim * default_dot_dim;
 
 
 // Must be called with at least `max(stride_b * num_rows, krylov_dim *
@@ -227,10 +230,8 @@ __global__
 }
 
 
-// Must be called with at least `num_cols` blocks, each with `block_size`
-// threads. `block_size` must be a power of 2.
-template <int block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void update_hessenberg_kernel(
+template <typename ValueType>
+__global__ __launch_bounds__(default_dot_size) void multidot_kernel(
     size_type k, size_type num_rows, size_type num_cols,
     const ValueType *__restrict__ next_krylov_basis,
     size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases,
@@ -239,34 +240,43 @@ __global__ __launch_bounds__(block_size) void update_hessenberg_kernel(
     const stopping_status *__restrict__ stop_status)
 {
     const auto tidx = threadIdx.x;
-    const auto col_idx = blockIdx.x;
-
+    const auto tidy = threadIdx.y;
+    const auto col_idx = blockIdx.x * default_dot_dim + tidx;
+    const auto num = ceildiv(num_rows, gridDim.y);
+    const auto start_row = blockIdx.y * num;
+    const auto end_row =
+        ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
     // Used that way to get around dynamic initialization warning and
     // template error when using `reduction_helper_array` directly in `reduce`
-    __shared__ UninitializedArray<ValueType, block_size> reduction_helper_array;
+    __shared__
+        UninitializedArray<ValueType, default_dot_dim *(default_dot_dim + 1)>
+            reduction_helper_array;
     ValueType *__restrict__ reduction_helper = reduction_helper_array;
 
+    ValueType local_res = zero<ValueType>();
+    const auto krylov_col = k * num_cols + col_idx;
     if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
-        ValueType local_res{};
-        const auto krylov_col = k * num_cols + col_idx;
-        for (size_type i = tidx; i < num_rows; i += block_size) {
+        for (size_type i = start_row + tidy; i < end_row;
+             i += default_dot_dim) {
             const auto next_krylov_idx = i * stride_next_krylov + col_idx;
             const auto krylov_idx = i * stride_krylov + krylov_col;
-
             local_res +=
                 next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx];
         }
-
-        reduction_helper[tidx] = local_res;
-
-        // Perform thread block reduction. Result is in reduction_helper[0]
-        reduce(group::this_thread_block(), reduction_helper,
+    }
+    reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res;
+    __syncthreads();
+    local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx];
+    const auto tile_block =
+        group::tiled_partition<default_dot_dim>(group::this_thread_block());
+    const auto sum =
+        reduce(tile_block, local_res,
                [](const ValueType &a, const ValueType &b) { return a + b; });
-
-        if (tidx == 0) {
-            const auto hessenberg_idx = k * stride_hessenberg + col_idx;
-            hessenberg_iter[hessenberg_idx] = reduction_helper[0];
-        }
+    const auto new_col_idx = blockIdx.x * default_dot_dim + tidy;
+    if (tidx == 0 && new_col_idx < num_cols &&
+        !stop_status[new_col_idx].has_stopped()) {
+        const auto hessenberg_idx = k * stride_hessenberg + new_col_idx;
+        atomic_add(hessenberg_iter + hessenberg_idx, sum);
     }
 }
 
@@ -381,17 +391,19 @@ void finish_arnoldi(std::shared_ptr<const CudaExecutor> exec,
     const auto stride_krylov = krylov_bases->get_stride();
     const auto stride_hessenberg = hessenberg_iter->get_stride();
     const auto dim_size = next_krylov_basis->get_size();
-
+    auto cublas_handle = exec->get_cublas_handle();
+    const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim),
+                         exec->get_num_multiprocessor() * 2);
+    const dim3 block_size(default_dot_dim, default_dot_dim);
     for (size_type k = 0; k < iter + 1; ++k) {
-        update_hessenberg_kernel<default_block_size>
-            <<<dim_size[1], default_block_size>>>(
-                k, dim_size[0], dim_size[1],
-                as_cuda_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov,
-                as_cuda_type(krylov_bases->get_const_values()), stride_krylov,
-                as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg,
-                as_cuda_type(stop_status));
-
+        zero_array(dim_size[1],
+                   hessenberg_iter->get_values() + k * stride_hessenberg);
+        multidot_kernel<<<grid_size, block_size>>>(
+            k, dim_size[0], dim_size[1],
+            as_cuda_type(next_krylov_basis->get_const_values()),
+            stride_next_krylov, as_cuda_type(krylov_bases->get_const_values()),
+            stride_krylov, as_cuda_type(hessenberg_iter->get_values()),
+            stride_hessenberg, as_cuda_type(stop_status));
         update_next_krylov_kernel<default_block_size>
             <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
                default_block_size>>>(