From bd770307e7833f6c435e1b8064e021935aa2ea3e Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 10 Oct 2019 16:25:33 +0200
Subject: [PATCH 1/8] use dot

---
 cuda/solver/gmres_kernels.cu | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)
diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
index e0294ba35e8..eeb28532758 100644
--- a/cuda/solver/gmres_kernels.cu
+++ b/cuda/solver/gmres_kernels.cu
@@ -381,16 +381,26 @@ void finish_arnoldi(std::shared_ptr<const CudaExecutor> exec,
     const auto stride_krylov = krylov_bases->get_stride();
     const auto stride_hessenberg = hessenberg_iter->get_stride();
     const auto dim_size = next_krylov_basis->get_size();
-
+    auto cublas_handle = exec->get_cublas_handle();
     for (size_type k = 0; k < iter + 1; ++k) {
-        update_hessenberg_kernel<default_block_size>
-            <<<dim_size[1], default_block_size>>>(
-                k, dim_size[0], dim_size[1],
-                as_cuda_type(next_krylov_basis->get_const_values()),
+        // update_hessenberg_kernel<default_block_size>
+        //     <<<dim_size[1], default_block_size>>>(
+        //         k, dim_size[0], dim_size[1],
+        //         as_cuda_type(next_krylov_basis->get_const_values()),
+        //         stride_next_krylov,
+        //         as_cuda_type(krylov_bases->get_const_values()),
+        //         stride_krylov, as_cuda_type(hessenberg_iter->get_values()),
+        //         stride_hessenberg, as_cuda_type(stop_status));
+        for (size_type col_idx = 0; col_idx < dim_size[1]; col_idx++) {
+            cublas::dot(
+                cublas_handle, dim_size[0],
+                next_krylov_basis->get_const_values() + col_idx,
                 stride_next_krylov,
-                as_cuda_type(krylov_bases->get_const_values()), stride_krylov,
-                as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg,
-                as_cuda_type(stop_status));
+                krylov_bases->get_const_values() + k * dim_size[1] + col_idx,
+                stride_krylov,
+                hessenberg_iter->get_values() + k * stride_hessenberg +
+                    col_idx);
+        }
 
         update_next_krylov_kernel<default_block_size>
             <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),

From f8db7898ecb1886e415e26a6015c840eb6176b89 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 11 Oct 2019 17:28:24 +0200
Subject: [PATCH 2/8] implement multidot_kernel

---
 cuda/solver/gmres_kernels.cu | 89 +++++++++++++++++++++++++++++-------
 1 file changed, 72 insertions(+), 17 deletions(-)

diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
index eeb28532758..df898724e5d 100644
--- a/cuda/solver/gmres_kernels.cu
+++ b/cuda/solver/gmres_kernels.cu
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -271,6 +272,55 @@ __global__ __launch_bounds__(block_size) void update_hessenberg_kernel(
 }
 
 
+template <int tile_dim, typename ValueType>
+__global__ __launch_bounds__(tile_dim *tile_dim) void multidot_kernel(
+    size_type k, size_type num_rows, size_type num_cols,
+    const ValueType *__restrict__ next_krylov_basis,
+    size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases,
+    size_type stride_krylov, ValueType *__restrict__ hessenberg_iter,
+    size_type stride_hessenberg,
+    const stopping_status *__restrict__ stop_status)
+{
+    const auto tidy = threadIdx.y;
+    const auto tidx = threadIdx.x;
+    const auto col_idx = blockIdx.x * tile_dim + tidx;
+    const auto num = ceildiv(num_rows, blockDim.y);
+    const auto start_row = blockIdx.y * num;
+    const auto end_row =
+        ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
+    // Used that way to get around dynamic initialization warning and
+    // template error when using `reduction_helper_array` directly in `reduce`
+    __shared__ UninitializedArray<ValueType, tile_dim *(tile_dim + 1)>
+        reduction_helper_array;
+    ValueType *__restrict__ reduction_helper = reduction_helper_array;
+
+    ValueType local_res = zero<ValueType>();
+    const auto krylov_col = k * num_cols + col_idx;
+    if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
+        for (size_type i = start_row + tidy; i < end_row; i += blockDim.y) {
+            const auto next_krylov_idx = i * stride_next_krylov + col_idx;
+            const auto krylov_idx = i * stride_krylov + krylov_col;
+            local_res +=
+                next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx];
+        }
+    }
+    reduction_helper[tidx * (tile_dim + 1) + tidy] = local_res;
+    __syncthreads();
+    local_res = reduction_helper[tidy * (tile_dim + 1) + tidx];
+    const auto tile_block =
+        group::tiled_partition<tile_dim>(group::this_thread_block());
+    const auto sum =
+        reduce(tile_block, local_res,
+               [](const ValueType &a, const ValueType &b) { return a + b; });
+    const auto new_col_idx = blockIdx.x * tile_dim + tidy;
+    if (tidx == 0 && new_col_idx < num_cols &&
+        !stop_status[new_col_idx].has_stopped()) {
+        const auto hessenberg_idx = k * stride_hessenberg + new_col_idx;
+        atomic_add(hessenberg_iter + hessenberg_idx, sum);
+    }
+}
+
+
 // Must be called with at least `num_rows * stride_next_krylov` threads in
 // total.
 template <int block_size, typename ValueType>
@@ -383,25 +433,30 @@ void finish_arnoldi(std::shared_ptr<const CudaExecutor> exec,
     const auto dim_size = next_krylov_basis->get_size();
     auto cublas_handle = exec->get_cublas_handle();
     for (size_type k = 0; k < iter + 1; ++k) {
-        // update_hessenberg_kernel<default_block_size>
-        //     <<<dim_size[1], default_block_size>>>(
-        //         k, dim_size[0], dim_size[1],
-        //         as_cuda_type(next_krylov_basis->get_const_values()),
-        //         stride_next_krylov,
-        //         as_cuda_type(krylov_bases->get_const_values()),
-        //         stride_krylov, as_cuda_type(hessenberg_iter->get_values()),
-        //         stride_hessenberg, as_cuda_type(stop_status));
-        for (size_type col_idx = 0; col_idx < dim_size[1]; col_idx++) {
-            cublas::dot(
-                cublas_handle, dim_size[0],
-                next_krylov_basis->get_const_values() + col_idx,
+        if (dim_size[1] <= 4) {
+            for (size_type col_idx = 0; col_idx < dim_size[1]; col_idx++) {
+                cublas::dot(cublas_handle, dim_size[0],
+                            next_krylov_basis->get_const_values() + col_idx,
+                            stride_next_krylov,
+                            krylov_bases->get_const_values() + k * dim_size[1] +
+                                col_idx,
+                            stride_krylov,
+                            hessenberg_iter->get_values() +
+                                k * stride_hessenberg + col_idx);
+            }
+        } else {
+            zero_array(dim_size[1],
+                       hessenberg_iter->get_values() + k * stride_hessenberg);
+            multidot_kernel<32><<<dim3{ceildiv(dim_size[1], 32),
+                                       exec->get_num_multiprocessor() * 2},
+                                  dim3{32, 32}>>>(
+                k, dim_size[0], dim_size[1],
+                as_cuda_type(next_krylov_basis->get_const_values()),
                 stride_next_krylov,
-                krylov_bases->get_const_values() + k * dim_size[1] + col_idx,
-                stride_krylov,
-                hessenberg_iter->get_values() + k * stride_hessenberg +
-                    col_idx);
+                as_cuda_type(krylov_bases->get_const_values()), stride_krylov,
+                as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg,
+                as_cuda_type(stop_status));
         }
-
         update_next_krylov_kernel<default_block_size>
             <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
                default_block_size>>>(

From 1002e9d5a98c8affaec97912a851bf48e86ae3e3 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sun, 13 Oct 2019 14:48:03 +0200
Subject: [PATCH 3/8] try non-square transpose

---
 cuda/solver/gmres_kernels.cu | 57 ++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
index df898724e5d..779825b76ba 100644
--- a/cuda/solver/gmres_kernels.cu
+++ b/cuda/solver/gmres_kernels.cu
@@ -272,8 +272,8 @@ __global__ __launch_bounds__(block_size) void update_hessenberg_kernel(
 }
 
 
-template <int tile_dim, typename ValueType>
-__global__ __launch_bounds__(tile_dim *tile_dim) void multidot_kernel(
+template <int tile_dimx, int tile_dimy, typename ValueType>
+__global__ __launch_bounds__(tile_dimx *tile_dimy) void multidot_kernel(
     size_type k, size_type num_rows, size_type num_cols,
     const ValueType *__restrict__ next_krylov_basis,
     size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases,
@@ -281,39 +281,49 @@ __global__ __launch_bounds__(tile_dim *tile_dim) void multidot_kernel(
     size_type stride_hessenberg,
     const stopping_status *__restrict__ stop_status)
 {
-    const auto tidy = threadIdx.y;
-    const auto tidx = threadIdx.x;
-    const auto col_idx = blockIdx.x * tile_dim + tidx;
-    const auto num = ceildiv(num_rows, blockDim.y);
+    const auto tid = threadIdx.x;
+    const auto tidy = tid / tile_dimx;
+    const auto tidx = tid % tile_dimx;
+    const auto col_idx = blockIdx.x * tile_dimx + tidx;
+    const auto num = ceildiv(num_rows, tile_dimy);
     const auto start_row = blockIdx.y * num;
     const auto end_row =
         ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
     // Used that way to get around dynamic initialization warning and
     // template error when using `reduction_helper_array` directly in `reduce`
-    __shared__ UninitializedArray<ValueType, tile_dim *(tile_dim + 1)>
+    __shared__ UninitializedArray<ValueType, (tile_dimx) * (tile_dimy + 1)>
         reduction_helper_array;
     ValueType *__restrict__ reduction_helper = reduction_helper_array;
 
     ValueType local_res = zero<ValueType>();
     const auto krylov_col = k * num_cols + col_idx;
     if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
-        for (size_type i = start_row + tidy; i < end_row; i += blockDim.y) {
+        for (size_type i = start_row + tidy; i < end_row; i += tile_dimy) {
             const auto next_krylov_idx = i * stride_next_krylov + col_idx;
             const auto krylov_idx = i * stride_krylov + krylov_col;
             local_res +=
                 next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx];
         }
     }
-    reduction_helper[tidx * (tile_dim + 1) + tidy] = local_res;
+    const auto new_tidx = tid % tile_dimy;
+    const auto new_tidy = tid / tile_dimy;
+    // if (new_tidy != tidx) {
+    // (tidx, tidy)
+    reduction_helper[tidx * (tile_dimy + 1) + tidy] = local_res;
+    // }
     __syncthreads();
-    local_res = reduction_helper[tidy * (tile_dim + 1) + tidx];
+    // if (new_tidy != tidx) {
+    // (new_tidy, new_tidx) takes (new_tidx, new_tidy) value
+    // (new_tidx, new_tidy) take (new_tidy, new_tidx)
+    local_res = reduction_helper[new_tidy * (tile_dimy + 1) + new_tidx];
+    // }
     const auto tile_block =
-        group::tiled_partition<tile_dim>(group::this_thread_block());
+        group::tiled_partition<tile_dimy>(group::this_thread_block());
     const auto sum =
         reduce(tile_block, local_res,
                [](const ValueType &a, const ValueType &b) { return a + b; });
-    const auto new_col_idx = blockIdx.x * tile_dim + tidy;
-    if (tidx == 0 && new_col_idx < num_cols &&
+    const auto new_col_idx = blockIdx.x * tile_dimx + new_tidy;
+    if (new_tidx == 0 && new_col_idx < num_cols &&
         !stop_status[new_col_idx].has_stopped()) {
         const auto hessenberg_idx = k * stride_hessenberg + new_col_idx;
         atomic_add(hessenberg_iter + hessenberg_idx, sum);
@@ -445,17 +455,20 @@ void finish_arnoldi(std::shared_ptr<const CudaExecutor> exec,
                                 k * stride_hessenberg + col_idx);
             }
         } else {
+            constexpr int cols_per_block = 32;
+            constexpr int rows_per_block = 32;
             zero_array(dim_size[1],
                        hessenberg_iter->get_values() + k * stride_hessenberg);
-            multidot_kernel<32><<<dim3{ceildiv(dim_size[1], 32),
-                                       exec->get_num_multiprocessor() * 2},
-                                  dim3{32, 32}>>>(
-                k, dim_size[0], dim_size[1],
-                as_cuda_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov,
-                as_cuda_type(krylov_bases->get_const_values()), stride_krylov,
-                as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg,
-                as_cuda_type(stop_status));
+            multidot_kernel<rows_per_block, cols_per_block>
+                <<<dim3{static_cast<uint>(ceildiv(dim_size[1], cols_per_block)),
+                        static_cast<uint>(exec->get_num_multiprocessor() * 2)},
+                   cols_per_block * rows_per_block>>>(
+                    k, dim_size[0], dim_size[1],
+                    as_cuda_type(next_krylov_basis->get_const_values()),
+                    stride_next_krylov,
+                    as_cuda_type(krylov_bases->get_const_values()),
+                    stride_krylov, as_cuda_type(hessenberg_iter->get_values()),
+                    stride_hessenberg, as_cuda_type(stop_status));
         }
         update_next_krylov_kernel<default_block_size>
             <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),

From 6b6669450471d3fd4cd220bdea8d6ae00f7c149a Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sun, 13 Oct 2019 21:47:49 +0200
Subject: [PATCH 4/8] improve square multidot kernel

---
 cuda/solver/gmres_kernels.cu | 82 +++++++++++++++---------------------
 1 file changed, 35 insertions(+), 47 deletions(-)

diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
index 779825b76ba..c2d6af58d7b 100644
--- a/cuda/solver/gmres_kernels.cu
+++ b/cuda/solver/gmres_kernels.cu
@@ -63,6 +63,8 @@ namespace gmres {
 
 
 constexpr int default_block_size = 512;
+constexpr int default_dot_dim = cuda_config::warp_size;
+constexpr int default_dot_size = default_dot_dim * default_dot_dim;
 
 
 // Must be called with at least `max(stride_b * num_rows, krylov_dim *
@@ -272,8 +274,8 @@ __global__ __launch_bounds__(block_size) void update_hessenberg_kernel(
 }
 
 
-template <int tile_dimx, int tile_dimy, typename ValueType>
-__global__ __launch_bounds__(tile_dimx *tile_dimy) void multidot_kernel(
+template <typename ValueType>
+__global__ __launch_bounds__(default_dot_size) void multidot_kernel(
     size_type k, size_type num_rows, size_type num_cols,
     const ValueType *__restrict__ next_krylov_basis,
     size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases,
@@ -281,49 +283,41 @@ __global__ __launch_bounds__(tile_dimx *tile_dimy) void multidot_kernel(
     size_type stride_hessenberg,
     const stopping_status *__restrict__ stop_status)
 {
-    const auto tid = threadIdx.x;
-    const auto tidy = tid / tile_dimx;
-    const auto tidx = tid % tile_dimx;
-    const auto col_idx = blockIdx.x * tile_dimx + tidx;
-    const auto num = ceildiv(num_rows, tile_dimy);
+    const auto tidx = threadIdx.x;
+    const auto tidy = threadIdx.y;
+    const auto col_idx = blockIdx.x * default_dot_dim + tidx;
+    const auto num = ceildiv(num_rows, default_dot_dim);
     const auto start_row = blockIdx.y * num;
     const auto end_row =
         ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
     // Used that way to get around dynamic initialization warning and
     // template error when using `reduction_helper_array` directly in `reduce`
-    __shared__ UninitializedArray<ValueType, (tile_dimx) * (tile_dimy + 1)>
-        reduction_helper_array;
+    __shared__
+        UninitializedArray<ValueType, default_dot_dim *(default_dot_dim + 1)>
+            reduction_helper_array;
     ValueType *__restrict__ reduction_helper = reduction_helper_array;
 
     ValueType local_res = zero<ValueType>();
     const auto krylov_col = k * num_cols + col_idx;
     if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
-        for (size_type i = start_row + tidy; i < end_row; i += tile_dimy) {
+        for (size_type i = start_row + tidy; i < end_row;
+             i += default_dot_dim) {
             const auto next_krylov_idx = i * stride_next_krylov + col_idx;
             const auto krylov_idx = i * stride_krylov + krylov_col;
             local_res +=
                 next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx];
         }
     }
-    const auto new_tidx = tid % tile_dimy;
-    const auto new_tidy = tid / tile_dimy;
-    // if (new_tidy != tidx) {
-    // (tidx, tidy)
-    reduction_helper[tidx * (tile_dimy + 1) + tidy] = local_res;
-    // }
+    reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res;
     __syncthreads();
-    // if (new_tidy != tidx) {
-    // (new_tidy, new_tidx) takes (new_tidx, new_tidy) value
-    // (new_tidx, new_tidy) take (new_tidy, new_tidx)
-    local_res = reduction_helper[new_tidy * (tile_dimy + 1) + new_tidx];
-    // }
+    local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx];
     const auto tile_block =
-        group::tiled_partition<tile_dimy>(group::this_thread_block());
+        group::tiled_partition<default_dot_dim>(group::this_thread_block());
     const auto sum =
         reduce(tile_block, local_res,
                [](const ValueType &a, const ValueType &b) { return a + b; });
-    const auto new_col_idx = blockIdx.x * tile_dimx + new_tidy;
-    if (new_tidx == 0 && new_col_idx < num_cols &&
+    const auto new_col_idx = blockIdx.x * default_dot_dim + tidy;
+    if (tidx == 0 && new_col_idx < num_cols &&
         !stop_status[new_col_idx].has_stopped()) {
         const auto hessenberg_idx = k * stride_hessenberg + new_col_idx;
         atomic_add(hessenberg_iter + hessenberg_idx, sum);
@@ -442,33 +436,27 @@ void finish_arnoldi(std::shared_ptr<const CudaExecutor> exec,
     const auto stride_hessenberg = hessenberg_iter->get_stride();
     const auto dim_size = next_krylov_basis->get_size();
     auto cublas_handle = exec->get_cublas_handle();
+    const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim),
+                         exec->get_num_multiprocessor() * 2);
+    const dim3 block_size(default_dot_dim, default_dot_dim);
     for (size_type k = 0; k < iter + 1; ++k) {
-        if (dim_size[1] <= 4) {
-            for (size_type col_idx = 0; col_idx < dim_size[1]; col_idx++) {
-                cublas::dot(cublas_handle, dim_size[0],
-                            next_krylov_basis->get_const_values() + col_idx,
-                            stride_next_krylov,
-                            krylov_bases->get_const_values() + k * dim_size[1] +
-                                col_idx,
-                            stride_krylov,
-                            hessenberg_iter->get_values() +
-                                k * stride_hessenberg + col_idx);
-            }
+        if (dim_size[1] == 1) {
+            cublas::dot(cublas_handle, dim_size[0],
+                        next_krylov_basis->get_const_values(),
+                        stride_next_krylov,
+                        krylov_bases->get_const_values() + k * dim_size[1],
+                        stride_krylov,
+                        hessenberg_iter->get_values() + k * stride_hessenberg);
         } else {
-            constexpr int cols_per_block = 32;
-            constexpr int rows_per_block = 32;
             zero_array(dim_size[1],
                        hessenberg_iter->get_values() + k * stride_hessenberg);
-            multidot_kernel<rows_per_block, cols_per_block>
-                <<<dim3{static_cast<uint>(ceildiv(dim_size[1], cols_per_block)),
-                        static_cast<uint>(exec->get_num_multiprocessor() * 2)},
-                   cols_per_block * rows_per_block>>>(
-                    k, dim_size[0], dim_size[1],
-                    as_cuda_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov,
-                    as_cuda_type(krylov_bases->get_const_values()),
-                    stride_krylov, as_cuda_type(hessenberg_iter->get_values()),
-                    stride_hessenberg, as_cuda_type(stop_status));
+            multidot_kernel<<<grid_size, block_size>>>(
+                k, dim_size[0], dim_size[1],
+                as_cuda_type(next_krylov_basis->get_const_values()),
+                stride_next_krylov,
+                as_cuda_type(krylov_bases->get_const_values()), stride_krylov,
+                as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg,
+                as_cuda_type(stop_status));
         }
         update_next_krylov_kernel<default_block_size>
             <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),

From 7e726bc3f4abfef43e3b17d00a81f400260a5fb4 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 14 Oct 2019 01:05:21 +0200
Subject: [PATCH 5/8] fix kernel error

---
 cuda/solver/gmres_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
index c2d6af58d7b..4bae0231ded 100644
--- a/cuda/solver/gmres_kernels.cu
+++ b/cuda/solver/gmres_kernels.cu
@@ -286,7 +286,7 @@ __global__ __launch_bounds__(default_dot_size) void multidot_kernel(
     const auto tidx = threadIdx.x;
     const auto tidy = threadIdx.y;
     const auto col_idx = blockIdx.x * default_dot_dim + tidx;
-    const auto num = ceildiv(num_rows, default_dot_dim);
+    const auto num = ceildiv(num_rows, gridDim.y);
     const auto start_row = blockIdx.y * num;
     const auto end_row =
         ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;

From fce752f23dcdb1d4b45ee701bbdddec353b36382 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 14 Oct 2019 10:13:34 +0200
Subject: [PATCH 6/8] move zero_array

---
 cuda/solver/gmres_kernels.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
index 4bae0231ded..7928d34c8c7 100644
--- a/cuda/solver/gmres_kernels.cu
+++ b/cuda/solver/gmres_kernels.cu
@@ -440,6 +440,8 @@ void finish_arnoldi(std::shared_ptr<const CudaExecutor> exec,
                          exec->get_num_multiprocessor() * 2);
     const dim3 block_size(default_dot_dim, default_dot_dim);
     for (size_type k = 0; k < iter + 1; ++k) {
+        zero_array(dim_size[1],
+                   hessenberg_iter->get_values() + k * stride_hessenberg);
         if (dim_size[1] == 1) {
             cublas::dot(cublas_handle, dim_size[0],
                         next_krylov_basis->get_const_values(),
@@ -448,8 +450,6 @@ void finish_arnoldi(std::shared_ptr<const CudaExecutor> exec,
                         stride_krylov,
                         hessenberg_iter->get_values() + k * stride_hessenberg);
         } else {
-            zero_array(dim_size[1],
-                       hessenberg_iter->get_values() + k * stride_hessenberg);
             multidot_kernel<<<grid_size, block_size>>>(
                 k, dim_size[0], dim_size[1],
                 as_cuda_type(next_krylov_basis->get_const_values()),

From 6fe81b13dcd94b09a798bcfa28cc5c4199fdd9c4 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 14 Oct 2019 10:48:49 +0200
Subject: [PATCH 7/8] use implementation and delete unused function

---
 cuda/solver/gmres_kernels.cu | 66 ++++--------------------------------
 1 file changed, 6 insertions(+), 60 deletions(-)

diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
index 7928d34c8c7..56496cf5dd8 100644
--- a/cuda/solver/gmres_kernels.cu
+++ b/cuda/solver/gmres_kernels.cu
@@ -230,50 +230,6 @@ __global__
 }
 
 
-// Must be called with at least `num_cols` blocks, each with `block_size`
-// threads. `block_size` must be a power of 2.
-template <int block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void update_hessenberg_kernel(
-    size_type k, size_type num_rows, size_type num_cols,
-    const ValueType *__restrict__ next_krylov_basis,
-    size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases,
-    size_type stride_krylov, ValueType *__restrict__ hessenberg_iter,
-    size_type stride_hessenberg,
-    const stopping_status *__restrict__ stop_status)
-{
-    const auto tidx = threadIdx.x;
-    const auto col_idx = blockIdx.x;
-
-    // Used that way to get around dynamic initialization warning and
-    // template error when using `reduction_helper_array` directly in `reduce`
-    __shared__ UninitializedArray<ValueType, block_size> reduction_helper_array;
-    ValueType *__restrict__ reduction_helper = reduction_helper_array;
-
-    if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
-        ValueType local_res{};
-        const auto krylov_col = k * num_cols + col_idx;
-        for (size_type i = tidx; i < num_rows; i += block_size) {
-            const auto next_krylov_idx = i * stride_next_krylov + col_idx;
-            const auto krylov_idx = i * stride_krylov + krylov_col;
-
-            local_res +=
-                next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx];
-        }
-
-        reduction_helper[tidx] = local_res;
-
-        // Perform thread block reduction. Result is in reduction_helper[0]
-        reduce(group::this_thread_block(), reduction_helper,
-               [](const ValueType &a, const ValueType &b) { return a + b; });
-
-        if (tidx == 0) {
-            const auto hessenberg_idx = k * stride_hessenberg + col_idx;
-            hessenberg_iter[hessenberg_idx] = reduction_helper[0];
-        }
-    }
-}
-
-
 template <typename ValueType>
 __global__ __launch_bounds__(default_dot_size) void multidot_kernel(
     size_type k, size_type num_rows, size_type num_cols,
@@ -442,22 +398,12 @@ void finish_arnoldi(std::shared_ptr<const CudaExecutor> exec,
     for (size_type k = 0; k < iter + 1; ++k) {
         zero_array(dim_size[1],
                    hessenberg_iter->get_values() + k * stride_hessenberg);
-        if (dim_size[1] == 1) {
-            cublas::dot(cublas_handle, dim_size[0],
-                        next_krylov_basis->get_const_values(),
-                        stride_next_krylov,
-                        krylov_bases->get_const_values() + k * dim_size[1],
-                        stride_krylov,
-                        hessenberg_iter->get_values() + k * stride_hessenberg);
-        } else {
-            multidot_kernel<<<grid_size, block_size>>>(
-                k, dim_size[0], dim_size[1],
-                as_cuda_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov,
-                as_cuda_type(krylov_bases->get_const_values()), stride_krylov,
-                as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg,
-                as_cuda_type(stop_status));
-        }
+        multidot_kernel<<<grid_size, block_size>>>(
+            k, dim_size[0], dim_size[1],
+            as_cuda_type(next_krylov_basis->get_const_values()),
+            stride_next_krylov, as_cuda_type(krylov_bases->get_const_values()),
+            stride_krylov, as_cuda_type(hessenberg_iter->get_values()),
+            stride_hessenberg, as_cuda_type(stop_status));
         update_next_krylov_kernel<default_block_size>
             <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
                default_block_size>>>(

From e3284454409e3c2585b54a081a2989149333a866 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@kit.edu>
Date: Thu, 10 Oct 2019 19:43:58 +0200
Subject: [PATCH 8/8] Remove need for copying matrix inside gmres

---
 benchmark/solver/solver.cpp |  3 ++-
 core/solver/gmres.cpp       | 31 ++++++++++++++++++++-----------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp
index fdff89b647d..591fb1a3423 100644
--- a/benchmark/solver/solver.cpp
+++ b/benchmark/solver/solver.cpp
@@ -123,7 +123,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
     solver_factory{{"cg", create_solver<gko::solver::Cg<>>},
                    {"bicgstab", create_solver<gko::solver::Bicgstab<>>},
                    {"cgs", create_solver<gko::solver::Cgs<>>},
-                   {"fcg", create_solver<gko::solver::Fcg<>>}};
+                   {"fcg", create_solver<gko::solver::Fcg<>>},
+                   {"gmres", create_solver<gko::solver::Gmres<>>}};
 
 
 // TODO: Workaround until GPU matrix conversions are implemented
diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
index 32c79f2455f..b56ef394d7b 100644
--- a/core/solver/gmres.cpp
+++ b/core/solver/gmres.cpp
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
 
 
 #include "core/solver/gmres_kernels.hpp"
@@ -66,18 +67,25 @@ namespace {
 
 
 template <typename ValueType>
-void apply_preconditioner(const LinOp *preconditioner,
-                          matrix::Dense<ValueType> *krylov_bases,
-                          matrix::Dense<ValueType> *preconditioned_vector,
-                          const size_type iter)
+void apply_preconditioner(
+    const LinOp *preconditioner, matrix::Dense<ValueType> *krylov_bases,
+    std::shared_ptr<matrix::Dense<ValueType>> &preconditioned_vector,
+    const size_type iter)
 {
-    auto target_basis = krylov_bases->create_submatrix(
-        span{0, krylov_bases->get_size()[0]},
-        span{iter * preconditioned_vector->get_size()[1],
-             (iter + 1) * preconditioned_vector->get_size()[1]});
+    std::shared_ptr<matrix::Dense<ValueType>> target_basis =
+        krylov_bases->create_submatrix(
+            span{0, krylov_bases->get_size()[0]},
+            span{iter * preconditioned_vector->get_size()[1],
+                 (iter + 1) * preconditioned_vector->get_size()[1]});
 
     // Apply preconditioner
-    preconditioner->apply(target_basis.get(), preconditioned_vector);
+    auto identity_pointer =
+        dynamic_cast<const matrix::Identity<ValueType> *>(preconditioner);
+    if (identity_pointer) {
+        preconditioned_vector = target_basis;
+    } else {
+        preconditioner->apply(target_basis.get(), preconditioned_vector.get());
+    }
 }
 
 
@@ -105,7 +113,8 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
         exec, dim<2>{system_matrix_->get_size()[1],
                      (krylov_dim_ + 1) * dense_b->get_size()[1]});
     auto next_krylov_basis = Vector::create_with_config_of(dense_b);
-    auto preconditioned_vector = Vector::create_with_config_of(dense_b);
+    std::shared_ptr<matrix::Dense<ValueType>> preconditioned_vector =
+        Vector::create_with_config_of(dense_b);
     auto hessenberg = Vector::create(
         exec, dim<2>{krylov_dim_ + 1, krylov_dim_ * dense_b->get_size()[1]});
     auto givens_sin =
@@ -200,7 +209,7 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
         }
 
         apply_preconditioner(get_preconditioner().get(), krylov_bases.get(),
-                             preconditioned_vector.get(), restart_iter);
+                             preconditioned_vector, restart_iter);
         // preconditioned_vector = get_preconditioner() *
         //                         krylov_bases(:, restart_iter)