From bd770307e7833f6c435e1b8064e021935aa2ea3e Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 10 Oct 2019 16:25:33 +0200 Subject: [PATCH 1/8] use dot --- cuda/solver/gmres_kernels.cu | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu index e0294ba35e8..eeb28532758 100644 --- a/cuda/solver/gmres_kernels.cu +++ b/cuda/solver/gmres_kernels.cu @@ -381,16 +381,26 @@ void finish_arnoldi(std::shared_ptr exec, const auto stride_krylov = krylov_bases->get_stride(); const auto stride_hessenberg = hessenberg_iter->get_stride(); const auto dim_size = next_krylov_basis->get_size(); - + auto cublas_handle = exec->get_cublas_handle(); for (size_type k = 0; k < iter + 1; ++k) { - update_hessenberg_kernel - <<>>( - k, dim_size[0], dim_size[1], - as_cuda_type(next_krylov_basis->get_const_values()), + // update_hessenberg_kernel + // <<>>( + // k, dim_size[0], dim_size[1], + // as_cuda_type(next_krylov_basis->get_const_values()), + // stride_next_krylov, + // as_cuda_type(krylov_bases->get_const_values()), + // stride_krylov, as_cuda_type(hessenberg_iter->get_values()), + // stride_hessenberg, as_cuda_type(stop_status)); + for (size_type col_idx = 0; col_idx < dim_size[1]; col_idx++) { + cublas::dot( + cublas_handle, dim_size[0], + next_krylov_basis->get_const_values() + col_idx, stride_next_krylov, - as_cuda_type(krylov_bases->get_const_values()), stride_krylov, - as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg, - as_cuda_type(stop_status)); + krylov_bases->get_const_values() + k * dim_size[1] + col_idx, + stride_krylov, + hessenberg_iter->get_values() + k * stride_hessenberg + + col_idx); + } update_next_krylov_kernel << Date: Fri, 11 Oct 2019 17:28:24 +0200 Subject: [PATCH 2/8] implement multidot_kernel --- cuda/solver/gmres_kernels.cu | 89 +++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 17 deletions(-) diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu index eeb28532758..df898724e5d 100644 --- a/cuda/solver/gmres_kernels.cu +++ b/cuda/solver/gmres_kernels.cu @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/types.hpp" +#include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/uninitialized_array.hpp" @@ -271,6 +272,55 @@ __global__ __launch_bounds__(block_size) void update_hessenberg_kernel( } +template +__global__ __launch_bounds__(tile_dim *tile_dim) void multidot_kernel( + size_type k, size_type num_rows, size_type num_cols, + const ValueType *__restrict__ next_krylov_basis, + size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases, + size_type stride_krylov, ValueType *__restrict__ hessenberg_iter, + size_type stride_hessenberg, + const stopping_status *__restrict__ stop_status) +{ + const auto tidy = threadIdx.y; + const auto tidx = threadIdx.x; + const auto col_idx = blockIdx.x * tile_dim + tidx; + const auto num = ceildiv(num_rows, blockDim.y); + const auto start_row = blockIdx.y * num; + const auto end_row = + ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num; + // Used that way to get around dynamic initialization warning and + // template error when using `reduction_helper_array` directly in `reduce` + __shared__ UninitializedArray + reduction_helper_array; + ValueType *__restrict__ reduction_helper = reduction_helper_array; + + ValueType local_res = zero(); + const auto krylov_col = k * num_cols + col_idx; + if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { + for (size_type i = start_row + tidy; i < end_row; i += blockDim.y) { + const auto next_krylov_idx = i * stride_next_krylov + col_idx; + const auto krylov_idx = i * stride_krylov + krylov_col; + local_res += + next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx]; + } + } + reduction_helper[tidx * (tile_dim + 1) + tidy] = local_res; + __syncthreads(); + local_res = reduction_helper[tidy * (tile_dim + 1) + tidx]; + const auto tile_block = + group::tiled_partition(group::this_thread_block()); + const auto sum = + reduce(tile_block, local_res, + [](const ValueType &a, const ValueType &b) { return a + b; }); + const auto new_col_idx = blockIdx.x * tile_dim + tidy; + if (tidx == 0 && new_col_idx < num_cols && + !stop_status[new_col_idx].has_stopped()) { + const auto hessenberg_idx = k * stride_hessenberg + new_col_idx; + atomic_add(hessenberg_iter + hessenberg_idx, sum); + } +} + + // Must be called with at least `num_rows * stride_next_krylov` threads in // total. template @@ -383,25 +433,30 @@ void finish_arnoldi(std::shared_ptr exec, const auto dim_size = next_krylov_basis->get_size(); auto cublas_handle = exec->get_cublas_handle(); for (size_type k = 0; k < iter + 1; ++k) { - // update_hessenberg_kernel - // <<>>( - // k, dim_size[0], dim_size[1], - // as_cuda_type(next_krylov_basis->get_const_values()), - // stride_next_krylov, - // as_cuda_type(krylov_bases->get_const_values()), - // stride_krylov, as_cuda_type(hessenberg_iter->get_values()), - // stride_hessenberg, as_cuda_type(stop_status)); - for (size_type col_idx = 0; col_idx < dim_size[1]; col_idx++) { - cublas::dot( - cublas_handle, dim_size[0], - next_krylov_basis->get_const_values() + col_idx, + if (dim_size[1] <= 4) { + for (size_type col_idx = 0; col_idx < dim_size[1]; col_idx++) { + cublas::dot(cublas_handle, dim_size[0], + next_krylov_basis->get_const_values() + col_idx, + stride_next_krylov, + krylov_bases->get_const_values() + k * dim_size[1] + + col_idx, + stride_krylov, + hessenberg_iter->get_values() + + k * stride_hessenberg + col_idx); + } + } else { + zero_array(dim_size[1], + hessenberg_iter->get_values() + k * stride_hessenberg); + multidot_kernel<32><<get_num_multiprocessor() * 2}, + dim3{32, 32}>>>( + k, dim_size[0], dim_size[1], + as_cuda_type(next_krylov_basis->get_const_values()), stride_next_krylov, - krylov_bases->get_const_values() + k * dim_size[1] + col_idx, - stride_krylov, - hessenberg_iter->get_values() + k * stride_hessenberg + - col_idx); + as_cuda_type(krylov_bases->get_const_values()), stride_krylov, + as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg, + as_cuda_type(stop_status)); } - update_next_krylov_kernel <<>>( From 1002e9d5a98c8affaec97912a851bf48e86ae3e3 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sun, 13 Oct 2019 14:48:03 +0200 Subject: [PATCH 3/8] try non-square transpose --- cuda/solver/gmres_kernels.cu | 57 ++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu index df898724e5d..779825b76ba 100644 --- a/cuda/solver/gmres_kernels.cu +++ b/cuda/solver/gmres_kernels.cu @@ -272,8 +272,8 @@ __global__ __launch_bounds__(block_size) void update_hessenberg_kernel( } -template -__global__ __launch_bounds__(tile_dim *tile_dim) void multidot_kernel( +template +__global__ __launch_bounds__(tile_dimx *tile_dimy) void multidot_kernel( size_type k, size_type num_rows, size_type num_cols, const ValueType *__restrict__ next_krylov_basis, size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases, @@ -281,39 +281,49 @@ __global__ __launch_bounds__(tile_dim *tile_dim) void multidot_kernel( size_type stride_hessenberg, const stopping_status *__restrict__ stop_status) { - const auto tidy = threadIdx.y; - const auto tidx = threadIdx.x; - const auto col_idx = blockIdx.x * tile_dim + tidx; - const auto num = ceildiv(num_rows, blockDim.y); + const auto tid = threadIdx.x; + const auto tidy = tid / tile_dimx; + const auto tidx = tid % tile_dimx; + const auto col_idx = blockIdx.x * tile_dimx + tidx; + const auto num = ceildiv(num_rows, tile_dimy); const auto start_row = blockIdx.y * num; const auto end_row = ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num; // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - __shared__ UninitializedArray + __shared__ UninitializedArray reduction_helper_array; ValueType *__restrict__ reduction_helper = reduction_helper_array; ValueType local_res = zero(); const auto krylov_col = k * num_cols + col_idx; if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { - for (size_type i = start_row + tidy; i < end_row; i += blockDim.y) { + for (size_type i = start_row + tidy; i < end_row; i += tile_dimy) { const auto next_krylov_idx = i * stride_next_krylov + col_idx; const auto krylov_idx = i * stride_krylov + krylov_col; local_res += next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx]; } } - reduction_helper[tidx * (tile_dim + 1) + tidy] = local_res; + const auto new_tidx = tid % tile_dimy; + const auto new_tidy = tid / tile_dimy; + // if (new_tidy != tidx) { + // (tidx, tidy) + reduction_helper[tidx * (tile_dimy + 1) + tidy] = local_res; + // } __syncthreads(); - local_res = reduction_helper[tidy * (tile_dim + 1) + tidx]; + // if (new_tidy != tidx) { + // (new_tidy, new_tidx) takes (new_tidx, new_tidy) value + // (new_tidx, new_tidy) take (new_tidy, new_tidx) + local_res = reduction_helper[new_tidy * (tile_dimy + 1) + new_tidx]; + // } const auto tile_block = - group::tiled_partition(group::this_thread_block()); + group::tiled_partition(group::this_thread_block()); const auto sum = reduce(tile_block, local_res, [](const ValueType &a, const ValueType &b) { return a + b; }); - const auto new_col_idx = blockIdx.x * tile_dim + tidy; - if (tidx == 0 && new_col_idx < num_cols && + const auto new_col_idx = blockIdx.x * tile_dimx + new_tidy; + if (new_tidx == 0 && new_col_idx < num_cols && !stop_status[new_col_idx].has_stopped()) { const auto hessenberg_idx = k * stride_hessenberg + new_col_idx; atomic_add(hessenberg_iter + hessenberg_idx, sum); @@ -445,17 +455,20 @@ void finish_arnoldi(std::shared_ptr exec, k * stride_hessenberg + col_idx); } } else { + constexpr int cols_per_block = 32; + constexpr int rows_per_block = 32; zero_array(dim_size[1], hessenberg_iter->get_values() + k * stride_hessenberg); - multidot_kernel<32><<get_num_multiprocessor() * 2}, - dim3{32, 32}>>>( - k, dim_size[0], dim_size[1], - as_cuda_type(next_krylov_basis->get_const_values()), - stride_next_krylov, - as_cuda_type(krylov_bases->get_const_values()), stride_krylov, - as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg, - as_cuda_type(stop_status)); + multidot_kernel + <<(ceildiv(dim_size[1], cols_per_block)), + static_cast(exec->get_num_multiprocessor() * 2)}, + cols_per_block * rows_per_block>>>( + k, dim_size[0], dim_size[1], + as_cuda_type(next_krylov_basis->get_const_values()), + stride_next_krylov, + as_cuda_type(krylov_bases->get_const_values()), + stride_krylov, as_cuda_type(hessenberg_iter->get_values()), + stride_hessenberg, as_cuda_type(stop_status)); } update_next_krylov_kernel << Date: Sun, 13 Oct 2019 21:47:49 +0200 Subject: [PATCH 4/8] improve square multidot kernel --- cuda/solver/gmres_kernels.cu | 82 +++++++++++++++--------------------- 1 file changed, 35 insertions(+), 47 deletions(-) diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu index 779825b76ba..c2d6af58d7b 100644 --- a/cuda/solver/gmres_kernels.cu +++ b/cuda/solver/gmres_kernels.cu @@ -63,6 +63,8 @@ namespace gmres { constexpr int default_block_size = 512; +constexpr int default_dot_dim = cuda_config::warp_size; +constexpr int default_dot_size = default_dot_dim * default_dot_dim; // Must be called with at least `max(stride_b * num_rows, krylov_dim * @@ -272,8 +274,8 @@ __global__ __launch_bounds__(block_size) void update_hessenberg_kernel( } -template -__global__ __launch_bounds__(tile_dimx *tile_dimy) void multidot_kernel( +template +__global__ __launch_bounds__(default_dot_size) void multidot_kernel( size_type k, size_type num_rows, size_type num_cols, const ValueType *__restrict__ next_krylov_basis, size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases, @@ -281,49 +283,41 @@ __global__ __launch_bounds__(tile_dimx *tile_dimy) void multidot_kernel( size_type stride_hessenberg, const stopping_status *__restrict__ stop_status) { - const auto tid = threadIdx.x; - const auto tidy = tid / tile_dimx; - const auto tidx = tid % tile_dimx; - const auto col_idx = blockIdx.x * tile_dimx + tidx; - const auto num = ceildiv(num_rows, tile_dimy); + const auto tidx = threadIdx.x; + const auto tidy = threadIdx.y; + const auto col_idx = blockIdx.x * default_dot_dim + tidx; + const auto num = ceildiv(num_rows, default_dot_dim); const auto start_row = blockIdx.y * num; const auto end_row = ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num; // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - __shared__ UninitializedArray - reduction_helper_array; + __shared__ + UninitializedArray + reduction_helper_array; ValueType *__restrict__ reduction_helper = reduction_helper_array; ValueType local_res = zero(); const auto krylov_col = k * num_cols + col_idx; if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { - for (size_type i = start_row + tidy; i < end_row; i += tile_dimy) { + for (size_type i = start_row + tidy; i < end_row; + i += default_dot_dim) { const auto next_krylov_idx = i * stride_next_krylov + col_idx; const auto krylov_idx = i * stride_krylov + krylov_col; local_res += next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx]; } } - const auto new_tidx = tid % tile_dimy; - const auto new_tidy = tid / tile_dimy; - // if (new_tidy != tidx) { - // (tidx, tidy) - reduction_helper[tidx * (tile_dimy + 1) + tidy] = local_res; - // } + reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; __syncthreads(); - // if (new_tidy != tidx) { - // (new_tidy, new_tidx) takes (new_tidx, new_tidy) value - // (new_tidx, new_tidy) take (new_tidy, new_tidx) - local_res = reduction_helper[new_tidy * (tile_dimy + 1) + new_tidx]; - // } + local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; const auto tile_block = - group::tiled_partition(group::this_thread_block()); + group::tiled_partition(group::this_thread_block()); const auto sum = reduce(tile_block, local_res, [](const ValueType &a, const ValueType &b) { return a + b; }); - const auto new_col_idx = blockIdx.x * tile_dimx + new_tidy; - if (new_tidx == 0 && new_col_idx < num_cols && + const auto new_col_idx = blockIdx.x * default_dot_dim + tidy; + if (tidx == 0 && new_col_idx < num_cols && !stop_status[new_col_idx].has_stopped()) { const auto hessenberg_idx = k * stride_hessenberg + new_col_idx; atomic_add(hessenberg_iter + hessenberg_idx, sum); @@ -442,33 +436,27 @@ void finish_arnoldi(std::shared_ptr exec, const auto stride_hessenberg = hessenberg_iter->get_stride(); const auto dim_size = next_krylov_basis->get_size(); auto cublas_handle = exec->get_cublas_handle(); + const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim), + exec->get_num_multiprocessor() * 2); + const dim3 block_size(default_dot_dim, default_dot_dim); for (size_type k = 0; k < iter + 1; ++k) { - if (dim_size[1] <= 4) { - for (size_type col_idx = 0; col_idx < dim_size[1]; col_idx++) { - cublas::dot(cublas_handle, dim_size[0], - next_krylov_basis->get_const_values() + col_idx, - stride_next_krylov, - krylov_bases->get_const_values() + k * dim_size[1] + - col_idx, - stride_krylov, - hessenberg_iter->get_values() + - k * stride_hessenberg + col_idx); - } + if (dim_size[1] == 1) { + cublas::dot(cublas_handle, dim_size[0], + next_krylov_basis->get_const_values(), + stride_next_krylov, + krylov_bases->get_const_values() + k * dim_size[1], + stride_krylov, + hessenberg_iter->get_values() + k * stride_hessenberg); } else { - constexpr int cols_per_block = 32; - constexpr int rows_per_block = 32; zero_array(dim_size[1], hessenberg_iter->get_values() + k * stride_hessenberg); - multidot_kernel - <<(ceildiv(dim_size[1], cols_per_block)), - static_cast(exec->get_num_multiprocessor() * 2)}, - cols_per_block * rows_per_block>>>( - k, dim_size[0], dim_size[1], - as_cuda_type(next_krylov_basis->get_const_values()), - stride_next_krylov, - as_cuda_type(krylov_bases->get_const_values()), - stride_krylov, as_cuda_type(hessenberg_iter->get_values()), - stride_hessenberg, as_cuda_type(stop_status)); + multidot_kernel<<>>( + k, dim_size[0], dim_size[1], + as_cuda_type(next_krylov_basis->get_const_values()), + stride_next_krylov, + as_cuda_type(krylov_bases->get_const_values()), stride_krylov, + as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg, + as_cuda_type(stop_status)); } update_next_krylov_kernel << Date: Mon, 14 Oct 2019 01:05:21 +0200 Subject: [PATCH 5/8] fix kernel error --- cuda/solver/gmres_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu index c2d6af58d7b..4bae0231ded 100644 --- a/cuda/solver/gmres_kernels.cu +++ b/cuda/solver/gmres_kernels.cu @@ -286,7 +286,7 @@ __global__ __launch_bounds__(default_dot_size) void multidot_kernel( const auto tidx = threadIdx.x; const auto tidy = threadIdx.y; const auto col_idx = blockIdx.x * default_dot_dim + tidx; - const auto num = ceildiv(num_rows, default_dot_dim); + const auto num = ceildiv(num_rows, gridDim.y); const auto start_row = blockIdx.y * num; const auto end_row = ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num; From fce752f23dcdb1d4b45ee701bbdddec353b36382 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 14 Oct 2019 10:13:34 +0200 Subject: [PATCH 6/8] move zero_array --- cuda/solver/gmres_kernels.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu index 4bae0231ded..7928d34c8c7 100644 --- a/cuda/solver/gmres_kernels.cu +++ b/cuda/solver/gmres_kernels.cu @@ -440,6 +440,8 @@ void finish_arnoldi(std::shared_ptr exec, exec->get_num_multiprocessor() * 2); const dim3 block_size(default_dot_dim, default_dot_dim); for (size_type k = 0; k < iter + 1; ++k) { + zero_array(dim_size[1], + hessenberg_iter->get_values() + k * stride_hessenberg); if (dim_size[1] == 1) { cublas::dot(cublas_handle, dim_size[0], next_krylov_basis->get_const_values(), @@ -448,8 +450,6 @@ void finish_arnoldi(std::shared_ptr exec, stride_krylov, hessenberg_iter->get_values() + k * stride_hessenberg); } else { - zero_array(dim_size[1], - hessenberg_iter->get_values() + k * stride_hessenberg); multidot_kernel<<>>( k, dim_size[0], dim_size[1], as_cuda_type(next_krylov_basis->get_const_values()), From 6fe81b13dcd94b09a798bcfa28cc5c4199fdd9c4 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 14 Oct 2019 10:48:49 +0200 Subject: [PATCH 7/8] use implementation and delete unused function --- cuda/solver/gmres_kernels.cu | 66 ++++-------------------------------- 1 file changed, 6 insertions(+), 60 deletions(-) diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu index 7928d34c8c7..56496cf5dd8 100644 --- a/cuda/solver/gmres_kernels.cu +++ b/cuda/solver/gmres_kernels.cu @@ -230,50 +230,6 @@ __global__ } -// Must be called with at least `num_cols` blocks, each with `block_size` -// threads. `block_size` must be a power of 2. -template -__global__ __launch_bounds__(block_size) void update_hessenberg_kernel( - size_type k, size_type num_rows, size_type num_cols, - const ValueType *__restrict__ next_krylov_basis, - size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases, - size_type stride_krylov, ValueType *__restrict__ hessenberg_iter, - size_type stride_hessenberg, - const stopping_status *__restrict__ stop_status) -{ - const auto tidx = threadIdx.x; - const auto col_idx = blockIdx.x; - - // Used that way to get around dynamic initialization warning and - // template error when using `reduction_helper_array` directly in `reduce` - __shared__ UninitializedArray reduction_helper_array; - ValueType *__restrict__ reduction_helper = reduction_helper_array; - - if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { - ValueType local_res{}; - const auto krylov_col = k * num_cols + col_idx; - for (size_type i = tidx; i < num_rows; i += block_size) { - const auto next_krylov_idx = i * stride_next_krylov + col_idx; - const auto krylov_idx = i * stride_krylov + krylov_col; - - local_res += - next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx]; - } - - reduction_helper[tidx] = local_res; - - // Perform thread block reduction. Result is in reduction_helper[0] - reduce(group::this_thread_block(), reduction_helper, - [](const ValueType &a, const ValueType &b) { return a + b; }); - - if (tidx == 0) { - const auto hessenberg_idx = k * stride_hessenberg + col_idx; - hessenberg_iter[hessenberg_idx] = reduction_helper[0]; - } - } -} - - template __global__ __launch_bounds__(default_dot_size) void multidot_kernel( size_type k, size_type num_rows, size_type num_cols, @@ -442,22 +398,12 @@ void finish_arnoldi(std::shared_ptr exec, for (size_type k = 0; k < iter + 1; ++k) { zero_array(dim_size[1], hessenberg_iter->get_values() + k * stride_hessenberg); - if (dim_size[1] == 1) { - cublas::dot(cublas_handle, dim_size[0], - next_krylov_basis->get_const_values(), - stride_next_krylov, - krylov_bases->get_const_values() + k * dim_size[1], - stride_krylov, - hessenberg_iter->get_values() + k * stride_hessenberg); - } else { - multidot_kernel<<>>( - k, dim_size[0], dim_size[1], - as_cuda_type(next_krylov_basis->get_const_values()), - stride_next_krylov, - as_cuda_type(krylov_bases->get_const_values()), stride_krylov, - as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg, - as_cuda_type(stop_status)); - } + multidot_kernel<<>>( + k, dim_size[0], dim_size[1], + as_cuda_type(next_krylov_basis->get_const_values()), + stride_next_krylov, as_cuda_type(krylov_bases->get_const_values()), + stride_krylov, as_cuda_type(hessenberg_iter->get_values()), + stride_hessenberg, as_cuda_type(stop_status)); update_next_krylov_kernel <<>>( From e3284454409e3c2585b54a081a2989149333a866 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Thu, 10 Oct 2019 19:43:58 +0200 Subject: [PATCH 8/8] Remove need for copying matrix inside gmres --- benchmark/solver/solver.cpp | 3 ++- core/solver/gmres.cpp | 31 ++++++++++++++++++++----------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp index fdff89b647d..591fb1a3423 100644 --- a/benchmark/solver/solver.cpp +++ b/benchmark/solver/solver.cpp @@ -123,7 +123,8 @@ const std::map( solver_factory{{"cg", create_solver>}, {"bicgstab", create_solver>}, {"cgs", create_solver>}, - {"fcg", create_solver>}}; + {"fcg", create_solver>}, + {"gmres", create_solver>}}; // TODO: Workaround until GPU matrix conversions are implemented diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index 32c79f2455f..b56ef394d7b 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/solver/gmres_kernels.hpp" @@ -66,18 +67,25 @@ namespace { template -void apply_preconditioner(const LinOp *preconditioner, - matrix::Dense *krylov_bases, - matrix::Dense *preconditioned_vector, - const size_type iter) +void apply_preconditioner( + const LinOp *preconditioner, matrix::Dense *krylov_bases, + std::shared_ptr> &preconditioned_vector, + const size_type iter) { - auto target_basis = krylov_bases->create_submatrix( - span{0, krylov_bases->get_size()[0]}, - span{iter * preconditioned_vector->get_size()[1], - (iter + 1) * preconditioned_vector->get_size()[1]}); + std::shared_ptr> target_basis = + krylov_bases->create_submatrix( + span{0, krylov_bases->get_size()[0]}, + span{iter * preconditioned_vector->get_size()[1], + (iter + 1) * preconditioned_vector->get_size()[1]}); // Apply preconditioner - preconditioner->apply(target_basis.get(), preconditioned_vector); + auto identity_pointer = + dynamic_cast *>(preconditioner); + if (identity_pointer) { + preconditioned_vector = target_basis; + } else { + preconditioner->apply(target_basis.get(), preconditioned_vector.get()); + } } @@ -105,7 +113,8 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const exec, dim<2>{system_matrix_->get_size()[1], (krylov_dim_ + 1) * dense_b->get_size()[1]}); auto next_krylov_basis = Vector::create_with_config_of(dense_b); - auto preconditioned_vector = Vector::create_with_config_of(dense_b); + std::shared_ptr> preconditioned_vector = + Vector::create_with_config_of(dense_b); auto hessenberg = Vector::create( exec, dim<2>{krylov_dim_ + 1, krylov_dim_ * dense_b->get_size()[1]}); auto givens_sin = @@ -200,7 +209,7 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const } apply_preconditioner(get_preconditioner().get(), krylov_bases.get(), - preconditioned_vector.get(), restart_iter); + preconditioned_vector, restart_iter); // preconditioned_vector = get_preconditioner() * // krylov_bases(:, restart_iter)