Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve update hessenberg of gmres #363

Merged
merged 8 commits into from
Oct 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion benchmark/solver/solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
solver_factory{{"cg", create_solver<gko::solver::Cg<>>},
{"bicgstab", create_solver<gko::solver::Bicgstab<>>},
{"cgs", create_solver<gko::solver::Cgs<>>},
{"fcg", create_solver<gko::solver::Fcg<>>}};
{"fcg", create_solver<gko::solver::Fcg<>>},
{"gmres", create_solver<gko::solver::Gmres<>>}};


// TODO: Workaround until GPU matrix conversions are implemented
Expand Down
31 changes: 20 additions & 11 deletions core/solver/gmres.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <ginkgo/core/base/name_demangling.hpp>
#include <ginkgo/core/base/utils.hpp>
#include <ginkgo/core/matrix/dense.hpp>
#include <ginkgo/core/matrix/identity.hpp>


#include "core/solver/gmres_kernels.hpp"
Expand All @@ -66,18 +67,25 @@ namespace {


template <typename ValueType>
void apply_preconditioner(const LinOp *preconditioner,
matrix::Dense<ValueType> *krylov_bases,
matrix::Dense<ValueType> *preconditioned_vector,
const size_type iter)
void apply_preconditioner(
const LinOp *preconditioner, matrix::Dense<ValueType> *krylov_bases,
std::shared_ptr<matrix::Dense<ValueType>> &preconditioned_vector,
const size_type iter)
{
auto target_basis = krylov_bases->create_submatrix(
span{0, krylov_bases->get_size()[0]},
span{iter * preconditioned_vector->get_size()[1],
(iter + 1) * preconditioned_vector->get_size()[1]});
std::shared_ptr<matrix::Dense<ValueType>> target_basis =
krylov_bases->create_submatrix(
span{0, krylov_bases->get_size()[0]},
span{iter * preconditioned_vector->get_size()[1],
(iter + 1) * preconditioned_vector->get_size()[1]});

// Apply preconditioner
preconditioner->apply(target_basis.get(), preconditioned_vector);
auto identity_pointer =
dynamic_cast<const matrix::Identity<ValueType> *>(preconditioner);
if (identity_pointer) {
preconditioned_vector = target_basis;
} else {
preconditioner->apply(target_basis.get(), preconditioned_vector.get());
}
}


Expand Down Expand Up @@ -105,7 +113,8 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
exec, dim<2>{system_matrix_->get_size()[1],
(krylov_dim_ + 1) * dense_b->get_size()[1]});
auto next_krylov_basis = Vector::create_with_config_of(dense_b);
auto preconditioned_vector = Vector::create_with_config_of(dense_b);
std::shared_ptr<matrix::Dense<ValueType>> preconditioned_vector =
Vector::create_with_config_of(dense_b);
auto hessenberg = Vector::create(
exec, dim<2>{krylov_dim_ + 1, krylov_dim_ * dense_b->get_size()[1]});
auto givens_sin =
Expand Down Expand Up @@ -200,7 +209,7 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
}

apply_preconditioner(get_preconditioner().get(), krylov_bases.get(),
preconditioned_vector.get(), restart_iter);
preconditioned_vector, restart_iter);
// preconditioned_vector = get_preconditioner() *
// krylov_bases(:, restart_iter)

Expand Down
74 changes: 43 additions & 31 deletions cuda/solver/gmres_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cuda/base/cublas_bindings.hpp"
#include "cuda/base/math.hpp"
#include "cuda/base/types.hpp"
#include "cuda/components/atomic.cuh"
#include "cuda/components/cooperative_groups.cuh"
#include "cuda/components/reduction.cuh"
#include "cuda/components/uninitialized_array.hpp"
Expand All @@ -62,6 +63,8 @@ namespace gmres {


constexpr int default_block_size = 512;
constexpr int default_dot_dim = cuda_config::warp_size;
constexpr int default_dot_size = default_dot_dim * default_dot_dim;


// Must be called with at least `max(stride_b * num_rows, krylov_dim *
Expand Down Expand Up @@ -227,10 +230,8 @@ __global__
}


// Must be called with at least `num_cols` blocks, each with `block_size`
// threads. `block_size` must be a power of 2.
template <int block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void update_hessenberg_kernel(
template <typename ValueType>
__global__ __launch_bounds__(default_dot_size) void multidot_kernel(
size_type k, size_type num_rows, size_type num_cols,
const ValueType *__restrict__ next_krylov_basis,
size_type stride_next_krylov, const ValueType *__restrict__ krylov_bases,
Expand All @@ -239,34 +240,43 @@ __global__ __launch_bounds__(block_size) void update_hessenberg_kernel(
const stopping_status *__restrict__ stop_status)
{
const auto tidx = threadIdx.x;
const auto col_idx = blockIdx.x;

const auto tidy = threadIdx.y;
const auto col_idx = blockIdx.x * default_dot_dim + tidx;
const auto num = ceildiv(num_rows, gridDim.y);
const auto start_row = blockIdx.y * num;
const auto end_row =
((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
// Used that way to get around dynamic initialization warning and
// template error when using `reduction_helper_array` directly in `reduce`
__shared__ UninitializedArray<ValueType, block_size> reduction_helper_array;
__shared__
UninitializedArray<ValueType, default_dot_dim *(default_dot_dim + 1)>
reduction_helper_array;
ValueType *__restrict__ reduction_helper = reduction_helper_array;

ValueType local_res = zero<ValueType>();
const auto krylov_col = k * num_cols + col_idx;
if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
ValueType local_res{};
const auto krylov_col = k * num_cols + col_idx;
for (size_type i = tidx; i < num_rows; i += block_size) {
for (size_type i = start_row + tidy; i < end_row;
i += default_dot_dim) {
const auto next_krylov_idx = i * stride_next_krylov + col_idx;
const auto krylov_idx = i * stride_krylov + krylov_col;

local_res +=
next_krylov_basis[next_krylov_idx] * krylov_bases[krylov_idx];
}

reduction_helper[tidx] = local_res;

// Perform thread block reduction. Result is in reduction_helper[0]
reduce(group::this_thread_block(), reduction_helper,
}
reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res;
__syncthreads();
local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx];
const auto tile_block =
group::tiled_partition<default_dot_dim>(group::this_thread_block());
const auto sum =
reduce(tile_block, local_res,
[](const ValueType &a, const ValueType &b) { return a + b; });

if (tidx == 0) {
const auto hessenberg_idx = k * stride_hessenberg + col_idx;
hessenberg_iter[hessenberg_idx] = reduction_helper[0];
}
const auto new_col_idx = blockIdx.x * default_dot_dim + tidy;
if (tidx == 0 && new_col_idx < num_cols &&
!stop_status[new_col_idx].has_stopped()) {
const auto hessenberg_idx = k * stride_hessenberg + new_col_idx;
atomic_add(hessenberg_iter + hessenberg_idx, sum);
}
}

Expand Down Expand Up @@ -381,17 +391,19 @@ void finish_arnoldi(std::shared_ptr<const CudaExecutor> exec,
const auto stride_krylov = krylov_bases->get_stride();
const auto stride_hessenberg = hessenberg_iter->get_stride();
const auto dim_size = next_krylov_basis->get_size();

auto cublas_handle = exec->get_cublas_handle();
const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim),
exec->get_num_multiprocessor() * 2);
const dim3 block_size(default_dot_dim, default_dot_dim);
for (size_type k = 0; k < iter + 1; ++k) {
update_hessenberg_kernel<default_block_size>
<<<dim_size[1], default_block_size>>>(
k, dim_size[0], dim_size[1],
as_cuda_type(next_krylov_basis->get_const_values()),
stride_next_krylov,
as_cuda_type(krylov_bases->get_const_values()), stride_krylov,
as_cuda_type(hessenberg_iter->get_values()), stride_hessenberg,
as_cuda_type(stop_status));

zero_array(dim_size[1],
hessenberg_iter->get_values() + k * stride_hessenberg);
multidot_kernel<<<grid_size, block_size>>>(
k, dim_size[0], dim_size[1],
as_cuda_type(next_krylov_basis->get_const_values()),
stride_next_krylov, as_cuda_type(krylov_bases->get_const_values()),
stride_krylov, as_cuda_type(hessenberg_iter->get_values()),
stride_hessenberg, as_cuda_type(stop_status));
update_next_krylov_kernel<default_block_size>
<<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
default_block_size>>>(
Expand Down