Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scalar Jacobi specialization and kernels #808

Merged
merged 5 commits into from
Jul 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions common/preconditioner/jacobi_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,50 @@ __launch_bounds__(warps_per_block *config::warp_size) adaptive_transpose_jacobi(
});
}
}


namespace kernel {


template <typename ValueType>
__global__ __launch_bounds__(default_block_size) void scalar_apply(
size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag,
const ValueType *__restrict__ alpha, size_type source_stride,
const ValueType *__restrict__ source_values,
const ValueType *__restrict__ beta, size_type result_stride,
ValueType *__restrict__ result_values)
{
const auto tidx = thread::get_thread_id_flat();
const auto row = tidx / num_cols;
const auto col = tidx % num_cols;

if (row < num_rows) {
auto diag_val =
diag[row] == zero<ValueType>() ? one<ValueType>() : diag[row];
result_values[row * result_stride + col] =
result_values[row * result_stride + col] * beta[0] +
alpha[0] * source_values[row * source_stride + col] / diag_val;
}
}


template <typename ValueType>
__global__ __launch_bounds__(default_block_size) void simple_scalar_apply(
size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag,
size_type source_stride, const ValueType *__restrict__ source_values,
size_type result_stride, ValueType *__restrict__ result_values)
{
const auto tidx = thread::get_thread_id_flat();
const auto row = tidx / num_cols;
const auto col = tidx % num_cols;

if (row < num_rows) {
auto diag_val =
diag[row] == zero<ValueType>() ? one<ValueType>() : diag[row];
result_values[row * result_stride + col] =
source_values[row * source_stride + col] / diag_val;
}
}


} // namespace kernel
11 changes: 11 additions & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,17 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);

template <typename ValueType>
GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);

template <typename ValueType>
GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);

template <typename ValueType, typename IndexType>
GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
Expand Down
88 changes: 55 additions & 33 deletions core/preconditioner/jacobi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ namespace jacobi {


GKO_REGISTER_OPERATION(simple_apply, jacobi::simple_apply);
GKO_REGISTER_OPERATION(simple_scalar_apply, jacobi::simple_scalar_apply);
GKO_REGISTER_OPERATION(apply, jacobi::apply);
GKO_REGISTER_OPERATION(scalar_apply, jacobi::scalar_apply);
GKO_REGISTER_OPERATION(find_blocks, jacobi::find_blocks);
GKO_REGISTER_OPERATION(generate, jacobi::generate);
GKO_REGISTER_OPERATION(transpose_jacobi, jacobi::transpose_jacobi);
Expand All @@ -74,10 +76,15 @@ void Jacobi<ValueType, IndexType>::apply_impl(const LinOp *b, LinOp *x) const
{
precision_dispatch_real_complex<ValueType>(
[this](auto dense_b, auto dense_x) {
this->get_executor()->run(jacobi::make_simple_apply(
num_blocks_, parameters_.max_block_size, storage_scheme_,
parameters_.storage_optimization.block_wise,
parameters_.block_pointers, blocks_, dense_b, dense_x));
if (parameters_.max_block_size == 1) {
this->get_executor()->run(jacobi::make_simple_scalar_apply(
this->blocks_, dense_b, dense_x));
} else {
this->get_executor()->run(jacobi::make_simple_apply(
num_blocks_, parameters_.max_block_size, storage_scheme_,
parameters_.storage_optimization.block_wise,
parameters_.block_pointers, blocks_, dense_b, dense_x));
}
},
b, x);
}
Expand All @@ -90,11 +97,16 @@ void Jacobi<ValueType, IndexType>::apply_impl(const LinOp *alpha,
{
precision_dispatch_real_complex<ValueType>(
[this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
this->get_executor()->run(jacobi::make_apply(
num_blocks_, parameters_.max_block_size, storage_scheme_,
parameters_.storage_optimization.block_wise,
parameters_.block_pointers, blocks_, dense_alpha, dense_b,
dense_beta, dense_x));
if (parameters_.max_block_size == 1) {
this->get_executor()->run(jacobi::make_scalar_apply(
this->blocks_, dense_alpha, dense_b, dense_beta, dense_x));
} else {
this->get_executor()->run(jacobi::make_apply(
num_blocks_, parameters_.max_block_size, storage_scheme_,
parameters_.storage_optimization.block_wise,
parameters_.block_pointers, blocks_, dense_alpha, dense_b,
dense_beta, dense_x));
}
},
alpha, b, beta, x);
}
Expand Down Expand Up @@ -218,33 +230,43 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp *system_matrix,
GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
using csr_type = matrix::Csr<ValueType, IndexType>;
const auto exec = this->get_executor();
auto csr_mtx =
convert_to_with_sorting<csr_type>(exec, system_matrix, skip_sorting);

if (parameters_.block_pointers.get_data() == nullptr) {
this->detect_blocks(csr_mtx.get());
}
if (parameters_.max_block_size == 1) {
auto diag = as<DiagonalExtractable<ValueType>>(system_matrix)
->extract_diagonal();
auto temp = gko::Array<ValueType>::view(
exec, system_matrix->get_size()[0], diag->get_values());
this->blocks_ = temp;
} else {
auto csr_mtx = convert_to_with_sorting<csr_type>(exec, system_matrix,
skip_sorting);

if (parameters_.block_pointers.get_data() == nullptr) {
this->detect_blocks(csr_mtx.get());
}

const auto all_block_opt = parameters_.storage_optimization.of_all_blocks;
auto &precisions = parameters_.storage_optimization.block_wise;
// if adaptive version is used, make sure that the precision array is of
// the correct size by replicating it multiple times if needed
if (parameters_.storage_optimization.is_block_wise ||
all_block_opt != precision_reduction(0, 0)) {
if (!parameters_.storage_optimization.is_block_wise) {
precisions = gko::Array<precision_reduction>(exec, {all_block_opt});
const auto all_block_opt =
parameters_.storage_optimization.of_all_blocks;
auto &precisions = parameters_.storage_optimization.block_wise;
// if adaptive version is used, make sure that the precision array is of
// the correct size by replicating it multiple times if needed
if (parameters_.storage_optimization.is_block_wise ||
all_block_opt != precision_reduction(0, 0)) {
if (!parameters_.storage_optimization.is_block_wise) {
precisions =
gko::Array<precision_reduction>(exec, {all_block_opt});
}
Array<precision_reduction> tmp(
exec, parameters_.block_pointers.get_num_elems() - 1);
exec->run(jacobi::make_initialize_precisions(precisions, tmp));
precisions = std::move(tmp);
conditioning_.resize_and_reset(num_blocks_);
}
Array<precision_reduction> tmp(
exec, parameters_.block_pointers.get_num_elems() - 1);
exec->run(jacobi::make_initialize_precisions(precisions, tmp));
precisions = std::move(tmp);
conditioning_.resize_and_reset(num_blocks_);
}

exec->run(jacobi::make_generate(
csr_mtx.get(), num_blocks_, parameters_.max_block_size,
parameters_.accuracy, storage_scheme_, conditioning_, precisions,
parameters_.block_pointers, blocks_));
exec->run(jacobi::make_generate(
csr_mtx.get(), num_blocks_, parameters_.max_block_size,
parameters_.accuracy, storage_scheme_, conditioning_, precisions,
parameters_.block_pointers, blocks_));
}
}


Expand Down
17 changes: 17 additions & 0 deletions core/preconditioner/jacobi_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ namespace kernels {
const matrix::Dense<ValueType> *b, \
const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *x)

#define GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType) \
void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec, \
const Array<ValueType> &diag, \
const matrix::Dense<ValueType> *b, \
matrix::Dense<ValueType> *x)

#define GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType) \
void simple_apply( \
std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
Expand All @@ -85,6 +91,13 @@ namespace kernels {
const Array<ValueType> &blocks, const matrix::Dense<ValueType> *b, \
matrix::Dense<ValueType> *x)

#define GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType) \
void scalar_apply( \
std::shared_ptr<const DefaultExecutor> exec, \
const Array<ValueType> &diag, const matrix::Dense<ValueType> *alpha, \
const matrix::Dense<ValueType> *b, \
const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *x)

#define GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType) \
void transpose_jacobi( \
std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
Expand Down Expand Up @@ -127,8 +140,12 @@ namespace kernels {
GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_JACOBI_GENERATE_KERNEL(ValueType, IndexType); \
template <typename ValueType> \
GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_JACOBI_APPLY_KERNEL(ValueType, IndexType); \
template <typename ValueType> \
GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
Expand Down
67 changes: 62 additions & 5 deletions cuda/preconditioner/jacobi_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,14 @@ constexpr int default_num_warps = 32;
// current GPUs have at most 84 SMs)
constexpr int default_grid_size = 32 * 32 * 128;

constexpr int default_block_size = 512;


#include "common/preconditioner/jacobi_kernels.hpp.inc"


template <typename ValueType, typename IndexType>
size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,
size_type find_natural_blocks(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType> *mtx,
int32 max_block_size,
IndexType *__restrict__ block_ptrs)
Expand All @@ -95,7 +97,7 @@ size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,

template <typename IndexType>
inline size_type agglomerate_supervariables(
std::shared_ptr<const CudaExecutor> exec, int32 max_block_size,
std::shared_ptr<const DefaultExecutor> exec, int32 max_block_size,
size_type num_natural_blocks, IndexType *block_ptrs)
{
Array<size_type> nums(exec, 1);
Expand All @@ -111,7 +113,7 @@ inline size_type agglomerate_supervariables(
} // namespace


void initialize_precisions(std::shared_ptr<const CudaExecutor> exec,
void initialize_precisions(std::shared_ptr<const DefaultExecutor> exec,
const Array<precision_reduction> &source,
Array<precision_reduction> &precisions)
{
Expand All @@ -126,7 +128,7 @@ void initialize_precisions(std::shared_ptr<const CudaExecutor> exec,


template <typename ValueType, typename IndexType>
void find_blocks(std::shared_ptr<const CudaExecutor> exec,
void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType> *system_matrix,
uint32 max_block_size, size_type &num_blocks,
Array<IndexType> &block_pointers)
Expand Down Expand Up @@ -230,7 +232,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

template <typename ValueType, typename IndexType>
void convert_to_dense(
std::shared_ptr<const CudaExecutor> exec, size_type num_blocks,
std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
const Array<precision_reduction> &block_precisions,
const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
const preconditioner::block_interleaved_storage_scheme<IndexType>
Expand All @@ -241,6 +243,61 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);


template <typename ValueType>
void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
const Array<ValueType> &diag,
const matrix::Dense<ValueType> *alpha,
const matrix::Dense<ValueType> *b,
const matrix::Dense<ValueType> *beta,
matrix::Dense<ValueType> *x)
{
const auto b_size = b->get_size();
const auto num_rows = b_size[0];
const auto num_cols = b_size[1];
const auto b_stride = b->get_stride();
const auto x_stride = x->get_stride();
const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);

const auto b_values = b->get_const_values();
const auto diag_values = diag.get_const_data();
auto x_values = x->get_values();

kernel::scalar_apply<<<grid_dim, default_block_size>>>(
num_rows, num_cols, as_cuda_type(diag_values),
as_cuda_type(alpha->get_const_values()), b_stride,
as_cuda_type(b_values), as_cuda_type(beta->get_const_values()),
x_stride, as_cuda_type(x_values));
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);


template <typename ValueType>
void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
const Array<ValueType> &diag,
const matrix::Dense<ValueType> *b,
matrix::Dense<ValueType> *x)
{
const auto b_size = b->get_size();
const auto num_rows = b_size[0];
const auto num_cols = b_size[1];
const auto b_stride = b->get_stride();
const auto x_stride = x->get_stride();
const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);

const auto b_values = b->get_const_values();
const auto diag_values = diag.get_const_data();
auto x_values = x->get_values();

kernel::simple_scalar_apply<<<grid_dim, default_block_size>>>(
num_rows, num_cols, as_cuda_type(diag_values), b_stride,
as_cuda_type(b_values), x_stride, as_cuda_type(x_values));
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);


} // namespace jacobi
} // namespace cuda
} // namespace kernels
Expand Down
Loading