Skip to content

Commit

Permalink
add Jacobi transpose kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Jun 7, 2020
1 parent 02d9ff5 commit 177e0d8
Show file tree
Hide file tree
Showing 13 changed files with 774 additions and 7 deletions.
72 changes: 72 additions & 0 deletions common/preconditioner/jacobi_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,75 @@ __global__ void agglomerate_supervariables_kernel(
block_ptrs[num_blocks] = block_ptrs[num_natural_blocks];
num_blocks_arr[0] = num_blocks;
}


template <bool conjugate, int max_block_size, int subwarp_size,
int warps_per_block, typename ValueType, typename IndexType>
__global__ void __launch_bounds__(warps_per_block *config::warp_size)
transpose_jacobi(const ValueType *__restrict__ blocks,
preconditioner::block_interleaved_storage_scheme<IndexType>
storage_scheme,
const IndexType *__restrict__ block_ptrs,
size_type num_blocks, ValueType *__restrict__ out_blocks)
{
const auto block_id =
thread::get_subwarp_id<subwarp_size, warps_per_block>();
const auto subwarp =
group::tiled_partition<subwarp_size>(group::this_thread_block());
if (block_id >= num_blocks) {
return;
}
const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];

const auto block_ofs = storage_scheme.get_global_block_offset(block_id);
const auto block_stride = storage_scheme.get_stride();
const auto rank = subwarp.thread_rank();
if (rank < block_size) {
for (IndexType i = 0; i < block_size; ++i) {
auto val = blocks[block_ofs + i * block_stride + rank];
out_blocks[block_ofs + i + rank * block_stride] =
conjugate ? conj(val) : val;
}
}
}


template <bool conjugate, int max_block_size, int subwarp_size,
int warps_per_block, typename ValueType, typename IndexType>
__global__ void
__launch_bounds__(warps_per_block *config::warp_size) adaptive_transpose_jacobi(
const ValueType *__restrict__ blocks,
preconditioner::block_interleaved_storage_scheme<IndexType> storage_scheme,
const precision_reduction *__restrict__ block_precisions,
const IndexType *__restrict__ block_ptrs, size_type num_blocks,
ValueType *__restrict__ out_blocks)
{
const auto block_id =
thread::get_subwarp_id<subwarp_size, warps_per_block>();
const auto subwarp =
group::tiled_partition<subwarp_size>(group::this_thread_block());
if (block_id >= num_blocks) {
return;
}
const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];

const auto block_stride = storage_scheme.get_stride();
const auto rank = subwarp.thread_rank();
if (rank < block_size) {
GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
ValueType, block_precisions[block_id],
auto local_block =
reinterpret_cast<const resolved_precision *>(
blocks + storage_scheme.get_group_offset(block_id)) +
storage_scheme.get_block_offset(block_id);
auto local_out_block =
reinterpret_cast<resolved_precision *>(
out_blocks + storage_scheme.get_group_offset(block_id)) +
storage_scheme.get_block_offset(block_id);
for (IndexType i = 0; i < block_size; ++i) {
auto val = local_block[i * block_stride + rank];
local_out_block[i + rank * block_stride] =
conjugate ? conj(val) : val;
});
}
}
18 changes: 18 additions & 0 deletions core/base/extended_float.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,14 @@ class half {
return static_cast<float64>(static_cast<float32>(*this));
}

GKO_ATTRIBUTES half operator-() const noexcept
{
auto res = *this;
// flip sign bit
res.data_ ^= f16_traits::sign_mask;
return res;
}

private:
using f16_traits = detail::float_traits<float16>;
using f32_traits = detail::float_traits<float32>;
Expand Down Expand Up @@ -456,6 +464,16 @@ class truncated {
return reinterpret_cast<const float_type &>(bits);
}

GKO_ATTRIBUTES truncated operator-() const noexcept
{
auto res = *this;
// flip sign bit
if (ComponentId == 0) {
res.data_ ^= bits_type{1} << (8 * sizeof(bits_type) - 1);
}
return res;
}

private:
bits_type data_;
};
Expand Down
12 changes: 12 additions & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,18 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);

template <typename ValueType, typename IndexType>
GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);

template <typename ValueType, typename IndexType>
GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);

template <typename ValueType, typename IndexType>
GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
Expand Down
45 changes: 43 additions & 2 deletions core/preconditioner/jacobi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ GKO_REGISTER_OPERATION(simple_apply, jacobi::simple_apply);
GKO_REGISTER_OPERATION(apply, jacobi::apply);
GKO_REGISTER_OPERATION(find_blocks, jacobi::find_blocks);
GKO_REGISTER_OPERATION(generate, jacobi::generate);
GKO_REGISTER_OPERATION(transpose_jacobi, jacobi::transpose_jacobi);
GKO_REGISTER_OPERATION(conj_transpose_jacobi, jacobi::conj_transpose_jacobi);
GKO_REGISTER_OPERATION(convert_to_dense, jacobi::convert_to_dense);
GKO_REGISTER_OPERATION(initialize_precisions, jacobi::initialize_precisions);

Expand Down Expand Up @@ -142,6 +144,46 @@ void Jacobi<ValueType, IndexType>::write(mat_data &data) const
}


template <typename ValueType, typename IndexType>
std::unique_ptr<LinOp> Jacobi<ValueType, IndexType>::transpose() const
{
auto res = std::unique_ptr<Jacobi<ValueType, IndexType>>(
new Jacobi<ValueType, IndexType>(this->get_executor()));
res->set_size(this->get_size());
res->storage_scheme_ = storage_scheme_;
res->num_blocks_ = num_blocks_;
res->blocks_.resize_and_reset(blocks_.get_num_elems());
res->conditioning_ = conditioning_;
res->parameters_ = parameters_;
this->get_executor()->run(jacobi::make_transpose_jacobi(
num_blocks_, parameters_.max_block_size,
parameters_.storage_optimization.block_wise, parameters_.block_pointers,
blocks_, storage_scheme_, res->blocks_));

return res;
}


template <typename ValueType, typename IndexType>
std::unique_ptr<LinOp> Jacobi<ValueType, IndexType>::conj_transpose() const
{
auto res = std::unique_ptr<Jacobi<ValueType, IndexType>>(
new Jacobi<ValueType, IndexType>(this->get_executor()));
res->set_size(this->get_size());
res->storage_scheme_ = storage_scheme_;
res->num_blocks_ = num_blocks_;
res->blocks_.resize_and_reset(blocks_.get_num_elems());
res->conditioning_ = conditioning_;
res->parameters_ = parameters_;
this->get_executor()->run(jacobi::make_conj_transpose_jacobi(
num_blocks_, parameters_.max_block_size,
parameters_.storage_optimization.block_wise, parameters_.block_pointers,
blocks_, storage_scheme_, res->blocks_));

return res;
}


template <typename ValueType, typename IndexType>
void Jacobi<ValueType, IndexType>::detect_blocks(
const matrix::Csr<ValueType, IndexType> *system_matrix)
Expand All @@ -159,8 +201,7 @@ void Jacobi<ValueType, IndexType>::detect_blocks(
template <typename ValueType, typename IndexType>
void Jacobi<ValueType, IndexType>::generate(const LinOp *system_matrix)
{
GKO_ASSERT_EQUAL_DIMENSIONS(system_matrix,
transpose(system_matrix->get_size()));
GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
const auto exec = this->get_executor();
const auto csr_mtx = copy_and_convert_to<matrix::Csr<ValueType, IndexType>>(
exec, system_matrix);
Expand Down
26 changes: 26 additions & 0 deletions core/preconditioner/jacobi_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,28 @@ namespace kernels {
const Array<ValueType> &blocks, const matrix::Dense<ValueType> *b, \
matrix::Dense<ValueType> *x)

#define GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType) \
void transpose_jacobi( \
std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
uint32 max_block_size, \
const Array<precision_reduction> &block_precisions, \
const Array<IndexType> &block_pointers, \
const Array<ValueType> &blocks, \
const preconditioner::block_interleaved_storage_scheme<IndexType> \
&storage_scheme, \
Array<ValueType> &out_blocks)

#define GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType) \
void conj_transpose_jacobi( \
std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
uint32 max_block_size, \
const Array<precision_reduction> &block_precisions, \
const Array<IndexType> &block_pointers, \
const Array<ValueType> &blocks, \
const preconditioner::block_interleaved_storage_scheme<IndexType> \
&storage_scheme, \
Array<ValueType> &out_blocks)

#define GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \
void convert_to_dense( \
std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
Expand All @@ -110,6 +132,10 @@ namespace kernels {
template <typename ValueType, typename IndexType> \
GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \
GKO_DECLARE_JACOBI_INITIALIZE_PRECISIONS_KERNEL()

Expand Down
90 changes: 90 additions & 0 deletions cuda/preconditioner/jacobi_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "core/base/extended_float.hpp"
#include "core/preconditioner/jacobi_utils.hpp"
#include "core/synthesizer/implementation_selection.hpp"
#include "cuda/base/config.hpp"
#include "cuda/base/math.hpp"
#include "cuda/base/types.hpp"
#include "cuda/components/cooperative_groups.cuh"
#include "cuda/components/thread_ids.cuh"
#include "cuda/preconditioner/jacobi_common.hpp"


namespace gko {
Expand Down Expand Up @@ -138,6 +141,93 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL);


namespace {


template <bool conjugate, int warps_per_block, int max_block_size,
typename ValueType, typename IndexType>
void transpose_jacobi(
syn::value_list<int, max_block_size>, size_type num_blocks,
const precision_reduction *block_precisions,
const IndexType *block_pointers, const ValueType *blocks,
const preconditioner::block_interleaved_storage_scheme<IndexType>
&storage_scheme,
ValueType *out_blocks)
{
constexpr int subwarp_size = get_larger_power(max_block_size);
constexpr int blocks_per_warp = config::warp_size / subwarp_size;
const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp),
1, 1);
const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block);

if (block_precisions) {
adaptive_transpose_jacobi<conjugate, max_block_size, subwarp_size,
warps_per_block>
<<<grid_size, block_size, 0, 0>>>(
as_cuda_type(blocks), storage_scheme, block_precisions,
block_pointers, num_blocks, as_cuda_type(out_blocks));
} else {
transpose_jacobi<conjugate, max_block_size, subwarp_size,
warps_per_block><<<grid_size, block_size, 0, 0>>>(
as_cuda_type(blocks), storage_scheme, block_pointers, num_blocks,
as_cuda_type(out_blocks));
}
}

GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_jacobi, transpose_jacobi);


} // namespace


template <typename ValueType, typename IndexType>
void transpose_jacobi(
std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
uint32 max_block_size, const Array<precision_reduction> &block_precisions,
const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
const preconditioner::block_interleaved_storage_scheme<IndexType>
&storage_scheme,
Array<ValueType> &out_blocks)
{
select_transpose_jacobi(
compiled_kernels(),
[&](int compiled_block_size) {
return max_block_size <= compiled_block_size;
},
syn::value_list<int, false, config::min_warps_per_block>(),
syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
block_pointers.get_const_data(), blocks.get_const_data(),
storage_scheme, out_blocks.get_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL);


template <typename ValueType, typename IndexType>
void conj_transpose_jacobi(
std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
uint32 max_block_size, const Array<precision_reduction> &block_precisions,
const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
const preconditioner::block_interleaved_storage_scheme<IndexType>
&storage_scheme,
Array<ValueType> &out_blocks)
{
select_transpose_jacobi(
compiled_kernels(),
[&](int compiled_block_size) {
return max_block_size <= compiled_block_size;
},
syn::value_list<int, true, config::min_warps_per_block>(),
syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
block_pointers.get_const_data(), blocks.get_const_data(),
storage_scheme, out_blocks.get_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_JACOBI_CONJ_TRANSPOSE_KERNEL);


template <typename ValueType, typename IndexType>
void convert_to_dense(
std::shared_ptr<const CudaExecutor> exec, size_type num_blocks,
Expand Down
Loading

0 comments on commit 177e0d8

Please sign in to comment.