Skip to content

Commit

Permalink
fix ell error on small mtx and flexble warp_size
Browse files Browse the repository at this point in the history
  • Loading branch information
yhmtsai committed Oct 30, 2019
1 parent ab7e3ab commit 29c6692
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 39 deletions.
69 changes: 36 additions & 33 deletions cuda/matrix/ell_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,11 @@ constexpr double ratio = 1e-2;
/**
* A compile-time list of sub-warp sizes for which the spmv kernels should be
* compiled.
* 0 is a special case where it uses a sub-warp size of 32 in
* 0 is a special case where it uses a sub-warp size of warp_size in
* combination with atomic_adds.
*/
using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
using compiled_kernels =
syn::value_list<int, 0, 1, 2, 4, 8, 16, 32, cuda_config::warp_size>;


namespace kernel {
Expand All @@ -97,7 +98,7 @@ namespace {

template <int subwarp_size, bool atomic, typename ValueType, typename IndexType,
typename Closure>
__device__ void spmv_kernel(const size_type num_rows,
__device__ void spmv_kernel(const size_type num_rows, const int nwarps_per_row,
const ValueType *__restrict__ val,
const IndexType *__restrict__ col,
const size_type stride,
Expand All @@ -108,9 +109,7 @@ __device__ void spmv_kernel(const size_type num_rows,
{
const auto tidx =
static_cast<IndexType>(blockDim.x) * blockIdx.x + threadIdx.x;
const auto nwarps_per_row =
gridDim.x * blockDim.x / num_rows / subwarp_size;
const auto x = tidx / subwarp_size / nwarps_per_row;
const IndexType x = tidx / subwarp_size / nwarps_per_row;
const auto warp_id = tidx / subwarp_size % nwarps_per_row;
const auto y_start = tidx % subwarp_size +
num_stored_elements_per_row * warp_id / nwarps_per_row;
Expand Down Expand Up @@ -148,24 +147,26 @@ __device__ void spmv_kernel(const size_type num_rows,
template <int subwarp_size, bool atomic = false, typename ValueType,
typename IndexType>
__global__ __launch_bounds__(default_block_size) void spmv(
const size_type num_rows, const ValueType *__restrict__ val,
const IndexType *__restrict__ col, const size_type stride,
const size_type num_stored_elements_per_row,
const size_type num_rows, const int nwarps_per_row,
const ValueType *__restrict__ val, const IndexType *__restrict__ col,
const size_type stride, const size_type num_stored_elements_per_row,
const ValueType *__restrict__ b, const size_type b_stride,
ValueType *__restrict__ c, const size_type c_stride)
{
spmv_kernel<subwarp_size, atomic>(
num_rows, val, col, stride, num_stored_elements_per_row, b, b_stride, c,
c_stride, [](const ValueType &x, const ValueType &y) { return x; });
num_rows, nwarps_per_row, val, col, stride, num_stored_elements_per_row,
b, b_stride, c, c_stride,
[](const ValueType &x, const ValueType &y) { return x; });
}


template <int subwarp_size, bool atomic = false, typename ValueType,
typename IndexType>
__global__ __launch_bounds__(default_block_size) void spmv(
const size_type num_rows, const ValueType *__restrict__ alpha,
const ValueType *__restrict__ val, const IndexType *__restrict__ col,
const size_type stride, const size_type num_stored_elements_per_row,
const size_type num_rows, const int nwarps_per_row,
const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
const IndexType *__restrict__ col, const size_type stride,
const size_type num_stored_elements_per_row,
const ValueType *__restrict__ b, const size_type b_stride,
const ValueType *__restrict__ beta, ValueType *__restrict__ c,
const size_type c_stride)
Expand All @@ -178,15 +179,15 @@ __global__ __launch_bounds__(default_block_size) void spmv(
// operation.
if (atomic) {
spmv_kernel<subwarp_size, atomic>(
num_rows, val, col, stride, num_stored_elements_per_row, b,
b_stride, c, c_stride,
num_rows, nwarps_per_row, val, col, stride,
num_stored_elements_per_row, b, b_stride, c, c_stride,
[&alpha_val](const ValueType &x, const ValueType &y) {
return alpha_val * x;
});
} else {
spmv_kernel<subwarp_size, atomic>(
num_rows, val, col, stride, num_stored_elements_per_row, b,
b_stride, c, c_stride,
num_rows, nwarps_per_row, val, col, stride,
num_stored_elements_per_row, b, b_stride, c, c_stride,
[&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
return alpha_val * x + beta_val * y;
});
Expand All @@ -210,21 +211,22 @@ void abstract_spmv(syn::value_list<int, info>, int nwarps_per_row,
const matrix::Dense<ValueType> *beta = nullptr)
{
const auto nrows = a->get_size()[0];
constexpr int subwarp_size = (info == 0) ? 32 : info;
constexpr int subwarp_size = (info == 0) ? cuda_config::warp_size : info;
constexpr bool atomic = (info == 0);
const dim3 block_size(default_block_size, 1, 1);
const dim3 grid_size(
ceildiv(nrows * subwarp_size * nwarps_per_row, block_size.x),
b->get_size()[1], 1);
if (alpha == nullptr && beta == nullptr) {
kernel::spmv<subwarp_size, atomic><<<grid_size, block_size, 0, 0>>>(
nrows, as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
a->get_stride(), a->get_num_stored_elements_per_row(),
nrows, nwarps_per_row, as_cuda_type(a->get_const_values()),
a->get_const_col_idxs(), a->get_stride(),
a->get_num_stored_elements_per_row(),
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(c->get_values()), c->get_stride());
} else if (alpha != nullptr && beta != nullptr) {
kernel::spmv<subwarp_size, atomic><<<grid_size, block_size, 0, 0>>>(
nrows, as_cuda_type(alpha->get_const_values()),
nrows, nwarps_per_row, as_cuda_type(alpha->get_const_values()),
as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
a->get_stride(), a->get_num_stored_elements_per_row(),
as_cuda_type(b->get_const_values()), b->get_stride(),
Expand Down Expand Up @@ -255,16 +257,17 @@ std::array<int, 3> compute_subwarp_size_and_atomicity(
// Use multithreads to perform the reduction on each row when the matrix is
// wide.
// To make every thread have computation, so pick the value which is the
// power of 2 less than 32 and is less than or equal to ell_ncols. If the
// subwarp_size is 32 and allow more than one warps to work on the same row,
// use atomic add to handle the warps write the value into the same
// position. The #warps is decided according to the number of warps allowed
// on GPU.
// power of 2 less than warp_size and is less than or equal to ell_ncols. If
// the subwarp_size is warp_size and allow more than one warps to work on
// the same row, use atomic add to handle the warps write the value into the
// same position. The #warps is decided according to the number of warps
// allowed on GPU.
if (static_cast<double>(ell_ncols) / nrows > ratio) {
while (subwarp_size < 32 && (subwarp_size << 1) <= ell_ncols) {
while (subwarp_size < cuda_config::warp_size &&
(subwarp_size << 1) <= ell_ncols) {
subwarp_size <<= 1;
}
if (subwarp_size == 32) {
if (subwarp_size == cuda_config::warp_size) {
nwarps_per_row =
std::min(ell_ncols / cuda_config::warp_size, nwarps / nrows);
nwarps_per_row = std::max(nwarps_per_row, 1);
Expand Down Expand Up @@ -292,8 +295,8 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,

/**
* info is the parameter for selecting the cuda kernel.
* for info == 0, it uses the kernel by 32 threads with atomic operation
* for other value, it uses the kernel without atomic_add
* for info == 0, it uses the kernel by warp_size threads with atomic
* operation for other value, it uses the kernel without atomic_add
*/
const int info = (!atomic) * subwarp_size;
if (atomic) {
Expand Down Expand Up @@ -323,8 +326,8 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,

/**
* info is the parameter for selecting the cuda kernel.
* for info == 0, it uses the kernel by 32 threads with atomic operation
* for other value, it uses the kernel without atomic_add
* for info == 0, it uses the kernel by warp_size threads with atomic
* operation for other value, it uses the kernel without atomic_add
*/
const int info = (!atomic) * subwarp_size;
if (atomic) {
Expand Down
78 changes: 72 additions & 6 deletions cuda/test/matrix/ell_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ class Ell : public ::testing::Test {
}

void set_up_apply_data(int num_rows = 532, int num_cols = 231,
int num_stored_elements_per_row = 0, int stride = 0,
int num_vectors = 1)
int num_vectors = 1,
int num_stored_elements_per_row = 0, int stride = 0)
{
mtx = Mtx::create(ref, gko::dim<2>{}, num_stored_elements_per_row,
stride);
Expand Down Expand Up @@ -148,7 +148,7 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef)

TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
{
set_up_apply_data(532, 231, 300, 600);
set_up_apply_data(532, 231, 1, 300, 600);

mtx->apply(y.get(), expected.get());
dmtx->apply(dy.get(), dresult.get());
Expand All @@ -159,7 +159,7 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)

TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
{
set_up_apply_data(532, 231, 300, 600);
set_up_apply_data(532, 231, 1, 300, 600);
mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());

Expand All @@ -169,7 +169,7 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)

TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
{
set_up_apply_data(532, 231, 300, 600, 3);
set_up_apply_data(532, 231, 3, 300, 600);

mtx->apply(y.get(), expected.get());
dmtx->apply(dy.get(), dresult.get());
Expand All @@ -180,7 +180,7 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)

TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
{
set_up_apply_data(532, 231, 300, 600, 3);
set_up_apply_data(532, 231, 3, 300, 600);

mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
Expand Down Expand Up @@ -211,6 +211,72 @@ TEST_F(Ell, AdvancedByAtomicApplyIsEquivalentToRef)
}


TEST_F(Ell, SimpleApplyByAtomicToDenseMatrixIsEquivalentToRef)
{
set_up_apply_data(10, 10000, 3);

mtx->apply(y.get(), expected.get());
dmtx->apply(dy.get(), dresult.get());

GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
}


TEST_F(Ell, AdvancedByAtomicToDenseMatrixApplyIsEquivalentToRef)
{
set_up_apply_data(10, 10000, 3);

mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());

GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
}


TEST_F(Ell, SimpleApplyOnSmallMatrixIsEquivalentToRef)
{
set_up_apply_data(1, 10);

mtx->apply(y.get(), expected.get());
dmtx->apply(dy.get(), dresult.get());

GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
}


TEST_F(Ell, AdvancedApplyOnSmallMatrixToDenseMatrixIsEquivalentToRef)
{
set_up_apply_data(1, 10, 3);

mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());

GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
}


TEST_F(Ell, SimpleApplyOnSmallMatrixToDenseMatrixIsEquivalentToRef)
{
set_up_apply_data(1, 10, 3);

mtx->apply(y.get(), expected.get());
dmtx->apply(dy.get(), dresult.get());

GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
}


TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef)
{
set_up_apply_data(1, 10);

mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());

GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
}


TEST_F(Ell, ConvertToDenseIsEquivalentToRef)
{
set_up_apply_data();
Expand Down

0 comments on commit 29c6692

Please sign in to comment.