diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index cc82d30a240..76abffe5859 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -85,10 +85,11 @@ constexpr double ratio = 1e-2; /** * A compile-time list of sub-warp sizes for which the spmv kernels should be * compiled. - * 0 is a special case where it uses a sub-warp size of 32 in + * 0 is a special case where it uses a sub-warp size of warp_size in * combination with atomic_adds. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = + syn::value_list; namespace kernel { @@ -97,7 +98,7 @@ namespace { template -__device__ void spmv_kernel(const size_type num_rows, +__device__ void spmv_kernel(const size_type num_rows, const int nwarps_per_row, const ValueType *__restrict__ val, const IndexType *__restrict__ col, const size_type stride, @@ -108,9 +109,7 @@ __device__ void spmv_kernel(const size_type num_rows, { const auto tidx = static_cast(blockDim.x) * blockIdx.x + threadIdx.x; - const auto nwarps_per_row = - gridDim.x * blockDim.x / num_rows / subwarp_size; - const auto x = tidx / subwarp_size / nwarps_per_row; + const IndexType x = tidx / subwarp_size / nwarps_per_row; const auto warp_id = tidx / subwarp_size % nwarps_per_row; const auto y_start = tidx % subwarp_size + num_stored_elements_per_row * warp_id / nwarps_per_row; @@ -148,24 +147,26 @@ __device__ void spmv_kernel(const size_type num_rows, template __global__ __launch_bounds__(default_block_size) void spmv( - const size_type num_rows, const ValueType *__restrict__ val, - const IndexType *__restrict__ col, const size_type stride, - const size_type num_stored_elements_per_row, + const size_type num_rows, const int nwarps_per_row, + const ValueType *__restrict__ val, const IndexType *__restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, const ValueType *__restrict__ b, const size_type b_stride, ValueType *__restrict__ c, const size_type c_stride) { spmv_kernel( - num_rows, val, col, stride, num_stored_elements_per_row, b, b_stride, c, - c_stride, [](const ValueType &x, const ValueType &y) { return x; }); + num_rows, nwarps_per_row, val, col, stride, num_stored_elements_per_row, + b, b_stride, c, c_stride, + [](const ValueType &x, const ValueType &y) { return x; }); } template __global__ __launch_bounds__(default_block_size) void spmv( - const size_type num_rows, const ValueType *__restrict__ alpha, - const ValueType *__restrict__ val, const IndexType *__restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, + const size_type num_rows, const int nwarps_per_row, + const ValueType *__restrict__ alpha, const ValueType *__restrict__ val, + const IndexType *__restrict__ col, const size_type stride, + const size_type num_stored_elements_per_row, const ValueType *__restrict__ b, const size_type b_stride, const ValueType *__restrict__ beta, ValueType *__restrict__ c, const size_type c_stride) @@ -178,15 +179,15 @@ __global__ __launch_bounds__(default_block_size) void spmv( // operation. if (atomic) { spmv_kernel( - num_rows, val, col, stride, num_stored_elements_per_row, b, - b_stride, c, c_stride, + num_rows, nwarps_per_row, val, col, stride, + num_stored_elements_per_row, b, b_stride, c, c_stride, [&alpha_val](const ValueType &x, const ValueType &y) { return alpha_val * x; }); } else { spmv_kernel( - num_rows, val, col, stride, num_stored_elements_per_row, b, - b_stride, c, c_stride, + num_rows, nwarps_per_row, val, col, stride, + num_stored_elements_per_row, b, b_stride, c, c_stride, [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) { return alpha_val * x + beta_val * y; }); @@ -210,7 +211,7 @@ void abstract_spmv(syn::value_list, int nwarps_per_row, const matrix::Dense *beta = nullptr) { const auto nrows = a->get_size()[0]; - constexpr int subwarp_size = (info == 0) ? 32 : info; + constexpr int subwarp_size = (info == 0) ? cuda_config::warp_size : info; constexpr bool atomic = (info == 0); const dim3 block_size(default_block_size, 1, 1); const dim3 grid_size( @@ -218,13 +219,14 @@ void abstract_spmv(syn::value_list, int nwarps_per_row, b->get_size()[1], 1); if (alpha == nullptr && beta == nullptr) { kernel::spmv<<>>( - nrows, as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), - a->get_stride(), a->get_num_stored_elements_per_row(), + nrows, nwarps_per_row, as_cuda_type(a->get_const_values()), + a->get_const_col_idxs(), a->get_stride(), + a->get_num_stored_elements_per_row(), as_cuda_type(b->get_const_values()), b->get_stride(), as_cuda_type(c->get_values()), c->get_stride()); } else if (alpha != nullptr && beta != nullptr) { kernel::spmv<<>>( - nrows, as_cuda_type(alpha->get_const_values()), + nrows, nwarps_per_row, as_cuda_type(alpha->get_const_values()), as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), a->get_stride(), a->get_num_stored_elements_per_row(), as_cuda_type(b->get_const_values()), b->get_stride(), @@ -255,16 +257,17 @@ std::array compute_subwarp_size_and_atomicity( // Use multithreads to perform the reduction on each row when the matrix is // wide. // To make every thread have computation, so pick the value which is the - // power of 2 less than 32 and is less than or equal to ell_ncols. If the - // subwarp_size is 32 and allow more than one warps to work on the same row, - // use atomic add to handle the warps write the value into the same - // position. The #warps is decided according to the number of warps allowed - // on GPU. + // power of 2 less than warp_size and is less than or equal to ell_ncols. If + // the subwarp_size is warp_size and allow more than one warps to work on + // the same row, use atomic add to handle the warps write the value into the + // same position. The #warps is decided according to the number of warps + // allowed on GPU. if (static_cast(ell_ncols) / nrows > ratio) { - while (subwarp_size < 32 && (subwarp_size << 1) <= ell_ncols) { + while (subwarp_size < cuda_config::warp_size && + (subwarp_size << 1) <= ell_ncols) { subwarp_size <<= 1; } - if (subwarp_size == 32) { + if (subwarp_size == cuda_config::warp_size) { nwarps_per_row = std::min(ell_ncols / cuda_config::warp_size, nwarps / nrows); nwarps_per_row = std::max(nwarps_per_row, 1); @@ -292,8 +295,8 @@ void spmv(std::shared_ptr exec, /** * info is the parameter for selecting the cuda kernel. - * for info == 0, it uses the kernel by 32 threads with atomic operation - * for other value, it uses the kernel without atomic_add + * for info == 0, it uses the kernel by warp_size threads with atomic + * operation for other value, it uses the kernel without atomic_add */ const int info = (!atomic) * subwarp_size; if (atomic) { @@ -323,8 +326,8 @@ void advanced_spmv(std::shared_ptr exec, /** * info is the parameter for selecting the cuda kernel. - * for info == 0, it uses the kernel by 32 threads with atomic operation - * for other value, it uses the kernel without atomic_add + * for info == 0, it uses the kernel by warp_size threads with atomic + * operation for other value, it uses the kernel without atomic_add */ const int info = (!atomic) * subwarp_size; if (atomic) { diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index 82e3d43d232..ff4ae0b8b88 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -82,8 +82,8 @@ class Ell : public ::testing::Test { } void set_up_apply_data(int num_rows = 532, int num_cols = 231, - int num_stored_elements_per_row = 0, int stride = 0, - int num_vectors = 1) + int num_vectors = 1, + int num_stored_elements_per_row = 0, int stride = 0) { mtx = Mtx::create(ref, gko::dim<2>{}, num_stored_elements_per_row, stride); @@ -148,7 +148,7 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef) TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) { - set_up_apply_data(532, 231, 300, 600); + set_up_apply_data(532, 231, 1, 300, 600); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -159,7 +159,7 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) { - set_up_apply_data(532, 231, 300, 600); + set_up_apply_data(532, 231, 1, 300, 600); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -169,7 +169,7 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) { - set_up_apply_data(532, 231, 300, 600, 3); + set_up_apply_data(532, 231, 3, 300, 600); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -180,7 +180,7 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) { - set_up_apply_data(532, 231, 300, 600, 3); + set_up_apply_data(532, 231, 3, 300, 600); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -211,6 +211,72 @@ TEST_F(Ell, AdvancedByAtomicApplyIsEquivalentToRef) } +TEST_F(Ell, SimpleApplyByAtomicToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(10, 10000, 3); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, AdvancedByAtomicToDenseMatrixApplyIsEquivalentToRef) +{ + set_up_apply_data(10, 10000, 3); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, SimpleApplyOnSmallMatrixIsEquivalentToRef) +{ + set_up_apply_data(1, 10); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, AdvancedApplyOnSmallMatrixToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(1, 10, 3); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, SimpleApplyOnSmallMatrixToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(1, 10, 3); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef) +{ + set_up_apply_data(1, 10); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + TEST_F(Ell, ConvertToDenseIsEquivalentToRef) { set_up_apply_data();