From 754be29097638a0b0877cb91a40d2214c8214338 Mon Sep 17 00:00:00 2001 From: Aditya Kashi Date: Tue, 5 Oct 2021 23:42:23 +0200 Subject: [PATCH] review comments Co-authored-by: Yu-hsiang Tsai --- common/cuda_hip/matrix/fbcsr_kernels.hpp.inc | 7 ++--- core/test/utils/fb_matrix_generator.hpp | 27 ++++++++++---------- core/test/utils/fb_matrix_generator_test.cpp | 3 ++- cuda/matrix/fbcsr_kernels.cu | 5 +++- cuda/test/matrix/fbcsr_kernels.cpp | 11 ++++---- omp/test/matrix/fbcsr_kernels.cpp | 10 ++++---- reference/matrix/fbcsr_kernels.cpp | 3 +++ 7 files changed, 38 insertions(+), 28 deletions(-) diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index 465d67f9aca..ea44a802700 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -30,9 +30,9 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - namespace kernel { + template __global__ __launch_bounds__(default_block_size) void transpose_blocks( @@ -50,10 +50,10 @@ __global__ __launch_bounds__(default_block_size) void transpose_blocks( const unsigned sw_id_in_threadblock{threadIdx.x / subwarp_size}; constexpr int mat_blk_sz_2{mat_blk_sz * mat_blk_sz}; + __shared__ UninitializedArray + origblocks; for (auto ibz = begin_blk; ibz < nbnz; ibz += total_subwarp_count) { - __shared__ UninitializedArray - origblocks; for (int i = sw_threadidx; i < mat_blk_sz_2; i += subwarp_size) { origblocks[sw_id_in_threadblock * mat_blk_sz_2 + i] = values[ibz * mat_blk_sz_2 + i]; @@ -70,4 +70,5 @@ __global__ __launch_bounds__(default_block_size) void transpose_blocks( } } + } // namespace kernel diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp index 27b2c3249f5..dbab091849a 100644 --- a/core/test/utils/fb_matrix_generator.hpp +++ b/core/test/utils/fb_matrix_generator.hpp @@ -78,22 +78,23 @@ namespace test { template , typename NonzeroDistribution, typename ValueDistribution, typename Engine, typename... MatrixArgs> std::unique_ptr generate_random_matrix_with_diag( - size_type num_rows, size_type num_cols, NonzeroDistribution&& nonzero_dist, - ValueDistribution&& value_dist, Engine&& engine, - std::shared_ptr exec, MatrixArgs&&... args) + typename MatrixType::index_type num_rows, + typename MatrixType::index_type num_cols, + NonzeroDistribution&& nonzero_dist, ValueDistribution&& value_dist, + Engine&& engine, std::shared_ptr exec, MatrixArgs&&... args) { using value_type = typename MatrixType::value_type; using index_type = typename MatrixType::index_type; - matrix_data data{gko::dim<2>{num_rows, num_cols}, + matrix_data data{gko::dim<2>(num_rows, num_cols), {}}; - for (size_type row = 0; row < num_rows; ++row) { - std::vector col_idx(num_cols); + for (index_type row = 0; row < num_rows; ++row) { + std::vector col_idx(num_cols); std::iota(col_idx.begin(), col_idx.end(), size_type(0)); // randomly generate number of nonzeros in this row - auto nnz_in_row = static_cast(nonzero_dist(engine)); - nnz_in_row = std::max(size_type(1), std::min(nnz_in_row, num_cols)); + auto nnz_in_row = static_cast(nonzero_dist(engine)); + nnz_in_row = std::max(1, std::min(nnz_in_row, num_cols)); // select a subset of `nnz_in_row` column indexes, and fill these // locations with random values std::shuffle(col_idx.begin(), col_idx.end(), engine); @@ -103,7 +104,7 @@ std::unique_ptr generate_random_matrix_with_diag( col_idx[nnz_in_row - 1] = row; } std::for_each( - begin(col_idx), begin(col_idx) + nnz_in_row, [&](size_type col) { + begin(col_idx), begin(col_idx) + nnz_in_row, [&](index_type col) { data.nonzeros.emplace_back( row, col, detail::get_rand_value(value_dist, engine)); @@ -211,7 +212,6 @@ std::unique_ptr> generate_fbcsr_from_csr( * entry in each block-row. * * @param exec Reference executor. - * @param engine Random number engine to use, such as std::ranlux48. * @param nbrows The number of block-rows in the generated matrix. * @param nbcols The number of block-columns in the generated matrix. * @param mat_blk_sz Block size of the generated matrix. @@ -220,12 +220,13 @@ std::unique_ptr> generate_fbcsr_from_csr( * @param unsort If true, the blocks of the generated matrix within each * block-row are ordered randomly. Otherwise, blocks in each row * are ordered by block-column index. + * @param engine Random number engine to use, such as std::ranlux48. */ template std::unique_ptr> generate_random_fbcsr( - std::shared_ptr ref, RandEngine engine, - const IndexType nbrows, const IndexType nbcols, const int mat_blk_sz, - const bool diag_dominant, const bool unsort) + std::shared_ptr ref, const IndexType nbrows, + const IndexType nbcols, const int mat_blk_sz, const bool diag_dominant, + const bool unsort, RandEngine&& engine) { using real_type = gko::remove_complex; std::unique_ptr> rand_csr_ref = diff --git a/core/test/utils/fb_matrix_generator_test.cpp b/core/test/utils/fb_matrix_generator_test.cpp index 693b890c329..7b073ade94c 100644 --- a/core/test/utils/fb_matrix_generator_test.cpp +++ b/core/test/utils/fb_matrix_generator_test.cpp @@ -67,7 +67,7 @@ class BlockMatrixGenerator : public ::testing::Test { rbmtx_dd(gko::test::generate_fbcsr_from_csr(exec, mtx.get(), blk_sz, true, std::ranlux48(42))), cbmtx(gko::test::generate_random_fbcsr( - exec, std::ranlux48(42), nbrows, nbcols, blk_sz, true, false)) + exec, nbrows, nbcols, blk_sz, true, false, std::ranlux48(42))) {} const int nbrows = 100; @@ -107,6 +107,7 @@ TEST_F(BlockMatrixGenerator, OutputHasCorrectSize) ASSERT_EQ(cbmtx->get_block_size(), blk_sz); } + TEST_F(BlockMatrixGenerator, OutputHasCorrectSparsityPattern) { ASSERT_EQ(mtx->get_num_stored_elements(), diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index ba0ecc5d471..ea8f93cf79e 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -201,6 +201,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( namespace { + template void transpose_blocks_impl(syn::value_list, matrix::Fbcsr* const mat) @@ -208,7 +209,8 @@ void transpose_blocks_impl(syn::value_list, constexpr int subwarp_size = config::warp_size; const size_type nbnz = mat->get_num_stored_blocks(); const size_type numthreads = nbnz * subwarp_size; - const size_type numblocks = (numthreads - 1) / default_block_size + 1; + // const size_type numblocks = (numthreads - 1) / default_block_size + 1; + const size_type numblocks = ceildiv(numthreads, default_block_size); const dim3 block_size{static_cast(default_block_size), 1, 1}; const dim3 grid_dim{static_cast(numblocks), 1, 1}; kernel::transpose_blocks @@ -218,6 +220,7 @@ void transpose_blocks_impl(syn::value_list, GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks, transpose_blocks_impl); + } // namespace diff --git a/cuda/test/matrix/fbcsr_kernels.cpp b/cuda/test/matrix/fbcsr_kernels.cpp index c58b30aa29f..6c8f79c805d 100644 --- a/cuda/test/matrix/fbcsr_kernels.cpp +++ b/cuda/test/matrix/fbcsr_kernels.cpp @@ -70,8 +70,8 @@ class Fbcsr : public ::testing::Test { const index_type rand_bcols = 70; const int block_size = 3; rsorted_ref = gko::test::generate_random_fbcsr( - ref, std::ranlux48(43), rand_brows, rand_bcols, block_size, false, - false); + ref, rand_brows, rand_bcols, block_size, false, false, + std::ranlux48(43)); } void TearDown() @@ -148,13 +148,13 @@ TYPED_TEST(Fbcsr, TransposeIsEquivalentToRefSortedBS7) const int block_size = 7; auto rsorted_ref2 = gko::test::generate_random_fbcsr( - this->ref, std::ranlux48(43), rand_brows, rand_bcols, block_size, - false, false); + this->ref, rand_brows, rand_bcols, block_size, false, false, + std::ranlux48(43)); rand_cuda->copy_from(gko::lend(rsorted_ref2)); + auto trans_ref_linop = rsorted_ref2->transpose(); std::unique_ptr trans_ref = gko::as(std::move(trans_ref_linop)); - auto trans_cuda_linop = rand_cuda->transpose(); std::unique_ptr trans_cuda = gko::as(std::move(trans_cuda_linop)); @@ -310,4 +310,5 @@ TYPED_TEST(Fbcsr, OutplaceAbsoluteMatrixIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(abs_mtx, dabs_mtx, 5 * tol); } + } // namespace diff --git a/omp/test/matrix/fbcsr_kernels.cpp b/omp/test/matrix/fbcsr_kernels.cpp index 9ed4dd4a670..76e6c2700df 100644 --- a/omp/test/matrix/fbcsr_kernels.cpp +++ b/omp/test/matrix/fbcsr_kernels.cpp @@ -97,11 +97,11 @@ class Fbcsr : public ::testing::Test { void set_up_apply_data(int num_vectors = 1) { mtx = gko::test::generate_random_fbcsr( - ref, rand_engine, num_brows, num_bcols, blk_sz, false, false); + ref, num_brows, num_bcols, blk_sz, false, false, rand_engine); complex_mtx = gko::test::generate_random_fbcsr>( - ref, rand_engine, num_brows, num_bcols, blk_sz, false, false); + ref, num_brows, num_bcols, blk_sz, false, false, rand_engine); square_mtx = gko::test::generate_random_fbcsr( - ref, rand_engine, num_brows, num_brows, blk_sz, false, false); + ref, num_brows, num_brows, blk_sz, false, false, rand_engine); dmtx = Mtx::create(omp); dmtx->copy_from(mtx.get()); complex_dmtx = ComplexMtx::create(omp); @@ -131,7 +131,7 @@ class Fbcsr : public ::testing::Test { { constexpr int min_nnz_per_row{2}; auto local_mtx_ref = gko::test::generate_random_fbcsr( - ref, rand_engine, num_brows, num_bcols, blk_sz, false, true); + ref, num_brows, num_bcols, blk_sz, false, true, rand_engine); auto local_mtx_omp = Mtx::create(omp); local_mtx_omp->copy_from(local_mtx_ref.get()); @@ -404,7 +404,7 @@ TEST_F(Fbcsr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef) TEST_F(Fbcsr, MaxNnzPerRowIsEquivalentToRefSortedBS3) { auto mtx_ref = gko::test::generate_random_fbcsr( - ref, rand_engine, num_brows, num_bcols, blk_sz, false, false); + ref, num_brows, num_bcols, blk_sz, false, false, rand_engine); auto rand_omp = Mtx::create(omp); rand_omp->copy_from(gko::lend(mtx_ref)); gko::size_type ref_max_nnz{}, omp_max_nnz{}; diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index 06c05d23b00..1bd356b470b 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -412,6 +412,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( namespace { + template void sort_by_column_index_impl( syn::value_list, @@ -447,8 +448,10 @@ void sort_by_column_index_impl( GKO_ENABLE_IMPLEMENTATION_SELECTION(select_sort_col_idx, sort_by_column_index_impl); + } // namespace + template void sort_by_column_index(const std::shared_ptr exec, matrix::Fbcsr* const to_sort)