review comments

Co-authored-by: Yu-hsiang Tsai <yu-hsiang.tsai@kit.edu>
ginkgo-project · Oct 5, 2021 · 754be29 · 754be29
1 parent a5b3f65
commit 754be29
Show file tree

Hide file tree

Showing 7 changed files with 38 additions and 28 deletions.
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
@@ -30,9 +30,9 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 namespace kernel {
 
+
 template <int mat_blk_sz, int subwarp_size, typename ValueType,
           typename IndexType>
 __global__ __launch_bounds__(default_block_size) void transpose_blocks(
@@ -50,10 +50,10 @@ __global__ __launch_bounds__(default_block_size) void transpose_blocks(
     const unsigned sw_id_in_threadblock{threadIdx.x / subwarp_size};
 
     constexpr int mat_blk_sz_2{mat_blk_sz * mat_blk_sz};
+    __shared__ UninitializedArray<ValueType, warps_in_block * mat_blk_sz_2>
+        origblocks;
 
     for (auto ibz = begin_blk; ibz < nbnz; ibz += total_subwarp_count) {
-        __shared__ UninitializedArray<ValueType, warps_in_block * mat_blk_sz_2>
-            origblocks;
         for (int i = sw_threadidx; i < mat_blk_sz_2; i += subwarp_size) {
             origblocks[sw_id_in_threadblock * mat_blk_sz_2 + i] =
                 values[ibz * mat_blk_sz_2 + i];
@@ -70,4 +70,5 @@ __global__ __launch_bounds__(default_block_size) void transpose_blocks(
     }
 }
 
+
 }  // namespace kernel
diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp
@@ -78,22 +78,23 @@ namespace test {
 template <typename MatrixType = matrix::Dense<>, typename NonzeroDistribution,
           typename ValueDistribution, typename Engine, typename... MatrixArgs>
 std::unique_ptr<MatrixType> generate_random_matrix_with_diag(
-    size_type num_rows, size_type num_cols, NonzeroDistribution&& nonzero_dist,
-    ValueDistribution&& value_dist, Engine&& engine,
-    std::shared_ptr<const Executor> exec, MatrixArgs&&... args)
+    typename MatrixType::index_type num_rows,
+    typename MatrixType::index_type num_cols,
+    NonzeroDistribution&& nonzero_dist, ValueDistribution&& value_dist,
+    Engine&& engine, std::shared_ptr<const Executor> exec, MatrixArgs&&... args)
 {
     using value_type = typename MatrixType::value_type;
     using index_type = typename MatrixType::index_type;
 
-    matrix_data<value_type, index_type> data{gko::dim<2>{num_rows, num_cols},
+    matrix_data<value_type, index_type> data{gko::dim<2>(num_rows, num_cols),
                                              {}};
 
-    for (size_type row = 0; row < num_rows; ++row) {
-        std::vector<size_type> col_idx(num_cols);
+    for (index_type row = 0; row < num_rows; ++row) {
+        std::vector<index_type> col_idx(num_cols);
         std::iota(col_idx.begin(), col_idx.end(), size_type(0));
         // randomly generate number of nonzeros in this row
-        auto nnz_in_row = static_cast<size_type>(nonzero_dist(engine));
-        nnz_in_row = std::max(size_type(1), std::min(nnz_in_row, num_cols));
+        auto nnz_in_row = static_cast<index_type>(nonzero_dist(engine));
+        nnz_in_row = std::max(1, std::min(nnz_in_row, num_cols));
         // select a subset of `nnz_in_row` column indexes, and fill these
         // locations with random values
         std::shuffle(col_idx.begin(), col_idx.end(), engine);
@@ -103,7 +104,7 @@ std::unique_ptr<MatrixType> generate_random_matrix_with_diag(
             col_idx[nnz_in_row - 1] = row;
         }
         std::for_each(
-            begin(col_idx), begin(col_idx) + nnz_in_row, [&](size_type col) {
+            begin(col_idx), begin(col_idx) + nnz_in_row, [&](index_type col) {
                 data.nonzeros.emplace_back(
                     row, col,
                     detail::get_rand_value<value_type>(value_dist, engine));
@@ -211,7 +212,6 @@ std::unique_ptr<matrix::Fbcsr<ValueType, IndexType>> generate_fbcsr_from_csr(
  * entry in each block-row.
  *
  * @param exec  Reference executor.
- * @param engine  Random number engine to use, such as std::ranlux48.
  * @param nbrows  The number of block-rows in the generated matrix.
  * @param nbcols  The number of block-columns in the generated matrix.
  * @param mat_blk_sz  Block size of the generated matrix.
@@ -220,12 +220,13 @@ std::unique_ptr<matrix::Fbcsr<ValueType, IndexType>> generate_fbcsr_from_csr(
  * @param unsort  If true, the blocks of the generated matrix within each
  *                block-row are ordered randomly. Otherwise, blocks in each row
  *                are ordered by block-column index.
+ * @param engine  Random number engine to use, such as std::ranlux48.
  */
 template <typename ValueType, typename IndexType, typename RandEngine>
 std::unique_ptr<matrix::Fbcsr<ValueType, IndexType>> generate_random_fbcsr(
-    std::shared_ptr<const ReferenceExecutor> ref, RandEngine engine,
-    const IndexType nbrows, const IndexType nbcols, const int mat_blk_sz,
-    const bool diag_dominant, const bool unsort)
+    std::shared_ptr<const ReferenceExecutor> ref, const IndexType nbrows,
+    const IndexType nbcols, const int mat_blk_sz, const bool diag_dominant,
+    const bool unsort, RandEngine&& engine)
 {
     using real_type = gko::remove_complex<ValueType>;
     std::unique_ptr<matrix::Csr<ValueType, IndexType>> rand_csr_ref =

diff --git a/core/test/utils/fb_matrix_generator_test.cpp b/core/test/utils/fb_matrix_generator_test.cpp
@@ -67,7 +67,7 @@ class BlockMatrixGenerator : public ::testing::Test {
           rbmtx_dd(gko::test::generate_fbcsr_from_csr(exec, mtx.get(), blk_sz,
                                                       true, std::ranlux48(42))),
           cbmtx(gko::test::generate_random_fbcsr<value_type>(
-              exec, std::ranlux48(42), nbrows, nbcols, blk_sz, true, false))
+              exec, nbrows, nbcols, blk_sz, true, false, std::ranlux48(42)))
     {}
 
     const int nbrows = 100;
@@ -107,6 +107,7 @@ TEST_F(BlockMatrixGenerator, OutputHasCorrectSize)
     ASSERT_EQ(cbmtx->get_block_size(), blk_sz);
 }
 
+
 TEST_F(BlockMatrixGenerator, OutputHasCorrectSparsityPattern)
 {
     ASSERT_EQ(mtx->get_num_stored_elements(),

diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu
@@ -201,14 +201,16 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 namespace {
 
+
 template <int mat_blk_sz, typename ValueType, typename IndexType>
 void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
                            matrix::Fbcsr<ValueType, IndexType>* const mat)
 {
     constexpr int subwarp_size = config::warp_size;
     const size_type nbnz = mat->get_num_stored_blocks();
     const size_type numthreads = nbnz * subwarp_size;
-    const size_type numblocks = (numthreads - 1) / default_block_size + 1;
+    // const size_type numblocks = (numthreads - 1) / default_block_size + 1;
+    const size_type numblocks = ceildiv(numthreads, default_block_size);
     const dim3 block_size{static_cast<unsigned>(default_block_size), 1, 1};
     const dim3 grid_dim{static_cast<unsigned>(numblocks), 1, 1};
     kernel::transpose_blocks<mat_blk_sz, subwarp_size>
@@ -218,6 +220,7 @@ void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
                                     transpose_blocks_impl);
 
+
 }  // namespace
 
 

diff --git a/cuda/test/matrix/fbcsr_kernels.cpp b/cuda/test/matrix/fbcsr_kernels.cpp
@@ -70,8 +70,8 @@ class Fbcsr : public ::testing::Test {
         const index_type rand_bcols = 70;
         const int block_size = 3;
         rsorted_ref = gko::test::generate_random_fbcsr<value_type, index_type>(
-            ref, std::ranlux48(43), rand_brows, rand_bcols, block_size, false,
-            false);
+            ref, rand_brows, rand_bcols, block_size, false, false,
+            std::ranlux48(43));
     }
 
     void TearDown()
@@ -148,13 +148,13 @@ TYPED_TEST(Fbcsr, TransposeIsEquivalentToRefSortedBS7)
     const int block_size = 7;
     auto rsorted_ref2 =
         gko::test::generate_random_fbcsr<value_type, index_type>(
-            this->ref, std::ranlux48(43), rand_brows, rand_bcols, block_size,
-            false, false);
+            this->ref, rand_brows, rand_bcols, block_size, false, false,
+            std::ranlux48(43));
     rand_cuda->copy_from(gko::lend(rsorted_ref2));
+
     auto trans_ref_linop = rsorted_ref2->transpose();
     std::unique_ptr<const Mtx> trans_ref =
         gko::as<const Mtx>(std::move(trans_ref_linop));
-
     auto trans_cuda_linop = rand_cuda->transpose();
     std::unique_ptr<const Mtx> trans_cuda =
         gko::as<const Mtx>(std::move(trans_cuda_linop));
@@ -310,4 +310,5 @@ TYPED_TEST(Fbcsr, OutplaceAbsoluteMatrixIsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(abs_mtx, dabs_mtx, 5 * tol);
 }
 
+
 }  // namespace
diff --git a/omp/test/matrix/fbcsr_kernels.cpp b/omp/test/matrix/fbcsr_kernels.cpp
@@ -97,11 +97,11 @@ class Fbcsr : public ::testing::Test {
     void set_up_apply_data(int num_vectors = 1)
     {
         mtx = gko::test::generate_random_fbcsr<real_type>(
-            ref, rand_engine, num_brows, num_bcols, blk_sz, false, false);
+            ref, num_brows, num_bcols, blk_sz, false, false, rand_engine);
         complex_mtx = gko::test::generate_random_fbcsr<std::complex<real_type>>(
-            ref, rand_engine, num_brows, num_bcols, blk_sz, false, false);
+            ref, num_brows, num_bcols, blk_sz, false, false, rand_engine);
         square_mtx = gko::test::generate_random_fbcsr<real_type>(
-            ref, rand_engine, num_brows, num_brows, blk_sz, false, false);
+            ref, num_brows, num_brows, blk_sz, false, false, rand_engine);
         dmtx = Mtx::create(omp);
         dmtx->copy_from(mtx.get());
         complex_dmtx = ComplexMtx::create(omp);
@@ -131,7 +131,7 @@ class Fbcsr : public ::testing::Test {
     {
         constexpr int min_nnz_per_row{2};
         auto local_mtx_ref = gko::test::generate_random_fbcsr<real_type>(
-            ref, rand_engine, num_brows, num_bcols, blk_sz, false, true);
+            ref, num_brows, num_bcols, blk_sz, false, true, rand_engine);
 
         auto local_mtx_omp = Mtx::create(omp);
         local_mtx_omp->copy_from(local_mtx_ref.get());
@@ -404,7 +404,7 @@ TEST_F(Fbcsr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 TEST_F(Fbcsr, MaxNnzPerRowIsEquivalentToRefSortedBS3)
 {
     auto mtx_ref = gko::test::generate_random_fbcsr<real_type>(
-        ref, rand_engine, num_brows, num_bcols, blk_sz, false, false);
+        ref, num_brows, num_bcols, blk_sz, false, false, rand_engine);
     auto rand_omp = Mtx::create(omp);
     rand_omp->copy_from(gko::lend(mtx_ref));
     gko::size_type ref_max_nnz{}, omp_max_nnz{};

diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp
@@ -412,6 +412,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 namespace {
 
+
 template <int mat_blk_sz, typename ValueType, typename IndexType>
 void sort_by_column_index_impl(
     syn::value_list<int, mat_blk_sz>,
@@ -447,8 +448,10 @@ void sort_by_column_index_impl(
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_sort_col_idx,
                                     sort_by_column_index_impl);
 
+
 }  // namespace
 
+
 template <typename ValueType, typename IndexType>
 void sort_by_column_index(const std::shared_ptr<const ReferenceExecutor> exec,
                           matrix::Fbcsr<ValueType, IndexType>* const to_sort)