Fix the automatical strategy when shared.

The automatical strategy has an issue when shared, it behaves as the wrong strategy if the matrices using it have different properties. This fixes this issue only by extending the current interface, and not changing any existing function. In effect: + Add a new virtual `copy` function to the strategies in order to create a new shared pointer polymorphically. + The approach used is to always copy the strategy, whenever a CSR matrix is instantiated or copied. Thankfully, the strategies are currently light objects so this should create few overhead. + Add tests which ensure that matrices with difference properties work correctly with the automatical strategy. Fixes #426
ginkgo-project · Jun 5, 2020 · f768210 · f768210
1 parent 36cb424
commit f768210
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 15 deletions.
diff --git a/core/test/matrix/csr_builder.cpp b/core/test/matrix/csr_builder.cpp
@@ -90,11 +90,15 @@ TYPED_TEST(CsrBuilder, UpdatesSrowOnDestruction)
     using index_type = typename TestFixture::index_type;
     struct mock_strategy : public Mtx::strategy_type {
         virtual void process(const gko::Array<index_type> &,
-                             gko::Array<index_type> *)
+                             gko::Array<index_type> *) override
         {
             *was_called = true;
         }
-        virtual int64_t clac_size(const int64_t nnz) { return 0; }
+        virtual int64_t clac_size(const int64_t nnz) override { return 0; }
+        virtual std::shared_ptr<typename Mtx::strategy_type> copy() override
+        {
+            return std::make_shared<mock_strategy>(*was_called);
+        }
 
         mock_strategy(bool &flag) : Mtx::strategy_type(""), was_called(&flag) {}
 

diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp
@@ -664,4 +664,29 @@ TEST_F(Csr, SortUnsortedMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)
+{
+    auto row_len_limit = std::max(Mtx::automatical::nvidia_row_len_limit,
+                                  Mtx::automatical::amd_row_len_limit);
+    auto load_balance_mtx = Mtx::create(ref);
+    auto classical_mtx = Mtx::create(ref);
+    load_balance_mtx->copy_from(
+        gen_mtx<Vec>(1, row_len_limit + 1000, row_len_limit + 1));
+    classical_mtx->copy_from(gen_mtx<Vec>(50, 50, 1));
+    auto load_balance_mtx_d = Mtx::create(cuda);
+    auto classical_mtx_d = Mtx::create(cuda);
+    load_balance_mtx_d->copy_from(load_balance_mtx.get());
+    classical_mtx_d->copy_from(classical_mtx.get());
+    auto automatical = std::make_shared<Mtx::automatical>();
+
+    load_balance_mtx_d->set_strategy(automatical);
+    classical_mtx_d->set_strategy(automatical);
+
+    EXPECT_EQ("load_balance", load_balance_mtx_d->get_strategy()->get_name());
+    EXPECT_EQ("classical", classical_mtx_d->get_strategy()->get_name());
+    ASSERT_NE(load_balance_mtx_d->get_strategy().get(),
+              classical_mtx_d->get_strategy().get());
+}
+
+
 }  // namespace
diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp
@@ -638,4 +638,29 @@ TEST_F(Csr, SortUnsortedMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, OneAutomaticalWorksWithDifferentMatrices)
+{
+    auto load_balance_mtx = Mtx::create(ref);
+    auto classical_mtx = Mtx::create(ref);
+    auto row_len_limit = std::max(Mtx::automatical::nvidia_row_len_limit,
+                                  Mtx::automatical::amd_row_len_limit);
+    load_balance_mtx->copy_from(
+        gen_mtx<Vec>(1, row_len_limit + 1000, row_len_limit + 1));
+    classical_mtx->copy_from(gen_mtx<Vec>(50, 50, 1));
+    auto load_balance_mtx_d = Mtx::create(hip);
+    auto classical_mtx_d = Mtx::create(hip);
+    load_balance_mtx_d->copy_from(load_balance_mtx.get());
+    classical_mtx_d->copy_from(classical_mtx.get());
+    auto automatical = std::make_shared<Mtx::automatical>();
+
+    load_balance_mtx_d->set_strategy(automatical);
+    classical_mtx_d->set_strategy(automatical);
+
+    EXPECT_EQ("load_balance", load_balance_mtx_d->get_strategy()->get_name());
+    EXPECT_EQ("classical", classical_mtx_d->get_strategy()->get_name());
+    ASSERT_NE(load_balance_mtx_d->get_strategy().get(),
+              classical_mtx_d->get_strategy().get());
+}
+
+
 }  // namespace
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
@@ -166,6 +166,12 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
          */
         virtual int64_t clac_size(const int64_t nnz) = 0;
 
+        /**
+         * Copy a strategy. This is a workaround until strategies are revamped,
+         * since strategies like `automatical` do not work when actually shared.
+         */
+        virtual std::shared_ptr<strategy_type> copy() = 0;
+
     protected:
         void set_name(std::string name) { name_ = name; }
 
@@ -215,6 +221,11 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             return max_length_per_row_;
         }
 
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<classical>();
+        }
+
     private:
         index_type max_length_per_row_;
     };
@@ -236,6 +247,11 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         {}
 
         int64_t clac_size(const int64_t nnz) override { return 0; }
+
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<merge_path>();
+        }
     };
 
     /**
@@ -256,6 +272,11 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         {}
 
         int64_t clac_size(const int64_t nnz) override { return 0; }
+
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<cusparse>();
+        }
     };
 
     /**
@@ -275,6 +296,11 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         {}
 
         int64_t clac_size(const int64_t nnz) override { return 0; }
+
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<sparselib>();
+        }
     };
 
     /**
@@ -406,6 +432,12 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             }
         }
 
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<load_balance>(nwarps_, warp_size_,
+                                                  cuda_strategy_);
+        }
+
     private:
         int64_t nwarps_;
         int warp_size_;
@@ -414,16 +446,29 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
 
     class automatical : public strategy_type {
     public:
+        /* Use imbalance strategy when the maximum number of nonzero per row is
+         * more than 1024 on NVIDIA hardware */
+        static constexpr index_type nvidia_row_len_limit = 1024;
+        /* Use imbalance strategy when the matrix has more more than 1e6 on
+         * NVIDIA hardware */
+        static constexpr index_type nvidia_nnz_limit = 1e6;
+        /* Use imbalance strategy when the maximum number of nonzero per row is
+         * more than 768 on AMD hardware */
+        static constexpr index_type amd_row_len_limit = 768;
+        /* Use imbalance strategy when the matrix has more more than 1e8 on AMD
+         * hardware */
+        static constexpr index_type amd_nnz_limit = 1e8;
+
         /**
-         * Creates a automatical strategy.
+         * Creates an automatical strategy.
          */
         automatical()
             : automatical(std::move(
                   gko::CudaExecutor::create(0, gko::OmpExecutor::create())))
         {}
 
         /**
-         * Creates a automatical strategy with CUDA executor.
+         * Creates an automatical strategy with CUDA executor.
          *
          * @param exec the CUDA executor
          */
@@ -432,7 +477,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         {}
 
         /**
-         * Creates a automatical strategy with HIP executor.
+         * Creates an automatical strategy with HIP executor.
          *
          * @param exec the HIP executor
          */
@@ -441,7 +486,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         {}
 
         /**
-         * Creates a automatical strategy with specified parameters
+         * Creates an automatical strategy with specified parameters
          *
          * @param nwarps the number of warps in the executor
          * @param warp_size the warp size of the executor
@@ -466,14 +511,12 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             // if the number of stored elements is larger than <nnz_limit> or
             // the maximum number of stored elements per row is larger than
             // <row_len_limit>, use load_balance otherwise use classical
-            // CUDA: nnz_limit = 1e6, row_len_limit = 1024
-            // AMD: nnz_limit = 1e8, row_len_limit = 768
-            index_type nnz_limit = 1e6;
-            index_type row_len_limit = 1024;
+            index_type nnz_limit = nvidia_nnz_limit;
+            index_type row_len_limit = nvidia_row_len_limit;
 #if GINKGO_HIP_PLATFORM_HCC
             if (!cuda_strategy_) {
-                nnz_limit = 1e8;
-                row_len_limit = 768;
+                nnz_limit = amd_nnz_limit;
+                row_len_limit = amd_row_len_limit;
             }
 #endif  // GINKGO_HIP_PLATFORM_HCC
             auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master();
@@ -539,6 +582,12 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             return max_length_per_row_;
         }
 
+        std::shared_ptr<strategy_type> copy() override
+        {
+            return std::make_shared<automatical>(nwarps_, warp_size_,
+                                                 cuda_strategy_);
+        }
+
     private:
         int64_t nwarps_;
         int warp_size_;
@@ -549,7 +598,13 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
     void convert_to(Csr<ValueType, IndexType> *result) const override
     {
         bool same_executor = this->get_executor() == result->get_executor();
-        EnableLinOp<Csr>::convert_to(result);
+        // NOTE: as soon as strategies are improved, this can be reverted
+        result->values_ = this->values_;
+        result->col_idxs_ = this->col_idxs_;
+        result->row_ptrs_ = this->row_ptrs_;
+        result->srow_ = this->srow_;
+        result->strategy_ = std::move(this->get_strategy()->copy());
+        // END NOTE
         if (!same_executor) {
             detail::strategy_rebuild_helper(result);
         }
@@ -771,7 +826,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
           col_idxs_(exec, num_nonzeros),
           row_ptrs_(exec, size[0] + 1),
           srow_(exec, strategy->clac_size(num_nonzeros)),
-          strategy_(std::move(strategy))
+          strategy_(std::move(strategy->copy()))
     {}
 
     /**
@@ -804,7 +859,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
           col_idxs_{exec, std::forward<ColIdxsArray>(col_idxs)},
           row_ptrs_{exec, std::forward<RowPtrsArray>(row_ptrs)},
           srow_(exec),
-          strategy_(std::move(strategy))
+          strategy_(std::move(strategy->copy()))
     {
         GKO_ASSERT_EQ(values_.get_num_elems(), col_idxs_.get_num_elems());
         GKO_ASSERT_EQ(this->get_size()[0] + 1, row_ptrs_.get_num_elems());