Merge tuned OpenMP SellP, COO and ELL SpMV kernels for small number o…

…f rhs This adds special cases for small numbers of rhs and blocked operations for larger numbers of rhs for OpenMP SpMVs. This allows for more compiler optimizations and sometimes drastically improves performance. Related PR: #809
ginkgo-project · Oct 2, 2021 · b89eb71 · b89eb71
2 parents d2542f0 + eea1466
commit b89eb71
Show file tree

Hide file tree

Showing 8 changed files with 537 additions and 134 deletions.
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
@@ -459,6 +459,31 @@ struct infinity_impl {
 };
 
 
+/**
+ * Computes the highest-precision type from a list of types
+ */
+template <typename T1, typename T2>
+struct highest_precision_impl {
+    using type = decltype(T1{} + T2{});
+};
+
+template <typename T1, typename T2>
+struct highest_precision_impl<std::complex<T1>, std::complex<T2>> {
+    using type = std::complex<typename highest_precision_impl<T1, T2>::type>;
+};
+
+template <typename Head, typename... Tail>
+struct highest_precision_variadic {
+    using type = typename highest_precision_impl<
+        Head, typename highest_precision_variadic<Tail...>::type>::type;
+};
+
+template <typename Head>
+struct highest_precision_variadic<Head> {
+    using type = Head;
+};
+
+
 }  // namespace detail
 
 
@@ -483,6 +508,22 @@ template <typename T>
 using increase_precision = typename detail::increase_precision_impl<T>::type;
 
 
+/**
+ * Obtains the smallest arithmetic type that is able to store elements of all
+ * template parameter types exactly. All template type parameters need to be
+ * either real or complex types, mixing them is not possible.
+ *
+ * Formally, it computes a right-fold over the type list, with the highest
+ * precision of a pair of real arithmetic types T1, T2 computed as
+ * `decltype(T1{} + T2{})`, or
+ * `std::complex<highest_precision<remove_complex<T1>, remove_complex<T2>>>` for
+ * complex types.
+ */
+template <typename... Ts>
+using highest_precision =
+    typename detail::highest_precision_variadic<Ts...>::type;
+
+
 /**
  * Reduces the precision of the input parameter.
  *

diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/coo_kernels.hpp"
 
 
+#include <array>
+
+
 #include <omp.h>
 
 
@@ -91,15 +94,18 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
-void spmv2(std::shared_ptr<const OmpExecutor> exec,
-           const matrix::Coo<ValueType, IndexType>* a,
-           const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
+template <int block_size, typename ValueType, typename IndexType>
+void spmv2_blocked(std::shared_ptr<const OmpExecutor> exec,
+                   const matrix::Coo<ValueType, IndexType>* a,
+                   const matrix::Dense<ValueType>* b,
+                   matrix::Dense<ValueType>* c, ValueType scale)
 {
+    GKO_ASSERT(b->get_size()[1] > block_size);
     const auto coo_val = a->get_const_values();
     const auto coo_col = a->get_const_col_idxs();
     const auto coo_row = a->get_const_row_idxs();
     const auto num_rhs = b->get_size()[1];
+    const auto rounded_rhs = num_rhs / block_size * block_size;
     const auto sentinel_row = a->get_size()[0] + 1;
     const auto nnz = a->get_num_stored_elements();
 
@@ -115,48 +121,123 @@ void spmv2(std::shared_ptr<const OmpExecutor> exec,
             const auto first = begin > 0 ? coo_row[begin - 1] : sentinel_row;
             const auto last = end < nnz ? coo_row[end] : sentinel_row;
             auto nz = begin;
-            for (; nz < end && coo_row[nz] == first; nz++) {
-                const auto row = first;
-                const auto col = coo_col[nz];
-                for (size_type rhs = 0; rhs < num_rhs; rhs++) {
-                    atomic_add(c->at(row, rhs), coo_val[nz] * b->at(col, rhs));
+            std::array<ValueType, block_size> partial_sum;
+            if (first != sentinel_row) {
+                for (size_type rhs_base = 0; rhs_base < rounded_rhs;
+                     rhs_base += block_size) {
+                    // handle row overlap with previous thread: block partial
+                    // sums
+                    partial_sum.fill(zero<ValueType>());
+                    for (auto local_nz = nz;
+                         local_nz < end && coo_row[local_nz] == first;
+                         local_nz++) {
+                        const auto col = coo_col[local_nz];
+#pragma unroll
+                        for (size_type i = 0; i < block_size; i++) {
+                            const auto rhs = i + rhs_base;
+                            partial_sum[i] +=
+                                scale * coo_val[local_nz] * b->at(col, rhs);
+                        }
+                    }
+                    // handle row overlap with previous thread: block add to
+                    // memory
+#pragma unroll
+                    for (size_type i = 0; i < block_size; i++) {
+                        const auto rhs = i + rhs_base;
+                        atomic_add(c->at(first, rhs), partial_sum[i]);
+                    }
+                }
+                // handle row overlap with previous thread: remainder partial
+                // sums
+                partial_sum.fill(zero<ValueType>());
+                for (; nz < end && coo_row[nz] == first; nz++) {
+                    const auto row = first;
+                    const auto col = coo_col[nz];
+                    for (size_type rhs = rounded_rhs; rhs < num_rhs; rhs++) {
+                        partial_sum[rhs - rounded_rhs] +=
+                            scale * coo_val[nz] * b->at(col, rhs);
+                    }
+                }
+                // handle row overlap with previous thread: remainder add to
+                // memory
+                for (size_type rhs = rounded_rhs; rhs < num_rhs; rhs++) {
+                    atomic_add(c->at(first, rhs),
+                               partial_sum[rhs - rounded_rhs]);
                 }
             }
+            // handle non-overlapping rows
             for (; nz < end && coo_row[nz] != last; nz++) {
                 const auto row = coo_row[nz];
                 const auto col = coo_col[nz];
-                for (size_type rhs = 0; rhs < num_rhs; rhs++) {
-                    c->at(row, rhs) += coo_val[nz] * b->at(col, rhs);
+                for (size_type rhs_base = 0; rhs_base < rounded_rhs;
+                     rhs_base += block_size) {
+#pragma unroll
+                    for (size_type i = 0; i < block_size; i++) {
+                        const auto rhs = i + rhs_base;
+                        c->at(row, rhs) +=
+                            scale * coo_val[nz] * b->at(col, rhs);
+                    }
+                }
+                for (size_type rhs = rounded_rhs; rhs < num_rhs; rhs++) {
+                    c->at(row, rhs) += scale * coo_val[nz] * b->at(col, rhs);
                 }
             }
-            for (; nz < end; nz++) {
-                const auto row = last;
-                const auto col = coo_col[nz];
-                for (size_type rhs = 0; rhs < num_rhs; rhs++) {
-                    atomic_add(c->at(row, rhs), coo_val[nz] * b->at(col, rhs));
+            if (last != sentinel_row) {
+                for (size_type rhs_base = 0; rhs_base < rounded_rhs;
+                     rhs_base += block_size) {
+                    // handle row overlap with following thread: block partial
+                    // sums
+                    partial_sum.fill(zero<ValueType>());
+                    for (auto local_nz = nz; local_nz < end; local_nz++) {
+                        const auto col = coo_col[local_nz];
+#pragma unroll
+                        for (size_type i = 0; i < block_size; i++) {
+                            const auto rhs = i + rhs_base;
+                            partial_sum[i] +=
+                                scale * coo_val[local_nz] * b->at(col, rhs);
+                        }
+                    }
+                    // handle row overlap with following thread: block add to
+                    // memory
+#pragma unroll
+                    for (size_type i = 0; i < block_size; i++) {
+                        const auto rhs = i + rhs_base;
+                        const auto row = last;
+                        atomic_add(c->at(row, rhs), partial_sum[i]);
+                    }
+                }
+                // handle row overlap with following thread: block partial sums
+                partial_sum.fill(zero<ValueType>());
+                for (; nz < end; nz++) {
+                    const auto col = coo_col[nz];
+                    for (size_type rhs = rounded_rhs; rhs < num_rhs; rhs++) {
+                        partial_sum[rhs - rounded_rhs] +=
+                            scale * coo_val[nz] * b->at(col, rhs);
+                    }
+                }
+                // handle row overlap with following thread: block add to memory
+                for (size_type rhs = rounded_rhs; rhs < num_rhs; rhs++) {
+                    const auto row = last;
+                    atomic_add(c->at(row, rhs), partial_sum[rhs - rounded_rhs]);
                 }
             }
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
-
 
-template <typename ValueType, typename IndexType>
-void advanced_spmv2(std::shared_ptr<const OmpExecutor> exec,
-                    const matrix::Dense<ValueType>* alpha,
-                    const matrix::Coo<ValueType, IndexType>* a,
-                    const matrix::Dense<ValueType>* b,
-                    matrix::Dense<ValueType>* c)
+template <int num_rhs, typename ValueType, typename IndexType>
+void spmv2_small_rhs(std::shared_ptr<const OmpExecutor> exec,
+                     const matrix::Coo<ValueType, IndexType>* a,
+                     const matrix::Dense<ValueType>* b,
+                     matrix::Dense<ValueType>* c, ValueType scale)
 {
+    GKO_ASSERT(b->get_size()[1] == num_rhs);
     const auto coo_val = a->get_const_values();
     const auto coo_col = a->get_const_col_idxs();
     const auto coo_row = a->get_const_row_idxs();
-    const auto num_rhs = b->get_size()[1];
     const auto sentinel_row = a->get_size()[0] + 1;
     const auto nnz = a->get_num_stored_elements();
-    const auto scale = alpha->at(0, 0);
 
 #pragma omp parallel
     {
@@ -170,33 +251,107 @@ void advanced_spmv2(std::shared_ptr<const OmpExecutor> exec,
             const auto first = begin > 0 ? coo_row[begin - 1] : sentinel_row;
             const auto last = end < nnz ? coo_row[end] : sentinel_row;
             auto nz = begin;
-            for (; nz < end && coo_row[nz] == first; nz++) {
-                const auto row = first;
-                const auto col = coo_col[nz];
+            std::array<ValueType, num_rhs> partial_sum;
+            if (first != sentinel_row) {
+                // handle row overlap with previous thread: partial sums
+                partial_sum.fill(zero<ValueType>());
+                for (; nz < end && coo_row[nz] == first; nz++) {
+                    const auto col = coo_col[nz];
+#pragma unroll
+                    for (size_type rhs = 0; rhs < num_rhs; rhs++) {
+                        partial_sum[rhs] +=
+                            scale * coo_val[nz] * b->at(col, rhs);
+                    }
+                }
+                // handle row overlap with previous thread: add to memory
+#pragma unroll
                 for (size_type rhs = 0; rhs < num_rhs; rhs++) {
-                    atomic_add(c->at(row, rhs),
-                               scale * coo_val[nz] * b->at(col, rhs));
+                    atomic_add(c->at(first, rhs), partial_sum[rhs]);
                 }
             }
+            // handle non-overlapping rows
             for (; nz < end && coo_row[nz] != last; nz++) {
                 const auto row = coo_row[nz];
                 const auto col = coo_col[nz];
+#pragma unroll
                 for (size_type rhs = 0; rhs < num_rhs; rhs++) {
                     c->at(row, rhs) += scale * coo_val[nz] * b->at(col, rhs);
                 }
             }
-            for (; nz < end; nz++) {
-                const auto row = last;
-                const auto col = coo_col[nz];
+            if (last != sentinel_row) {
+                // handle row overlap with following thread: partial sums
+                partial_sum.fill(zero<ValueType>());
+                for (; nz < end; nz++) {
+                    const auto col = coo_col[nz];
+#pragma unroll
+                    for (size_type rhs = 0; rhs < num_rhs; rhs++) {
+                        partial_sum[rhs] +=
+                            scale * coo_val[nz] * b->at(col, rhs);
+                    }
+                }
+                // handle row overlap with following thread: add to memory
+#pragma unroll
                 for (size_type rhs = 0; rhs < num_rhs; rhs++) {
-                    atomic_add(c->at(row, rhs),
-                               scale * coo_val[nz] * b->at(col, rhs));
+                    const auto row = last;
+                    atomic_add(c->at(row, rhs), partial_sum[rhs]);
                 }
             }
         }
     }
 }
 
+
+template <typename ValueType, typename IndexType>
+void generic_spmv2(std::shared_ptr<const OmpExecutor> exec,
+                   const matrix::Coo<ValueType, IndexType>* a,
+                   const matrix::Dense<ValueType>* b,
+                   matrix::Dense<ValueType>* c, ValueType scale)
+{
+    const auto num_rhs = b->get_size()[1];
+    if (num_rhs <= 0) {
+        return;
+    }
+    if (num_rhs == 1) {
+        spmv2_small_rhs<1>(exec, a, b, c, scale);
+        return;
+    }
+    if (num_rhs == 2) {
+        spmv2_small_rhs<2>(exec, a, b, c, scale);
+        return;
+    }
+    if (num_rhs == 3) {
+        spmv2_small_rhs<3>(exec, a, b, c, scale);
+        return;
+    }
+    if (num_rhs == 4) {
+        spmv2_small_rhs<4>(exec, a, b, c, scale);
+        return;
+    }
+    spmv2_blocked<4>(exec, a, b, c, scale);
+}
+
+
+template <typename ValueType, typename IndexType>
+void spmv2(std::shared_ptr<const OmpExecutor> exec,
+           const matrix::Coo<ValueType, IndexType>* a,
+           const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
+{
+    generic_spmv2(exec, a, b, c, one<ValueType>());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spmv2(std::shared_ptr<const OmpExecutor> exec,
+                    const matrix::Dense<ValueType>* alpha,
+                    const matrix::Coo<ValueType, IndexType>* a,
+                    const matrix::Dense<ValueType>* b,
+                    matrix::Dense<ValueType>* c)
+{
+    generic_spmv2(exec, a, b, c, alpha->at(0, 0));
+}
+
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);