Skip to content

Commit

Permalink
Merge tuned OpenMP SellP, COO and ELL SpMV kernels for small number o…
Browse files Browse the repository at this point in the history
…f rhs

This adds special cases for small numbers of rhs and blocked operations for larger numbers of rhs
for OpenMP SpMVs. This allows for more compiler optimizations and sometimes drastically
improves performance.

Related PR: #809
  • Loading branch information
upsj committed Oct 2, 2021
2 parents d2542f0 + eea1466 commit b89eb71
Show file tree
Hide file tree
Showing 8 changed files with 537 additions and 134 deletions.
41 changes: 41 additions & 0 deletions include/ginkgo/core/base/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,31 @@ struct infinity_impl {
};


/**
* Computes the highest-precision type from a list of types
*/
template <typename T1, typename T2>
struct highest_precision_impl {
using type = decltype(T1{} + T2{});
};

template <typename T1, typename T2>
struct highest_precision_impl<std::complex<T1>, std::complex<T2>> {
using type = std::complex<typename highest_precision_impl<T1, T2>::type>;
};

template <typename Head, typename... Tail>
struct highest_precision_variadic {
using type = typename highest_precision_impl<
Head, typename highest_precision_variadic<Tail...>::type>::type;
};

template <typename Head>
struct highest_precision_variadic<Head> {
using type = Head;
};


} // namespace detail


Expand All @@ -483,6 +508,22 @@ template <typename T>
using increase_precision = typename detail::increase_precision_impl<T>::type;


/**
* Obtains the smallest arithmetic type that is able to store elements of all
* template parameter types exactly. All template type parameters need to be
* either real or complex types, mixing them is not possible.
*
* Formally, it computes a right-fold over the type list, with the highest
* precision of a pair of real arithmetic types T1, T2 computed as
* `decltype(T1{} + T2{})`, or
* `std::complex<highest_precision<remove_complex<T1>, remove_complex<T2>>>` for
* complex types.
*/
template <typename... Ts>
using highest_precision =
typename detail::highest_precision_variadic<Ts...>::type;


/**
* Reduces the precision of the input parameter.
*
Expand Down
227 changes: 191 additions & 36 deletions omp/matrix/coo_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "core/matrix/coo_kernels.hpp"


#include <array>


#include <omp.h>


Expand Down Expand Up @@ -91,15 +94,18 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_COO_ADVANCED_SPMV_KERNEL);


template <typename ValueType, typename IndexType>
void spmv2(std::shared_ptr<const OmpExecutor> exec,
const matrix::Coo<ValueType, IndexType>* a,
const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
template <int block_size, typename ValueType, typename IndexType>
void spmv2_blocked(std::shared_ptr<const OmpExecutor> exec,
const matrix::Coo<ValueType, IndexType>* a,
const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* c, ValueType scale)
{
GKO_ASSERT(b->get_size()[1] > block_size);
const auto coo_val = a->get_const_values();
const auto coo_col = a->get_const_col_idxs();
const auto coo_row = a->get_const_row_idxs();
const auto num_rhs = b->get_size()[1];
const auto rounded_rhs = num_rhs / block_size * block_size;
const auto sentinel_row = a->get_size()[0] + 1;
const auto nnz = a->get_num_stored_elements();

Expand All @@ -115,48 +121,123 @@ void spmv2(std::shared_ptr<const OmpExecutor> exec,
const auto first = begin > 0 ? coo_row[begin - 1] : sentinel_row;
const auto last = end < nnz ? coo_row[end] : sentinel_row;
auto nz = begin;
for (; nz < end && coo_row[nz] == first; nz++) {
const auto row = first;
const auto col = coo_col[nz];
for (size_type rhs = 0; rhs < num_rhs; rhs++) {
atomic_add(c->at(row, rhs), coo_val[nz] * b->at(col, rhs));
std::array<ValueType, block_size> partial_sum;
if (first != sentinel_row) {
for (size_type rhs_base = 0; rhs_base < rounded_rhs;
rhs_base += block_size) {
// handle row overlap with previous thread: block partial
// sums
partial_sum.fill(zero<ValueType>());
for (auto local_nz = nz;
local_nz < end && coo_row[local_nz] == first;
local_nz++) {
const auto col = coo_col[local_nz];
#pragma unroll
for (size_type i = 0; i < block_size; i++) {
const auto rhs = i + rhs_base;
partial_sum[i] +=
scale * coo_val[local_nz] * b->at(col, rhs);
}
}
// handle row overlap with previous thread: block add to
// memory
#pragma unroll
for (size_type i = 0; i < block_size; i++) {
const auto rhs = i + rhs_base;
atomic_add(c->at(first, rhs), partial_sum[i]);
}
}
// handle row overlap with previous thread: remainder partial
// sums
partial_sum.fill(zero<ValueType>());
for (; nz < end && coo_row[nz] == first; nz++) {
const auto row = first;
const auto col = coo_col[nz];
for (size_type rhs = rounded_rhs; rhs < num_rhs; rhs++) {
partial_sum[rhs - rounded_rhs] +=
scale * coo_val[nz] * b->at(col, rhs);
}
}
// handle row overlap with previous thread: remainder add to
// memory
for (size_type rhs = rounded_rhs; rhs < num_rhs; rhs++) {
atomic_add(c->at(first, rhs),
partial_sum[rhs - rounded_rhs]);
}
}
// handle non-overlapping rows
for (; nz < end && coo_row[nz] != last; nz++) {
const auto row = coo_row[nz];
const auto col = coo_col[nz];
for (size_type rhs = 0; rhs < num_rhs; rhs++) {
c->at(row, rhs) += coo_val[nz] * b->at(col, rhs);
for (size_type rhs_base = 0; rhs_base < rounded_rhs;
rhs_base += block_size) {
#pragma unroll
for (size_type i = 0; i < block_size; i++) {
const auto rhs = i + rhs_base;
c->at(row, rhs) +=
scale * coo_val[nz] * b->at(col, rhs);
}
}
for (size_type rhs = rounded_rhs; rhs < num_rhs; rhs++) {
c->at(row, rhs) += scale * coo_val[nz] * b->at(col, rhs);
}
}
for (; nz < end; nz++) {
const auto row = last;
const auto col = coo_col[nz];
for (size_type rhs = 0; rhs < num_rhs; rhs++) {
atomic_add(c->at(row, rhs), coo_val[nz] * b->at(col, rhs));
if (last != sentinel_row) {
for (size_type rhs_base = 0; rhs_base < rounded_rhs;
rhs_base += block_size) {
// handle row overlap with following thread: block partial
// sums
partial_sum.fill(zero<ValueType>());
for (auto local_nz = nz; local_nz < end; local_nz++) {
const auto col = coo_col[local_nz];
#pragma unroll
for (size_type i = 0; i < block_size; i++) {
const auto rhs = i + rhs_base;
partial_sum[i] +=
scale * coo_val[local_nz] * b->at(col, rhs);
}
}
// handle row overlap with following thread: block add to
// memory
#pragma unroll
for (size_type i = 0; i < block_size; i++) {
const auto rhs = i + rhs_base;
const auto row = last;
atomic_add(c->at(row, rhs), partial_sum[i]);
}
}
// handle row overlap with following thread: block partial sums
partial_sum.fill(zero<ValueType>());
for (; nz < end; nz++) {
const auto col = coo_col[nz];
for (size_type rhs = rounded_rhs; rhs < num_rhs; rhs++) {
partial_sum[rhs - rounded_rhs] +=
scale * coo_val[nz] * b->at(col, rhs);
}
}
// handle row overlap with following thread: block add to memory
for (size_type rhs = rounded_rhs; rhs < num_rhs; rhs++) {
const auto row = last;
atomic_add(c->at(row, rhs), partial_sum[rhs - rounded_rhs]);
}
}
}
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);


template <typename ValueType, typename IndexType>
void advanced_spmv2(std::shared_ptr<const OmpExecutor> exec,
const matrix::Dense<ValueType>* alpha,
const matrix::Coo<ValueType, IndexType>* a,
const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* c)
template <int num_rhs, typename ValueType, typename IndexType>
void spmv2_small_rhs(std::shared_ptr<const OmpExecutor> exec,
const matrix::Coo<ValueType, IndexType>* a,
const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* c, ValueType scale)
{
GKO_ASSERT(b->get_size()[1] == num_rhs);
const auto coo_val = a->get_const_values();
const auto coo_col = a->get_const_col_idxs();
const auto coo_row = a->get_const_row_idxs();
const auto num_rhs = b->get_size()[1];
const auto sentinel_row = a->get_size()[0] + 1;
const auto nnz = a->get_num_stored_elements();
const auto scale = alpha->at(0, 0);

#pragma omp parallel
{
Expand All @@ -170,33 +251,107 @@ void advanced_spmv2(std::shared_ptr<const OmpExecutor> exec,
const auto first = begin > 0 ? coo_row[begin - 1] : sentinel_row;
const auto last = end < nnz ? coo_row[end] : sentinel_row;
auto nz = begin;
for (; nz < end && coo_row[nz] == first; nz++) {
const auto row = first;
const auto col = coo_col[nz];
std::array<ValueType, num_rhs> partial_sum;
if (first != sentinel_row) {
// handle row overlap with previous thread: partial sums
partial_sum.fill(zero<ValueType>());
for (; nz < end && coo_row[nz] == first; nz++) {
const auto col = coo_col[nz];
#pragma unroll
for (size_type rhs = 0; rhs < num_rhs; rhs++) {
partial_sum[rhs] +=
scale * coo_val[nz] * b->at(col, rhs);
}
}
// handle row overlap with previous thread: add to memory
#pragma unroll
for (size_type rhs = 0; rhs < num_rhs; rhs++) {
atomic_add(c->at(row, rhs),
scale * coo_val[nz] * b->at(col, rhs));
atomic_add(c->at(first, rhs), partial_sum[rhs]);
}
}
// handle non-overlapping rows
for (; nz < end && coo_row[nz] != last; nz++) {
const auto row = coo_row[nz];
const auto col = coo_col[nz];
#pragma unroll
for (size_type rhs = 0; rhs < num_rhs; rhs++) {
c->at(row, rhs) += scale * coo_val[nz] * b->at(col, rhs);
}
}
for (; nz < end; nz++) {
const auto row = last;
const auto col = coo_col[nz];
if (last != sentinel_row) {
// handle row overlap with following thread: partial sums
partial_sum.fill(zero<ValueType>());
for (; nz < end; nz++) {
const auto col = coo_col[nz];
#pragma unroll
for (size_type rhs = 0; rhs < num_rhs; rhs++) {
partial_sum[rhs] +=
scale * coo_val[nz] * b->at(col, rhs);
}
}
// handle row overlap with following thread: add to memory
#pragma unroll
for (size_type rhs = 0; rhs < num_rhs; rhs++) {
atomic_add(c->at(row, rhs),
scale * coo_val[nz] * b->at(col, rhs));
const auto row = last;
atomic_add(c->at(row, rhs), partial_sum[rhs]);
}
}
}
}
}


template <typename ValueType, typename IndexType>
void generic_spmv2(std::shared_ptr<const OmpExecutor> exec,
const matrix::Coo<ValueType, IndexType>* a,
const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* c, ValueType scale)
{
const auto num_rhs = b->get_size()[1];
if (num_rhs <= 0) {
return;
}
if (num_rhs == 1) {
spmv2_small_rhs<1>(exec, a, b, c, scale);
return;
}
if (num_rhs == 2) {
spmv2_small_rhs<2>(exec, a, b, c, scale);
return;
}
if (num_rhs == 3) {
spmv2_small_rhs<3>(exec, a, b, c, scale);
return;
}
if (num_rhs == 4) {
spmv2_small_rhs<4>(exec, a, b, c, scale);
return;
}
spmv2_blocked<4>(exec, a, b, c, scale);
}


template <typename ValueType, typename IndexType>
void spmv2(std::shared_ptr<const OmpExecutor> exec,
const matrix::Coo<ValueType, IndexType>* a,
const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
{
generic_spmv2(exec, a, b, c, one<ValueType>());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COO_SPMV2_KERNEL);


template <typename ValueType, typename IndexType>
void advanced_spmv2(std::shared_ptr<const OmpExecutor> exec,
const matrix::Dense<ValueType>* alpha,
const matrix::Coo<ValueType, IndexType>* a,
const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* c)
{
generic_spmv2(exec, a, b, c, alpha->at(0, 0));
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);

Expand Down
Loading

0 comments on commit b89eb71

Please sign in to comment.