Skip to content

Commit

Permalink
Merge mixed precision ELL
Browse files Browse the repository at this point in the history
This PR adds a true mixed precision ELL kernel that can be enabled with the
GINKGO_MIXED_PRECISION CMake flag, otherwise it uses the default
conversion-based mixed precision support.

Related PR: #717
  • Loading branch information
upsj committed May 6, 2021
2 parents 7aef216 + 31fe679 commit 08913b5
Show file tree
Hide file tree
Showing 24 changed files with 1,599 additions and 208 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/osx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ jobs:
fail-fast: false
matrix:
config:
- {shared: "ON", build_type: "Debug", name: "omp/debug/shared"}
- {shared: "OFF", build_type: "Release", name: "omp/release/static"}
- {shared: "ON", build_type: "Debug", name: "omp/debug/shared", "mixed": "OFF"}
- {shared: "OFF", build_type: "Release", name: "omp/release/static", "mixed": "ON"}
name: ${{ matrix.config.name }}
runs-on: [macos-latest]

Expand All @@ -40,7 +40,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }}
cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }}
make -j8
ctest -j10 --output-on-failure
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/windows-msvc-cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ jobs:
fail-fast: false
matrix:
config:
- {version: "10.2.89.20191206", name: "cuda102/release/shared"}
- {version: "latest", name: "cuda-latest/release/shared"}
- {version: "10.2.89.20191206", name: "cuda102/release/shared", "mixed": "ON"}
- {version: "latest", name: "cuda-latest/release/shared", "mixed": "OFF"}
name: msvc/${{ matrix.config.name }} (only compile)
runs-on: [windows-latest]

Expand Down Expand Up @@ -46,5 +46,5 @@ jobs:
$env:PATH="$env:PATH;$pwd\build\windows_shared_library"
mkdir build
cd build
cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_CUDA_ARCHITECTURES=60 ..
cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_CUDA_ARCHITECTURES=60 ..
cmake --build . -j4 --config Release
6 changes: 3 additions & 3 deletions .github/workflows/windows-msvc-ref.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ jobs:
fail-fast: false
matrix:
config:
- {shared: "ON", build_type: "Debug", name: "reference/debug/shared"}
- {shared: "OFF", build_type: "Release", name: "reference/release/static"}
- {shared: "ON", build_type: "Debug", name: "reference/debug/shared", "mixed": "ON"}
- {shared: "OFF", build_type: "Release", name: "reference/release/static", "mixed": "OFF"}
# Debug static needs too much storage
# - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"}
name: msvc/${{ matrix.config.name }}
Expand All @@ -35,7 +35,7 @@ jobs:
$env:PATH="$env:PATH;$pwd\build\windows_shared_library"
mkdir build
cd build
cmake -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF ..
cmake -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} ..
cmake --build . -j4 --config ${{ matrix.config.build_type }}
ctest . -C ${{ matrix.config.build_type }} --output-on-failure
Expand Down
41 changes: 41 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ include:
BUILD_HWLOC: "ON"
FAST_TESTS: "OFF"
DPCPP_SINGLE_MODE: "OFF"
MIXED_PRECISION: "ON"
RUN_EXAMPLES: "OFF"
CONFIG_LOG: "ON"
CXX_FLAGS: ""
Expand Down Expand Up @@ -77,6 +78,7 @@ include:
-DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
-DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
-DGINKGO_FAST_TESTS=${FAST_TESTS}
-DGINKGO_MIXED_PRECISION=${MIXED_PRECISION}
-DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES}
-DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG}
-DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE}
Expand Down Expand Up @@ -111,6 +113,7 @@ include:
-DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
-DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
-DGINKGO_FAST_TESTS=${FAST_TESTS}
-DGINKGO_MIXED_PRECISION=${MIXED_PRECISION}
-DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG}
-DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE}
-DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES}
Expand Down Expand Up @@ -681,6 +684,44 @@ build/nocuda/intel/omp/release/static:
BUILD_TYPE: "Release"
BUILD_SHARED_LIBS: "OFF"

build/nocuda-nomixed/gcc/omp/release/shared:
<<: *default_build_with_test
extends:
- .quick_test_condition
- .use_gko-nocuda-gnu9-llvm8
variables:
<<: *default_variables
BUILD_OMP: "ON"
BUILD_TYPE: "Release"
MIXED_PRECISION: "OFF"

build/nocuda-nomixed/clang/omp/debug/static:
<<: *default_build_with_test
extends:
- .full_test_condition
- .use_gko-nocuda-gnu9-llvm8
variables:
<<: *default_variables
C_COMPILER: "clang"
CXX_COMPILER: "clang++"
BUILD_OMP: "ON"
BUILD_TYPE: "Debug"
BUILD_SHARED_LIBS: "OFF"
MIXED_PRECISION: "OFF"

build/nocuda-nomixed/intel/omp/release/static:
<<: *default_build_with_test
extends:
- .full_test_condition
- .use_gko-nocuda-gnu9-llvm8-intel
variables:
<<: *default_variables
C_COMPILER: "icc"
CXX_COMPILER: "icpc"
BUILD_OMP: "ON"
BUILD_TYPE: "Release"
BUILD_SHARED_LIBS: "OFF"
MIXED_PRECISION: "OFF"

build/dpcpp/cpu/release/static:
<<: *default_build_with_test
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA})
option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP})
option(GINKGO_BUILD_DOC "Generate documentation" OFF)
option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
option(GINKGO_SKIP_DEPENDENCY_UPDATE
"Do not update dependencies each time the project is rebuilt" ON)
option(GINKGO_EXPORT_BUILD_DIR
Expand Down
4 changes: 4 additions & 0 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ Ginkgo adds the following additional switches to control what is being built:
* `-DGINKGO_DEVEL_TOOLS={ON, OFF}` sets up the build system for development
(requires clang-format, will also download git-cmake-format),
default is `OFF`.
* `-DGINKGO_MIXED_PRECISION={ON, OFF}` compiles true mixed-precision kernels
instead of converting data on the fly, default is `OFF`.
Enabling this flag increases the library size, but improves performance of
mixed-precision kernels.
* `-DGINKGO_BUILD_TESTS={ON, OFF}` builds Ginkgo's tests
(will download googletest), default is `ON`.
* `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests
Expand Down
29 changes: 25 additions & 4 deletions benchmark/utils/formats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ namespace formats {


std::string available_format =
"coo, csr, ell, sellp, hybrid, hybrid0, hybrid25, hybrid33, hybrid40, "
"coo, csr, ell, ell-mixed, sellp, hybrid, hybrid0, hybrid25, hybrid33, "
"hybrid40, "
"hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, "
"hybridminstorage"
#ifdef HAS_CUDA
Expand Down Expand Up @@ -90,6 +91,8 @@ std::string format_description =
"csrm: Ginkgo's CSR implementation with merge_path strategy.\n"
"ell: Ellpack format according to Bell and Garland: Efficient Sparse "
"Matrix-Vector Multiplication on CUDA.\n"
"ell-mixed: Mixed Precision Ellpack format according to Bell and Garland: "
"Efficient Sparse Matrix-Vector Multiplication on CUDA.\n"
"sellp: Sliced Ellpack uses a default block size of 32.\n"
"hybrid: Hybrid uses ell and coo to represent the matrix.\n"
"hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80: Hybrid uses "
Expand Down Expand Up @@ -204,6 +207,23 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
{"csrc", READ_MATRIX(csr, std::make_shared<csr::classical>())},
{"coo", read_matrix_from_data<gko::matrix::Coo<etype>>},
{"ell", read_matrix_from_data<gko::matrix::Ell<etype>>},
{"ell-mixed",
[](std::shared_ptr<const gko::Executor> exec,
const gko::matrix_data<etype> &data) {
gko::matrix_data<gko::next_precision<etype>> conv_data;
conv_data.size = data.size;
conv_data.nonzeros.resize(data.nonzeros.size());
auto it = conv_data.nonzeros.begin();
for (auto &el : data.nonzeros) {
it->row = el.row;
it->column = el.column;
it->value = el.value;
++it;
}
auto mat = gko::matrix::Ell<gko::next_precision<etype>>::create(std::move(exec));
mat->read(conv_data);
return mat;
}},
#ifdef HAS_CUDA
#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
{"cusp_csr", read_matrix_from_data<cusp_csr>},
Expand All @@ -212,8 +232,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
{"cusp_hybrid", read_matrix_from_data<cusp_hybrid>},
{"cusp_coo", read_matrix_from_data<cusp_coo>},
{"cusp_ell", read_matrix_from_data<cusp_ell>},
#else // CUDA_VERSION >= 11000
// cusp_csr, cusp_coo use the generic ones from CUDA 11
#else // CUDA_VERSION >= 11000
// cusp_csr, cusp_coo use the generic ones from CUDA 11
{"cusp_csr", read_matrix_from_data<cusp_gcsr>},
{"cusp_coo", read_matrix_from_data<cusp_gcoo>},
#endif
Expand Down Expand Up @@ -260,7 +280,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
{"hybridminstorage",
READ_MATRIX(hybrid,
std::make_shared<hybrid::minimal_storage_limit>())},
{"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}};
{"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}
};
// clang-format on


Expand Down
3 changes: 3 additions & 0 deletions cmake/get_info.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ foreach(log_type ${log_types})
ginkgo_print_module_footer(${${log_type}} " Enabled modules:")
ginkgo_print_foreach_variable(${${log_type}}
"GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP")
ginkgo_print_module_footer(${${log_type}} " Enabled features:")
ginkgo_print_foreach_variable(${${log_type}}
"GINKGO_MIXED_PRECISION")
ginkgo_print_module_footer(${${log_type}} " Tests, benchmarks and examples:")
ginkgo_print_foreach_variable(${${log_type}}
"GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")
Expand Down
63 changes: 32 additions & 31 deletions common/matrix/ell_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -34,29 +34,30 @@ namespace kernel {
namespace {


template <int num_thread_per_worker, bool atomic, typename ValueType,
typename IndexType, typename Closure>
template <int num_thread_per_worker, bool atomic, typename b_accessor,
typename a_accessor, typename OutputValueType, typename IndexType,
typename Closure>
__device__ void spmv_kernel(
const size_type num_rows, const int num_worker_per_row,
const ValueType *__restrict__ val, const IndexType *__restrict__ col,
acc::range<a_accessor> val, const IndexType *__restrict__ col,
const size_type stride, const size_type num_stored_elements_per_row,
const ValueType *__restrict__ b, const size_type b_stride,
ValueType *__restrict__ c, const size_type c_stride, Closure op)
acc::range<b_accessor> b, OutputValueType *__restrict__ c,
const size_type c_stride, Closure op)
{
const auto tidx = thread::get_thread_id_flat();
const auto column_id = blockIdx.y;
const decltype(tidx) column_id = blockIdx.y;
if (num_thread_per_worker == 1) {
// Specialize the num_thread_per_worker = 1. It doesn't need the shared
// memory, __syncthreads, and atomic_add
if (tidx < num_rows) {
ValueType temp = zero<ValueType>();
auto temp = zero<OutputValueType>();
for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
const auto ind = tidx + idx * stride;
const auto col_idx = col[ind];
if (col_idx < idx) {
break;
} else {
temp += val[ind] * b[col_idx * b_stride + column_id];
temp += val(ind) * b(col_idx, column_id);
}
}
const auto c_ind = tidx * c_stride + column_id;
Expand All @@ -68,14 +69,14 @@ __device__ void spmv_kernel(
const auto x = tidx % num_rows;
const auto worker_id = tidx / num_rows;
const auto step_size = num_worker_per_row * num_thread_per_worker;
__shared__ UninitializedArray<ValueType, default_block_size /
num_thread_per_worker>
__shared__ UninitializedArray<
OutputValueType, default_block_size / num_thread_per_worker>
storage;
if (idx_in_worker == 0) {
storage[threadIdx.x] = 0;
}
__syncthreads();
ValueType temp = zero<ValueType>();
auto temp = zero<OutputValueType>();
for (size_type idx =
worker_id * num_thread_per_worker + idx_in_worker;
idx < num_stored_elements_per_row; idx += step_size) {
Expand All @@ -84,7 +85,7 @@ __device__ void spmv_kernel(
if (col_idx < idx) {
break;
} else {
temp += val[ind] * b[col_idx * b_stride + column_id];
temp += val(ind) * b(col_idx, column_id);
}
}
atomic_add(&storage[threadIdx.x], temp);
Expand All @@ -102,51 +103,51 @@ __device__ void spmv_kernel(
}


template <int num_thread_per_worker, bool atomic = false, typename ValueType,
typename IndexType>
template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
typename a_accessor, typename OutputValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void spmv(
const size_type num_rows, const int num_worker_per_row,
const ValueType *__restrict__ val, const IndexType *__restrict__ col,
acc::range<a_accessor> val, const IndexType *__restrict__ col,
const size_type stride, const size_type num_stored_elements_per_row,
const ValueType *__restrict__ b, const size_type b_stride,
ValueType *__restrict__ c, const size_type c_stride)
acc::range<b_accessor> b, OutputValueType *__restrict__ c,
const size_type c_stride)
{
spmv_kernel<num_thread_per_worker, atomic>(
num_rows, num_worker_per_row, val, col, stride,
num_stored_elements_per_row, b, b_stride, c, c_stride,
[](const ValueType &x, const ValueType &y) { return x; });
num_stored_elements_per_row, b, c, c_stride,
[](const OutputValueType &x, const OutputValueType &y) { return x; });
}


template <int num_thread_per_worker, bool atomic = false, typename ValueType,
typename IndexType>
template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
typename a_accessor, typename OutputValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void spmv(
const size_type num_rows, const int num_worker_per_row,
const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
acc::range<a_accessor> alpha, acc::range<a_accessor> val,
const IndexType *__restrict__ col, const size_type stride,
const size_type num_stored_elements_per_row,
const ValueType *__restrict__ b, const size_type b_stride,
const ValueType *__restrict__ beta, ValueType *__restrict__ c,
const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c,
const size_type c_stride)
{
const ValueType alpha_val = alpha[0];
const ValueType beta_val = beta[0];
const OutputValueType alpha_val = alpha(0);
const OutputValueType beta_val = beta[0];
// Because the atomic operation changes the values of c during computation,
// it can not do the right alpha * a * b + beta * c operation.
// Thus, the cuda kernel only computes alpha * a * b when it uses atomic
// operation.
if (atomic) {
spmv_kernel<num_thread_per_worker, atomic>(
num_rows, num_worker_per_row, val, col, stride,
num_stored_elements_per_row, b, b_stride, c, c_stride,
[&alpha_val](const ValueType &x, const ValueType &y) {
num_stored_elements_per_row, b, c, c_stride,
[&alpha_val](const OutputValueType &x, const OutputValueType &y) {
return alpha_val * x;
});
} else {
spmv_kernel<num_thread_per_worker, atomic>(
num_rows, num_worker_per_row, val, col, stride,
num_stored_elements_per_row, b, b_stride, c, c_stride,
[&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
num_stored_elements_per_row, b, c, c_stride,
[&alpha_val, &beta_val](const OutputValueType &x,
const OutputValueType &y) {
return alpha_val * x + beta_val * y;
});
}
Expand Down
Loading

0 comments on commit 08913b5

Please sign in to comment.