Merge mixed precision ELL

This PR adds a true mixed precision ELL kernel that can be enabled with the GINKGO_MIXED_PRECISION CMake flag, otherwise it uses the default conversion-based mixed precision support. Related PR: #717
ginkgo-project · May 6, 2021 · 08913b5 · 08913b5
2 parents 7aef216 + 31fe679
commit 08913b5
Show file tree

Hide file tree

Showing 24 changed files with 1,599 additions and 208 deletions.
diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml
@@ -15,8 +15,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - {shared: "ON", build_type: "Debug", name: "omp/debug/shared"}
-        - {shared: "OFF", build_type: "Release", name: "omp/release/static"}
+        - {shared: "ON", build_type: "Debug", name: "omp/debug/shared", "mixed": "OFF"}
+        - {shared: "OFF", build_type: "Release", name: "omp/release/static", "mixed": "ON"}
     name: ${{ matrix.config.name }}
     runs-on: [macos-latest]
 
@@ -40,7 +40,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }}
+        cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }}
         make -j8
         ctest -j10 --output-on-failure
 

diff --git a/.github/workflows/windows-msvc-cuda.yml b/.github/workflows/windows-msvc-cuda.yml
@@ -15,8 +15,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - {version: "10.2.89.20191206", name: "cuda102/release/shared"}
-        - {version: "latest", name: "cuda-latest/release/shared"}
+        - {version: "10.2.89.20191206", name: "cuda102/release/shared", "mixed": "ON"}
+        - {version: "latest", name: "cuda-latest/release/shared", "mixed": "OFF"}
     name: msvc/${{ matrix.config.name }} (only compile)
     runs-on: [windows-latest]
 
@@ -46,5 +46,5 @@ jobs:
         $env:PATH="$env:PATH;$pwd\build\windows_shared_library"
         mkdir build
         cd build
-        cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_CUDA_ARCHITECTURES=60 ..
+        cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_CUDA_ARCHITECTURES=60 ..
         cmake --build . -j4 --config Release
diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml
@@ -15,8 +15,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"}
-        - {shared: "OFF", build_type: "Release", name: "reference/release/static"}
+        - {shared: "ON", build_type: "Debug", name: "reference/debug/shared", "mixed": "ON"}
+        - {shared: "OFF", build_type: "Release", name: "reference/release/static", "mixed": "OFF"}
         # Debug static needs too much storage
         # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"}
     name: msvc/${{ matrix.config.name }}
@@ -35,7 +35,7 @@ jobs:
         $env:PATH="$env:PATH;$pwd\build\windows_shared_library"
         mkdir build
         cd build
-        cmake  -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF ..
+        cmake  -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} ..
         cmake --build . -j4 --config ${{ matrix.config.build_type }}
         ctest . -C ${{ matrix.config.build_type }} --output-on-failure
 

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -32,6 +32,7 @@ include:
   BUILD_HWLOC: "ON"
   FAST_TESTS: "OFF"
   DPCPP_SINGLE_MODE: "OFF"
+  MIXED_PRECISION: "ON"
   RUN_EXAMPLES: "OFF"
   CONFIG_LOG: "ON"
   CXX_FLAGS: ""
@@ -77,6 +78,7 @@ include:
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
         -DGINKGO_FAST_TESTS=${FAST_TESTS}
+        -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION}
         -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES}
         -DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG}
         -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE}
@@ -111,6 +113,7 @@ include:
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
         -DGINKGO_FAST_TESTS=${FAST_TESTS}
+        -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION}
         -DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG}
         -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE}
         -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES}
@@ -681,6 +684,44 @@ build/nocuda/intel/omp/release/static:
     BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "OFF"
 
+build/nocuda-nomixed/gcc/omp/release/shared:
+  <<: *default_build_with_test
+  extends:
+    - .quick_test_condition
+    - .use_gko-nocuda-gnu9-llvm8
+  variables:
+    <<: *default_variables
+    BUILD_OMP: "ON"
+    BUILD_TYPE: "Release"
+    MIXED_PRECISION: "OFF"
+
+build/nocuda-nomixed/clang/omp/debug/static:
+  <<: *default_build_with_test
+  extends:
+    - .full_test_condition
+    - .use_gko-nocuda-gnu9-llvm8
+  variables:
+    <<: *default_variables
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
+    BUILD_OMP: "ON"
+    BUILD_TYPE: "Debug"
+    BUILD_SHARED_LIBS: "OFF"
+    MIXED_PRECISION: "OFF"
+
+build/nocuda-nomixed/intel/omp/release/static:
+  <<: *default_build_with_test
+  extends:
+    - .full_test_condition
+    - .use_gko-nocuda-gnu9-llvm8-intel
+  variables:
+    <<: *default_variables
+    C_COMPILER: "icc"
+    CXX_COMPILER: "icpc"
+    BUILD_OMP: "ON"
+    BUILD_TYPE: "Release"
+    BUILD_SHARED_LIBS: "OFF"
+    MIXED_PRECISION: "OFF"
 
 build/dpcpp/cpu/release/static:
   <<: *default_build_with_test

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,6 +31,7 @@ option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA})
 option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP})
 option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
+option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
 option(GINKGO_EXPORT_BUILD_DIR

diff --git a/INSTALL.md b/INSTALL.md
@@ -18,6 +18,10 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_DEVEL_TOOLS={ON, OFF}` sets up the build system for development
     (requires clang-format, will also download git-cmake-format),
     default is `OFF`.
+*   `-DGINKGO_MIXED_PRECISION={ON, OFF}` compiles true mixed-precision kernels
+    instead of converting data on the fly, default is `OFF`.
+    Enabling this flag increases the library size, but improves performance of
+    mixed-precision kernels.
 *   `-DGINKGO_BUILD_TESTS={ON, OFF}` builds Ginkgo's tests
     (will download googletest), default is `ON`.
 *   `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests

diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
@@ -59,7 +59,8 @@ namespace formats {
 
 
 std::string available_format =
-    "coo, csr, ell, sellp, hybrid, hybrid0, hybrid25, hybrid33, hybrid40, "
+    "coo, csr, ell, ell-mixed, sellp, hybrid, hybrid0, hybrid25, hybrid33, "
+    "hybrid40, "
     "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, "
     "hybridminstorage"
 #ifdef HAS_CUDA
@@ -90,6 +91,8 @@ std::string format_description =
     "csrm: Ginkgo's CSR implementation with merge_path strategy.\n"
     "ell: Ellpack format according to Bell and Garland: Efficient Sparse "
     "Matrix-Vector Multiplication on CUDA.\n"
+    "ell-mixed: Mixed Precision Ellpack format according to Bell and Garland: "
+    "Efficient Sparse Matrix-Vector Multiplication on CUDA.\n"
     "sellp: Sliced Ellpack uses a default block size of 32.\n"
     "hybrid: Hybrid uses ell and coo to represent the matrix.\n"
     "hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80: Hybrid uses "
@@ -204,6 +207,23 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"csrc", READ_MATRIX(csr, std::make_shared<csr::classical>())},
         {"coo", read_matrix_from_data<gko::matrix::Coo<etype>>},
         {"ell", read_matrix_from_data<gko::matrix::Ell<etype>>},
+        {"ell-mixed",
+         [](std::shared_ptr<const gko::Executor> exec,
+            const gko::matrix_data<etype> &data) {
+             gko::matrix_data<gko::next_precision<etype>> conv_data;
+             conv_data.size = data.size;
+             conv_data.nonzeros.resize(data.nonzeros.size());
+             auto it = conv_data.nonzeros.begin();
+             for (auto &el : data.nonzeros) {
+                 it->row = el.row;
+                 it->column = el.column;
+                 it->value = el.value;
+                 ++it;
+             }
+             auto mat = gko::matrix::Ell<gko::next_precision<etype>>::create(std::move(exec));
+             mat->read(conv_data);
+             return mat;
+         }},
 #ifdef HAS_CUDA
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         {"cusp_csr", read_matrix_from_data<cusp_csr>},
@@ -212,8 +232,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"cusp_hybrid", read_matrix_from_data<cusp_hybrid>},
         {"cusp_coo", read_matrix_from_data<cusp_coo>},
         {"cusp_ell", read_matrix_from_data<cusp_ell>},
-#else // CUDA_VERSION >= 11000
-        // cusp_csr, cusp_coo use the generic ones from CUDA 11
+#else  // CUDA_VERSION >= 11000
+       // cusp_csr, cusp_coo use the generic ones from CUDA 11
         {"cusp_csr", read_matrix_from_data<cusp_gcsr>},
         {"cusp_coo", read_matrix_from_data<cusp_gcoo>},
 #endif
@@ -260,7 +280,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"hybridminstorage",
          READ_MATRIX(hybrid,
                      std::make_shared<hybrid::minimal_storage_limit>())},
-        {"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}};
+        {"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}
+};
 // clang-format on
 
 

diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
@@ -128,6 +128,9 @@ foreach(log_type ${log_types})
     ginkgo_print_module_footer(${${log_type}} "  Enabled modules:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP")
+    ginkgo_print_module_footer(${${log_type}} "  Enabled features:")
+    ginkgo_print_foreach_variable(${${log_type}}
+        "GINKGO_MIXED_PRECISION")
     ginkgo_print_module_footer(${${log_type}} "  Tests, benchmarks and examples:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")

diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
@@ -34,29 +34,30 @@ namespace kernel {
 namespace {
 
 
-template <int num_thread_per_worker, bool atomic, typename ValueType,
-          typename IndexType, typename Closure>
+template <int num_thread_per_worker, bool atomic, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType,
+          typename Closure>
 __device__ void spmv_kernel(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    acc::range<a_accessor> val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride, Closure op)
+    acc::range<b_accessor> b, OutputValueType *__restrict__ c,
+    const size_type c_stride, Closure op)
 {
     const auto tidx = thread::get_thread_id_flat();
-    const auto column_id = blockIdx.y;
+    const decltype(tidx) column_id = blockIdx.y;
     if (num_thread_per_worker == 1) {
         // Specialize the num_thread_per_worker = 1. It doesn't need the shared
         // memory, __syncthreads, and atomic_add
         if (tidx < num_rows) {
-            ValueType temp = zero<ValueType>();
+            auto temp = zero<OutputValueType>();
             for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
                 const auto ind = tidx + idx * stride;
                 const auto col_idx = col[ind];
                 if (col_idx < idx) {
                     break;
                 } else {
-                    temp += val[ind] * b[col_idx * b_stride + column_id];
+                    temp += val(ind) * b(col_idx, column_id);
                 }
             }
             const auto c_ind = tidx * c_stride + column_id;
@@ -68,14 +69,14 @@ __device__ void spmv_kernel(
             const auto x = tidx % num_rows;
             const auto worker_id = tidx / num_rows;
             const auto step_size = num_worker_per_row * num_thread_per_worker;
-            __shared__ UninitializedArray<ValueType, default_block_size /
-                                                         num_thread_per_worker>
+            __shared__ UninitializedArray<
+                OutputValueType, default_block_size / num_thread_per_worker>
                 storage;
             if (idx_in_worker == 0) {
                 storage[threadIdx.x] = 0;
             }
             __syncthreads();
-            ValueType temp = zero<ValueType>();
+            auto temp = zero<OutputValueType>();
             for (size_type idx =
                      worker_id * num_thread_per_worker + idx_in_worker;
                  idx < num_stored_elements_per_row; idx += step_size) {
@@ -84,7 +85,7 @@ __device__ void spmv_kernel(
                 if (col_idx < idx) {
                     break;
                 } else {
-                    temp += val[ind] * b[col_idx * b_stride + column_id];
+                    temp += val(ind) * b(col_idx, column_id);
                 }
             }
             atomic_add(&storage[threadIdx.x], temp);
@@ -102,51 +103,51 @@ __device__ void spmv_kernel(
 }
 
 
-template <int num_thread_per_worker, bool atomic = false, typename ValueType,
-          typename IndexType>
+template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    acc::range<a_accessor> val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride)
+    acc::range<b_accessor> b, OutputValueType *__restrict__ c,
+    const size_type c_stride)
 {
     spmv_kernel<num_thread_per_worker, atomic>(
         num_rows, num_worker_per_row, val, col, stride,
-        num_stored_elements_per_row, b, b_stride, c, c_stride,
-        [](const ValueType &x, const ValueType &y) { return x; });
+        num_stored_elements_per_row, b, c, c_stride,
+        [](const OutputValueType &x, const OutputValueType &y) { return x; });
 }
 
 
-template <int num_thread_per_worker, bool atomic = false, typename ValueType,
-          typename IndexType>
+template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
+    acc::range<a_accessor> alpha, acc::range<a_accessor> val,
     const IndexType *__restrict__ col, const size_type stride,
-    const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    const ValueType *__restrict__ beta, ValueType *__restrict__ c,
+    const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
+    const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c,
     const size_type c_stride)
 {
-    const ValueType alpha_val = alpha[0];
-    const ValueType beta_val = beta[0];
+    const OutputValueType alpha_val = alpha(0);
+    const OutputValueType beta_val = beta[0];
     // Because the atomic operation changes the values of c during computation,
     // it can not do the right alpha * a * b + beta * c operation.
     // Thus, the cuda kernel only computes alpha * a * b when it uses atomic
     // operation.
     if (atomic) {
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
-            num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val](const ValueType &x, const ValueType &y) {
+            num_stored_elements_per_row, b, c, c_stride,
+            [&alpha_val](const OutputValueType &x, const OutputValueType &y) {
                 return alpha_val * x;
             });
     } else {
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
-            num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
+            num_stored_elements_per_row, b, c, c_stride,
+            [&alpha_val, &beta_val](const OutputValueType &x,
+                                    const OutputValueType &y) {
                 return alpha_val * x + beta_val * y;
             });
     }