From 92970cd43547eca63886fff36f7a525e61d2fae3 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 9 Jun 2021 19:40:57 +0200
Subject: [PATCH 01/11] add the issue test for cuda/hip reset collision

---
 .gitlab-ci.yml                           |  6 +--
 .gitlab/image.yml                        |  3 +-
 cuda/test/base/CMakeLists.txt            |  1 +
 cuda/test/base/cuda_executor_reset.cu    | 65 ++++++++++++++++++++++++
 hip/test/base/CMakeLists.txt             |  1 +
 hip/test/base/hip_executor_reset.hip.cpp | 65 ++++++++++++++++++++++++
 6 files changed, 136 insertions(+), 5 deletions(-)
 create mode 100644 cuda/test/base/cuda_executor_reset.cu
 create mode 100644 hip/test/base/hip_executor_reset.hip.cpp
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 8bfdcf7e8e8..2cfb0faf44f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -315,9 +315,9 @@ build/cuda91/clang/all/release/shared:
 
 # cuda 9.2 and friends
 build/cuda92/gcc/all/release/shared:
-  <<: *default_build
+  <<: *default_build_with_test
   extends:
-    - .full_test_condition
+    - .quick_test_condition
     - .use_gko-cuda92-gnu7-llvm50-intel2017
   variables:
     <<: *default_variables
@@ -325,7 +325,7 @@ build/cuda92/gcc/all/release/shared:
     BUILD_CUDA: "ON"
     BUILD_HIP: "ON"
     BUILD_TYPE: "Release"
-    CUDA_ARCH: 35
+    CUDA_ARCH: 61
 
 # cuda 10.0 and friends
 # Make sure that our jobs run when using self-installed
diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index 5ea3889d8df..a2afe0fcd53 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -30,8 +30,7 @@
   image: ginkgohub/cuda:92-gnu7-llvm50-intel2017
   tags:
     - private_ci
-    - controller
-    - cpu
+    - nvidia-gpu
 
 .use_gko-cuda100-gnu7-llvm60-intel2018:
   image: ginkgohub/cuda:100-gnu7-llvm60-intel2018
diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt
index cedc2d8d328..bc9124a30bf 100644
--- a/cuda/test/base/CMakeLists.txt
+++ b/cuda/test/base/CMakeLists.txt
@@ -1,5 +1,6 @@
 ginkgo_create_cuda_test(array)
 ginkgo_create_cuda_test(cuda_executor)
+ginkgo_create_cuda_test(cuda_executor_reset)
 if(GINKGO_HAVE_HWLOC)
     find_package(NUMA REQUIRED)
     ginkgo_create_cuda_test(cuda_executor_topology NUMA::NUMA)
diff --git a/cuda/test/base/cuda_executor_reset.cu b/cuda/test/base/cuda_executor_reset.cu
new file mode 100644
index 00000000000..b63318b9afc
--- /dev/null
+++ b/cuda/test/base/cuda_executor_reset.cu
@@ -0,0 +1,65 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+namespace {
+
+
+#define GTEST_ASSERT_NO_EXIT(statement) \
+    ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "")
+
+TEST(DeviceReset, HipCuda)
+{
+    GTEST_ASSERT_NO_EXIT({
+        auto ref = gko::ReferenceExecutor::create();
+        auto hip = gko::HipExecutor::create(0, ref, true);
+        auto cuda = gko::CudaExecutor::create(0, ref, true);
+    });
+}
+
+
+TEST(DeviceReset, CudaHip)
+{
+    GTEST_ASSERT_NO_EXIT({
+        auto ref = gko::ReferenceExecutor::create();
+        auto cuda = gko::CudaExecutor::create(0, ref, true);
+        auto hip = gko::HipExecutor::create(0, ref, true);
+    });
+}
+
+
+}  // namespace
diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt
index c2f694fdf8a..14640d2fb2d 100644
--- a/hip/test/base/CMakeLists.txt
+++ b/hip/test/base/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_hip_test(hip_executor)
+ginkgo_create_hip_test(hip_executor_reset)
 if(GINKGO_HAVE_HWLOC)
     find_package(NUMA REQUIRED)
     ginkgo_create_hip_test(hip_executor_topology NUMA::NUMA)
diff --git a/hip/test/base/hip_executor_reset.hip.cpp b/hip/test/base/hip_executor_reset.hip.cpp
new file mode 100644
index 00000000000..b63318b9afc
--- /dev/null
+++ b/hip/test/base/hip_executor_reset.hip.cpp
@@ -0,0 +1,65 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+namespace {
+
+
+#define GTEST_ASSERT_NO_EXIT(statement) \
+    ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "")
+
+TEST(DeviceReset, HipCuda)
+{
+    GTEST_ASSERT_NO_EXIT({
+        auto ref = gko::ReferenceExecutor::create();
+        auto hip = gko::HipExecutor::create(0, ref, true);
+        auto cuda = gko::CudaExecutor::create(0, ref, true);
+    });
+}
+
+
+TEST(DeviceReset, CudaHip)
+{
+    GTEST_ASSERT_NO_EXIT({
+        auto ref = gko::ReferenceExecutor::create();
+        auto cuda = gko::CudaExecutor::create(0, ref, true);
+        auto hip = gko::HipExecutor::create(0, ref, true);
+    });
+}
+
+
+}  // namespace

From be4d94181887a5f3f09e362b754e06c075c0808d Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 9 Jun 2021 21:07:38 +0200
Subject: [PATCH 02/11] add device

---
 core/device_hooks/CMakeLists.txt      |  5 ++
 cuda/CMakeLists.txt                   |  4 +-
 cuda/base/executor.cpp                |  3 +
 devices/CMakeLists.txt                |  1 +
 devices/cuda/executor.cpp             |  6 --
 devices/device.cpp                    | 54 ++++++++++++++++++
 devices/hip/executor.cpp              |  6 --
 dpcpp/CMakeLists.txt                  |  2 +-
 hip/CMakeLists.txt                    |  2 +-
 hip/base/executor.hip.cpp             |  3 +
 include/ginkgo/core/base/device.hpp   | 82 +++++++++++++++++++++++++++
 include/ginkgo/core/base/executor.hpp | 45 +++++++++------
 include/ginkgo/ginkgo.hpp             |  1 +
 omp/CMakeLists.txt                    |  2 +-
 reference/CMakeLists.txt              |  2 +-
 15 files changed, 182 insertions(+), 36 deletions(-)
 create mode 100644 devices/device.cpp
 create mode 100644 include/ginkgo/core/base/device.hpp

diff --git a/core/device_hooks/CMakeLists.txt b/core/device_hooks/CMakeLists.txt
index fcb370a81a0..a3d00014025 100644
--- a/core/device_hooks/CMakeLists.txt
+++ b/core/device_hooks/CMakeLists.txt
@@ -1,6 +1,7 @@
 if(NOT GINKGO_BUILD_CUDA)
     add_library(ginkgo_cuda
         $<TARGET_OBJECTS:ginkgo_cuda_device>
+        $<TARGET_OBJECTS:ginkgo_device_mutex>
         cuda_hooks.cpp)
     target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device)
     ginkgo_compile_features(ginkgo_cuda)
@@ -11,6 +12,7 @@ endif()
 if (NOT GINKGO_BUILD_DPCPP)
     add_library(ginkgo_dpcpp
         $<TARGET_OBJECTS:ginkgo_dpcpp_device>
+        $<TARGET_OBJECTS:ginkgo_device_mutex>
         dpcpp_hooks.cpp)
     target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
     ginkgo_compile_features(ginkgo_dpcpp)
@@ -21,6 +23,7 @@ endif()
 if(NOT GINKGO_BUILD_HIP)
     add_library(ginkgo_hip
         $<TARGET_OBJECTS:ginkgo_hip_device>
+        $<TARGET_OBJECTS:ginkgo_device_mutex>
         hip_hooks.cpp)
     target_link_libraries(ginkgo_hip PUBLIC ginkgo_device)
     ginkgo_compile_features(ginkgo_hip)
@@ -31,6 +34,7 @@ endif()
 if (NOT GINKGO_BUILD_OMP)
     add_library(ginkgo_omp
         $<TARGET_OBJECTS:ginkgo_omp_device>
+        $<TARGET_OBJECTS:ginkgo_device_mutex>
         omp_hooks.cpp)
     ginkgo_compile_features(ginkgo_omp)
     target_link_libraries(ginkgo_omp PRIVATE ginkgo_cuda)
@@ -44,6 +48,7 @@ endif()
 if (NOT GINKGO_BUILD_REFERENCE)
     add_library(ginkgo_reference
         $<TARGET_OBJECTS:ginkgo_reference_device>
+        $<TARGET_OBJECTS:ginkgo_device_mutex>
         reference_hooks.cpp)
     target_link_libraries(ginkgo_reference PUBLIC ginkgo_device)
     ginkgo_compile_features(ginkgo_reference)
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 6f89ca45f79..0a45231aae1 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -64,8 +64,8 @@ find_library(CUSPARSE cusparse
 find_library(CURAND curand
     HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
 
-add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
-set(GKO_CUDA_COMMON_SOURCES    
+add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> $<TARGET_OBJECTS:ginkgo_device_mutex> "")
+set(GKO_CUDA_COMMON_SOURCES
     ../common/components/precision_conversion.cpp
     ../common/matrix/dense_kernels.cpp
     ../common/solver/bicg_kernels.cpp
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 0c7bbef8f10..38bffb3aa10 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/config.hpp>
+#include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
@@ -64,6 +65,8 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
                          alloc_mode),
         [device_id](CudaExecutor *exec) {
             auto device_reset = exec->get_device_reset();
+            std::lock_guard<std::recursive_mutex> guard(
+                device_type::mutex[device_id]);
             delete exec;
             if (!CudaExecutor::get_num_execs(device_id) && device_reset) {
                 cuda::device_guard g(device_id);
diff --git a/devices/CMakeLists.txt b/devices/CMakeLists.txt
index 0c0e80febaf..85632345dc2 100644
--- a/devices/CMakeLists.txt
+++ b/devices/CMakeLists.txt
@@ -17,6 +17,7 @@ function(ginkgo_add_library name)
 endfunction()
 
 ginkgo_add_library(ginkgo_device machine_topology.cpp)
+ginkgo_add_object_library(ginkgo_device_mutex device.cpp)
 ginkgo_install_library(ginkgo_device)
 
 add_subdirectory(cuda)
diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp
index c1efe6e7d40..abdc93a4a08 100644
--- a/devices/cuda/executor.cpp
+++ b/devices/cuda/executor.cpp
@@ -64,10 +64,4 @@ bool CudaExecutor::verify_memory_to(const HipExecutor *dest_exec) const
 }
 
 
-unsigned CudaExecutor::num_execs[max_devices];
-
-
-std::mutex CudaExecutor::mutex[max_devices];
-
-
 }  // namespace gko
diff --git a/devices/device.cpp b/devices/device.cpp
new file mode 100644
index 00000000000..b178de0777f
--- /dev/null
+++ b/devices/device.cpp
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+
+
+#include <ginkgo/core/base/device.hpp>
+
+
+namespace gko {
+
+
+std::recursive_mutex NvidiaDevice::mutex[max_devices];
+
+int NvidiaDevice::num_execs[max_devices];
+
+
+std::recursive_mutex AmdDevice::mutex[max_devices];
+
+int AmdDevice::num_execs[max_devices];
+
+
+}  // namespace gko
diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp
index 31325fcd15f..0e755209f91 100644
--- a/devices/hip/executor.cpp
+++ b/devices/hip/executor.cpp
@@ -61,10 +61,4 @@ bool HipExecutor::verify_memory_to(const CudaExecutor *dest_exec) const
 }
 
 
-int HipExecutor::num_execs[max_devices];
-
-
-std::mutex HipExecutor::mutex[max_devices];
-
-
 }  // namespace gko
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index e2d476164e8..0289fd889e1 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -9,7 +9,7 @@ set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE)
 find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}")
 set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE)
 
-add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
+add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> $<TARGET_OBJECTS:ginkgo_device_mutex> "")
 target_sources(ginkgo_dpcpp
     PRIVATE
     base/version.dp.cpp
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 61870d03aa7..3d98ec4acd5 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -239,7 +239,7 @@ set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_
 set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS})
 
 set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE)
-hip_add_library(ginkgo_hip $<TARGET_OBJECTS:ginkgo_hip_device> ${GINKGO_HIP_SOURCES}
+hip_add_library(ginkgo_hip $<TARGET_OBJECTS:ginkgo_hip_device> $<TARGET_OBJECTS:ginkgo_device_mutex> ${GINKGO_HIP_SOURCES}
     HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS}
     CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS}
     NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS}
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 1e43fe96549..cdb22ce7ee2 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/config.hpp>
+#include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
@@ -63,6 +64,8 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
         new HipExecutor(device_id, std::move(master), device_reset, alloc_mode),
         [device_id](HipExecutor *exec) {
             auto device_reset = exec->get_device_reset();
+            std::lock_guard<std::recursive_mutex> guard(
+                device_type::mutex[device_id]);
             delete exec;
             if (!HipExecutor::get_num_execs(device_id) && device_reset) {
                 hip::device_guard g(device_id);
diff --git a/include/ginkgo/core/base/device.hpp b/include/ginkgo/core/base/device.hpp
new file mode 100644
index 00000000000..f933d05e1be
--- /dev/null
+++ b/include/ginkgo/core/base/device.hpp
@@ -0,0 +1,82 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_DEVICE_HPP_
+#define GKO_PUBLIC_CORE_BASE_DEVICE_HPP_
+
+
+#include <array>
+#include <cstdint>
+#include <mutex>
+#include <type_traits>
+
+
+#include <ginkgo/config.hpp>
+
+
+namespace gko {
+
+
+class CudaExecutor;
+
+class HipExecutor;
+
+
+class NvidiaDevice {
+    friend class CudaExecutor;
+#if GINKGO_HIP_PLATFORM_NVCC
+    friend class HipExecutor;
+#endif
+
+private:
+    static constexpr int max_devices = 64;
+    static std::recursive_mutex mutex[max_devices];
+    static int num_execs[max_devices];
+};
+
+
+class AmdDevice {
+// to avoid both GINKGO_HIP_PLATFORM_* zero
+#if !GINKGO_HIP_PLATFORM_NVCC
+    friend class HipExecutor;
+#endif
+
+private:
+    static constexpr int max_devices = 64;
+    static std::recursive_mutex mutex[max_devices];
+    static int num_execs[max_devices];
+};
+
+
+}  // namespace gko
+
+#endif  // GKO_PUBLIC_CORE_BASE_DEVICE_HPP_
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 0d96cb4cd66..3e5c47d5278 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <vector>
 
 
+#include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/machine_topology.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/log/logger.hpp>
@@ -1408,6 +1409,8 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     int get_closest_numa() const { return this->get_exec_info().numa_node; }
 
 protected:
+    using device_type = NvidiaDevice;
+
     void set_gpu_property();
 
     void init_handles();
@@ -1450,20 +1453,23 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     static void increase_num_execs(unsigned device_id)
     {
-        std::lock_guard<std::mutex> guard(mutex[device_id]);
-        num_execs[device_id]++;
+        std::lock_guard<std::recursive_mutex> guard(
+            device_type::mutex[device_id]);
+        device_type::num_execs[device_id]++;
     }
 
     static void decrease_num_execs(unsigned device_id)
     {
-        std::lock_guard<std::mutex> guard(mutex[device_id]);
-        num_execs[device_id]--;
+        std::lock_guard<std::recursive_mutex> guard(
+            device_type::mutex[device_id]);
+        device_type::num_execs[device_id]--;
     }
 
     static unsigned get_num_execs(unsigned device_id)
     {
-        std::lock_guard<std::mutex> guard(mutex[device_id]);
-        return num_execs[device_id];
+        std::lock_guard<std::recursive_mutex> guard(
+            device_type::mutex[device_id]);
+        return device_type::num_execs[device_id];
     }
 
     void populate_exec_info(const MachineTopology *mach_topo) override;
@@ -1476,9 +1482,6 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     handle_manager<cublasContext> cublas_handle_;
     handle_manager<cusparseContext> cusparse_handle_;
 
-    static constexpr int max_devices = 64;
-    static unsigned num_execs[max_devices];
-    static std::mutex mutex[max_devices];
     allocation_mode alloc_mode_;
 };
 
@@ -1625,6 +1628,12 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     }
 
 protected:
+#if (GINKGO_HIP_PLATFORM_NVCC == 1)
+    using device_type = NvidiaDevice;
+#else
+    using device_type = AmdDevice;
+#endif
+
     void set_gpu_property();
 
     void init_handles();
@@ -1667,20 +1676,23 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
 
     static void increase_num_execs(int device_id)
     {
-        std::lock_guard<std::mutex> guard(mutex[device_id]);
-        num_execs[device_id]++;
+        std::lock_guard<std::recursive_mutex> guard(
+            device_type::mutex[device_id]);
+        device_type::num_execs[device_id]++;
     }
 
     static void decrease_num_execs(int device_id)
     {
-        std::lock_guard<std::mutex> guard(mutex[device_id]);
-        num_execs[device_id]--;
+        std::lock_guard<std::recursive_mutex> guard(
+            device_type::mutex[device_id]);
+        device_type::num_execs[device_id]--;
     }
 
     static int get_num_execs(int device_id)
     {
-        std::lock_guard<std::mutex> guard(mutex[device_id]);
-        return num_execs[device_id];
+        std::lock_guard<std::recursive_mutex> guard(
+            device_type::mutex[device_id]);
+        return device_type::num_execs[device_id];
     }
 
     void populate_exec_info(const MachineTopology *mach_topo) override;
@@ -1693,9 +1705,6 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     handle_manager<hipblasContext> hipblas_handle_;
     handle_manager<hipsparseContext> hipsparse_handle_;
 
-    static constexpr int max_devices = 64;
-    static int num_execs[max_devices];
-    static std::mutex mutex[max_devices];
     allocation_mode alloc_mode_;
 };
 
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index a757a5a6072..a16163edc8a 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/combination.hpp>
 #include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index c58709bc904..af680f7a39b 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -1,6 +1,6 @@
 find_package(OpenMP 3.0 REQUIRED)
 
-add_library(ginkgo_omp $<TARGET_OBJECTS:ginkgo_omp_device> "")
+add_library(ginkgo_omp $<TARGET_OBJECTS:ginkgo_omp_device> $<TARGET_OBJECTS:ginkgo_device_mutex> "")
 target_sources(ginkgo_omp
     PRIVATE
     base/version.cpp
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index c008ad1a87c..a66c0323d4f 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_library(ginkgo_reference $<TARGET_OBJECTS:ginkgo_reference_device> "")
+add_library(ginkgo_reference $<TARGET_OBJECTS:ginkgo_reference_device> $<TARGET_OBJECTS:ginkgo_device_mutex> "")
 target_sources(ginkgo_reference
     PRIVATE
     base/version.cpp

From 4f58ceff77eb192a48cfcae22f857bf67e51e58e Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 1 Jul 2021 14:44:05 +0200
Subject: [PATCH 03/11] put device in ginkgo_device not device_mutex

---
 core/device_hooks/CMakeLists.txt    |  5 -----
 cuda/CMakeLists.txt                 |  2 +-
 devices/CMakeLists.txt              |  3 +--
 dpcpp/CMakeLists.txt                |  2 +-
 hip/CMakeLists.txt                  |  2 +-
 include/ginkgo/core/base/device.hpp | 10 ++++++++++
 omp/CMakeLists.txt                  |  2 +-
 reference/CMakeLists.txt            |  2 +-
 8 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/core/device_hooks/CMakeLists.txt b/core/device_hooks/CMakeLists.txt
index a3d00014025..fcb370a81a0 100644
--- a/core/device_hooks/CMakeLists.txt
+++ b/core/device_hooks/CMakeLists.txt
@@ -1,7 +1,6 @@
 if(NOT GINKGO_BUILD_CUDA)
     add_library(ginkgo_cuda
         $<TARGET_OBJECTS:ginkgo_cuda_device>
-        $<TARGET_OBJECTS:ginkgo_device_mutex>
         cuda_hooks.cpp)
     target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device)
     ginkgo_compile_features(ginkgo_cuda)
@@ -12,7 +11,6 @@ endif()
 if (NOT GINKGO_BUILD_DPCPP)
     add_library(ginkgo_dpcpp
         $<TARGET_OBJECTS:ginkgo_dpcpp_device>
-        $<TARGET_OBJECTS:ginkgo_device_mutex>
         dpcpp_hooks.cpp)
     target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
     ginkgo_compile_features(ginkgo_dpcpp)
@@ -23,7 +21,6 @@ endif()
 if(NOT GINKGO_BUILD_HIP)
     add_library(ginkgo_hip
         $<TARGET_OBJECTS:ginkgo_hip_device>
-        $<TARGET_OBJECTS:ginkgo_device_mutex>
         hip_hooks.cpp)
     target_link_libraries(ginkgo_hip PUBLIC ginkgo_device)
     ginkgo_compile_features(ginkgo_hip)
@@ -34,7 +31,6 @@ endif()
 if (NOT GINKGO_BUILD_OMP)
     add_library(ginkgo_omp
         $<TARGET_OBJECTS:ginkgo_omp_device>
-        $<TARGET_OBJECTS:ginkgo_device_mutex>
         omp_hooks.cpp)
     ginkgo_compile_features(ginkgo_omp)
     target_link_libraries(ginkgo_omp PRIVATE ginkgo_cuda)
@@ -48,7 +44,6 @@ endif()
 if (NOT GINKGO_BUILD_REFERENCE)
     add_library(ginkgo_reference
         $<TARGET_OBJECTS:ginkgo_reference_device>
-        $<TARGET_OBJECTS:ginkgo_device_mutex>
         reference_hooks.cpp)
     target_link_libraries(ginkgo_reference PUBLIC ginkgo_device)
     ginkgo_compile_features(ginkgo_reference)
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 0a45231aae1..2e25435e5c5 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -64,7 +64,7 @@ find_library(CUSPARSE cusparse
 find_library(CURAND curand
     HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
 
-add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> $<TARGET_OBJECTS:ginkgo_device_mutex> "")
+add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
 set(GKO_CUDA_COMMON_SOURCES
     ../common/components/precision_conversion.cpp
     ../common/matrix/dense_kernels.cpp
diff --git a/devices/CMakeLists.txt b/devices/CMakeLists.txt
index 85632345dc2..09797aafe49 100644
--- a/devices/CMakeLists.txt
+++ b/devices/CMakeLists.txt
@@ -16,8 +16,7 @@ function(ginkgo_add_library name)
     set_target_properties(${name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endfunction()
 
-ginkgo_add_library(ginkgo_device machine_topology.cpp)
-ginkgo_add_object_library(ginkgo_device_mutex device.cpp)
+ginkgo_add_library(ginkgo_device machine_topology.cpp device.cpp)
 ginkgo_install_library(ginkgo_device)
 
 add_subdirectory(cuda)
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 0289fd889e1..e2d476164e8 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -9,7 +9,7 @@ set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE)
 find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}")
 set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE)
 
-add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> $<TARGET_OBJECTS:ginkgo_device_mutex> "")
+add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
 target_sources(ginkgo_dpcpp
     PRIVATE
     base/version.dp.cpp
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 3d98ec4acd5..61870d03aa7 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -239,7 +239,7 @@ set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_
 set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS})
 
 set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE)
-hip_add_library(ginkgo_hip $<TARGET_OBJECTS:ginkgo_hip_device> $<TARGET_OBJECTS:ginkgo_device_mutex> ${GINKGO_HIP_SOURCES}
+hip_add_library(ginkgo_hip $<TARGET_OBJECTS:ginkgo_hip_device> ${GINKGO_HIP_SOURCES}
     HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS}
     CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS}
     NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS}
diff --git a/include/ginkgo/core/base/device.hpp b/include/ginkgo/core/base/device.hpp
index f933d05e1be..01f0bd7107d 100644
--- a/include/ginkgo/core/base/device.hpp
+++ b/include/ginkgo/core/base/device.hpp
@@ -51,8 +51,13 @@ class CudaExecutor;
 class HipExecutor;
 
 
+/**
+ * NvidiaDevice handles the number of executor on Nvidia devices and have the
+ * corresponding recursive_mutex.
+ */
 class NvidiaDevice {
     friend class CudaExecutor;
+// If Hip is compiled for NVCC, give NvidiaDevice's permission to HipExecutor
 #if GINKGO_HIP_PLATFORM_NVCC
     friend class HipExecutor;
 #endif
@@ -64,8 +69,13 @@ class NvidiaDevice {
 };
 
 
+/**
+ * AmdDevice handles the number of executor on Amd devices and have the
+ * corresponding recursive_mutex.
+ */
 class AmdDevice {
 // to avoid both GINKGO_HIP_PLATFORM_* zero
+// If Hip isn't compiled for NVCC, give AmdDevice's permission to HipExecutor
 #if !GINKGO_HIP_PLATFORM_NVCC
     friend class HipExecutor;
 #endif
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index af680f7a39b..c58709bc904 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -1,6 +1,6 @@
 find_package(OpenMP 3.0 REQUIRED)
 
-add_library(ginkgo_omp $<TARGET_OBJECTS:ginkgo_omp_device> $<TARGET_OBJECTS:ginkgo_device_mutex> "")
+add_library(ginkgo_omp $<TARGET_OBJECTS:ginkgo_omp_device> "")
 target_sources(ginkgo_omp
     PRIVATE
     base/version.cpp
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index a66c0323d4f..c008ad1a87c 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_library(ginkgo_reference $<TARGET_OBJECTS:ginkgo_reference_device> $<TARGET_OBJECTS:ginkgo_device_mutex> "")
+add_library(ginkgo_reference $<TARGET_OBJECTS:ginkgo_reference_device> "")
 target_sources(ginkgo_reference
     PRIVATE
     base/version.cpp

From 0596b92265c690246c407ae3d9869feb02d497aa Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 1 Jul 2021 18:06:11 +0200
Subject: [PATCH 04/11] use static getter to avoid the static issue in MSVC

---
 cuda/base/executor.cpp                |  2 +-
 devices/CMakeLists.txt                |  2 +-
 devices/device.cpp                    | 54 --------------------------
 hip/base/executor.hip.cpp             |  2 +-
 include/ginkgo/core/base/device.hpp   | 56 +++++++++++++++++++++++++--
 include/ginkgo/core/base/executor.hpp | 24 ++++++------
 6 files changed, 67 insertions(+), 73 deletions(-)
 delete mode 100644 devices/device.cpp

diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 38bffb3aa10..47a712c3eba 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -66,7 +66,7 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
         [device_id](CudaExecutor *exec) {
             auto device_reset = exec->get_device_reset();
             std::lock_guard<std::recursive_mutex> guard(
-                device_type::mutex[device_id]);
+                device_type::get_mutex(device_id));
             delete exec;
             if (!CudaExecutor::get_num_execs(device_id) && device_reset) {
                 cuda::device_guard g(device_id);
diff --git a/devices/CMakeLists.txt b/devices/CMakeLists.txt
index 09797aafe49..0c0e80febaf 100644
--- a/devices/CMakeLists.txt
+++ b/devices/CMakeLists.txt
@@ -16,7 +16,7 @@ function(ginkgo_add_library name)
     set_target_properties(${name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endfunction()
 
-ginkgo_add_library(ginkgo_device machine_topology.cpp device.cpp)
+ginkgo_add_library(ginkgo_device machine_topology.cpp)
 ginkgo_install_library(ginkgo_device)
 
 add_subdirectory(cuda)
diff --git a/devices/device.cpp b/devices/device.cpp
deleted file mode 100644
index b178de0777f..00000000000
--- a/devices/device.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <atomic>
-#include <memory>
-#include <mutex>
-
-
-#include <ginkgo/core/base/device.hpp>
-
-
-namespace gko {
-
-
-std::recursive_mutex NvidiaDevice::mutex[max_devices];
-
-int NvidiaDevice::num_execs[max_devices];
-
-
-std::recursive_mutex AmdDevice::mutex[max_devices];
-
-int AmdDevice::num_execs[max_devices];
-
-
-}  // namespace gko
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index cdb22ce7ee2..36df8fa7152 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -65,7 +65,7 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
         [device_id](HipExecutor *exec) {
             auto device_reset = exec->get_device_reset();
             std::lock_guard<std::recursive_mutex> guard(
-                device_type::mutex[device_id]);
+                device_type::get_mutex(device_id));
             delete exec;
             if (!HipExecutor::get_num_execs(device_id) && device_reset) {
                 hip::device_guard g(device_id);
diff --git a/include/ginkgo/core/base/device.hpp b/include/ginkgo/core/base/device.hpp
index 01f0bd7107d..f60fecae736 100644
--- a/include/ginkgo/core/base/device.hpp
+++ b/include/ginkgo/core/base/device.hpp
@@ -63,9 +63,33 @@ class NvidiaDevice {
 #endif
 
 private:
+    /**
+     * get_mutex gets the static mutex reference at i.
+     *
+     * @param i  index of mutex
+     *
+     * @return recursive_mutex reference
+     */
+    static std::recursive_mutex &get_mutex(int i)
+    {
+        static std::recursive_mutex mutex[max_devices];
+        return mutex[i];
+    }
+
+    /**
+     * get_num_execs gets the static num_execs reference at i.
+     *
+     * @param i  index of num_execs
+     *
+     * @return int reference
+     */
+    static int &get_num_execs(int i)
+    {
+        static int num_execs[max_devices];
+        return num_execs[i];
+    }
+
     static constexpr int max_devices = 64;
-    static std::recursive_mutex mutex[max_devices];
-    static int num_execs[max_devices];
 };
 
 
@@ -81,9 +105,33 @@ class AmdDevice {
 #endif
 
 private:
+    /**
+     * get_mutex gets the static mutex reference at i.
+     *
+     * @param i  index of mutex
+     *
+     * @return recursive_mutex reference
+     */
+    static std::recursive_mutex &get_mutex(int i)
+    {
+        static std::recursive_mutex mutex[max_devices];
+        return mutex[i];
+    }
+
+    /**
+     * get_num_execs gets the static num_execs reference at i.
+     *
+     * @param i  index of num_execs
+     *
+     * @return int reference
+     */
+    static int &get_num_execs(int i)
+    {
+        static int num_execs[max_devices];
+        return num_execs[i];
+    }
+
     static constexpr int max_devices = 64;
-    static std::recursive_mutex mutex[max_devices];
-    static int num_execs[max_devices];
 };
 
 
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 3e5c47d5278..9e7f12ab74f 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1454,22 +1454,22 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     static void increase_num_execs(unsigned device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::mutex[device_id]);
-        device_type::num_execs[device_id]++;
+            device_type::get_mutex(device_id));
+        device_type::get_num_execs(device_id)++;
     }
 
     static void decrease_num_execs(unsigned device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::mutex[device_id]);
-        device_type::num_execs[device_id]--;
+            device_type::get_mutex(device_id));
+        device_type::get_num_execs(device_id)--;
     }
 
     static unsigned get_num_execs(unsigned device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::mutex[device_id]);
-        return device_type::num_execs[device_id];
+            device_type::get_mutex(device_id));
+        return device_type::get_num_execs(device_id);
     }
 
     void populate_exec_info(const MachineTopology *mach_topo) override;
@@ -1677,22 +1677,22 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     static void increase_num_execs(int device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::mutex[device_id]);
-        device_type::num_execs[device_id]++;
+            device_type::get_mutex(device_id));
+        device_type::get_num_execs(device_id)++;
     }
 
     static void decrease_num_execs(int device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::mutex[device_id]);
-        device_type::num_execs[device_id]--;
+            device_type::get_mutex(device_id));
+        device_type::get_num_execs(device_id)--;
     }
 
     static int get_num_execs(int device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::mutex[device_id]);
-        return device_type::num_execs[device_id];
+            device_type::get_mutex(device_id));
+        return device_type::get_num_execs(device_id);
     }
 
     void populate_exec_info(const MachineTopology *mach_topo) override;

From 9a2c6d1010dc9b398d2e907790179e893a7790b7 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 8 Jul 2021 17:01:45 +0200
Subject: [PATCH 05/11] add cudacuda and hiphip test

Co-authored-by: Pratik Nayak <pratikvn@protonmail.com>
---
 cuda/test/base/cuda_executor_reset.cu    | 33 ++++++++++++++++++++++++
 hip/test/base/hip_executor_reset.hip.cpp | 33 ++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/cuda/test/base/cuda_executor_reset.cu b/cuda/test/base/cuda_executor_reset.cu
index b63318b9afc..4807354bdea 100644
--- a/cuda/test/base/cuda_executor_reset.cu
+++ b/cuda/test/base/cuda_executor_reset.cu
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include <thread>
+
+
 #include <gtest/gtest.h>
 
 
@@ -62,4 +65,34 @@ TEST(DeviceReset, CudaHip)
 }
 
 
+template <typename Executor>
+void func()
+{
+    auto ref = gko::ReferenceExecutor::create();
+    auto exec = Executor::create(0, ref, true);
+}
+
+
+TEST(DeviceReset, CudaCuda)
+{
+    GTEST_ASSERT_NO_EXIT({
+        std::thread t1(func<gko::CudaExecutor>);
+        std::thread t2(func<gko::CudaExecutor>);
+        t1.join();
+        t2.join();
+    });
+}
+
+
+TEST(DeviceReset, HipHip)
+{
+    GTEST_ASSERT_NO_EXIT({
+        std::thread t1(func<gko::CudaExecutor>);
+        std::thread t2(func<gko::CudaExecutor>);
+        t1.join();
+        t2.join();
+    });
+}
+
+
 }  // namespace
diff --git a/hip/test/base/hip_executor_reset.hip.cpp b/hip/test/base/hip_executor_reset.hip.cpp
index b63318b9afc..4807354bdea 100644
--- a/hip/test/base/hip_executor_reset.hip.cpp
+++ b/hip/test/base/hip_executor_reset.hip.cpp
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include <thread>
+
+
 #include <gtest/gtest.h>
 
 
@@ -62,4 +65,34 @@ TEST(DeviceReset, CudaHip)
 }
 
 
+template <typename Executor>
+void func()
+{
+    auto ref = gko::ReferenceExecutor::create();
+    auto exec = Executor::create(0, ref, true);
+}
+
+
+TEST(DeviceReset, CudaCuda)
+{
+    GTEST_ASSERT_NO_EXIT({
+        std::thread t1(func<gko::CudaExecutor>);
+        std::thread t2(func<gko::CudaExecutor>);
+        t1.join();
+        t2.join();
+    });
+}
+
+
+TEST(DeviceReset, HipHip)
+{
+    GTEST_ASSERT_NO_EXIT({
+        std::thread t1(func<gko::CudaExecutor>);
+        std::thread t2(func<gko::CudaExecutor>);
+        t1.join();
+        t2.join();
+    });
+}
+
+
 }  // namespace

From e08731b1f50412027acfd1e61bcb215ba9a8485c Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 9 Jul 2021 15:07:12 +0200
Subject: [PATCH 06/11] ensure ops affected by Reset after increase_exec

---
 cuda/test/base/CMakeLists.txt                 |  2 +-
 ...cutor_reset.cu => cuda_executor_reset.cpp} | 19 ++++---------------
 hip/test/base/CMakeLists.txt                  |  2 +-
 ...r_reset.hip.cpp => hip_executor_reset.cpp} | 19 ++++---------------
 include/ginkgo/core/base/executor.hpp         | 12 ++++++++++--
 5 files changed, 20 insertions(+), 34 deletions(-)
 rename cuda/test/base/{cuda_executor_reset.cu => cuda_executor_reset.cpp} (87%)
 rename hip/test/base/{hip_executor_reset.hip.cpp => hip_executor_reset.cpp} (87%)

diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt
index bc9124a30bf..8bd6194f134 100644
--- a/cuda/test/base/CMakeLists.txt
+++ b/cuda/test/base/CMakeLists.txt
@@ -1,6 +1,6 @@
 ginkgo_create_cuda_test(array)
 ginkgo_create_cuda_test(cuda_executor)
-ginkgo_create_cuda_test(cuda_executor_reset)
+ginkgo_create_test(cuda_executor_reset)
 if(GINKGO_HAVE_HWLOC)
     find_package(NUMA REQUIRED)
     ginkgo_create_cuda_test(cuda_executor_topology NUMA::NUMA)
diff --git a/cuda/test/base/cuda_executor_reset.cu b/cuda/test/base/cuda_executor_reset.cpp
similarity index 87%
rename from cuda/test/base/cuda_executor_reset.cu
rename to cuda/test/base/cuda_executor_reset.cpp
index 4807354bdea..5d9169be808 100644
--- a/cuda/test/base/cuda_executor_reset.cu
+++ b/cuda/test/base/cuda_executor_reset.cpp
@@ -45,6 +45,7 @@ namespace {
 #define GTEST_ASSERT_NO_EXIT(statement) \
     ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "")
 
+
 TEST(DeviceReset, HipCuda)
 {
     GTEST_ASSERT_NO_EXIT({
@@ -65,30 +66,18 @@ TEST(DeviceReset, CudaHip)
 }
 
 
-template <typename Executor>
 void func()
 {
     auto ref = gko::ReferenceExecutor::create();
-    auto exec = Executor::create(0, ref, true);
+    auto exec = gko::CudaExecutor::create(0, ref, true);
 }
 
 
 TEST(DeviceReset, CudaCuda)
 {
     GTEST_ASSERT_NO_EXIT({
-        std::thread t1(func<gko::CudaExecutor>);
-        std::thread t2(func<gko::CudaExecutor>);
-        t1.join();
-        t2.join();
-    });
-}
-
-
-TEST(DeviceReset, HipHip)
-{
-    GTEST_ASSERT_NO_EXIT({
-        std::thread t1(func<gko::CudaExecutor>);
-        std::thread t2(func<gko::CudaExecutor>);
+        std::thread t1(func);
+        std::thread t2(func);
         t1.join();
         t2.join();
     });
diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt
index 14640d2fb2d..dd91d9e3c5f 100644
--- a/hip/test/base/CMakeLists.txt
+++ b/hip/test/base/CMakeLists.txt
@@ -1,5 +1,5 @@
 ginkgo_create_hip_test(hip_executor)
-ginkgo_create_hip_test(hip_executor_reset)
+ginkgo_create_test(hip_executor_reset)
 if(GINKGO_HAVE_HWLOC)
     find_package(NUMA REQUIRED)
     ginkgo_create_hip_test(hip_executor_topology NUMA::NUMA)
diff --git a/hip/test/base/hip_executor_reset.hip.cpp b/hip/test/base/hip_executor_reset.cpp
similarity index 87%
rename from hip/test/base/hip_executor_reset.hip.cpp
rename to hip/test/base/hip_executor_reset.cpp
index 4807354bdea..33a9c8040c0 100644
--- a/hip/test/base/hip_executor_reset.hip.cpp
+++ b/hip/test/base/hip_executor_reset.cpp
@@ -45,6 +45,7 @@ namespace {
 #define GTEST_ASSERT_NO_EXIT(statement) \
     ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "")
 
+
 TEST(DeviceReset, HipCuda)
 {
     GTEST_ASSERT_NO_EXIT({
@@ -65,30 +66,18 @@ TEST(DeviceReset, CudaHip)
 }
 
 
-template <typename Executor>
 void func()
 {
     auto ref = gko::ReferenceExecutor::create();
-    auto exec = Executor::create(0, ref, true);
-}
-
-
-TEST(DeviceReset, CudaCuda)
-{
-    GTEST_ASSERT_NO_EXIT({
-        std::thread t1(func<gko::CudaExecutor>);
-        std::thread t2(func<gko::CudaExecutor>);
-        t1.join();
-        t2.join();
-    });
+    auto exec = gko::HipExecutor::create(0, ref, true);
 }
 
 
 TEST(DeviceReset, HipHip)
 {
     GTEST_ASSERT_NO_EXIT({
-        std::thread t1(func<gko::CudaExecutor>);
-        std::thread t2(func<gko::CudaExecutor>);
+        std::thread t1(func);
+        std::thread t2(func);
         t1.join();
         t2.join();
     });
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 9e7f12ab74f..fb484871f9b 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1430,9 +1430,13 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
             MachineTopology::get_instance()->bind_to_pus(
                 this->get_closest_pus());
         }
+        // it only gets attribute from device, so it should not be affected by
+        // DeviceReset.
         this->set_gpu_property();
-        this->init_handles();
+        // increase the number of executor before any operations may be affected
+        // by DeviceReset.
         increase_num_execs(this->get_exec_info().device_id);
+        this->init_handles();
     }
 
     void *raw_alloc(size_type size) const override;
@@ -1653,9 +1657,13 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
             MachineTopology::get_instance()->bind_to_pus(
                 this->get_closest_pus());
         }
+        // it only gets attribute from device, so it should not be affected by
+        // DeviceReset.
         this->set_gpu_property();
-        this->init_handles();
+        // increase the number of executor before any operations may be affected
+        // by DeviceReset.
         increase_num_execs(this->get_exec_info().device_id);
+        this->init_handles();
     }
 
     void *raw_alloc(size_type size) const override;

From 23d3197f2dee0f671df564ace05acccef49057e3 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 29 Jul 2021 08:56:36 +0200
Subject: [PATCH 07/11] use device_class and move definition to cpp link thread
 in test

Co-authored-by: Terry Cojean <terry.cojean@kit.edu>
---
 cuda/base/executor.cpp                |  2 +-
 cuda/test/base/CMakeLists.txt         |  2 +-
 devices/CMakeLists.txt                |  2 +-
 devices/device.cpp                    | 68 +++++++++++++++++++++++++++
 hip/base/executor.hip.cpp             |  2 +-
 hip/test/base/CMakeLists.txt          |  2 +-
 include/ginkgo/core/base/device.hpp   | 24 ++--------
 include/ginkgo/core/base/executor.hpp | 30 ++++++------
 8 files changed, 92 insertions(+), 40 deletions(-)
 create mode 100644 devices/device.cpp

diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 47a712c3eba..4091a251aeb 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -66,7 +66,7 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
         [device_id](CudaExecutor *exec) {
             auto device_reset = exec->get_device_reset();
             std::lock_guard<std::recursive_mutex> guard(
-                device_type::get_mutex(device_id));
+                device_class::get_mutex(device_id));
             delete exec;
             if (!CudaExecutor::get_num_execs(device_id) && device_reset) {
                 cuda::device_guard g(device_id);
diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt
index 8bd6194f134..c23efefa8ed 100644
--- a/cuda/test/base/CMakeLists.txt
+++ b/cuda/test/base/CMakeLists.txt
@@ -1,6 +1,6 @@
 ginkgo_create_cuda_test(array)
 ginkgo_create_cuda_test(cuda_executor)
-ginkgo_create_test(cuda_executor_reset)
+ginkgo_create_thread_test(cuda_executor_reset)
 if(GINKGO_HAVE_HWLOC)
     find_package(NUMA REQUIRED)
     ginkgo_create_cuda_test(cuda_executor_topology NUMA::NUMA)
diff --git a/devices/CMakeLists.txt b/devices/CMakeLists.txt
index 0c0e80febaf..09797aafe49 100644
--- a/devices/CMakeLists.txt
+++ b/devices/CMakeLists.txt
@@ -16,7 +16,7 @@ function(ginkgo_add_library name)
     set_target_properties(${name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endfunction()
 
-ginkgo_add_library(ginkgo_device machine_topology.cpp)
+ginkgo_add_library(ginkgo_device machine_topology.cpp device.cpp)
 ginkgo_install_library(ginkgo_device)
 
 add_subdirectory(cuda)
diff --git a/devices/device.cpp b/devices/device.cpp
new file mode 100644
index 00000000000..79f7da20077
--- /dev/null
+++ b/devices/device.cpp
@@ -0,0 +1,68 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <memory>
+#include <mutex>
+
+
+#include <ginkgo/core/base/device.hpp>
+
+
+namespace gko {
+
+
+std::recursive_mutex &NvidiaDevice::get_mutex(int i)
+{
+    static std::recursive_mutex mutex[max_devices];
+    return mutex[i];
+}
+
+int &NvidiaDevice::get_num_execs(int i)
+{
+    static int num_execs[max_devices];
+    return num_execs[i];
+}
+
+std::recursive_mutex &AmdDevice::get_mutex(int i)
+{
+    static std::recursive_mutex mutex[max_devices];
+    return mutex[i];
+}
+
+int &AmdDevice::get_num_execs(int i)
+{
+    static int num_execs[max_devices];
+    return num_execs[i];
+}
+
+
+}  // namespace gko
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 36df8fa7152..d0d64110289 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -65,7 +65,7 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
         [device_id](HipExecutor *exec) {
             auto device_reset = exec->get_device_reset();
             std::lock_guard<std::recursive_mutex> guard(
-                device_type::get_mutex(device_id));
+                device_class::get_mutex(device_id));
             delete exec;
             if (!HipExecutor::get_num_execs(device_id) && device_reset) {
                 hip::device_guard g(device_id);
diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt
index dd91d9e3c5f..4529a40c862 100644
--- a/hip/test/base/CMakeLists.txt
+++ b/hip/test/base/CMakeLists.txt
@@ -1,5 +1,5 @@
 ginkgo_create_hip_test(hip_executor)
-ginkgo_create_test(hip_executor_reset)
+ginkgo_create_thread_test(hip_executor_reset)
 if(GINKGO_HAVE_HWLOC)
     find_package(NUMA REQUIRED)
     ginkgo_create_hip_test(hip_executor_topology NUMA::NUMA)
diff --git a/include/ginkgo/core/base/device.hpp b/include/ginkgo/core/base/device.hpp
index f60fecae736..025927ee367 100644
--- a/include/ginkgo/core/base/device.hpp
+++ b/include/ginkgo/core/base/device.hpp
@@ -70,11 +70,7 @@ class NvidiaDevice {
      *
      * @return recursive_mutex reference
      */
-    static std::recursive_mutex &get_mutex(int i)
-    {
-        static std::recursive_mutex mutex[max_devices];
-        return mutex[i];
-    }
+    static std::recursive_mutex &get_mutex(int i);
 
     /**
      * get_num_execs gets the static num_execs reference at i.
@@ -83,11 +79,7 @@ class NvidiaDevice {
      *
      * @return int reference
      */
-    static int &get_num_execs(int i)
-    {
-        static int num_execs[max_devices];
-        return num_execs[i];
-    }
+    static int &get_num_execs(int i);
 
     static constexpr int max_devices = 64;
 };
@@ -112,11 +104,7 @@ class AmdDevice {
      *
      * @return recursive_mutex reference
      */
-    static std::recursive_mutex &get_mutex(int i)
-    {
-        static std::recursive_mutex mutex[max_devices];
-        return mutex[i];
-    }
+    static std::recursive_mutex &get_mutex(int i);
 
     /**
      * get_num_execs gets the static num_execs reference at i.
@@ -125,11 +113,7 @@ class AmdDevice {
      *
      * @return int reference
      */
-    static int &get_num_execs(int i)
-    {
-        static int num_execs[max_devices];
-        return num_execs[i];
-    }
+    static int &get_num_execs(int i);
 
     static constexpr int max_devices = 64;
 };
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index fb484871f9b..8c172bc4ed9 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1409,7 +1409,7 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     int get_closest_numa() const { return this->get_exec_info().numa_node; }
 
 protected:
-    using device_type = NvidiaDevice;
+    using device_class = NvidiaDevice;
 
     void set_gpu_property();
 
@@ -1458,22 +1458,22 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     static void increase_num_execs(unsigned device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::get_mutex(device_id));
-        device_type::get_num_execs(device_id)++;
+            device_class::get_mutex(device_id));
+        device_class::get_num_execs(device_id)++;
     }
 
     static void decrease_num_execs(unsigned device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::get_mutex(device_id));
-        device_type::get_num_execs(device_id)--;
+            device_class::get_mutex(device_id));
+        device_class::get_num_execs(device_id)--;
     }
 
     static unsigned get_num_execs(unsigned device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::get_mutex(device_id));
-        return device_type::get_num_execs(device_id);
+            device_class::get_mutex(device_id));
+        return device_class::get_num_execs(device_id);
     }
 
     void populate_exec_info(const MachineTopology *mach_topo) override;
@@ -1633,9 +1633,9 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
 
 protected:
 #if (GINKGO_HIP_PLATFORM_NVCC == 1)
-    using device_type = NvidiaDevice;
+    using device_class = NvidiaDevice;
 #else
-    using device_type = AmdDevice;
+    using device_class = AmdDevice;
 #endif
 
     void set_gpu_property();
@@ -1685,22 +1685,22 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     static void increase_num_execs(int device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::get_mutex(device_id));
-        device_type::get_num_execs(device_id)++;
+            device_class::get_mutex(device_id));
+        device_class::get_num_execs(device_id)++;
     }
 
     static void decrease_num_execs(int device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::get_mutex(device_id));
-        device_type::get_num_execs(device_id)--;
+            device_class::get_mutex(device_id));
+        device_class::get_num_execs(device_id)--;
     }
 
     static int get_num_execs(int device_id)
     {
         std::lock_guard<std::recursive_mutex> guard(
-            device_type::get_mutex(device_id));
-        return device_type::get_num_execs(device_id);
+            device_class::get_mutex(device_id));
+        return device_class::get_num_execs(device_id);
     }
 
     void populate_exec_info(const MachineTopology *mach_topo) override;

From 2da69cd8592e9437e66cbb73c8f5d27c1bef64c6 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 29 Jul 2021 17:23:30 +0200
Subject: [PATCH 08/11] handle deleter in create not destructor of class it
 makes us keep mutex not recursive_mutex

Co-authored-by: Terry Cojean <terry.cojean@kit.edu>
---
 cuda/base/executor.cpp                |  6 ++++--
 devices/device.cpp                    |  8 ++++----
 hip/base/executor.hip.cpp             |  6 ++++--
 include/ginkgo/core/base/device.hpp   |  4 ++--
 include/ginkgo/core/base/executor.hpp | 22 ++++++----------------
 5 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 4091a251aeb..2898ff3126c 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -65,10 +65,12 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
                          alloc_mode),
         [device_id](CudaExecutor *exec) {
             auto device_reset = exec->get_device_reset();
-            std::lock_guard<std::recursive_mutex> guard(
+            std::lock_guard<std::mutex> guard(
                 device_class::get_mutex(device_id));
             delete exec;
-            if (!CudaExecutor::get_num_execs(device_id) && device_reset) {
+            auto &num_execs = device_class::get_num_execs(device_id);
+            num_execs--;
+            if (!num_execs && device_reset) {
                 cuda::device_guard g(device_id);
                 cudaDeviceReset();
             }
diff --git a/devices/device.cpp b/devices/device.cpp
index 79f7da20077..c70fdaaffb8 100644
--- a/devices/device.cpp
+++ b/devices/device.cpp
@@ -40,9 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace gko {
 
 
-std::recursive_mutex &NvidiaDevice::get_mutex(int i)
+std::mutex &NvidiaDevice::get_mutex(int i)
 {
-    static std::recursive_mutex mutex[max_devices];
+    static std::mutex mutex[max_devices];
     return mutex[i];
 }
 
@@ -52,9 +52,9 @@ int &NvidiaDevice::get_num_execs(int i)
     return num_execs[i];
 }
 
-std::recursive_mutex &AmdDevice::get_mutex(int i)
+std::mutex &AmdDevice::get_mutex(int i)
 {
-    static std::recursive_mutex mutex[max_devices];
+    static std::mutex mutex[max_devices];
     return mutex[i];
 }
 
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index d0d64110289..c5b1c32920b 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -64,10 +64,12 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
         new HipExecutor(device_id, std::move(master), device_reset, alloc_mode),
         [device_id](HipExecutor *exec) {
             auto device_reset = exec->get_device_reset();
-            std::lock_guard<std::recursive_mutex> guard(
+            std::lock_guard<std::mutex> guard(
                 device_class::get_mutex(device_id));
             delete exec;
-            if (!HipExecutor::get_num_execs(device_id) && device_reset) {
+            auto &num_execs = device_class::get_num_execs(device_id);
+            num_execs--;
+            if (!num_execs && device_reset) {
                 hip::device_guard g(device_id);
                 hipDeviceReset();
             }
diff --git a/include/ginkgo/core/base/device.hpp b/include/ginkgo/core/base/device.hpp
index 025927ee367..d40ad7f6eef 100644
--- a/include/ginkgo/core/base/device.hpp
+++ b/include/ginkgo/core/base/device.hpp
@@ -70,7 +70,7 @@ class NvidiaDevice {
      *
      * @return recursive_mutex reference
      */
-    static std::recursive_mutex &get_mutex(int i);
+    static std::mutex &get_mutex(int i);
 
     /**
      * get_num_execs gets the static num_execs reference at i.
@@ -104,7 +104,7 @@ class AmdDevice {
      *
      * @return recursive_mutex reference
      */
-    static std::recursive_mutex &get_mutex(int i);
+    static std::mutex &get_mutex(int i);
 
     /**
      * get_num_execs gets the static num_execs reference at i.
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 8c172bc4ed9..1b9da581f30 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1302,8 +1302,6 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
         bool device_reset = false,
         allocation_mode alloc_mode = default_cuda_alloc_mode);
 
-    ~CudaExecutor() { decrease_num_execs(this->get_device_id()); }
-
     std::shared_ptr<Executor> get_master() noexcept override;
 
     std::shared_ptr<const Executor> get_master() const noexcept override;
@@ -1457,22 +1455,19 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     static void increase_num_execs(unsigned device_id)
     {
-        std::lock_guard<std::recursive_mutex> guard(
-            device_class::get_mutex(device_id));
+        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
         device_class::get_num_execs(device_id)++;
     }
 
     static void decrease_num_execs(unsigned device_id)
     {
-        std::lock_guard<std::recursive_mutex> guard(
-            device_class::get_mutex(device_id));
+        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
         device_class::get_num_execs(device_id)--;
     }
 
     static unsigned get_num_execs(unsigned device_id)
     {
-        std::lock_guard<std::recursive_mutex> guard(
-            device_class::get_mutex(device_id));
+        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
         return device_class::get_num_execs(device_id);
     }
 
@@ -1525,8 +1520,6 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
         bool device_reset = false,
         allocation_mode alloc_mode = default_hip_alloc_mode);
 
-    ~HipExecutor() { decrease_num_execs(this->get_device_id()); }
-
     std::shared_ptr<Executor> get_master() noexcept override;
 
     std::shared_ptr<const Executor> get_master() const noexcept override;
@@ -1684,22 +1677,19 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
 
     static void increase_num_execs(int device_id)
     {
-        std::lock_guard<std::recursive_mutex> guard(
-            device_class::get_mutex(device_id));
+        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
         device_class::get_num_execs(device_id)++;
     }
 
     static void decrease_num_execs(int device_id)
     {
-        std::lock_guard<std::recursive_mutex> guard(
-            device_class::get_mutex(device_id));
+        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
         device_class::get_num_execs(device_id)--;
     }
 
     static int get_num_execs(int device_id)
     {
-        std::lock_guard<std::recursive_mutex> guard(
-            device_class::get_mutex(device_id));
+        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
         return device_class::get_num_execs(device_id);
     }
 

From eb37c0ff5cb9692d5f6f0a811d19da9d99dee2e4 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 30 Jul 2021 11:43:01 +0200
Subject: [PATCH 09/11] increase/decrease num_exec when enabling module

---
 devices/device.cpp                    |  3 +++
 include/ginkgo/config.hpp.in          |  8 ++++++++
 include/ginkgo/core/base/executor.hpp | 12 ++++++++++++
 3 files changed, 23 insertions(+)

diff --git a/devices/device.cpp b/devices/device.cpp
index c70fdaaffb8..3080044f675 100644
--- a/devices/device.cpp
+++ b/devices/device.cpp
@@ -46,18 +46,21 @@ std::mutex &NvidiaDevice::get_mutex(int i)
     return mutex[i];
 }
 
+
 int &NvidiaDevice::get_num_execs(int i)
 {
     static int num_execs[max_devices];
     return num_execs[i];
 }
 
+
 std::mutex &AmdDevice::get_mutex(int i)
 {
     static std::mutex mutex[max_devices];
     return mutex[i];
 }
 
+
 int &AmdDevice::get_num_execs(int i)
 {
     static int num_execs[max_devices];
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index 1c6a31ea481..c232f481fac 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -67,6 +67,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #cmakedefine GINKGO_MIXED_PRECISION
 
 
+/* Should we compile cuda kenrels for Ginkgo? */
+#cmakedefine GINKGO_BUILD_CUDA
+
+
+/* Should we compile hip kenrels for Ginkgo? */
+#cmakedefine GINKGO_BUILD_HIP
+
+
 /* What is HIP compiled for, hcc or nvcc? */
 // clang-format off
 #define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 1b9da581f30..45d42f0b0f1 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1455,14 +1455,20 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     static void increase_num_execs(unsigned device_id)
     {
+#ifdef GINKGO_BUILD_CUDA
+        // increase the Cuda Device count only when ginkgo build cuda
         std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
         device_class::get_num_execs(device_id)++;
+#endif  // GINKGO_BUILD_CUDA
     }
 
     static void decrease_num_execs(unsigned device_id)
     {
+#ifdef GINKGO_BUILD_CUDA
+        // increase the Cuda Device count only when ginkgo build cuda
         std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
         device_class::get_num_execs(device_id)--;
+#endif  // GINKGO_BUILD_CUDA
     }
 
     static unsigned get_num_execs(unsigned device_id)
@@ -1677,14 +1683,20 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
 
     static void increase_num_execs(int device_id)
     {
+#ifdef GINKGO_BUILD_HIP
+        // increase the HIP Device count only when ginkgo build hip
         std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
         device_class::get_num_execs(device_id)++;
+#endif  // GINKGO_BUILD_HIP
     }
 
     static void decrease_num_execs(int device_id)
     {
+#ifdef GINKGO_BUILD_HIP
+        // increase the HIP Device count only when ginkgo build hip
         std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
         device_class::get_num_execs(device_id)--;
+#endif  // GINKGO_BUILD_HIP
     }
 
     static int get_num_execs(int device_id)

From 78d0c00c753f77a79be27b7631e05bc9e1c5e458 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sun, 1 Aug 2021 18:10:30 +0200
Subject: [PATCH 10/11] move all related to cmake into cpp not hpp

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 cuda/base/executor.cpp                |  4 +-
 devices/cuda/executor.cpp             | 27 +++++++++++++
 devices/hip/executor.cpp              | 34 ++++++++++++++++
 hip/base/executor.hip.cpp             | 11 +++++-
 include/ginkgo/config.hpp.in          |  4 +-
 include/ginkgo/core/base/device.hpp   |  7 ----
 include/ginkgo/core/base/executor.hpp | 56 +++------------------------
 7 files changed, 80 insertions(+), 63 deletions(-)

diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 2898ff3126c..1f179dc1224 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -66,9 +66,9 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
         [device_id](CudaExecutor *exec) {
             auto device_reset = exec->get_device_reset();
             std::lock_guard<std::mutex> guard(
-                device_class::get_mutex(device_id));
+                NvidiaDevice::get_mutex(device_id));
             delete exec;
-            auto &num_execs = device_class::get_num_execs(device_id);
+            auto &num_execs = NvidiaDevice::get_num_execs(device_id);
             num_execs--;
             if (!num_execs && device_reset) {
                 cuda::device_guard g(device_id);
diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp
index abdc93a4a08..708fe02dcd6 100644
--- a/devices/cuda/executor.cpp
+++ b/devices/cuda/executor.cpp
@@ -64,4 +64,31 @@ bool CudaExecutor::verify_memory_to(const HipExecutor *dest_exec) const
 }
 
 
+void CudaExecutor::increase_num_execs(unsigned device_id)
+{
+#ifdef GINKGO_BUILD_CUDA
+    // increase the Cuda Device count only when ginkgo build cuda
+    std::lock_guard<std::mutex> guard(NvidiaDevice::get_mutex(device_id));
+    NvidiaDevice::get_num_execs(device_id)++;
+#endif  // GINKGO_BUILD_CUDA
+}
+
+
+void CudaExecutor::decrease_num_execs(unsigned device_id)
+{
+#ifdef GINKGO_BUILD_CUDA
+    // increase the Cuda Device count only when ginkgo build cuda
+    std::lock_guard<std::mutex> guard(NvidiaDevice::get_mutex(device_id));
+    NvidiaDevice::get_num_execs(device_id)--;
+#endif  // GINKGO_BUILD_CUDA
+}
+
+
+unsigned CudaExecutor::get_num_execs(unsigned device_id)
+{
+    std::lock_guard<std::mutex> guard(NvidiaDevice::get_mutex(device_id));
+    return NvidiaDevice::get_num_execs(device_id);
+}
+
+
 }  // namespace gko
diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp
index 0e755209f91..daf8c14ec58 100644
--- a/devices/hip/executor.cpp
+++ b/devices/hip/executor.cpp
@@ -61,4 +61,38 @@ bool HipExecutor::verify_memory_to(const CudaExecutor *dest_exec) const
 }
 
 
+#if (GINKGO_HIP_PLATFORM_NVCC == 1)
+using hip_device_class = NvidiaDevice;
+#else
+using hip_device_class = AmdDevice;
+#endif
+
+
+void HipExecutor::increase_num_execs(int device_id)
+{
+#ifdef GINKGO_BUILD_HIP
+    // increase the HIP Device count only when ginkgo build hip
+    std::lock_guard<std::mutex> guard(hip_device_class::get_mutex(device_id));
+    hip_device_class::get_num_execs(device_id)++;
+#endif  // GINKGO_BUILD_HIP
+}
+
+
+void HipExecutor::decrease_num_execs(int device_id)
+{
+#ifdef GINKGO_BUILD_HIP
+    // increase the HIP Device count only when ginkgo build hip
+    std::lock_guard<std::mutex> guard(hip_device_class::get_mutex(device_id));
+    hip_device_class::get_num_execs(device_id)--;
+#endif  // GINKGO_BUILD_HIP
+}
+
+
+int HipExecutor::get_num_execs(int device_id)
+{
+    std::lock_guard<std::mutex> guard(hip_device_class::get_mutex(device_id));
+    return hip_device_class::get_num_execs(device_id);
+}
+
+
 }  // namespace gko
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index c5b1c32920b..11ea4ab8ead 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -56,6 +56,13 @@ namespace gko {
 #include "common/base/executor.hpp.inc"
 
 
+#if (GINKGO_HIP_PLATFORM_NVCC == 1)
+using hip_device_class = NvidiaDevice;
+#else
+using hip_device_class = AmdDevice;
+#endif
+
+
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode)
@@ -65,9 +72,9 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
         [device_id](HipExecutor *exec) {
             auto device_reset = exec->get_device_reset();
             std::lock_guard<std::mutex> guard(
-                device_class::get_mutex(device_id));
+                hip_device_class::get_mutex(device_id));
             delete exec;
-            auto &num_execs = device_class::get_num_execs(device_id);
+            auto &num_execs = hip_device_class::get_num_execs(device_id);
             num_execs--;
             if (!num_execs && device_reset) {
                 hip::device_guard g(device_id);
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index c232f481fac..42f2c56f44d 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -67,11 +67,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #cmakedefine GINKGO_MIXED_PRECISION
 
 
-/* Should we compile cuda kenrels for Ginkgo? */
+/* Should we compile cuda kernels for Ginkgo? */
 #cmakedefine GINKGO_BUILD_CUDA
 
 
-/* Should we compile hip kenrels for Ginkgo? */
+/* Should we compile hip kernels for Ginkgo? */
 #cmakedefine GINKGO_BUILD_HIP
 
 
diff --git a/include/ginkgo/core/base/device.hpp b/include/ginkgo/core/base/device.hpp
index d40ad7f6eef..981191ba73c 100644
--- a/include/ginkgo/core/base/device.hpp
+++ b/include/ginkgo/core/base/device.hpp
@@ -57,10 +57,7 @@ class HipExecutor;
  */
 class NvidiaDevice {
     friend class CudaExecutor;
-// If Hip is compiled for NVCC, give NvidiaDevice's permission to HipExecutor
-#if GINKGO_HIP_PLATFORM_NVCC
     friend class HipExecutor;
-#endif
 
 private:
     /**
@@ -90,11 +87,7 @@ class NvidiaDevice {
  * corresponding recursive_mutex.
  */
 class AmdDevice {
-// to avoid both GINKGO_HIP_PLATFORM_* zero
-// If Hip isn't compiled for NVCC, give AmdDevice's permission to HipExecutor
-#if !GINKGO_HIP_PLATFORM_NVCC
     friend class HipExecutor;
-#endif
 
 private:
     /**
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 45d42f0b0f1..59876d620cc 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1407,8 +1407,6 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     int get_closest_numa() const { return this->get_exec_info().numa_node; }
 
 protected:
-    using device_class = NvidiaDevice;
-
     void set_gpu_property();
 
     void init_handles();
@@ -1453,29 +1451,11 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     bool verify_memory_to(const CudaExecutor *dest_exec) const override;
 
-    static void increase_num_execs(unsigned device_id)
-    {
-#ifdef GINKGO_BUILD_CUDA
-        // increase the Cuda Device count only when ginkgo build cuda
-        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
-        device_class::get_num_execs(device_id)++;
-#endif  // GINKGO_BUILD_CUDA
-    }
+    static void increase_num_execs(unsigned device_id);
 
-    static void decrease_num_execs(unsigned device_id)
-    {
-#ifdef GINKGO_BUILD_CUDA
-        // increase the Cuda Device count only when ginkgo build cuda
-        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
-        device_class::get_num_execs(device_id)--;
-#endif  // GINKGO_BUILD_CUDA
-    }
+    static void decrease_num_execs(unsigned device_id);
 
-    static unsigned get_num_execs(unsigned device_id)
-    {
-        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
-        return device_class::get_num_execs(device_id);
-    }
+    static unsigned get_num_execs(unsigned device_id);
 
     void populate_exec_info(const MachineTopology *mach_topo) override;
 
@@ -1631,12 +1611,6 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     }
 
 protected:
-#if (GINKGO_HIP_PLATFORM_NVCC == 1)
-    using device_class = NvidiaDevice;
-#else
-    using device_class = AmdDevice;
-#endif
-
     void set_gpu_property();
 
     void init_handles();
@@ -1681,29 +1655,11 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
 
     bool verify_memory_to(const HipExecutor *dest_exec) const override;
 
-    static void increase_num_execs(int device_id)
-    {
-#ifdef GINKGO_BUILD_HIP
-        // increase the HIP Device count only when ginkgo build hip
-        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
-        device_class::get_num_execs(device_id)++;
-#endif  // GINKGO_BUILD_HIP
-    }
+    static void increase_num_execs(int device_id);
 
-    static void decrease_num_execs(int device_id)
-    {
-#ifdef GINKGO_BUILD_HIP
-        // increase the HIP Device count only when ginkgo build hip
-        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
-        device_class::get_num_execs(device_id)--;
-#endif  // GINKGO_BUILD_HIP
-    }
+    static void decrease_num_execs(int device_id);
 
-    static int get_num_execs(int device_id)
-    {
-        std::lock_guard<std::mutex> guard(device_class::get_mutex(device_id));
-        return device_class::get_num_execs(device_id);
-    }
+    static int get_num_execs(int device_id);
 
     void populate_exec_info(const MachineTopology *mach_topo) override;
 

From c22fbb3597cfbd008a0072374fd19e48b73668cd Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 2 Aug 2021 14:19:19 +0200
Subject: [PATCH 11/11] use snake_case and move the compile_definitions

Co-authored-by: Terry Cojean <terry.cojean@kit.edu>
Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 cuda/base/executor.cpp              |  4 ++--
 devices/cuda/CMakeLists.txt         |  3 +++
 devices/cuda/executor.cpp           | 20 ++++++++++----------
 devices/device.cpp                  |  8 ++++----
 devices/hip/CMakeLists.txt          |  3 +++
 devices/hip/executor.cpp            | 12 ++++++------
 hip/base/executor.hip.cpp           |  4 ++--
 include/ginkgo/config.hpp.in        |  8 --------
 include/ginkgo/core/base/device.hpp |  8 ++++----
 9 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 1f179dc1224..1c35af4d390 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -66,9 +66,9 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
         [device_id](CudaExecutor *exec) {
             auto device_reset = exec->get_device_reset();
             std::lock_guard<std::mutex> guard(
-                NvidiaDevice::get_mutex(device_id));
+                nvidia_device::get_mutex(device_id));
             delete exec;
-            auto &num_execs = NvidiaDevice::get_num_execs(device_id);
+            auto &num_execs = nvidia_device::get_num_execs(device_id);
             num_execs--;
             if (!num_execs && device_reset) {
                 cuda::device_guard g(device_id);
diff --git a/devices/cuda/CMakeLists.txt b/devices/cuda/CMakeLists.txt
index acd0455ff2a..b9d1bca187b 100644
--- a/devices/cuda/CMakeLists.txt
+++ b/devices/cuda/CMakeLists.txt
@@ -1,2 +1,5 @@
 ginkgo_add_object_library(ginkgo_cuda_device
     executor.cpp)
+if(GINKGO_BUILD_CUDA)
+    target_compile_definitions(ginkgo_cuda_device PRIVATE GKO_COMPILING_CUDA_DEVICE)
+endif()
diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp
index 708fe02dcd6..06e8d504097 100644
--- a/devices/cuda/executor.cpp
+++ b/devices/cuda/executor.cpp
@@ -66,28 +66,28 @@ bool CudaExecutor::verify_memory_to(const HipExecutor *dest_exec) const
 
 void CudaExecutor::increase_num_execs(unsigned device_id)
 {
-#ifdef GINKGO_BUILD_CUDA
+#ifdef GKO_COMPILING_CUDA_DEVICE
     // increase the Cuda Device count only when ginkgo build cuda
-    std::lock_guard<std::mutex> guard(NvidiaDevice::get_mutex(device_id));
-    NvidiaDevice::get_num_execs(device_id)++;
-#endif  // GINKGO_BUILD_CUDA
+    std::lock_guard<std::mutex> guard(nvidia_device::get_mutex(device_id));
+    nvidia_device::get_num_execs(device_id)++;
+#endif  // GKO_COMPILING_CUDA_DEVICE
 }
 
 
 void CudaExecutor::decrease_num_execs(unsigned device_id)
 {
-#ifdef GINKGO_BUILD_CUDA
+#ifdef GKO_COMPILING_CUDA_DEVICE
     // increase the Cuda Device count only when ginkgo build cuda
-    std::lock_guard<std::mutex> guard(NvidiaDevice::get_mutex(device_id));
-    NvidiaDevice::get_num_execs(device_id)--;
-#endif  // GINKGO_BUILD_CUDA
+    std::lock_guard<std::mutex> guard(nvidia_device::get_mutex(device_id));
+    nvidia_device::get_num_execs(device_id)--;
+#endif  // GKO_COMPILING_CUDA_DEVICE
 }
 
 
 unsigned CudaExecutor::get_num_execs(unsigned device_id)
 {
-    std::lock_guard<std::mutex> guard(NvidiaDevice::get_mutex(device_id));
-    return NvidiaDevice::get_num_execs(device_id);
+    std::lock_guard<std::mutex> guard(nvidia_device::get_mutex(device_id));
+    return nvidia_device::get_num_execs(device_id);
 }
 
 
diff --git a/devices/device.cpp b/devices/device.cpp
index 3080044f675..ca6597d652d 100644
--- a/devices/device.cpp
+++ b/devices/device.cpp
@@ -40,28 +40,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace gko {
 
 
-std::mutex &NvidiaDevice::get_mutex(int i)
+std::mutex &nvidia_device::get_mutex(int i)
 {
     static std::mutex mutex[max_devices];
     return mutex[i];
 }
 
 
-int &NvidiaDevice::get_num_execs(int i)
+int &nvidia_device::get_num_execs(int i)
 {
     static int num_execs[max_devices];
     return num_execs[i];
 }
 
 
-std::mutex &AmdDevice::get_mutex(int i)
+std::mutex &amd_device::get_mutex(int i)
 {
     static std::mutex mutex[max_devices];
     return mutex[i];
 }
 
 
-int &AmdDevice::get_num_execs(int i)
+int &amd_device::get_num_execs(int i)
 {
     static int num_execs[max_devices];
     return num_execs[i];
diff --git a/devices/hip/CMakeLists.txt b/devices/hip/CMakeLists.txt
index 9995e51c5e7..d05db433984 100644
--- a/devices/hip/CMakeLists.txt
+++ b/devices/hip/CMakeLists.txt
@@ -1,2 +1,5 @@
 ginkgo_add_object_library(ginkgo_hip_device
     executor.cpp)
+if(GINKGO_BUILD_HIP)
+    target_compile_definitions(ginkgo_hip_device PRIVATE GKO_COMPILING_HIP_DEVICE)
+endif()
diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp
index daf8c14ec58..53da0db72c2 100644
--- a/devices/hip/executor.cpp
+++ b/devices/hip/executor.cpp
@@ -62,29 +62,29 @@ bool HipExecutor::verify_memory_to(const CudaExecutor *dest_exec) const
 
 
 #if (GINKGO_HIP_PLATFORM_NVCC == 1)
-using hip_device_class = NvidiaDevice;
+using hip_device_class = nvidia_device;
 #else
-using hip_device_class = AmdDevice;
+using hip_device_class = amd_device;
 #endif
 
 
 void HipExecutor::increase_num_execs(int device_id)
 {
-#ifdef GINKGO_BUILD_HIP
+#ifdef GKO_COMPILING_HIP_DEVICE
     // increase the HIP Device count only when ginkgo build hip
     std::lock_guard<std::mutex> guard(hip_device_class::get_mutex(device_id));
     hip_device_class::get_num_execs(device_id)++;
-#endif  // GINKGO_BUILD_HIP
+#endif  // GKO_COMPILING_HIP_DEVICE
 }
 
 
 void HipExecutor::decrease_num_execs(int device_id)
 {
-#ifdef GINKGO_BUILD_HIP
+#ifdef GKO_COMPILING_HIP_DEVICE
     // increase the HIP Device count only when ginkgo build hip
     std::lock_guard<std::mutex> guard(hip_device_class::get_mutex(device_id));
     hip_device_class::get_num_execs(device_id)--;
-#endif  // GINKGO_BUILD_HIP
+#endif  // GKO_COMPILING_HIP_DEVICE
 }
 
 
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 11ea4ab8ead..3875ad95910 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -57,9 +57,9 @@ namespace gko {
 
 
 #if (GINKGO_HIP_PLATFORM_NVCC == 1)
-using hip_device_class = NvidiaDevice;
+using hip_device_class = nvidia_device;
 #else
-using hip_device_class = AmdDevice;
+using hip_device_class = amd_device;
 #endif
 
 
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index 42f2c56f44d..1c6a31ea481 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -67,14 +67,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #cmakedefine GINKGO_MIXED_PRECISION
 
 
-/* Should we compile cuda kernels for Ginkgo? */
-#cmakedefine GINKGO_BUILD_CUDA
-
-
-/* Should we compile hip kernels for Ginkgo? */
-#cmakedefine GINKGO_BUILD_HIP
-
-
 /* What is HIP compiled for, hcc or nvcc? */
 // clang-format off
 #define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@
diff --git a/include/ginkgo/core/base/device.hpp b/include/ginkgo/core/base/device.hpp
index 981191ba73c..5eaf892789d 100644
--- a/include/ginkgo/core/base/device.hpp
+++ b/include/ginkgo/core/base/device.hpp
@@ -52,10 +52,10 @@ class HipExecutor;
 
 
 /**
- * NvidiaDevice handles the number of executor on Nvidia devices and have the
+ * nvidia_device handles the number of executor on Nvidia devices and have the
  * corresponding recursive_mutex.
  */
-class NvidiaDevice {
+class nvidia_device {
     friend class CudaExecutor;
     friend class HipExecutor;
 
@@ -83,10 +83,10 @@ class NvidiaDevice {
 
 
 /**
- * AmdDevice handles the number of executor on Amd devices and have the
+ * amd_device handles the number of executor on Amd devices and have the
  * corresponding recursive_mutex.
  */
-class AmdDevice {
+class amd_device {
     friend class HipExecutor;
 
 private: