From 6cd265f3ae885f1a985874bcbaefd8b982244730 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Sun, 7 May 2023 10:18:08 +0200
Subject: [PATCH 01/22] remove HIP CUDA support from source

---
 common/cuda_hip/components/atomic.hpp.inc | 17 ++-------
 core/device_hooks/hip_hooks.cpp           | 16 ++++-----
 core/log/profiler_hook.cpp                |  2 --
 core/log/profiler_hook.hpp                |  4 ---
 core/test/base/executor.cpp               |  7 ----
 cuda/base/executor.cpp                    | 10 ------
 devices/cuda/executor.cpp                 |  4 ---
 devices/hip/executor.cpp                  |  4 ---
 hip/base/config.hip.hpp                   |  8 -----
 hip/base/device.hip.cpp                   |  2 +-
 hip/base/device.hpp                       |  2 +-
 hip/base/executor.hip.cpp                 | 17 ---------
 hip/base/stream.hip.cpp                   |  2 +-
 hip/base/thrust.hip.hpp                   | 10 +-----
 hip/components/cooperative_groups.hip.hpp | 44 ++---------------------
 hip/components/format_conversion.hip.hpp  | 13 -------
 hip/preconditioner/jacobi_kernels.hip.cpp |  6 +---
 hip/test/base/hip_executor.hip.cpp        | 12 -------
 include/ginkgo/config.hpp.in              |  4 +--
 include/ginkgo/core/base/executor.hpp     | 35 +++---------------
 include/ginkgo/core/base/fwd_decls.hpp    |  7 ----
 include/ginkgo/core/base/math.hpp         |  4 +--
 include/ginkgo/core/base/memory.hpp       | 16 ++++-----
 include/ginkgo/core/base/stream.hpp       |  4 +--
 include/ginkgo/core/base/timer.hpp        |  4 +--
 include/ginkgo/core/matrix/csr.hpp        |  4 ---
 test/utils/executor.hpp                   |  2 +-
 27 files changed, 38 insertions(+), 222 deletions(-)

diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc
index 3d76cfdcb79..00fed2db4a6 100644
--- a/common/cuda_hip/components/atomic.hpp.inc
+++ b/common/cuda_hip/components/atomic.hpp.inc
@@ -119,7 +119,7 @@ GKO_BIND_ATOMIC_ADD(unsigned long long int);
 GKO_BIND_ATOMIC_ADD(float);
 
 // AMD
-#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
+#if defined(__HIPCC__)
 
 
 // the double atomicAdd is added after 4.3
@@ -157,7 +157,7 @@ GKO_BIND_ATOMIC_ADD(__half2);
         // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
 
 
-#endif  // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
+#endif  // defined(__HIPCC__)
 
 
 #undef GKO_BIND_ATOMIC_ADD
@@ -179,20 +179,7 @@ __forceinline__ __device__ T atomic_max(T* __restrict__ addr, T val)
 
 GKO_BIND_ATOMIC_MAX(int);
 GKO_BIND_ATOMIC_MAX(unsigned int);
-
-#if !defined(__HIPCC__) || \
-    (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC)
-
-
-#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__)
-// Only Compute Capability 3.5 and higher supports 64-bit atomicMax
-GKO_BIND_ATOMIC_MAX(unsigned long long int);
-#endif
-
-#else   // Is HIP platform & on AMD hardware
 GKO_BIND_ATOMIC_MAX(unsigned long long int);
-#endif  // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) &&
-        // GINKGO_HIP_PLATFORM_HCC)
 
 
 #undef GKO_BIND_ATOMIC_MAX
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index a90691e1af4..cb85019e542 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -33,7 +33,7 @@ void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
 void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
 
 
-HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream)
+HipAsyncAllocator::HipAsyncAllocator(ihipStream_t* stream)
     GKO_NOT_COMPILED(hip);
 
 
@@ -44,7 +44,7 @@ void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
 
 
 bool HipAsyncAllocator::check_environment(int device_id,
-                                          GKO_HIP_STREAM_STRUCT* stream) const
+                                          ihipStream_t* stream) const
     GKO_NOT_COMPILED(hip);
 
 
@@ -62,7 +62,7 @@ void HipUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
 
 
 bool HipUnifiedAllocator::check_environment(int device_id,
-                                            GKO_HIP_STREAM_STRUCT* stream) const
+                                            ihipStream_t* stream) const
     GKO_NOT_COMPILED(hip);
 
 
@@ -76,13 +76,13 @@ void HipHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
 
 
 bool HipHostAllocator::check_environment(int device_id,
-                                         GKO_HIP_STREAM_STRUCT* stream) const
+                                         ihipStream_t* stream) const
     GKO_NOT_COMPILED(hip);
 
 
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
-    allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream)
+    allocation_mode alloc_mode, ihipStream_t* stream)
 {
     return std::shared_ptr<HipExecutor>(
         new HipExecutor(device_id, std::move(master),
@@ -92,7 +92,7 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
 
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master,
-    std::shared_ptr<HipAllocatorBase> alloc, GKO_HIP_STREAM_STRUCT* stream)
+    std::shared_ptr<HipAllocatorBase> alloc, ihipStream_t* stream)
 {
     return std::shared_ptr<HipExecutor>(
         new HipExecutor(device_id, std::move(master), alloc, stream));
@@ -204,7 +204,7 @@ hip_stream::~hip_stream() {}
 hip_stream::hip_stream(hip_stream&&) GKO_NOT_COMPILED(hip);
 
 
-GKO_HIP_STREAM_STRUCT* hip_stream::get() const GKO_NOT_COMPILED(hip);
+ihipStream_t* hip_stream::get() const GKO_NOT_COMPILED(hip);
 
 
 HipTimer::HipTimer(std::shared_ptr<const HipExecutor> exec)
@@ -232,7 +232,7 @@ namespace hip {
 void reset_device(int device_id) GKO_NOT_COMPILED(hip);
 
 
-void destroy_event(GKO_HIP_EVENT_STRUCT* event) GKO_NOT_COMPILED(hip);
+void destroy_event(ihipEvent_t* event) GKO_NOT_COMPILED(hip);
 
 
 }  // namespace hip
diff --git a/core/log/profiler_hook.cpp b/core/log/profiler_hook.cpp
index a8eef7668f2..8b5d84b2f0e 100644
--- a/core/log/profiler_hook.cpp
+++ b/core/log/profiler_hook.cpp
@@ -391,11 +391,9 @@ std::shared_ptr<ProfilerHook> ProfilerHook::create_for_executor(
     if (std::dynamic_pointer_cast<const CudaExecutor>(exec)) {
         return create_nvtx();
     }
-#if (GINKGO_HIP_PLATFORM_NVCC == 0)
     if (std::dynamic_pointer_cast<const HipExecutor>(exec)) {
         return create_roctx();
     }
-#endif
     if (std::dynamic_pointer_cast<const DpcppExecutor>(exec)) {
         return create_vtune();
     }
diff --git a/core/log/profiler_hook.hpp b/core/log/profiler_hook.hpp
index 3f4baf80db1..b6a88c1d471 100644
--- a/core/log/profiler_hook.hpp
+++ b/core/log/profiler_hook.hpp
@@ -110,7 +110,6 @@ class profiling_scope_guard : log::profiling_scope_guard {
 namespace hip {
 
 
-#if (GINKGO_HIP_PLATFORM_NVCC == 0)
 class profiling_scope_guard : log::profiling_scope_guard {
 public:
     profiling_scope_guard(const char* name)
@@ -119,9 +118,6 @@ class profiling_scope_guard : log::profiling_scope_guard {
                                      log::begin_roctx, log::end_nvtx}
     {}
 };
-#else
-using profiling_scope_guard = log::default_profiling_scope_guard;
-#endif
 
 
 }  // namespace hip
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index dc4ea5aad63..989a1137b14 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -336,17 +336,10 @@ TEST(Executor, CanVerifyMemory)
         ASSERT_EQ(false, gpu_dpcpp->memory_accessible(gpu_dpcpp_dup));
         ASSERT_EQ(false, gpu_dpcpp_dup->memory_accessible(gpu_dpcpp));
     }
-#if GINKGO_HIP_PLATFORM_NVCC
-    ASSERT_EQ(true, hip->memory_accessible(cuda));
-    ASSERT_EQ(true, cuda->memory_accessible(hip));
-    ASSERT_EQ(true, hip_1->memory_accessible(cuda_1));
-    ASSERT_EQ(true, cuda_1->memory_accessible(hip_1));
-#else
     ASSERT_EQ(false, hip->memory_accessible(cuda));
     ASSERT_EQ(false, cuda->memory_accessible(hip));
     ASSERT_EQ(false, hip_1->memory_accessible(cuda_1));
     ASSERT_EQ(false, cuda_1->memory_accessible(hip_1));
-#endif
     ASSERT_EQ(true, omp->memory_accessible(omp2));
     ASSERT_EQ(true, hip->memory_accessible(hip2));
     ASSERT_EQ(true, cuda->memory_accessible(cuda2));
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 52a92132689..82bc56792e5 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -135,17 +135,7 @@ void CudaExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes,
 void CudaExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes,
                                const void* src_ptr, void* dest_ptr) const
 {
-#if GINKGO_HIP_PLATFORM_NVCC == 1
-    if (num_bytes > 0) {
-        detail::cuda_scoped_device_id_guard g(this->get_device_id());
-        GKO_ASSERT_NO_CUDA_ERRORS(cudaMemcpyPeerAsync(
-            dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(),
-            num_bytes, this->get_stream()));
-        this->synchronize();
-    }
-#else
     GKO_NOT_SUPPORTED(dest);
-#endif
 }
 
 
diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp
index 58261c318fb..f16225c07ee 100644
--- a/devices/cuda/executor.cpp
+++ b/devices/cuda/executor.cpp
@@ -28,11 +28,7 @@ bool CudaExecutor::verify_memory_to(const CudaExecutor* dest_exec) const
 
 bool CudaExecutor::verify_memory_to(const HipExecutor* dest_exec) const
 {
-#if GINKGO_HIP_PLATFORM_NVCC
-    return this->get_device_id() == dest_exec->get_device_id();
-#else
     return false;
-#endif
 }
 
 
diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp
index 6954e31b24b..a8bab47dd61 100644
--- a/devices/hip/executor.cpp
+++ b/devices/hip/executor.cpp
@@ -25,11 +25,7 @@ bool HipExecutor::verify_memory_to(const HipExecutor* dest_exec) const
 
 bool HipExecutor::verify_memory_to(const CudaExecutor* dest_exec) const
 {
-#if GINKGO_HIP_PLATFORM_NVCC
-    return this->get_device_id() == dest_exec->get_device_id();
-#else
     return false;
-#endif
 }
 
 
diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
index fbad841fd0f..e0fb2d73210 100644
--- a/hip/base/config.hip.hpp
+++ b/hip/base/config.hip.hpp
@@ -27,21 +27,13 @@ struct config {
     /**
      * The type containing a bitmask over all lanes of a warp.
      */
-#if GINKGO_HIP_PLATFORM_HCC
     using lane_mask_type = uint64;
-#else  // GINKGO_HIP_PLATFORM_NVCC
-    using lane_mask_type = uint32;
-#endif
 
     /**
      * The number of threads within a HIP warp. Here, we use the definition from
      * `device_functions.h`.
      */
-#if GINKGO_HIP_PLATFORM_HCC
     static constexpr uint32 warp_size = warpSize;
-#else  // GINKGO_HIP_PLATFORM_NVCC
-    static constexpr uint32 warp_size = 32;
-#endif
 
     /**
      * The bitmask of the entire warp.
diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp
index 58376c2175b..2eaa92e8a66 100644
--- a/hip/base/device.hip.cpp
+++ b/hip/base/device.hip.cpp
@@ -28,7 +28,7 @@ void reset_device(int device_id)
 }
 
 
-void destroy_event(GKO_HIP_EVENT_STRUCT* event)
+void destroy_event(ihipEvent_t* event)
 {
     GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event));
 }
diff --git a/hip/base/device.hpp b/hip/base/device.hpp
index f0ceae0dc2b..f00e75851ca 100644
--- a/hip/base/device.hpp
+++ b/hip/base/device.hpp
@@ -18,7 +18,7 @@ void reset_device(int device_id);
 
 
 /** calls hipEventDestroy on the given event. */
-void destroy_event(GKO_HIP_EVENT_STRUCT* event);
+void destroy_event(ihipEvent_t* event);
 
 
 /** returns hipDeviceProp.name for the given device */
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 2694ce4177f..22d1ce0c1e2 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -131,17 +131,7 @@ void HipExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes,
 void HipExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes,
                               const void* src_ptr, void* dest_ptr) const
 {
-#if GINKGO_HIP_PLATFORM_NVCC == 1
-    if (num_bytes > 0) {
-        detail::hip_scoped_device_id_guard g(this->get_device_id());
-        GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeerAsync(
-            dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(),
-            num_bytes, this->get_stream()));
-        this->synchronize();
-    }
-#else
     GKO_NOT_SUPPORTED(dest);
-#endif
 }
 
 
@@ -222,16 +212,9 @@ void HipExecutor::set_gpu_property()
             this->get_device_id()));
         this->get_exec_info().max_workgroup_size = max_threads_per_block;
         this->get_exec_info().max_workitem_sizes = max_threads_per_block_dim;
-#if GINKGO_HIP_PLATFORM_NVCC
-        this->get_exec_info().num_pu_per_cu =
-            convert_sm_ver_to_cores(this->get_exec_info().major,
-                                    this->get_exec_info().minor) /
-            kernels::hip::config::warp_size;
-#else
         // In GCN (Graphics Core Next), each multiprocessor has 4 SIMD
         // Reference: https://en.wikipedia.org/wiki/Graphics_Core_Next
         this->get_exec_info().num_pu_per_cu = 4;
-#endif  // GINKGO_HIP_PLATFORM_NVCC
         this->get_exec_info().max_subgroup_size =
             kernels::hip::config::warp_size;
     }
diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp
index 93c1fc008d9..47f4092cc82 100644
--- a/hip/base/stream.hip.cpp
+++ b/hip/base/stream.hip.cpp
@@ -44,7 +44,7 @@ hip_stream::hip_stream(hip_stream&& other)
 {}
 
 
-GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; }
+ihipStream_t* hip_stream::get() const { return stream_; }
 
 
 }  // namespace gko
diff --git a/hip/base/thrust.hip.hpp b/hip/base/thrust.hip.hpp
index 2c0412fb67d..008f1e0645b 100644
--- a/hip/base/thrust.hip.hpp
+++ b/hip/base/thrust.hip.hpp
@@ -7,15 +7,11 @@
 
 
 #include <thrust/execution_policy.h>
+#include <thrust/system/hip/detail/execution_policy.h>
 
 
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/executor.hpp>
-#if GINKGO_HIP_PLATFORM_HCC
-#include <thrust/system/hip/detail/execution_policy.h>
-#else
-#include <thrust/system/cuda/detail/execution_policy.h>
-#endif
 
 
 namespace gko {
@@ -25,11 +21,7 @@ namespace hip {
 
 inline auto thrust_policy(std::shared_ptr<const HipExecutor> exec)
 {
-#if GINKGO_HIP_PLATFORM_HCC
     return thrust::hip::par.on(exec->get_stream());
-#else
-    return thrust::cuda::par.on(exec->get_stream());
-#endif
 }
 
 
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index 247218a1457..2e5723366e6 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -167,14 +167,8 @@ class thread_block_tile {
 
     __device__ __forceinline__ unsigned size() const noexcept { return Size; }
 
-    __device__ __forceinline__ void sync() const noexcept
-    {
-#if GINKGO_HIP_PLATFORM_NVCC
-        __syncwarp(data_.mask);
-#endif  // GINKGO_HIP_PLATFORM_NVCC
-    }
+    __device__ __forceinline__ void sync() const noexcept {}
 
-#if GINKGO_HIP_PLATFORM_HCC
 #define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType)                       \
     __device__ __forceinline__ ValueType ShflOp(                             \
         ValueType var, SelectorType selector) const noexcept                 \
@@ -184,17 +178,6 @@ class thread_block_tile {
     static_assert(true,                                                      \
                   "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
-#else
-#define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType)                       \
-    __device__ __forceinline__ ValueType ShflOp(                             \
-        ValueType var, SelectorType selector) const noexcept                 \
-    {                                                                        \
-        return __##ShflOp##_sync(data_.mask, var, selector, Size);           \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-#endif
 
     GKO_BIND_SHFL(shfl, int32, int32);
     GKO_BIND_SHFL(shfl, float, int32);
@@ -222,15 +205,11 @@ class thread_block_tile {
      */
     __device__ __forceinline__ int any(int predicate) const noexcept
     {
-#if GINKGO_HIP_PLATFORM_HCC
         if (Size == config::warp_size) {
             return __any(predicate);
         } else {
             return (__ballot(predicate) & data_.mask) != 0;
         }
-#else
-        return __any_sync(data_.mask, predicate);
-#endif
     }
 
     /**
@@ -239,15 +218,11 @@ class thread_block_tile {
      */
     __device__ __forceinline__ int all(int predicate) const noexcept
     {
-#if GINKGO_HIP_PLATFORM_HCC
         if (Size == config::warp_size) {
             return __all(predicate);
         } else {
             return (__ballot(predicate) & data_.mask) == data_.mask;
         }
-#else
-        return __all_sync(data_.mask, predicate);
-#endif
     }
 
     /**
@@ -260,19 +235,11 @@ class thread_block_tile {
     __device__ __forceinline__ config::lane_mask_type ballot(
         int predicate) const noexcept
     {
-#if GINKGO_HIP_PLATFORM_HCC
         if (Size == config::warp_size) {
             return __ballot(predicate);
         } else {
             return (__ballot(predicate) & data_.mask) >> data_.lane_offset;
         }
-#else
-        if (Size == config::warp_size) {
-            return __ballot_sync(data_.mask, predicate);
-        } else {
-            return __ballot_sync(data_.mask, predicate) >> data_.lane_offset;
-        }
-#endif
     }
 
 private:
@@ -345,14 +312,9 @@ class enable_extended_shuffle : public Group {
 }  // namespace detail
 
 
-// Implementing this as a using directive messes up with SFINAE for some reason,
-// probably a bug in NVCC. If it is a complete type, everything works fine.
 template <unsigned Size>
-struct thread_block_tile
-    : detail::enable_extended_shuffle<detail::thread_block_tile<Size>> {
-    using detail::enable_extended_shuffle<
-        detail::thread_block_tile<Size>>::enable_extended_shuffle;
-};
+using thread_block_tile =
+    detail::enable_extended_shuffle<detail::thread_block_tile<Size>>;
 
 
 // Only support tile_partition with 1, 2, 4, 8, 16, 32, 64 (hip).
diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp
index 59c0405a874..a5b84533ddd 100644
--- a/hip/components/format_conversion.hip.hpp
+++ b/hip/components/format_conversion.hip.hpp
@@ -80,25 +80,12 @@ __host__ size_type calculate_nwarps(std::shared_ptr<const HipExecutor> exec,
     size_type nwarps_in_hip = exec->get_num_multiprocessor() *
                               exec->get_num_warps_per_sm() * config::warp_size /
                               subwarp_size;
-#if GINKGO_HIP_PLATFORM_NVCC
-    size_type multiple = 8;
-    if (nnz >= 2e8) {
-        multiple = 2048;
-    } else if (nnz >= 2e7) {
-        multiple = 512;
-    } else if (nnz >= 2e6) {
-        multiple = 128;
-    } else if (nnz >= 2e5) {
-        multiple = 32;
-    }
-#else
     size_type multiple = 2;
     if (nnz >= 1e7) {
         multiple = 32;
     } else if (nnz >= 1e5) {
         multiple = 8;
     }
-#endif  // GINKGO_HIP_PLATFORM_NVCC
 #ifdef GINKGO_BENCHMARK_ENABLE_TUNING
     if (_tuning_flag) {
         multiple = _tuned_value;
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
index 1646a7fb376..83af45132c4 100644
--- a/hip/preconditioner/jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_kernels.hip.cpp
@@ -33,12 +33,8 @@ namespace hip {
 namespace jacobi {
 
 
-// a total of 32/16 warps (1024 threads)
-#if GINKGO_HIP_PLATFORM_HCC
+// a total of 16 warps (1024 threads)
 constexpr int default_num_warps = 16;
-#else  // GINKGO_HIP_PLATFORM_NVCC
-constexpr int default_num_warps = 32;
-#endif
 // with current architectures, at most 32 warps can be scheduled per SM (and
 // current GPUs have at most 84 SMs)
 constexpr int default_grid_size = 32 * 32 * 128;
diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp
index cfdfc3122fd..908f1c06c3e 100644
--- a/hip/test/base/hip_executor.hip.cpp
+++ b/hip/test/base/hip_executor.hip.cpp
@@ -164,11 +164,7 @@ TEST_F(HipExecutor, FailsWhenOverallocating)
 __global__ void check_data(int* data)
 {
     if (data[0] != 3 || data[1] != 8) {
-#if GINKGO_HIP_PLATFORM_HCC
         asm("s_trap 0x02;");
-#else  // GINKGO_HIP_PLATFORM_NVCC
-        asm("trap;");
-#endif
     }
 }
 
@@ -188,11 +184,7 @@ TEST_F(HipExecutor, CopiesDataToHip)
 __global__ void check_data2(int* data)
 {
     if (data[0] != 4 || data[1] != 8) {
-#if GINKGO_HIP_PLATFORM_HCC
         asm("s_trap 0x02;");
-#else  // GINKGO_HIP_PLATFORM_NVCC
-        asm("trap;");
-#endif
     }
 }
 
@@ -316,11 +308,7 @@ TEST_F(HipExecutor, ExecInfoSetsCorrectProperties)
         &max_threads_per_block, hipDeviceAttributeMaxThreadsPerBlock, dev_id));
     GKO_ASSERT_NO_HIP_ERRORS(
         hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, dev_id));
-#if GINKGO_HIP_PLATFORM_NVCC
-    auto num_cores = convert_sm_ver_to_cores(major, minor);
-#else
     auto num_cores = warp_size * 4;
-#endif
 
     ASSERT_EQ(hip->get_major_version(), major);
     ASSERT_EQ(hip->get_minor_version(), minor);
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index 329918399d6..34d06d8d63c 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -41,10 +41,10 @@
 
 /* What is HIP compiled for, hcc or nvcc? */
 // clang-format off
-#define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@
+#define GINKGO_HIP_PLATFORM_HCC 1
 
 
-#define GINKGO_HIP_PLATFORM_NVCC @GINKGO_HIP_PLATFORM_NVCC@
+#define GINKGO_HIP_PLATFORM_NVCC 0
 // clang-format on
 
 
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index d7db35d2e3c..6b3f6d3755d 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -63,34 +63,10 @@ enum class log_propagation_mode {
 enum class allocation_mode { device, unified_global, unified_host };
 
 
-#ifdef NDEBUG
-
-// When in release, prefer device allocations
 constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
 
 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
 
-#else
-
-// When in debug, always UM allocations.
-constexpr allocation_mode default_cuda_alloc_mode =
-    allocation_mode::unified_global;
-
-#if (GINKGO_HIP_PLATFORM_HCC == 1)
-
-// HIP on AMD GPUs does not support UM, so always prefer device allocations.
-constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
-
-#else
-
-// HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
-constexpr allocation_mode default_hip_alloc_mode =
-    allocation_mode::unified_global;
-
-#endif
-
-#endif
-
 
 }  // namespace gko
 
@@ -1718,13 +1694,13 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     static std::shared_ptr<HipExecutor> create(
         int device_id, std::shared_ptr<Executor> master, bool device_reset,
         allocation_mode alloc_mode = default_hip_alloc_mode,
-        GKO_HIP_STREAM_STRUCT* stream = nullptr);
+        ihipStream_t* stream = nullptr);
 
     static std::shared_ptr<HipExecutor> create(
         int device_id, std::shared_ptr<Executor> master,
         std::shared_ptr<HipAllocatorBase> alloc =
             std::make_shared<HipAllocator>(),
-        GKO_HIP_STREAM_STRUCT* stream = nullptr);
+        ihipStream_t* stream = nullptr);
 
     std::shared_ptr<Executor> get_master() noexcept override;
 
@@ -1830,7 +1806,7 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
         return this->get_exec_info().closest_pu_ids;
     }
 
-    GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }
+    ihipStream_t* get_stream() const { return stream_; }
 
 protected:
     void set_gpu_property();
@@ -1838,8 +1814,7 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     void init_handles();
 
     HipExecutor(int device_id, std::shared_ptr<Executor> master,
-                std::shared_ptr<HipAllocatorBase> alloc,
-                GKO_HIP_STREAM_STRUCT* stream)
+                std::shared_ptr<HipAllocatorBase> alloc, ihipStream_t* stream)
         : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
     {
         this->get_exec_info().device_id = device_id;
@@ -1876,7 +1851,7 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     handle_manager<hipblasContext> hipblas_handle_;
     handle_manager<hipsparseContext> hipsparse_handle_;
     std::shared_ptr<HipAllocatorBase> alloc_;
-    GKO_HIP_STREAM_STRUCT* stream_;
+    ihipStream_t* stream_;
 };
 
 
diff --git a/include/ginkgo/core/base/fwd_decls.hpp b/include/ginkgo/core/base/fwd_decls.hpp
index f7e446d7bf2..606328f45a4 100644
--- a/include/ginkgo/core/base/fwd_decls.hpp
+++ b/include/ginkgo/core/base/fwd_decls.hpp
@@ -21,15 +21,8 @@ struct hipblasContext;
 
 struct hipsparseContext;
 
-#if GINKGO_HIP_PLATFORM_HCC
 struct ihipStream_t;
 struct ihipEvent_t;
-#define GKO_HIP_STREAM_STRUCT ihipStream_t
-#define GKO_HIP_EVENT_STRUCT ihipEvent_t
-#else
-#define GKO_HIP_STREAM_STRUCT CUstream_st
-#define GKO_HIP_EVENT_STRUCT CUevent_st
-#endif
 
 
 // after intel/llvm September'22 release, which uses major version 6, they
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 30b0da475d0..b81bea5e145 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -616,7 +616,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den)
 }
 
 
-#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
+#if defined(__HIPCC__)
 
 
 /**
@@ -794,7 +794,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T&)
 }
 
 
-#endif  // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
+#endif  // defined(__HIPCC__)
 
 
 #undef GKO_BIND_ZERO_ONE
diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp
index 7f8da044fd9..78fe2cee829 100644
--- a/include/ginkgo/core/base/memory.hpp
+++ b/include/ginkgo/core/base/memory.hpp
@@ -77,8 +77,7 @@ class HipAllocatorBase : public Allocator {
      * @return true if and only if the allocator can be used by HipExecutor in
      *         the given environment.
      */
-    virtual bool check_environment(int device_id,
-                                   GKO_HIP_STREAM_STRUCT* stream) const
+    virtual bool check_environment(int device_id, ihipStream_t* stream) const
     {
         return true;
     }
@@ -186,14 +185,13 @@ class HipAsyncAllocator : public HipAllocatorBase {
 
     void deallocate(void* ptr) override;
 
-    HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream);
+    HipAsyncAllocator(ihipStream_t* stream);
 
 protected:
-    bool check_environment(int device_id,
-                           GKO_HIP_STREAM_STRUCT* stream) const override;
+    bool check_environment(int device_id, ihipStream_t* stream) const override;
 
 private:
-    GKO_HIP_STREAM_STRUCT* stream_;
+    ihipStream_t* stream_;
 };
 
 
@@ -211,8 +209,7 @@ class HipUnifiedAllocator : public HipAllocatorBase, public CpuAllocatorBase {
     HipUnifiedAllocator(int device_id, unsigned int flags);
 
 protected:
-    bool check_environment(int device_id,
-                           GKO_HIP_STREAM_STRUCT* stream) const override;
+    bool check_environment(int device_id, ihipStream_t* stream) const override;
 
 private:
     int device_id_;
@@ -232,8 +229,7 @@ class HipHostAllocator : public HipAllocatorBase, public CpuAllocatorBase {
     HipHostAllocator(int device_id);
 
 protected:
-    bool check_environment(int device_id,
-                           GKO_HIP_STREAM_STRUCT* stream) const override;
+    bool check_environment(int device_id, ihipStream_t* stream) const override;
 
 private:
     int device_id_;
diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp
index 22af70ac14b..f95d8971a3a 100644
--- a/include/ginkgo/core/base/stream.hpp
+++ b/include/ginkgo/core/base/stream.hpp
@@ -89,10 +89,10 @@ class hip_stream {
      * Returns the native HIP stream handle.
      * In an empty hip_stream, this will return nullptr.
      */
-    GKO_HIP_STREAM_STRUCT* get() const;
+    ihipStream_t* get() const;
 
 private:
-    GKO_HIP_STREAM_STRUCT* stream_;
+    ihipStream_t* stream_;
 
     int device_id_;
 };
diff --git a/include/ginkgo/core/base/timer.hpp b/include/ginkgo/core/base/timer.hpp
index 8008cecfb94..7b5d2aed5b3 100644
--- a/include/ginkgo/core/base/timer.hpp
+++ b/include/ginkgo/core/base/timer.hpp
@@ -54,7 +54,7 @@ class time_point {
     type type_;
     union data_union {
         CUevent_st* cuda_event;
-        GKO_HIP_EVENT_STRUCT* hip_event;
+        ihipEvent_t* hip_event;
         sycl::event* dpcpp_event;
         std::chrono::steady_clock::time_point chrono;
 
@@ -206,7 +206,7 @@ class HipTimer : public Timer {
 
 private:
     int device_id_;
-    GKO_HIP_STREAM_STRUCT* stream_;
+    ihipStream_t* stream_;
 };
 
 
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index f27fe12a934..153ebbd1730 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -478,7 +478,6 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                         multiple = 32;
                     }
                 }
-#if GINKGO_HIP_PLATFORM_HCC
                 if (!cuda_strategy_) {
                     multiple = 8;
                     if (nnz >= static_cast<int64_t>(1e7)) {
@@ -487,7 +486,6 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                         multiple = 16;
                     }
                 }
-#endif  // GINKGO_HIP_PLATFORM_HCC
 
                 auto nwarps = nwarps_ * multiple;
                 return min(ceildiv(nnz, warp_size_), nwarps);
@@ -605,12 +603,10 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                 nnz_limit = intel_nnz_limit;
                 row_len_limit = intel_row_len_limit;
             }
-#if GINKGO_HIP_PLATFORM_HCC
             if (!cuda_strategy_) {
                 nnz_limit = amd_nnz_limit;
                 row_len_limit = amd_row_len_limit;
             }
-#endif  // GINKGO_HIP_PLATFORM_HCC
             auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master();
             const bool is_mtx_on_host{host_mtx_exec ==
                                       mtx_row_ptrs.get_executor()};
diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp
index 21c40a70c0a..3899f2fefb7 100644
--- a/test/utils/executor.hpp
+++ b/test/utils/executor.hpp
@@ -63,7 +63,7 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
 
 inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
                           std::shared_ptr<gko::HipExecutor>& exec,
-                          GKO_HIP_STREAM_STRUCT* stream = nullptr)
+                          ihipStream_t* stream = nullptr)
 {
     if (gko::HipExecutor::get_num_devices() == 0) {
         throw std::runtime_error{"No suitable HIP devices"};

From b611c0b25167a10a40cb017f156e5ff34e0d31d9 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Sun, 7 May 2023 12:54:48 +0200
Subject: [PATCH 02/22] modernize HIP setup

---
 CMakeLists.txt                                |  14 ---
 INSTALL.md                                    |  13 +--
 benchmark/CMakeLists.txt                      |  13 +--
 cmake/GinkgoConfig.cmake.in                   |  16 +--
 cmake/build_helpers.cmake                     |  11 +-
 cmake/create_test.cmake                       |  54 ++-------
 cmake/hip.cmake                               | 107 +-----------------
 cmake/hip_path.cmake                          |   5 -
 hip/CMakeLists.txt                            |  52 ++-------
 hip/get_info.cmake                            |  15 +--
 hip/test/base/CMakeLists.txt                  |   8 +-
 test/test_install/CMakeLists.txt              |  27 +----
 .../identify_stream_usage/CMakeLists.txt      |  27 +++--
 13 files changed, 57 insertions(+), 305 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b89f2dbf05..0549c7b77d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,14 +52,6 @@ set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING
     "A list of target NVIDIA GPU architectures. See README.md for more detail.")
 # the details of fine/coarse grain memory and unsafe atomic are available https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#floating-point-fp-atomic-operations-and-coarse-fine-grained-memory-allocations
 option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU and ROCM >= 5). Default is ON because we use hipMalloc, which is always on coarse grain. Must turn off when allocating memory on fine grain" ON)
-set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING
-    "Set the required HIP compiler flags. Current default is an empty string.")
-set(GINKGO_HIP_NVCC_COMPILER_FLAGS "" CACHE STRING
-    "Set the required HIP nvcc compiler flags. Current default is an empty string.")
-set(GINKGO_HIP_CLANG_COMPILER_FLAGS "" CACHE STRING
-    "Set the required HIP CLANG compiler flags. Current default is an empty string.")
-set(GINKGO_HIP_AMDGPU "" CACHE STRING
-    "The amdgpu_target(s) variable passed to hipcc. The default is none (auto).")
 option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON)
 mark_as_advanced(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS)
 option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF)
@@ -144,12 +136,6 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
 endif()
 
-if(BUILD_SHARED_LIBS)
-    set(GINKGO_STATIC_OR_SHARED SHARED)
-else()
-    set(GINKGO_STATIC_OR_SHARED STATIC)
-endif()
-
 # Ensure we have a debug postfix
 if(NOT DEFINED CMAKE_DEBUG_POSTFIX)
     set(CMAKE_DEBUG_POSTFIX "d")
diff --git a/INSTALL.md b/INSTALL.md
index 045d5f93a09..54889f7e335 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -51,9 +51,9 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_BUILD_HIP={ON, OFF}` builds optimized HIP versions of the kernels
     (requires HIP), default is `ON` if an installation of HIP could be detected,
     `OFF` otherwise.
-*   `-DGINKGO_HIP_AMDGPU="gpuarch1;gpuarch2"` the amdgpu_target(s) variable
-    passed to hipcc for the `hcc` HIP backend. The default is none (auto).
-*   `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. Default is `OFF`.
+*   `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. If system HWLOC
+    is not found, Ginkgo will try to build it. Default is `ON` on Linux. Ginkgo
+    does not support HWLOC on Windows/MacOS, so the default is `OFF` on Windows/MacOS.
 *   `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation
     from inline comments in the code. The default is `OFF`.
 *   `-DGINKGO_DOC_GENERATE_EXAMPLES={ON, OFF}` generates the documentation of examples
@@ -191,13 +191,6 @@ environment variable `HIP_PLATFORM` like so:
 export HIP_PLATFORM=nvcc # or nvidia for ROCM >= 4.1
 ```
 
-#### Setting platform specific compilation flags
-Platform specific compilation flags can be given through the following CMake
-variables:
-+ `-DGINKGO_HIP_COMPILER_FLAGS=`: compilation flags given to all platforms.
-+ `-DGINKGO_HIP_NVCC_COMPILER_FLAGS=`: compilation flags given to NVIDIA platforms.
-+ `-DGINKGO_HIP_CLANG_COMPILER_FLAGS=`: compilation flags given to AMD clang compiler.
-
 
 ### Third party libraries and packages
 
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 6b8ce0687d6..ca209e65057 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -25,14 +25,9 @@ endfunction()
 
 function(ginkgo_benchmark_hipsparse_linops type def)
     add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp)
+    set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES LANGUAGE HIP)
     target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def})
-    EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
-    set_target_properties(hipsparse_linops_${type} PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS})
-    # use Thrust C++ device just for compilation, we don't use thrust::complex in the benchmarks
-    target_compile_definitions(hipsparse_linops_${type} PUBLIC -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP)
-    target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE
-        ${HSA_HEADER} ${HIP_INCLUDE_DIRS}
-        ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
+    target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
     target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES})
 endfunction()
 
@@ -126,10 +121,8 @@ if (GINKGO_BUILD_HIP)
     ginkgo_benchmark_hipsparse_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION)
     ginkgo_benchmark_hipsparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION)
     ginkgo_benchmark_hipsparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
+    set_source_files_properties(utils/hip_timer.hip.cpp PROPERTIES LANGUAGE HIP)
     add_library(hip_timer utils/hip_timer.hip.cpp)
-    EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
-    set_target_properties(hip_timer PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS})
-    target_include_directories(hip_timer SYSTEM PRIVATE ${HSA_HEADER} ${HIP_INCLUDE_DIRS})
     target_link_libraries(hip_timer ginkgo)
 endif()
 
diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in
index 0776801aa99..4b4e8d77338 100644
--- a/cmake/GinkgoConfig.cmake.in
+++ b/cmake/GinkgoConfig.cmake.in
@@ -63,16 +63,10 @@ set(GINKGO_JACOBI_FULL_OPTIMIZATIONS @GINKGO_JACOBI_FULL_OPTIMIZATIONS@)
 set(GINKGO_CUDA_ARCHITECTURES "@CMAKE_CUDA_ARCHITECTURES@")
 set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@")
 
-set(GINKGO_HIP_COMPILER_FLAGS "@GINKGO_HIP_COMPILER_FLAGS@")
-set(GINKGO_HIP_HCC_COMPILER_FLAGS "@GINKGO_HIP_HCC_COMPILER_FLAGS@")
-set(GINKGO_HIP_NVCC_COMPILER_FLAGS "@GINKGO_HIP_NVCC_COMPILER_FLAGS@")
-set(GINKGO_HIP_CLANG_COMPILER_FLAGS "@GINKGO_HIP_CLANG_COMPILER_FLAGS@")
-set(GINKGO_HIP_PLATFORM @GINKGO_HIP_PLATFORM@)
-set(GINKGO_HIP_PLATFORM_AMD_REGEX "@HIP_PLATFORM_AMD_REGEX@")
-set(GINKGO_HIP_PLATFORM_NVIDIA_REGEX "@HIP_PLATFORM_NVIDIA_REGEX@")
-set(GINKGO_HIP_AMDGPU "@GINKGO_HIP_AMDGPU@")
-set(GINKGO_HIP_VERSION @GINKGO_HIP_VERSION@)
-set(GINKGO_AMD_ARCH_FLAGS "@GINKGO_AMD_ARCH_FLAGS@")
+set(GINKGO_HIP_COMPILER_FLAGS "@CMAKE_HIP_COMPILER_FLAGS@")
+set(GINKGO_HIP_PLATFORM "@GINKGO_HIP_PLATFORM@")
+set(GINKGO_HIP_VERSION "@GINKGO_HIP_VERSION@")
+set(GINKGO_HIP_ARCHITECTURES "@CMAKE_HIP_ARCHITECTURES@")
 
 set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@)
 set(GINKGO_DPCPP_MAJOR_VERSION @GINKGO_DPCPP_MAJOR_VERSION@)
@@ -175,7 +169,7 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_CUDA)
 endif()
 
 if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP)
-    find_dependency(HIP)
+    enable_language(HIP)
     find_dependency(hipblas)
     find_dependency(hipfft)
     find_dependency(hiprand)
diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake
index 34189a09450..65c987b3490 100644
--- a/cmake/build_helpers.cmake
+++ b/cmake/build_helpers.cmake
@@ -93,17 +93,12 @@ function(ginkgo_check_headers target defines)
         list(APPEND HIP_SOURCES "${HEADER_SOURCEFILE}")
     endforeach()
     if(HIP_SOURCES)
-        set_source_files_properties(${HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE)
-        hip_add_library(${target}_headers_hip ${HIP_SOURCES}) # the compiler options get set by linking to ginkgo_hip
+        set_source_files_properties(${HIP_SOURCES} PROPERTIES LANGUAGE HIP)
+        add_library(${target}_headers_hip ${HIP_SOURCES}) # the compiler options get set by linking to ginkgo_hip
         target_link_libraries(${target}_headers_hip PRIVATE ${target} roc::hipblas roc::hipsparse hip::hiprand roc::rocrand)
         target_include_directories(${target}_headers_hip
             PRIVATE
-            "${CMAKE_CURRENT_SOURCE_DIR}"
-            "${GINKGO_HIP_THRUST_PATH}"
-            "${HIPBLAS_INCLUDE_DIRS}"
-            "${hiprand_INCLUDE_DIRS}"
-            "${HIPSPARSE_INCLUDE_DIRS}"
-            "${ROCPRIM_INCLUDE_DIRS}")
+            "${CMAKE_CURRENT_SOURCE_DIR}")
     endif()
 endfunction()
 
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index c2f5c1fb94f..8214afc50ba 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -167,55 +167,14 @@ endfunction(ginkgo_create_cuda_test_internal)
 ## Test compiled with HIP
 function(ginkgo_create_hip_test test_name)
     ginkgo_build_test_name(${test_name} test_target_name)
-    ginkgo_create_hip_test_internal(${test_name} ${test_name}.hip.cpp ${test_target_name} "" ${ARGN})
+    ginkgo_create_hip_test_internal(${test_name} ${test_name}.hip.cpp ${test_target_name} ${ARGN})
 endfunction(ginkgo_create_hip_test)
 
 ## Internal function allowing separate filename, test name and test target name.
-function(ginkgo_create_hip_test_internal test_name filename test_target_name additional_flags)
-    set_source_files_properties(${filename} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE)
-    set(GINKGO_TEST_HIP_DEFINES -DGKO_COMPILING_HIP ${additional_flags})
-    if (GINKGO_FAST_TESTS)
-        list(APPEND GINKGO_TEST_HIP_DEFINES -DGINKGO_FAST_TESTS)
-    endif()
-    if (GINKGO_TEST_NONDEFAULT_STREAM)
-        list(APPEND GINKGO_TEST_HIP_DEFINES -DGKO_TEST_NONDEFAULT_STREAM)
-    endif()
-
-    # NOTE: With how HIP works, passing the flags `HIPCC_OPTIONS` etc. here
-    # creates a redefinition of all flags. This creates some issues with `nvcc`,
-    # but `clang` seems fine with the redefinitions.
-    if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
-        hip_add_executable(${test_target_name} ${filename}
-            # If `FindHIP.cmake`, namely `HIP_PARSE_HIPCC_OPTIONS` macro and
-            # call gets fixed, uncomment this.
-            HIPCC_OPTIONS ${GINKGO_TEST_HIP_DEFINES} # ${GINKGO_HIPCC_OPTIONS}
-            # NVCC_OPTIONS  ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_NVCC_OPTIONS}
-            # CLANG_OPTIONS ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_CLANG_OPTIONS}
-            --expt-relaxed-constexpr --expt-extended-lambda
-            )
-    else() # hcc/clang
-        hip_add_executable(${test_target_name} ${filename}
-            HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} ${GINKGO_TEST_HIP_DEFINES}
-            NVCC_OPTIONS  ${GINKGO_HIP_NVCC_OPTIONS}
-            CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS}
-            )
-    endif()
-
-    # Let's use a normal compiler for linking
-    set_target_properties(${test_target_name} PROPERTIES LINKER_LANGUAGE CXX)
-
-    target_include_directories(${test_target_name}
-        PRIVATE
-        # Only `math` requires it so far, but it's much easier
-        # to put these this way.
-        ${GINKGO_HIP_THRUST_PATH}
-        # Only `exception_helpers` requires these so far, but it's much easier
-        # to put these this way.
-        ${HIPBLAS_INCLUDE_DIRS}
-        ${HIPFFT_INCLUDE_DIRS}
-        ${hiprand_INCLUDE_DIRS}
-        ${HIPSPARSE_INCLUDE_DIRS}
-        )
+function(ginkgo_create_hip_test_internal test_name filename test_target_name)
+    set_source_files_properties(${filename} PROPERTIES LANGUAGE HIP)
+    add_executable(${test_target_name} ${filename})
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP)
     ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN})
     ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu)
 endfunction(ginkgo_create_hip_test_internal)
@@ -317,7 +276,8 @@ function(ginkgo_create_common_device_test test_name)
     if(GINKGO_BUILD_HIP)
         # need to make a separate file for this, since we can't set conflicting properties on the same file
         configure_file(${test_name}.cpp ${test_name}.hip.cpp COPYONLY)
-        ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip "-std=c++14;-DEXEC_TYPE=HipExecutor;-DEXEC_NAMESPACE=hip" ${ARGN})
+        ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip ${ARGN})
+        target_compile_definitions(${test_target_name}_hip PRIVATE EXEC_TYPE=HipExecutor EXEC_NAMESPACE=hip)
     endif()
 endfunction(ginkgo_create_common_device_test)
 
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 72a7a3a86d8..9a4ed90b712 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -1,26 +1,4 @@
-if(DEFINED ENV{HIP_PLATFORM})
-    set(GINKGO_HIP_PLATFORM "$ENV{HIP_PLATFORM}")
-elseif(GINKGO_HIPCONFIG_PATH)
-    execute_process(COMMAND ${GINKGO_HIPCONFIG_PATH}
-        --platform OUTPUT_VARIABLE GINKGO_HIP_PLATFORM)
-else()
-    message(FATAL_ERROR "No platform could be found for HIP. "
-            "Set and export the environment variable HIP_PLATFORM.")
-endif()
-message(STATUS "HIP platform set to ${GINKGO_HIP_PLATFORM}")
-set(HIP_PLATFORM_AMD_REGEX "hcc|amd")
-set(HIP_PLATFORM_NVIDIA_REGEX "nvcc|nvidia")
-
-if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
-    set(GINKGO_HIP_PLATFORM_HCC 1)
-elseif (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
-    enable_language(CUDA)
-    set(GINKGO_HIP_PLATFORM_NVCC 1)
-endif()
-
-if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.21)
-    set(CMAKE_HIP_ARCHITECTURES OFF)
-endif()
+enable_language(HIP)
 
 if(NOT DEFINED ROCM_PATH)
     if(DEFINED ENV{ROCM_PATH})
@@ -88,24 +66,6 @@ if(NOT DEFINED ROCTRACER_PATH)
     endif()
 endif()
 
-# Find HIPCC_CMAKE_LINKER_HELPER executable
-find_program(
-    HIP_HIPCC_CMAKE_LINKER_HELPER
-    NAMES hipcc_cmake_linker_helper
-    PATHS
-    "${HIP_ROOT_DIR}"
-    ENV ROCM_PATH
-    ENV HIP_PATH
-    /opt/rocm
-    /opt/rocm/hip
-    PATH_SUFFIXES bin
-    NO_DEFAULT_PATH
-)
-if(NOT HIP_HIPCC_CMAKE_LINKER_HELPER)
-    # Now search in default paths
-    find_program(HIP_HIPCC_CMAKE_LINKER_HELPER hipcc_cmake_linker_helper)
-endif()
-
 find_program(
     HIP_HIPCONFIG_EXECUTABLE
     NAMES hipconfig
@@ -130,17 +90,6 @@ execute_process(
             ERROR_STRIP_TRAILING_WHITESPACE
             )
 
-if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
-    # ensure ENV{CUDA_PATH} is set by the user
-    if (NOT DEFINED ENV{CUDA_PATH})
-        find_path(GINKGO_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include NO_DEFAULT_PATH)
-        if (NOT GINKGO_HIP_DEFAULT_CUDA_PATH)
-            message(FATAL_ERROR "HIP nvidia backend was requested but CUDA could not be "
-                "located. Set and export the environment variable CUDA_PATH.")
-         endif()
-     endif()
-endif()
-
 ## Setup all CMAKE variables to find HIP and its dependencies
 set(GINKGO_HIP_MODULE_PATH "${HIP_PATH}/cmake")
 list(APPEND CMAKE_MODULE_PATH "${GINKGO_HIP_MODULE_PATH}")
@@ -155,15 +104,6 @@ list(APPEND CMAKE_PREFIX_PATH
     "${ROCRAND_PATH}/lib/cmake"
 )
 
-# NOTE: without this, HIP jacobi build takes a *very* long time. The reason for
-# that is that these variables are seemingly empty by default, thus there is no
-# proper optimization applied to the HIP builds otherwise.
-set(HIP_HIPCC_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING "Flags used by the HIPCC compiler during DEBUG builds")
-set(HIP_HIPCC_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}" CACHE STRING "Flags used by the HIPCC compiler during MINSIZEREL builds")
-set(HIP_HIPCC_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "Flags used by the HIPCC compiler during RELEASE builds")
-set(HIP_HIPCC_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" CACHE STRING "Flags used by the HIPCC compiler during RELWITHDEBINFO builds")
-
-find_package(HIP REQUIRED)
 find_package(hipblas REQUIRED)
 find_package(hipfft) # optional dependency
 find_package(hiprand REQUIRED)
@@ -171,47 +111,8 @@ find_package(hipsparse REQUIRED)
 # At the moment, for hiprand to work also rocrand is required.
 find_package(rocrand REQUIRED)
 find_package(ROCTX)
-find_path(GINKGO_HIP_THRUST_PATH "thrust/complex.h"
-    PATHS "${HIP_PATH}/../include"
-    ENV HIP_THRUST_PATH)
-if (NOT GINKGO_HIP_THRUST_PATH)
-    message(FATAL_ERROR "Could not find the ROCm header thrust/complex.h which is required by Ginkgo HIP.")
-endif()
-
-set(GINKGO_HIP_NVCC_ARCH "")
-if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
-    if (CMAKE_CUDA_HOST_COMPILER)
-        list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS "-ccbin=${CMAKE_CUDA_HOST_COMPILER}")
-    endif()
-
-    # Remove false positive CUDA warnings when calling one<T>() and zero<T>()
-    list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr --expt-extended-lambda)
-
-    # select GPU architecture    
-    include(cmake/Modules/CudaArchitectureSelector.cmake)
-    cas_variable_cuda_architectures(GINKGO_HIP_NVCC_ARCH
-        ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES}
-        UNSUPPORTED "20" "21")
-endif()
-
-# `target_compile_options` do not work with hip_add_library
-# Thus, we need to pass the flags to `hip_add_library` itself
-if(GINKGO_HIP_AMDGPU)
-    foreach(target ${GINKGO_HIP_AMDGPU})
-        list(APPEND GINKGO_AMD_ARCH_FLAGS --amdgpu-target=${target})
-    endforeach()
-endif()
-
 
-set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS} "-std=c++14 -DGKO_COMPILING_HIP")
-set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS})
-set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS})
-if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND HIP_VERSION VERSION_GREATER_EQUAL 5)
-    list(APPEND GINKGO_HIP_CLANG_OPTIONS "-munsafe-fp-atomics -Wno-unused-command-line-argument")
+if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL 5)
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -munsafe-fp-atomics -Wno-unused-command-line-argument")
 endif()
-# HIP's cmake support secretly carries around global state to remember
-# whether we created any shared libraries, and sets PIC flags accordingly.
-# CMake's scoping rules means that this makes the hip_add_* calls order- and
-# scope-dependent, which is not good. Let's set the flags ourselves instead.
-list(APPEND GINKGO_HIP_CLANG_OPTIONS "-fPIC")
-list(APPEND GINKGO_HIP_NVCC_OPTIONS "--shared -Xcompiler '-fPIC'")
+set(CMAKE_HIP_STANDARD 14)
diff --git a/cmake/hip_path.cmake b/cmake/hip_path.cmake
index 58fcd3db447..269f8403fd3 100644
--- a/cmake/hip_path.cmake
+++ b/cmake/hip_path.cmake
@@ -11,8 +11,3 @@ find_program(GINKGO_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin")
 if(GINKGO_HIPCONFIG_PATH)
     message(STATUS "Found hipconfig: ${GINKGO_HIPCONFIG_PATH}")
 endif()
-
-# We keep using NVCC/HCC for consistency with previous releases even if AMD
-# updated everything to use NVIDIA/AMD in ROCM 4.1
-set(GINKGO_HIP_PLATFORM_NVCC 0)
-set(GINKGO_HIP_PLATFORM_HCC 0)
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index ad106e123bc..56bcfec462a 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -1,3 +1,4 @@
+cmake_minimum_required(VERSION 3.21)
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE)
 add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE)
@@ -70,11 +71,7 @@ else()
     list(APPEND GINKGO_HIP_SOURCES matrix/fft_kernels_stub.hip.cpp)
 endif()
 
-if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
-    set(GKO_HIP_JACOBI_MAX_BLOCK_SIZE 32)
-else()
-    set(GKO_HIP_JACOBI_MAX_BLOCK_SIZE 64)
-endif()
+set(GKO_HIP_JACOBI_MAX_BLOCK_SIZE 64)
 if(GINKGO_JACOBI_FULL_OPTIMIZATIONS)
     set(GKO_HIP_JACOBI_BLOCK_SIZES)
     foreach(blocksize RANGE 1 ${GKO_HIP_JACOBI_MAX_BLOCK_SIZE})
@@ -102,24 +99,14 @@ endforeach()
 string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}")
 configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hip.hpp)
 
-set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE)
-hip_add_library(ginkgo_hip $<TARGET_OBJECTS:ginkgo_hip_device> ${GINKGO_HIP_SOURCES}
-    HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS}
-    CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS}
-    NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS}
-    ${GINKGO_STATIC_OR_SHARED})
+set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES LANGUAGE HIP)
+add_library(ginkgo_hip $<TARGET_OBJECTS:ginkgo_hip_device> ${GINKGO_HIP_SOURCES})
 
 target_include_directories(ginkgo_hip
-    PUBLIC
-        ${HIP_INCLUDE_DIRS}
     PRIVATE
         ${CMAKE_CURRENT_BINARY_DIR}/.. # for generated headers like jacobi_common.hip.hpp
-        ${GINKGO_HIP_THRUST_PATH}
-        ${HIPBLAS_INCLUDE_DIRS}
-        ${HIPFFT_INCLUDE_DIRS}
-        ${hiprand_INCLUDE_DIRS}
-        ${HIPSPARSE_INCLUDE_DIRS}
-        $<BUILD_INTERFACE:${ROCPRIM_INCLUDE_DIRS}>)
+        )
+target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP)
 
 target_link_libraries(ginkgo_hip PUBLIC ginkgo_device)
 target_link_libraries(ginkgo_hip PRIVATE roc::hipblas roc::hipsparse hip::hiprand roc::rocrand)
@@ -130,34 +117,11 @@ if (GINKGO_HAVE_ROCTX)
     target_link_libraries(ginkgo_hip PRIVATE roc::roctx)
 endif()
 
-if(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
-    find_package(hip REQUIRED)
-    # To save a bit of pain, we directly link against the `library` instead of
-    # linking against the target.
-    if (CMAKE_BUILD_TYPE)
-        # Check if our configuration is available first
-        string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPER_BUILD_TYPE)
-        get_target_property(HIP_LIBAMDHIP64_LIBRARIES hip::amdhip64 IMPORTED_LOCATION_${UPPER_BUILD_TYPE})
-    endif()
-    if (NOT HIP_LIBAMDHIP64_LIBRARIES)
-        # Fall back to anything
-        get_target_property(HIP_LIBAMDHIP64_LIBRARIES hip::amdhip64 LOCATION)
-    endif()
-    target_link_libraries(ginkgo_hip PUBLIC ${HIP_LIBAMDHIP64_LIBRARIES})
-elseif(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
-    find_package(CUDA 10.1 REQUIRED)
-    target_link_libraries(ginkgo_hip PUBLIC ${CUDA_LIBRARIES})
-endif()
+target_compile_options(ginkgo_hip PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${GINKGO_COMPILER_FLAGS}>)
 
 # Try to find everything in /opt/rocm/lib first.
 set(GKO_HIP_RPATH "${ROCM_PATH}/lib" )
-if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
-    list(GET CUDA_LIBRARIES 0 CUDA_FIRST_LIB)
-    get_filename_component(GKO_CUDA_LIBDIR "${CUDA_FIRST_LIB}" DIRECTORY)
-    list(APPEND GKO_HIP_RPATH "${GKO_CUDA_LIBDIR}")
-else()
-    list(APPEND GKO_HIP_RPATH "${HIP_PATH}/lib")
-endif()
+list(APPEND GKO_HIP_RPATH "${HIP_PATH}/lib")
 list(APPEND GKO_HIP_RPATH "${HIPBLAS_PATH}/lib" "${HIPRAND_PATH}/lib"
     "${HIPSPARSE_PATH}/lib" "${ROCRAND_PATH}/lib")
 
diff --git a/hip/get_info.cmake b/hip/get_info.cmake
index 1610ac0eee4..234f65642cc 100644
--- a/hip/get_info.cmake
+++ b/hip/get_info.cmake
@@ -1,15 +1,12 @@
 ginkgo_print_module_header(${detailed_log} "HIP")
 ginkgo_print_foreach_variable(${detailed_log}
-    "GINKGO_HIPCONFIG_PATH;GINKGO_HIP_AMDGPU"
-    "GINKGO_HIP_CLANG_COMPILER_FLAGS;GINKGO_HIP_NVCC_COMPILER_FLAGS"
-    "GINKGO_HIP_THRUST_PATH;GINKGO_AMD_ARCH_FLAGS")
+    "GINKGO_HIPCONFIG_PATH")
 ginkgo_print_module_footer(${detailed_log} "HIP variables:")
 ginkgo_print_foreach_variable(${detailed_log}
-    "HIP_VERSION;HIP_COMPILER;HIP_PATH;ROCM_PATH"
+    "HIP_VERSION;HIP_PATH;ROCM_PATH"
     "HIP_PLATFORM;HIP_ROOT_DIR;HIP_RUNTIME;HIPBLAS_PATH;HIPSPARSE_PATH"
-    "HIPRAND_PATH;ROCRAND_PATH;HIP_CLANG_INCLUDE_PATH;HIP_CLANG_PATH"
-    "HIP_HIPCC_EXECUTABLE;HIP_HIPCONFIG_EXECUTABLE;HIP_HOST_COMPILATION_CPP")
-ginkgo_print_flags(${detailed_log} "HIP_HIPCC_FLAGS")
-ginkgo_print_flags(${detailed_log} "HIP_NVCC_FLAGS")
-ginkgo_print_flags(${detailed_log} "HIP_CLANG_FLAGS")
+    "HIPRAND_PATH;ROCRAND_PATH;HIP_CLANG_INCLUDE_PATH"
+    "HIP_HIPCONFIG_EXECUTABLE")
+ginkgo_print_flags(${detailed_log} "CMAKE_HIP_FLAGS")
+ginkgo_print_flags(${detailed_log} "CMAKE_HIP_COMPILER")
 ginkgo_print_module_footer(${detailed_log} "")
diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt
index 11b0e209aeb..57c6f64bfd8 100644
--- a/hip/test/base/CMakeLists.txt
+++ b/hip/test/base/CMakeLists.txt
@@ -9,14 +9,8 @@ if(GINKGO_HAVE_HWLOC)
     endif()
 endif()
 ginkgo_create_hip_test(kernel_launch)
-# correct flags for kernel_launch.hpp are set in GINKGO_HIPCC_OPTIONS
 ginkgo_create_test(lin_op RESOURCE_TYPE hipgpu)
 ginkgo_create_hip_test(math)
 ginkgo_create_test(memory RESOURCE_TYPE hipgpu)
-# Only hcc needs the libraries. nvcc only requires the headers.
-if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
-    ginkgo_create_hip_test(exception_helpers ADDITIONAL_LIBRARIES roc::hipblas roc::hipsparse hip::hiprand roc::rocrand)
-else()
-    ginkgo_create_hip_test(exception_helpers)
-endif()
+ginkgo_create_hip_test(exception_helpers)
 ginkgo_create_hip_test(scoped_device_id)
diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt
index 513af67e923..fef09b19021 100644
--- a/test/test_install/CMakeLists.txt
+++ b/test/test_install/CMakeLists.txt
@@ -44,33 +44,14 @@ if(GINKGO_BUILD_CUDA)
 endif()
 
 if(GINKGO_BUILD_HIP)
-    find_package(HIP REQUIRED)
-
-    set_source_files_properties(test_install.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE)
-    # The library was compiled with `-fPIC` when in shared library mode
-    if (GINKGO_BUILD_SHARED_LIBS)
-        set (GINKGO_PIC_OPTION "-fPIC")
-        set (GINKGO_CUDA_PIC_OPTION "-Xcompiler '-fPIC'")
-    else()
-        set (GINKGO_PIC_OPTION "$<$<CONFIG:Debug>:-fPIC>")
-    endif()
-    if (CMAKE_CUDA_HOST_COMPILER)
-        set(TESTINSTALL_CUDA_HOST_COMPILER "-ccbin=${CMAKE_CUDA_HOST_COMPILER}")
-    endif()
-    hip_add_executable(test_install_hip test_install.cpp
-        HIPCC_OPTIONS "-std=c++14"
-        CLANG_OPTIONS "${GINKGO_PIC_OPTION}"
-        NVCC_OPTIONS "${GINKGO_CUDA_PIC_OPTION}" "${TESTINSTALL_CUDA_HOST_COMPILER}")
+    enable_language(HIP)
+    configure_file(test_install.cpp test_install.hip.cpp COPYONLY)
+    set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp PROPERTIES LANGUAGE HIP)
+    add_executable(test_install_hip ${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp)
 
     target_link_libraries(test_install_hip PRIVATE Ginkgo::ginkgo)
     target_compile_definitions(test_install_hip PRIVATE HAS_HIP=1)
     target_compile_definitions(test_install_hip PRIVATE HAS_REFERENCE=${HAS_REFERENCE})
-
-    # If we always link with CXX there is no RPATH issue
-    set_target_properties(test_install_hip PROPERTIES LINKER_LANGUAGE CXX)
-
-    # Instead of using CXX, it's possible to instead use
-    # target_link_libraries(test_install_hip PRIVATE ${GINKGO_INSTALL_RPATH_FOR_HIP})
 endif()
 
 # Installation step
diff --git a/third_party/identify_stream_usage/CMakeLists.txt b/third_party/identify_stream_usage/CMakeLists.txt
index c20fe43278b..42036baede1 100644
--- a/third_party/identify_stream_usage/CMakeLists.txt
+++ b/third_party/identify_stream_usage/CMakeLists.txt
@@ -27,22 +27,21 @@ if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
     find_package(hip REQUIRED)
     set_source_files_properties(identify_stream_usage.hip.cpp
                                 test_default_stream_identification.hip.cpp
-                                PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE)
-    hip_add_library(identify_stream_usage_hip
-                    identify_stream_usage.hip.cpp
-                    HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS}
-                    CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS}
-                    NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS}
-                    SHARED)
-    target_link_libraries(identify_stream_usage_hip PUBLIC hip::amdhip64 ${CMAKE_DL_LIBS})
+                                PROPERTIES LANGUAGE HIP)
+    add_library(identify_stream_usage_hip SHARED identify_stream_usage.hip.cpp)
+    target_link_libraries(identify_stream_usage_hip ${CMAKE_DL_LIBS})
+    set_target_properties(
+        identify_stream_usage_hip
+        PROPERTIES # set target compile options
+                  CXX_STANDARD 17
+                  CXX_STANDARD_REQUIRED ON
+                  POSITION_INDEPENDENT_CODE ON
+    )
   
-    hip_add_executable(test_stream_identification_hip
-                       test_default_stream_identification.hip.cpp
-                       HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS}
-                       NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS}
-                       CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS})
+    add_executable(test_stream_identification_hip
+                   test_default_stream_identification.hip.cpp)
     add_test(NAME default_stream_identification_hip COMMAND test_stream_identification_hip)
     
     set_tests_properties(default_stream_identification_hip PROPERTIES
                          ENVIRONMENT LD_PRELOAD=$<TARGET_FILE:identify_stream_usage_hip>)
-endif()
\ No newline at end of file
+endif()

From 46ddec40a182270c347a2982f65b40e81a4c1d2e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 15 Feb 2024 15:08:01 +0100
Subject: [PATCH 03/22] work around ROCm compiler bug

---
 hip/CMakeLists.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 56bcfec462a..8bcf2fb9e7d 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -91,6 +91,16 @@ foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES)
     configure_file(
         preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
         preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
+    # The 3D indexing used in Jacobi kernel triggers an instruction selection bug in Debug builds
+    # Probably the same as https://github.com/llvm/llvm-project/issues/67574
+    # Fixed in ROCm 6.0 https://github.com/ROCm/llvm-project/commit/cd7f574a1fd1d3f3e8b9c1cae61fa8133a51de5f
+    # and in LLVM trunk https://github.com/llvm/llvm-project/commit/cc3d2533cc2e4ea06981b86ede5087fbf801e789
+    set_source_files_properties(
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        PROPERTIES
+        COMPILE_OPTIONS $<$<CONFIG:Debug>:-O2>)
     list(APPEND GINKGO_HIP_SOURCES
         ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
         ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp

From e90f8d74cd346b0d9c4abdfec5a7b5b85aa19f1f Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 15 Feb 2024 15:25:19 +0100
Subject: [PATCH 04/22] autodetect based on HIP language

---
 cmake/autodetect_executors.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake
index 757262f1ea1..d3ad2e3a6a1 100644
--- a/cmake/autodetect_executors.cmake
+++ b/cmake/autodetect_executors.cmake
@@ -31,7 +31,8 @@ if (NOT DEFINED GINKGO_BUILD_CUDA)
 endif()
 
 if (NOT DEFINED GINKGO_BUILD_HIP)
-    if(GINKGO_HIPCONFIG_PATH)
+    check_language(HIP)
+    if(CMAKE_HIP_COMPILER)
         message(STATUS "Enabling HIP executor")
         set(GINKGO_HAS_HIP ON)
     endif()

From 7419034716d9564c9c9eac1c03ee22a1d8562bee Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 23 Feb 2024 15:15:14 +0100
Subject: [PATCH 05/22] fix exception_helpers includes

---
 hip/test/base/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt
index 57c6f64bfd8..ae29eb782f1 100644
--- a/hip/test/base/CMakeLists.txt
+++ b/hip/test/base/CMakeLists.txt
@@ -12,5 +12,9 @@ ginkgo_create_hip_test(kernel_launch)
 ginkgo_create_test(lin_op RESOURCE_TYPE hipgpu)
 ginkgo_create_hip_test(math)
 ginkgo_create_test(memory RESOURCE_TYPE hipgpu)
-ginkgo_create_hip_test(exception_helpers)
+set(additional_libs)
+if(hipfft_FOUND)
+    set(additional_libs hip::hipfft)
+endif()
+ginkgo_create_hip_test(exception_helpers ADDITIONAL_LIBRARIES roc::hipblas roc::hipsparse hip::hiprand roc::rocrand ${additional_libs})
 ginkgo_create_hip_test(scoped_device_id)

From 8ac6658b5042ca29e317ccff8af8c3a7fc9a609b Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 8 Apr 2024 15:44:03 +0200
Subject: [PATCH 06/22] Revert "remove HIP CUDA support from source"

This reverts commit c8ea7f47d2d350973ffb31628d887d259ffb662a.
---
 common/cuda_hip/components/atomic.hpp.inc | 17 +++++++--
 core/device_hooks/hip_hooks.cpp           | 16 ++++-----
 core/log/profiler_hook.cpp                |  2 ++
 core/log/profiler_hook.hpp                |  4 +++
 core/test/base/executor.cpp               |  7 ++++
 cuda/base/executor.cpp                    | 10 ++++++
 devices/cuda/executor.cpp                 |  4 +++
 devices/hip/executor.cpp                  |  4 +++
 hip/base/config.hip.hpp                   |  8 +++++
 hip/base/device.hip.cpp                   |  2 +-
 hip/base/device.hpp                       |  2 +-
 hip/base/executor.hip.cpp                 | 17 +++++++++
 hip/base/stream.hip.cpp                   |  2 +-
 hip/base/thrust.hip.hpp                   | 10 +++++-
 hip/components/cooperative_groups.hip.hpp | 44 +++++++++++++++++++++--
 hip/components/format_conversion.hip.hpp  | 13 +++++++
 hip/preconditioner/jacobi_kernels.hip.cpp |  6 +++-
 hip/test/base/hip_executor.hip.cpp        | 12 +++++++
 include/ginkgo/config.hpp.in              |  4 +--
 include/ginkgo/core/base/executor.hpp     | 35 +++++++++++++++---
 include/ginkgo/core/base/fwd_decls.hpp    |  7 ++++
 include/ginkgo/core/base/math.hpp         |  4 +--
 include/ginkgo/core/base/memory.hpp       | 16 +++++----
 include/ginkgo/core/base/stream.hpp       |  4 +--
 include/ginkgo/core/base/timer.hpp        |  4 +--
 include/ginkgo/core/matrix/csr.hpp        |  4 +++
 test/utils/executor.hpp                   |  2 +-
 27 files changed, 222 insertions(+), 38 deletions(-)

diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc
index 00fed2db4a6..3d76cfdcb79 100644
--- a/common/cuda_hip/components/atomic.hpp.inc
+++ b/common/cuda_hip/components/atomic.hpp.inc
@@ -119,7 +119,7 @@ GKO_BIND_ATOMIC_ADD(unsigned long long int);
 GKO_BIND_ATOMIC_ADD(float);
 
 // AMD
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
 
 
 // the double atomicAdd is added after 4.3
@@ -157,7 +157,7 @@ GKO_BIND_ATOMIC_ADD(__half2);
         // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
 
 
-#endif  // defined(__HIPCC__)
+#endif  // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
 
 
 #undef GKO_BIND_ATOMIC_ADD
@@ -179,7 +179,20 @@ __forceinline__ __device__ T atomic_max(T* __restrict__ addr, T val)
 
 GKO_BIND_ATOMIC_MAX(int);
 GKO_BIND_ATOMIC_MAX(unsigned int);
+
+#if !defined(__HIPCC__) || \
+    (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC)
+
+
+#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__)
+// Only Compute Capability 3.5 and higher supports 64-bit atomicMax
+GKO_BIND_ATOMIC_MAX(unsigned long long int);
+#endif
+
+#else   // Is HIP platform & on AMD hardware
 GKO_BIND_ATOMIC_MAX(unsigned long long int);
+#endif  // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) &&
+        // GINKGO_HIP_PLATFORM_HCC)
 
 
 #undef GKO_BIND_ATOMIC_MAX
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index cb85019e542..a90691e1af4 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -33,7 +33,7 @@ void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
 void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
 
 
-HipAsyncAllocator::HipAsyncAllocator(ihipStream_t* stream)
+HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream)
     GKO_NOT_COMPILED(hip);
 
 
@@ -44,7 +44,7 @@ void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
 
 
 bool HipAsyncAllocator::check_environment(int device_id,
-                                          ihipStream_t* stream) const
+                                          GKO_HIP_STREAM_STRUCT* stream) const
     GKO_NOT_COMPILED(hip);
 
 
@@ -62,7 +62,7 @@ void HipUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
 
 
 bool HipUnifiedAllocator::check_environment(int device_id,
-                                            ihipStream_t* stream) const
+                                            GKO_HIP_STREAM_STRUCT* stream) const
     GKO_NOT_COMPILED(hip);
 
 
@@ -76,13 +76,13 @@ void HipHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
 
 
 bool HipHostAllocator::check_environment(int device_id,
-                                         ihipStream_t* stream) const
+                                         GKO_HIP_STREAM_STRUCT* stream) const
     GKO_NOT_COMPILED(hip);
 
 
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
-    allocation_mode alloc_mode, ihipStream_t* stream)
+    allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream)
 {
     return std::shared_ptr<HipExecutor>(
         new HipExecutor(device_id, std::move(master),
@@ -92,7 +92,7 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
 
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master,
-    std::shared_ptr<HipAllocatorBase> alloc, ihipStream_t* stream)
+    std::shared_ptr<HipAllocatorBase> alloc, GKO_HIP_STREAM_STRUCT* stream)
 {
     return std::shared_ptr<HipExecutor>(
         new HipExecutor(device_id, std::move(master), alloc, stream));
@@ -204,7 +204,7 @@ hip_stream::~hip_stream() {}
 hip_stream::hip_stream(hip_stream&&) GKO_NOT_COMPILED(hip);
 
 
-ihipStream_t* hip_stream::get() const GKO_NOT_COMPILED(hip);
+GKO_HIP_STREAM_STRUCT* hip_stream::get() const GKO_NOT_COMPILED(hip);
 
 
 HipTimer::HipTimer(std::shared_ptr<const HipExecutor> exec)
@@ -232,7 +232,7 @@ namespace hip {
 void reset_device(int device_id) GKO_NOT_COMPILED(hip);
 
 
-void destroy_event(ihipEvent_t* event) GKO_NOT_COMPILED(hip);
+void destroy_event(GKO_HIP_EVENT_STRUCT* event) GKO_NOT_COMPILED(hip);
 
 
 }  // namespace hip
diff --git a/core/log/profiler_hook.cpp b/core/log/profiler_hook.cpp
index 8b5d84b2f0e..a8eef7668f2 100644
--- a/core/log/profiler_hook.cpp
+++ b/core/log/profiler_hook.cpp
@@ -391,9 +391,11 @@ std::shared_ptr<ProfilerHook> ProfilerHook::create_for_executor(
     if (std::dynamic_pointer_cast<const CudaExecutor>(exec)) {
         return create_nvtx();
     }
+#if (GINKGO_HIP_PLATFORM_NVCC == 0)
     if (std::dynamic_pointer_cast<const HipExecutor>(exec)) {
         return create_roctx();
     }
+#endif
     if (std::dynamic_pointer_cast<const DpcppExecutor>(exec)) {
         return create_vtune();
     }
diff --git a/core/log/profiler_hook.hpp b/core/log/profiler_hook.hpp
index b6a88c1d471..3f4baf80db1 100644
--- a/core/log/profiler_hook.hpp
+++ b/core/log/profiler_hook.hpp
@@ -110,6 +110,7 @@ class profiling_scope_guard : log::profiling_scope_guard {
 namespace hip {
 
 
+#if (GINKGO_HIP_PLATFORM_NVCC == 0)
 class profiling_scope_guard : log::profiling_scope_guard {
 public:
     profiling_scope_guard(const char* name)
@@ -118,6 +119,9 @@ class profiling_scope_guard : log::profiling_scope_guard {
                                      log::begin_roctx, log::end_nvtx}
     {}
 };
+#else
+using profiling_scope_guard = log::default_profiling_scope_guard;
+#endif
 
 
 }  // namespace hip
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 989a1137b14..dc4ea5aad63 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -336,10 +336,17 @@ TEST(Executor, CanVerifyMemory)
         ASSERT_EQ(false, gpu_dpcpp->memory_accessible(gpu_dpcpp_dup));
         ASSERT_EQ(false, gpu_dpcpp_dup->memory_accessible(gpu_dpcpp));
     }
+#if GINKGO_HIP_PLATFORM_NVCC
+    ASSERT_EQ(true, hip->memory_accessible(cuda));
+    ASSERT_EQ(true, cuda->memory_accessible(hip));
+    ASSERT_EQ(true, hip_1->memory_accessible(cuda_1));
+    ASSERT_EQ(true, cuda_1->memory_accessible(hip_1));
+#else
     ASSERT_EQ(false, hip->memory_accessible(cuda));
     ASSERT_EQ(false, cuda->memory_accessible(hip));
     ASSERT_EQ(false, hip_1->memory_accessible(cuda_1));
     ASSERT_EQ(false, cuda_1->memory_accessible(hip_1));
+#endif
     ASSERT_EQ(true, omp->memory_accessible(omp2));
     ASSERT_EQ(true, hip->memory_accessible(hip2));
     ASSERT_EQ(true, cuda->memory_accessible(cuda2));
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 82bc56792e5..52a92132689 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -135,7 +135,17 @@ void CudaExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes,
 void CudaExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes,
                                const void* src_ptr, void* dest_ptr) const
 {
+#if GINKGO_HIP_PLATFORM_NVCC == 1
+    if (num_bytes > 0) {
+        detail::cuda_scoped_device_id_guard g(this->get_device_id());
+        GKO_ASSERT_NO_CUDA_ERRORS(cudaMemcpyPeerAsync(
+            dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(),
+            num_bytes, this->get_stream()));
+        this->synchronize();
+    }
+#else
     GKO_NOT_SUPPORTED(dest);
+#endif
 }
 
 
diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp
index f16225c07ee..58261c318fb 100644
--- a/devices/cuda/executor.cpp
+++ b/devices/cuda/executor.cpp
@@ -28,7 +28,11 @@ bool CudaExecutor::verify_memory_to(const CudaExecutor* dest_exec) const
 
 bool CudaExecutor::verify_memory_to(const HipExecutor* dest_exec) const
 {
+#if GINKGO_HIP_PLATFORM_NVCC
+    return this->get_device_id() == dest_exec->get_device_id();
+#else
     return false;
+#endif
 }
 
 
diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp
index a8bab47dd61..6954e31b24b 100644
--- a/devices/hip/executor.cpp
+++ b/devices/hip/executor.cpp
@@ -25,7 +25,11 @@ bool HipExecutor::verify_memory_to(const HipExecutor* dest_exec) const
 
 bool HipExecutor::verify_memory_to(const CudaExecutor* dest_exec) const
 {
+#if GINKGO_HIP_PLATFORM_NVCC
+    return this->get_device_id() == dest_exec->get_device_id();
+#else
     return false;
+#endif
 }
 
 
diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
index e0fb2d73210..fbad841fd0f 100644
--- a/hip/base/config.hip.hpp
+++ b/hip/base/config.hip.hpp
@@ -27,13 +27,21 @@ struct config {
     /**
      * The type containing a bitmask over all lanes of a warp.
      */
+#if GINKGO_HIP_PLATFORM_HCC
     using lane_mask_type = uint64;
+#else  // GINKGO_HIP_PLATFORM_NVCC
+    using lane_mask_type = uint32;
+#endif
 
     /**
      * The number of threads within a HIP warp. Here, we use the definition from
      * `device_functions.h`.
      */
+#if GINKGO_HIP_PLATFORM_HCC
     static constexpr uint32 warp_size = warpSize;
+#else  // GINKGO_HIP_PLATFORM_NVCC
+    static constexpr uint32 warp_size = 32;
+#endif
 
     /**
      * The bitmask of the entire warp.
diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp
index 2eaa92e8a66..58376c2175b 100644
--- a/hip/base/device.hip.cpp
+++ b/hip/base/device.hip.cpp
@@ -28,7 +28,7 @@ void reset_device(int device_id)
 }
 
 
-void destroy_event(ihipEvent_t* event)
+void destroy_event(GKO_HIP_EVENT_STRUCT* event)
 {
     GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event));
 }
diff --git a/hip/base/device.hpp b/hip/base/device.hpp
index f00e75851ca..f0ceae0dc2b 100644
--- a/hip/base/device.hpp
+++ b/hip/base/device.hpp
@@ -18,7 +18,7 @@ void reset_device(int device_id);
 
 
 /** calls hipEventDestroy on the given event. */
-void destroy_event(ihipEvent_t* event);
+void destroy_event(GKO_HIP_EVENT_STRUCT* event);
 
 
 /** returns hipDeviceProp.name for the given device */
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 22d1ce0c1e2..2694ce4177f 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -131,7 +131,17 @@ void HipExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes,
 void HipExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes,
                               const void* src_ptr, void* dest_ptr) const
 {
+#if GINKGO_HIP_PLATFORM_NVCC == 1
+    if (num_bytes > 0) {
+        detail::hip_scoped_device_id_guard g(this->get_device_id());
+        GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeerAsync(
+            dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(),
+            num_bytes, this->get_stream()));
+        this->synchronize();
+    }
+#else
     GKO_NOT_SUPPORTED(dest);
+#endif
 }
 
 
@@ -212,9 +222,16 @@ void HipExecutor::set_gpu_property()
             this->get_device_id()));
         this->get_exec_info().max_workgroup_size = max_threads_per_block;
         this->get_exec_info().max_workitem_sizes = max_threads_per_block_dim;
+#if GINKGO_HIP_PLATFORM_NVCC
+        this->get_exec_info().num_pu_per_cu =
+            convert_sm_ver_to_cores(this->get_exec_info().major,
+                                    this->get_exec_info().minor) /
+            kernels::hip::config::warp_size;
+#else
         // In GCN (Graphics Core Next), each multiprocessor has 4 SIMD
         // Reference: https://en.wikipedia.org/wiki/Graphics_Core_Next
         this->get_exec_info().num_pu_per_cu = 4;
+#endif  // GINKGO_HIP_PLATFORM_NVCC
         this->get_exec_info().max_subgroup_size =
             kernels::hip::config::warp_size;
     }
diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp
index 47f4092cc82..93c1fc008d9 100644
--- a/hip/base/stream.hip.cpp
+++ b/hip/base/stream.hip.cpp
@@ -44,7 +44,7 @@ hip_stream::hip_stream(hip_stream&& other)
 {}
 
 
-ihipStream_t* hip_stream::get() const { return stream_; }
+GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; }
 
 
 }  // namespace gko
diff --git a/hip/base/thrust.hip.hpp b/hip/base/thrust.hip.hpp
index 008f1e0645b..2c0412fb67d 100644
--- a/hip/base/thrust.hip.hpp
+++ b/hip/base/thrust.hip.hpp
@@ -7,11 +7,15 @@
 
 
 #include <thrust/execution_policy.h>
-#include <thrust/system/hip/detail/execution_policy.h>
 
 
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#if GINKGO_HIP_PLATFORM_HCC
+#include <thrust/system/hip/detail/execution_policy.h>
+#else
+#include <thrust/system/cuda/detail/execution_policy.h>
+#endif
 
 
 namespace gko {
@@ -21,7 +25,11 @@ namespace hip {
 
 inline auto thrust_policy(std::shared_ptr<const HipExecutor> exec)
 {
+#if GINKGO_HIP_PLATFORM_HCC
     return thrust::hip::par.on(exec->get_stream());
+#else
+    return thrust::cuda::par.on(exec->get_stream());
+#endif
 }
 
 
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index 2e5723366e6..247218a1457 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -167,8 +167,14 @@ class thread_block_tile {
 
     __device__ __forceinline__ unsigned size() const noexcept { return Size; }
 
-    __device__ __forceinline__ void sync() const noexcept {}
+    __device__ __forceinline__ void sync() const noexcept
+    {
+#if GINKGO_HIP_PLATFORM_NVCC
+        __syncwarp(data_.mask);
+#endif  // GINKGO_HIP_PLATFORM_NVCC
+    }
 
+#if GINKGO_HIP_PLATFORM_HCC
 #define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType)                       \
     __device__ __forceinline__ ValueType ShflOp(                             \
         ValueType var, SelectorType selector) const noexcept                 \
@@ -178,6 +184,17 @@ class thread_block_tile {
     static_assert(true,                                                      \
                   "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
+#else
+#define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType)                       \
+    __device__ __forceinline__ ValueType ShflOp(                             \
+        ValueType var, SelectorType selector) const noexcept                 \
+    {                                                                        \
+        return __##ShflOp##_sync(data_.mask, var, selector, Size);           \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+#endif
 
     GKO_BIND_SHFL(shfl, int32, int32);
     GKO_BIND_SHFL(shfl, float, int32);
@@ -205,11 +222,15 @@ class thread_block_tile {
      */
     __device__ __forceinline__ int any(int predicate) const noexcept
     {
+#if GINKGO_HIP_PLATFORM_HCC
         if (Size == config::warp_size) {
             return __any(predicate);
         } else {
             return (__ballot(predicate) & data_.mask) != 0;
         }
+#else
+        return __any_sync(data_.mask, predicate);
+#endif
     }
 
     /**
@@ -218,11 +239,15 @@ class thread_block_tile {
      */
     __device__ __forceinline__ int all(int predicate) const noexcept
     {
+#if GINKGO_HIP_PLATFORM_HCC
         if (Size == config::warp_size) {
             return __all(predicate);
         } else {
             return (__ballot(predicate) & data_.mask) == data_.mask;
         }
+#else
+        return __all_sync(data_.mask, predicate);
+#endif
     }
 
     /**
@@ -235,11 +260,19 @@ class thread_block_tile {
     __device__ __forceinline__ config::lane_mask_type ballot(
         int predicate) const noexcept
     {
+#if GINKGO_HIP_PLATFORM_HCC
         if (Size == config::warp_size) {
             return __ballot(predicate);
         } else {
             return (__ballot(predicate) & data_.mask) >> data_.lane_offset;
         }
+#else
+        if (Size == config::warp_size) {
+            return __ballot_sync(data_.mask, predicate);
+        } else {
+            return __ballot_sync(data_.mask, predicate) >> data_.lane_offset;
+        }
+#endif
     }
 
 private:
@@ -312,9 +345,14 @@ class enable_extended_shuffle : public Group {
 }  // namespace detail
 
 
+// Implementing this as a using directive messes up with SFINAE for some reason,
+// probably a bug in NVCC. If it is a complete type, everything works fine.
 template <unsigned Size>
-using thread_block_tile =
-    detail::enable_extended_shuffle<detail::thread_block_tile<Size>>;
+struct thread_block_tile
+    : detail::enable_extended_shuffle<detail::thread_block_tile<Size>> {
+    using detail::enable_extended_shuffle<
+        detail::thread_block_tile<Size>>::enable_extended_shuffle;
+};
 
 
 // Only support tile_partition with 1, 2, 4, 8, 16, 32, 64 (hip).
diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp
index a5b84533ddd..59c0405a874 100644
--- a/hip/components/format_conversion.hip.hpp
+++ b/hip/components/format_conversion.hip.hpp
@@ -80,12 +80,25 @@ __host__ size_type calculate_nwarps(std::shared_ptr<const HipExecutor> exec,
     size_type nwarps_in_hip = exec->get_num_multiprocessor() *
                               exec->get_num_warps_per_sm() * config::warp_size /
                               subwarp_size;
+#if GINKGO_HIP_PLATFORM_NVCC
+    size_type multiple = 8;
+    if (nnz >= 2e8) {
+        multiple = 2048;
+    } else if (nnz >= 2e7) {
+        multiple = 512;
+    } else if (nnz >= 2e6) {
+        multiple = 128;
+    } else if (nnz >= 2e5) {
+        multiple = 32;
+    }
+#else
     size_type multiple = 2;
     if (nnz >= 1e7) {
         multiple = 32;
     } else if (nnz >= 1e5) {
         multiple = 8;
     }
+#endif  // GINKGO_HIP_PLATFORM_NVCC
 #ifdef GINKGO_BENCHMARK_ENABLE_TUNING
     if (_tuning_flag) {
         multiple = _tuned_value;
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
index 83af45132c4..1646a7fb376 100644
--- a/hip/preconditioner/jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_kernels.hip.cpp
@@ -33,8 +33,12 @@ namespace hip {
 namespace jacobi {
 
 
-// a total of 16 warps (1024 threads)
+// a total of 32/16 warps (1024 threads)
+#if GINKGO_HIP_PLATFORM_HCC
 constexpr int default_num_warps = 16;
+#else  // GINKGO_HIP_PLATFORM_NVCC
+constexpr int default_num_warps = 32;
+#endif
 // with current architectures, at most 32 warps can be scheduled per SM (and
 // current GPUs have at most 84 SMs)
 constexpr int default_grid_size = 32 * 32 * 128;
diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp
index 908f1c06c3e..cfdfc3122fd 100644
--- a/hip/test/base/hip_executor.hip.cpp
+++ b/hip/test/base/hip_executor.hip.cpp
@@ -164,7 +164,11 @@ TEST_F(HipExecutor, FailsWhenOverallocating)
 __global__ void check_data(int* data)
 {
     if (data[0] != 3 || data[1] != 8) {
+#if GINKGO_HIP_PLATFORM_HCC
         asm("s_trap 0x02;");
+#else  // GINKGO_HIP_PLATFORM_NVCC
+        asm("trap;");
+#endif
     }
 }
 
@@ -184,7 +188,11 @@ TEST_F(HipExecutor, CopiesDataToHip)
 __global__ void check_data2(int* data)
 {
     if (data[0] != 4 || data[1] != 8) {
+#if GINKGO_HIP_PLATFORM_HCC
         asm("s_trap 0x02;");
+#else  // GINKGO_HIP_PLATFORM_NVCC
+        asm("trap;");
+#endif
     }
 }
 
@@ -308,7 +316,11 @@ TEST_F(HipExecutor, ExecInfoSetsCorrectProperties)
         &max_threads_per_block, hipDeviceAttributeMaxThreadsPerBlock, dev_id));
     GKO_ASSERT_NO_HIP_ERRORS(
         hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, dev_id));
+#if GINKGO_HIP_PLATFORM_NVCC
+    auto num_cores = convert_sm_ver_to_cores(major, minor);
+#else
     auto num_cores = warp_size * 4;
+#endif
 
     ASSERT_EQ(hip->get_major_version(), major);
     ASSERT_EQ(hip->get_minor_version(), minor);
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index 34d06d8d63c..329918399d6 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -41,10 +41,10 @@
 
 /* What is HIP compiled for, hcc or nvcc? */
 // clang-format off
-#define GINKGO_HIP_PLATFORM_HCC 1
+#define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@
 
 
-#define GINKGO_HIP_PLATFORM_NVCC 0
+#define GINKGO_HIP_PLATFORM_NVCC @GINKGO_HIP_PLATFORM_NVCC@
 // clang-format on
 
 
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 6b3f6d3755d..d7db35d2e3c 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -63,10 +63,34 @@ enum class log_propagation_mode {
 enum class allocation_mode { device, unified_global, unified_host };
 
 
+#ifdef NDEBUG
+
+// When in release, prefer device allocations
 constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
 
 constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
 
+#else
+
+// When in debug, always UM allocations.
+constexpr allocation_mode default_cuda_alloc_mode =
+    allocation_mode::unified_global;
+
+#if (GINKGO_HIP_PLATFORM_HCC == 1)
+
+// HIP on AMD GPUs does not support UM, so always prefer device allocations.
+constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
+
+#else
+
+// HIP on NVIDIA GPUs supports UM, so prefer UM allocations.
+constexpr allocation_mode default_hip_alloc_mode =
+    allocation_mode::unified_global;
+
+#endif
+
+#endif
+
 
 }  // namespace gko
 
@@ -1694,13 +1718,13 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     static std::shared_ptr<HipExecutor> create(
         int device_id, std::shared_ptr<Executor> master, bool device_reset,
         allocation_mode alloc_mode = default_hip_alloc_mode,
-        ihipStream_t* stream = nullptr);
+        GKO_HIP_STREAM_STRUCT* stream = nullptr);
 
     static std::shared_ptr<HipExecutor> create(
         int device_id, std::shared_ptr<Executor> master,
         std::shared_ptr<HipAllocatorBase> alloc =
             std::make_shared<HipAllocator>(),
-        ihipStream_t* stream = nullptr);
+        GKO_HIP_STREAM_STRUCT* stream = nullptr);
 
     std::shared_ptr<Executor> get_master() noexcept override;
 
@@ -1806,7 +1830,7 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
         return this->get_exec_info().closest_pu_ids;
     }
 
-    ihipStream_t* get_stream() const { return stream_; }
+    GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; }
 
 protected:
     void set_gpu_property();
@@ -1814,7 +1838,8 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     void init_handles();
 
     HipExecutor(int device_id, std::shared_ptr<Executor> master,
-                std::shared_ptr<HipAllocatorBase> alloc, ihipStream_t* stream)
+                std::shared_ptr<HipAllocatorBase> alloc,
+                GKO_HIP_STREAM_STRUCT* stream)
         : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
     {
         this->get_exec_info().device_id = device_id;
@@ -1851,7 +1876,7 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     handle_manager<hipblasContext> hipblas_handle_;
     handle_manager<hipsparseContext> hipsparse_handle_;
     std::shared_ptr<HipAllocatorBase> alloc_;
-    ihipStream_t* stream_;
+    GKO_HIP_STREAM_STRUCT* stream_;
 };
 
 
diff --git a/include/ginkgo/core/base/fwd_decls.hpp b/include/ginkgo/core/base/fwd_decls.hpp
index 606328f45a4..f7e446d7bf2 100644
--- a/include/ginkgo/core/base/fwd_decls.hpp
+++ b/include/ginkgo/core/base/fwd_decls.hpp
@@ -21,8 +21,15 @@ struct hipblasContext;
 
 struct hipsparseContext;
 
+#if GINKGO_HIP_PLATFORM_HCC
 struct ihipStream_t;
 struct ihipEvent_t;
+#define GKO_HIP_STREAM_STRUCT ihipStream_t
+#define GKO_HIP_EVENT_STRUCT ihipEvent_t
+#else
+#define GKO_HIP_STREAM_STRUCT CUstream_st
+#define GKO_HIP_EVENT_STRUCT CUevent_st
+#endif
 
 
 // after intel/llvm September'22 release, which uses major version 6, they
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index b81bea5e145..30b0da475d0 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -616,7 +616,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den)
 }
 
 
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
 
 
 /**
@@ -794,7 +794,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T&)
 }
 
 
-#endif  // defined(__HIPCC__)
+#endif  // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
 
 
 #undef GKO_BIND_ZERO_ONE
diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp
index 78fe2cee829..7f8da044fd9 100644
--- a/include/ginkgo/core/base/memory.hpp
+++ b/include/ginkgo/core/base/memory.hpp
@@ -77,7 +77,8 @@ class HipAllocatorBase : public Allocator {
      * @return true if and only if the allocator can be used by HipExecutor in
      *         the given environment.
      */
-    virtual bool check_environment(int device_id, ihipStream_t* stream) const
+    virtual bool check_environment(int device_id,
+                                   GKO_HIP_STREAM_STRUCT* stream) const
     {
         return true;
     }
@@ -185,13 +186,14 @@ class HipAsyncAllocator : public HipAllocatorBase {
 
     void deallocate(void* ptr) override;
 
-    HipAsyncAllocator(ihipStream_t* stream);
+    HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream);
 
 protected:
-    bool check_environment(int device_id, ihipStream_t* stream) const override;
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
 
 private:
-    ihipStream_t* stream_;
+    GKO_HIP_STREAM_STRUCT* stream_;
 };
 
 
@@ -209,7 +211,8 @@ class HipUnifiedAllocator : public HipAllocatorBase, public CpuAllocatorBase {
     HipUnifiedAllocator(int device_id, unsigned int flags);
 
 protected:
-    bool check_environment(int device_id, ihipStream_t* stream) const override;
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
 
 private:
     int device_id_;
@@ -229,7 +232,8 @@ class HipHostAllocator : public HipAllocatorBase, public CpuAllocatorBase {
     HipHostAllocator(int device_id);
 
 protected:
-    bool check_environment(int device_id, ihipStream_t* stream) const override;
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
 
 private:
     int device_id_;
diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp
index f95d8971a3a..22af70ac14b 100644
--- a/include/ginkgo/core/base/stream.hpp
+++ b/include/ginkgo/core/base/stream.hpp
@@ -89,10 +89,10 @@ class hip_stream {
      * Returns the native HIP stream handle.
      * In an empty hip_stream, this will return nullptr.
      */
-    ihipStream_t* get() const;
+    GKO_HIP_STREAM_STRUCT* get() const;
 
 private:
-    ihipStream_t* stream_;
+    GKO_HIP_STREAM_STRUCT* stream_;
 
     int device_id_;
 };
diff --git a/include/ginkgo/core/base/timer.hpp b/include/ginkgo/core/base/timer.hpp
index 7b5d2aed5b3..8008cecfb94 100644
--- a/include/ginkgo/core/base/timer.hpp
+++ b/include/ginkgo/core/base/timer.hpp
@@ -54,7 +54,7 @@ class time_point {
     type type_;
     union data_union {
         CUevent_st* cuda_event;
-        ihipEvent_t* hip_event;
+        GKO_HIP_EVENT_STRUCT* hip_event;
         sycl::event* dpcpp_event;
         std::chrono::steady_clock::time_point chrono;
 
@@ -206,7 +206,7 @@ class HipTimer : public Timer {
 
 private:
     int device_id_;
-    ihipStream_t* stream_;
+    GKO_HIP_STREAM_STRUCT* stream_;
 };
 
 
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index 153ebbd1730..f27fe12a934 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -478,6 +478,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                         multiple = 32;
                     }
                 }
+#if GINKGO_HIP_PLATFORM_HCC
                 if (!cuda_strategy_) {
                     multiple = 8;
                     if (nnz >= static_cast<int64_t>(1e7)) {
@@ -486,6 +487,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                         multiple = 16;
                     }
                 }
+#endif  // GINKGO_HIP_PLATFORM_HCC
 
                 auto nwarps = nwarps_ * multiple;
                 return min(ceildiv(nnz, warp_size_), nwarps);
@@ -603,10 +605,12 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                 nnz_limit = intel_nnz_limit;
                 row_len_limit = intel_row_len_limit;
             }
+#if GINKGO_HIP_PLATFORM_HCC
             if (!cuda_strategy_) {
                 nnz_limit = amd_nnz_limit;
                 row_len_limit = amd_row_len_limit;
             }
+#endif  // GINKGO_HIP_PLATFORM_HCC
             auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master();
             const bool is_mtx_on_host{host_mtx_exec ==
                                       mtx_row_ptrs.get_executor()};
diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp
index 3899f2fefb7..21c40a70c0a 100644
--- a/test/utils/executor.hpp
+++ b/test/utils/executor.hpp
@@ -63,7 +63,7 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
 
 inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
                           std::shared_ptr<gko::HipExecutor>& exec,
-                          ihipStream_t* stream = nullptr)
+                          GKO_HIP_STREAM_STRUCT* stream = nullptr)
 {
     if (gko::HipExecutor::get_num_devices() == 0) {
         throw std::runtime_error{"No suitable HIP devices"};

From d972798093921fb4a33684c5096c4073be4a2a28 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 8 Apr 2024 15:46:57 +0200
Subject: [PATCH 07/22] fix INSTALL.md documentation

Co-authored-by: Yuhsiang M. Tsai <yhmtsai@gmail.com>
---
 INSTALL.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 54889f7e335..087b4000f82 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -51,9 +51,9 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_BUILD_HIP={ON, OFF}` builds optimized HIP versions of the kernels
     (requires HIP), default is `ON` if an installation of HIP could be detected,
     `OFF` otherwise.
-*   `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. If system HWLOC
-    is not found, Ginkgo will try to build it. Default is `ON` on Linux. Ginkgo
-    does not support HWLOC on Windows/MacOS, so the default is `OFF` on Windows/MacOS.
+*   `-DGINKGO_HIP_AMDGPU="gpuarch1;gpuarch2"` the amdgpu_target(s) variable
+    passed to hipcc for the `hcc` HIP backend. The default is none (auto).
+*   `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. Default is `OFF`.
 *   `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation
     from inline comments in the code. The default is `OFF`.
 *   `-DGINKGO_DOC_GENERATE_EXAMPLES={ON, OFF}` generates the documentation of examples

From 2c80e27d0bcda5a7fbb18884c320541f90d839c8 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 8 Apr 2024 16:28:36 +0200
Subject: [PATCH 08/22] improve usage of GINKGO_HIP_PLATFORM

---
 INSTALL.md                                       | 16 +++++++---------
 cmake/build_type_helpers.cmake                   |  2 +-
 cmake/create_test.cmake                          |  2 +-
 cmake/hip.cmake                                  | 12 +++++++++++-
 core/CMakeLists.txt                              |  2 +-
 third_party/identify_stream_usage/CMakeLists.txt |  2 +-
 6 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 087b4000f82..9719bdfb920 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -51,8 +51,8 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_BUILD_HIP={ON, OFF}` builds optimized HIP versions of the kernels
     (requires HIP), default is `ON` if an installation of HIP could be detected,
     `OFF` otherwise.
-*   `-DGINKGO_HIP_AMDGPU="gpuarch1;gpuarch2"` the amdgpu_target(s) variable
-    passed to hipcc for the `hcc` HIP backend. The default is none (auto).
+*   `-DCMAKE_HIP_ARCHITECTURES="gpuarch1;gpuarch2"` the AMDGPU targets to be passed to the compiler.
+    If empty, compiler chooses based on the available GPUs.
 *   `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. Default is `OFF`.
 *   `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation
     from inline comments in the code. The default is `OFF`.
@@ -181,15 +181,13 @@ imposed by the `HIP` tool suite. The variables are the following:
 
 
 #### HIP platform detection of AMD and NVIDIA
-By default, Ginkgo uses the output of `/opt/rocm/hip/bin/hipconfig --platform`
-to select the backend. The accepted values are either `hcc` (`amd` with ROCM >=
-4.1) or `nvcc` (`nvidia` with ROCM >= 4.1). When on an AMD or NVIDIA system,
-this should output the correct platform by default. When on a system without
-GPUs, this should output `hcc` by default. To change this value, export the
-environment variable `HIP_PLATFORM` like so:
+Ginkgo relies on CMake to decide which compiler to use for HIP.
+To choose `nvcc` instead of the default ROCm `clang++`, set the corresponding
+environment variable:
 ```bash
-export HIP_PLATFORM=nvcc # or nvidia for ROCM >= 4.1
+export HIPCXX=nvcc
 ```
+Note that this option is currently not being tested in our CI pipelines.
 
 
 ### Third party libraries and packages
diff --git a/cmake/build_type_helpers.cmake b/cmake/build_type_helpers.cmake
index 09fc4a7dc9b..0c6b386bb21 100644
--- a/cmake/build_type_helpers.cmake
+++ b/cmake/build_type_helpers.cmake
@@ -93,7 +93,7 @@ foreach(_LANG IN LISTS ENABLED_LANGUAGES ITEMS "HIP")
             set(${PROJECT_NAME}_${_LANG}_${_TYPE}_SUPPORTED FALSE)
         endif()
         if(${PROJECT_NAME}_${_LANG}_${_TYPE}_SUPPORTED)
-            if(_LANG STREQUAL "HIP" AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
+            if(_LANG STREQUAL "HIP" AND GINKGO_HIP_PLATFORM_NVIDIA)
                 set(CMAKE_${_LANG}_FLAGS_${_TYPE}
                     ${${PROJECT_NAME}_NVCC_${_TYPE}_COMPILER_FLAGS}
                     CACHE STRING "Flags used by the ${_LANG} compiler during ${_TYPE} builds." FORCE
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index 8214afc50ba..4a349485579 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -104,7 +104,7 @@ function(ginkgo_add_test test_name test_target_name)
     if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_CUDA)
         set(test_preload $<TARGET_FILE:identify_stream_usage_cuda>:${test_preload})
     endif()
-    if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
+    if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM_AMD)
         set(test_preload $<TARGET_FILE:identify_stream_usage_hip>:${test_preload})
     endif()
     if(test_preload)
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 9a4ed90b712..bcfa0b5a4db 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -1,4 +1,14 @@
 enable_language(HIP)
+if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
+    set(GINKGO_HIP_PLATFORM "nvidia")
+    set(GINKGO_HIP_PLATFORM_NVIDIA ON)
+    set(GINKGO_HIP_PLATFORM_AMD OFF)
+else()
+    set(GINKGO_HIP_PLATFORM "amd")
+    set(GINKGO_HIP_PLATFORM_NVIDIA OFF)
+    set(GINKGO_HIP_PLATFORM_AMD ON)
+endif()
+
 
 if(NOT DEFINED ROCM_PATH)
     if(DEFINED ENV{ROCM_PATH})
@@ -93,7 +103,7 @@ execute_process(
 ## Setup all CMAKE variables to find HIP and its dependencies
 set(GINKGO_HIP_MODULE_PATH "${HIP_PATH}/cmake")
 list(APPEND CMAKE_MODULE_PATH "${GINKGO_HIP_MODULE_PATH}")
-if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
+if (GINKGO_HIP_PLATFORM_AND)
     list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake")
 endif()
 list(APPEND CMAKE_PREFIX_PATH
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 449b8da4584..f44e49338fd 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -142,7 +142,7 @@ endif()
 
 # Since we have a public dependency on HIP, this dependency appears
 # here as well
-if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
+if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM_AMD)
     list(APPEND GKO_RPATH_ADDITIONS "${HIP_PATH}/lib")
 endif()
 
diff --git a/third_party/identify_stream_usage/CMakeLists.txt b/third_party/identify_stream_usage/CMakeLists.txt
index 42036baede1..cb316205953 100644
--- a/third_party/identify_stream_usage/CMakeLists.txt
+++ b/third_party/identify_stream_usage/CMakeLists.txt
@@ -23,7 +23,7 @@ if(GINKGO_BUILD_CUDA)
                          ENVIRONMENT LD_PRELOAD=$<TARGET_FILE:identify_stream_usage_cuda>)
 endif()
 
-if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
+if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM_AMD)
     find_package(hip REQUIRED)
     set_source_files_properties(identify_stream_usage.hip.cpp
                                 test_default_stream_identification.hip.cpp

From 60e363ae9f34df71b9c0bc0d447f87a3e883fdef Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 8 Apr 2024 16:30:14 +0200
Subject: [PATCH 09/22] disable simultaneous cuda/hip-cuda compilation in CI

---
 .gitlab-ci.yml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fa8135e0763..f1238d4b7fd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -106,7 +106,6 @@ build/cuda101/nompi/clang/cuda_wo_omp/release/shared:
   variables:
     CXX_COMPILER: "clang++"
     BUILD_CUDA: "ON"
-    BUILD_HIP: "ON"
     BUILD_HWLOC: "OFF"
     BUILD_TYPE: "Release"
     CUDA_ARCH: 35
@@ -139,7 +138,6 @@ build/cuda101/nompi/clang/all/release/static:
     CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_HIP: "ON"
     BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "OFF"
     CUDA_ARCH: 35
@@ -190,7 +188,6 @@ build/cuda102/nompi/gcc/all/debug/shared:
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_HIP: "ON"
     BUILD_TYPE: "Debug"
     FAST_TESTS: "ON"
     BUILD_HWLOC: "OFF"
@@ -207,7 +204,6 @@ build/cuda102/nompi/clang/all/release/static:
     CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_HIP: "ON"
     BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "OFF"
     CUDA_ARCH: 35
@@ -686,7 +682,6 @@ warnings:
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_HIP: "ON"
     CXX_FLAGS: "-Werror=pedantic;-pedantic-errors"
   allow_failure: yes
 
@@ -701,7 +696,6 @@ no-circular-deps:
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_HIP: "ON"
     EXTRA_CMAKE_FLAGS: '-DGINKGO_CHECK_CIRCULAR_DEPS=on'
   allow_failure: no
 
@@ -729,7 +723,6 @@ clang-tidy:
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_HIP: "ON"
     EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_CLANG_TIDY=ON'
   allow_failure: yes
 

From d1fb989c8fbf3849ba9ab64b2e1a5b51cc8559d2 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 8 Apr 2024 16:34:45 +0200
Subject: [PATCH 10/22] add CMake minimum requirement for HIP

---
 README.md       | 1 +
 cmake/hip.cmake | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 49d5f23f300..f34582539a1 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,7 @@ The Ginkgo HIP module has the following __additional__ requirements:
     * _AMD_ backend (using the `clang` compiler)
     * _10.1 <= CUDA < 11_ backend
 * if the hipFFT package is available, it is used to implement the FFT LinOps.
+* _cmake 3.21+_
 
 The Ginkgo DPC++(SYCL) module has the following __additional__ requirements:
 
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index bcfa0b5a4db..a42407a0673 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -1,3 +1,4 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 enable_language(HIP)
 if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
     set(GINKGO_HIP_PLATFORM "nvidia")

From a8c859e428e128a13c0b47d979fe3b0a0fae9b5c Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 8 Apr 2024 19:58:52 +0200
Subject: [PATCH 11/22] fix HIP definitions

---
 cmake/hip.cmake      | 2 ++
 cmake/hip_path.cmake | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index a42407a0673..83abf468e26 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -4,10 +4,12 @@ if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
     set(GINKGO_HIP_PLATFORM "nvidia")
     set(GINKGO_HIP_PLATFORM_NVIDIA ON)
     set(GINKGO_HIP_PLATFORM_AMD OFF)
+    set(GINKGO_HIP_PLATFORM_NVCC 1)
 else()
     set(GINKGO_HIP_PLATFORM "amd")
     set(GINKGO_HIP_PLATFORM_NVIDIA OFF)
     set(GINKGO_HIP_PLATFORM_AMD ON)
+    set(GINKGO_HIP_PLATFORM_HCC 1)
 endif()
 
 
diff --git a/cmake/hip_path.cmake b/cmake/hip_path.cmake
index 269f8403fd3..58fcd3db447 100644
--- a/cmake/hip_path.cmake
+++ b/cmake/hip_path.cmake
@@ -11,3 +11,8 @@ find_program(GINKGO_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin")
 if(GINKGO_HIPCONFIG_PATH)
     message(STATUS "Found hipconfig: ${GINKGO_HIPCONFIG_PATH}")
 endif()
+
+# We keep using NVCC/HCC for consistency with previous releases even if AMD
+# updated everything to use NVIDIA/AMD in ROCM 4.1
+set(GINKGO_HIP_PLATFORM_NVCC 0)
+set(GINKGO_HIP_PLATFORM_HCC 0)

From 4bfb545c9322442f224cd812db3734517b0f07fb Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 8 Apr 2024 20:48:09 +0200
Subject: [PATCH 12/22] disable HIP in last job

---
 .gitlab-ci.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f1238d4b7fd..3521d5f1b2e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -123,7 +123,6 @@ build/cuda101/openmpi/gcc/all/debug/shared:
     BUILD_CUDA: "ON"
     BUILD_MPI: "ON"
     MPI_AS_ROOT: "ON"
-    BUILD_HIP: "ON"
     BUILD_TYPE: "Debug"
     BUILD_PAPI_SDE: "ON"
     CUDA_ARCH: 35

From 270cf5300a01acaf158e1add30828735732e77b6 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 9 Apr 2024 15:13:48 +0200
Subject: [PATCH 13/22] review updates

- fix incorrect CMake version requirement
- remove explicit RPATH additions
- clean up configure log

Co-authored-by: Yuhsiang M. Tsai <yhmtsai@gmail.com>
Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 cmake/GinkgoConfig.cmake.in |  3 +++
 cmake/hip.cmake             |  2 +-
 core/CMakeLists.txt         | 12 +-----------
 hip/CMakeLists.txt          |  7 +------
 hip/get_info.cmake          |  6 ++----
 5 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in
index 4b4e8d77338..23b1d25adc1 100644
--- a/cmake/GinkgoConfig.cmake.in
+++ b/cmake/GinkgoConfig.cmake.in
@@ -211,6 +211,9 @@ if(GINKGO_BUILD_CUDA)
     _ginkgo_check_compiler(CUDA)
     _ginkgo_check_compiler(CUDA_HOST)
 endif()
+if(GINKGO_BUILD_HIP)
+    _ginkgo_check_compiler(HIP)
+endif()
 
 include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake)
 
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 83abf468e26..327375bfe76 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
 enable_language(HIP)
 if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
     set(GINKGO_HIP_PLATFORM "nvidia")
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index f44e49338fd..1b5f9237612 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -116,10 +116,6 @@ ginkgo_compile_features(ginkgo)
 add_library(Ginkgo::ginkgo ALIAS ginkgo)
 target_link_libraries(ginkgo
     PUBLIC ginkgo_device ginkgo_omp ginkgo_cuda ginkgo_reference ginkgo_hip ginkgo_dpcpp)
-
-# The PAPI dependency needs to be exposed to the user.
-set(GKO_RPATH_ADDITIONS "")
-
 if(GINKGO_HAVE_PAPI_SDE)
     target_link_libraries(ginkgo PUBLIC PAPI::PAPI_SDE)
 endif()
@@ -140,14 +136,8 @@ if(GINKGO_BUILD_MPI)
     target_link_libraries(ginkgo PUBLIC MPI::MPI_CXX)
 endif()
 
-# Since we have a public dependency on HIP, this dependency appears
-# here as well
-if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM_AMD)
-    list(APPEND GKO_RPATH_ADDITIONS "${HIP_PATH}/lib")
-endif()
-
 ginkgo_default_includes(ginkgo)
-ginkgo_install_library(ginkgo "${GKO_RPATH_ADDITIONS}")
+ginkgo_install_library(ginkgo)
 
 if(GINKGO_CHECK_CIRCULAR_DEPS)
     ginkgo_check_headers(ginkgo "")
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 8bcf2fb9e7d..738d9d09862 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -129,15 +129,10 @@ endif()
 
 target_compile_options(ginkgo_hip PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${GINKGO_COMPILER_FLAGS}>)
 
-# Try to find everything in /opt/rocm/lib first.
-set(GKO_HIP_RPATH "${ROCM_PATH}/lib" )
-list(APPEND GKO_HIP_RPATH "${HIP_PATH}/lib")
-list(APPEND GKO_HIP_RPATH "${HIPBLAS_PATH}/lib" "${HIPRAND_PATH}/lib"
-    "${HIPSPARSE_PATH}/lib" "${ROCRAND_PATH}/lib")
 
 ginkgo_compile_features(ginkgo_hip)
 ginkgo_default_includes(ginkgo_hip)
-ginkgo_install_library(ginkgo_hip "${GKO_HIP_RPATH}")
+ginkgo_install_library(ginkgo_hip)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
     ginkgo_check_headers(ginkgo_hip GKO_COMPILING_HIP)
diff --git a/hip/get_info.cmake b/hip/get_info.cmake
index 234f65642cc..dc3068e4f81 100644
--- a/hip/get_info.cmake
+++ b/hip/get_info.cmake
@@ -1,12 +1,10 @@
 ginkgo_print_module_header(${detailed_log} "HIP")
-ginkgo_print_foreach_variable(${detailed_log}
-    "GINKGO_HIPCONFIG_PATH")
 ginkgo_print_module_footer(${detailed_log} "HIP variables:")
+ginkgo_print_flags(${detailed_log} "CMAKE_HIP_FLAGS")
+ginkgo_print_flags(${detailed_log} "CMAKE_HIP_COMPILER")
 ginkgo_print_foreach_variable(${detailed_log}
     "HIP_VERSION;HIP_PATH;ROCM_PATH"
     "HIP_PLATFORM;HIP_ROOT_DIR;HIP_RUNTIME;HIPBLAS_PATH;HIPSPARSE_PATH"
     "HIPRAND_PATH;ROCRAND_PATH;HIP_CLANG_INCLUDE_PATH"
     "HIP_HIPCONFIG_EXECUTABLE")
-ginkgo_print_flags(${detailed_log} "CMAKE_HIP_FLAGS")
-ginkgo_print_flags(${detailed_log} "CMAKE_HIP_COMPILER")
 ginkgo_print_module_footer(${detailed_log} "")

From 382e1f0633a07ff976ac2a470368a66cf5b1d9e1 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 9 Apr 2024 15:28:28 +0200
Subject: [PATCH 14/22] set CUDA and HIP standard explicitly

---
 cmake/build_helpers.cmake        | 4 ++++
 cmake/create_test.cmake          | 4 ++++
 test/test_install/CMakeLists.txt | 2 ++
 3 files changed, 10 insertions(+)

diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake
index 65c987b3490..a1a1735f84e 100644
--- a/cmake/build_helpers.cmake
+++ b/cmake/build_helpers.cmake
@@ -19,6 +19,10 @@ endfunction()
 
 function(ginkgo_compile_features name)
     target_compile_features("${name}" PUBLIC cxx_std_14)
+    # we set these properties regardless of the enabled backends,
+    # because unknown properties are ignored
+    set_target_properties("${name}" PROPERTIES HIP_STANDARD 14)
+    set_target_properties("${name}" PROPERTIES CUDA_STANDARD 14)
     if(GINKGO_WITH_CLANG_TIDY AND GINKGO_CLANG_TIDY_PATH)
         set_property(TARGET "${name}" PROPERTY CXX_CLANG_TIDY "${GINKGO_CLANG_TIDY_PATH};-checks=*")
     endif()
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index 4a349485579..ecb75d5da39 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -33,6 +33,10 @@ function(ginkgo_set_test_target_properties test_target_name test_library_suffix)
         target_link_libraries(${test_target_name} PRIVATE ginkgo_gtest_main${test_library_suffix})
     endif()
     target_compile_features(${test_target_name} PUBLIC cxx_std_14)
+    # we set these properties regardless of the enabled backends,
+    # because unknown properties are ignored
+    set_target_properties(${test_target_name} PROPERTIES HIP_STANDARD 14)
+    set_target_properties(${test_target_name} PROPERTIES CUDA_STANDARD 14)
     target_include_directories(${test_target_name} PRIVATE ${Ginkgo_BINARY_DIR} ${set_properties_ADDITIONAL_INCLUDES})
     target_link_libraries(${test_target_name} PRIVATE ginkgo GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES})
 endfunction()
diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt
index fef09b19021..ee19b8d030e 100644
--- a/test/test_install/CMakeLists.txt
+++ b/test/test_install/CMakeLists.txt
@@ -38,6 +38,7 @@ if(GINKGO_BUILD_CUDA)
     enable_language(CUDA)
     configure_file(test_install.cpp test_install.cu COPYONLY)
     add_executable(test_install_cuda ${CMAKE_CURRENT_BINARY_DIR}/test_install.cu)
+    set_target_properties(test_install_cuda PROPERTIES CUDA_STANDARD 14)
     target_compile_definitions(test_install_cuda PRIVATE HAS_CUDA=1)
     target_compile_definitions(test_install_cuda PRIVATE HAS_REFERENCE=${HAS_REFERENCE})
     target_link_libraries(test_install_cuda PRIVATE Ginkgo::ginkgo)
@@ -48,6 +49,7 @@ if(GINKGO_BUILD_HIP)
     configure_file(test_install.cpp test_install.hip.cpp COPYONLY)
     set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp PROPERTIES LANGUAGE HIP)
     add_executable(test_install_hip ${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp)
+    set_target_properties(test_install_hip PROPERTIES HIP_STANDARD 14)
 
     target_link_libraries(test_install_hip PRIVATE Ginkgo::ginkgo)
     target_compile_definitions(test_install_hip PRIVATE HAS_HIP=1)

From 5a0a4760d8bb43814c522c2fcbd7af8e3c6b082e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 9 Apr 2024 18:33:41 +0200
Subject: [PATCH 15/22] remove hipconfig detection

---
 cmake/hip_path.cmake | 5 -----
 hip/get_info.cmake   | 6 +-----
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/cmake/hip_path.cmake b/cmake/hip_path.cmake
index 58fcd3db447..a9f418cb3bd 100644
--- a/cmake/hip_path.cmake
+++ b/cmake/hip_path.cmake
@@ -7,11 +7,6 @@ if(NOT DEFINED HIP_PATH)
     endif()
 endif()
 
-find_program(GINKGO_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin")
-if(GINKGO_HIPCONFIG_PATH)
-    message(STATUS "Found hipconfig: ${GINKGO_HIPCONFIG_PATH}")
-endif()
-
 # We keep using NVCC/HCC for consistency with previous releases even if AMD
 # updated everything to use NVIDIA/AMD in ROCM 4.1
 set(GINKGO_HIP_PLATFORM_NVCC 0)
diff --git a/hip/get_info.cmake b/hip/get_info.cmake
index dc3068e4f81..14a770234fa 100644
--- a/hip/get_info.cmake
+++ b/hip/get_info.cmake
@@ -2,9 +2,5 @@ ginkgo_print_module_header(${detailed_log} "HIP")
 ginkgo_print_module_footer(${detailed_log} "HIP variables:")
 ginkgo_print_flags(${detailed_log} "CMAKE_HIP_FLAGS")
 ginkgo_print_flags(${detailed_log} "CMAKE_HIP_COMPILER")
-ginkgo_print_foreach_variable(${detailed_log}
-    "HIP_VERSION;HIP_PATH;ROCM_PATH"
-    "HIP_PLATFORM;HIP_ROOT_DIR;HIP_RUNTIME;HIPBLAS_PATH;HIPSPARSE_PATH"
-    "HIPRAND_PATH;ROCRAND_PATH;HIP_CLANG_INCLUDE_PATH"
-    "HIP_HIPCONFIG_EXECUTABLE")
+ginkgo_print_variable(${detailed_log} "CMAKE_HIP_COMPILER_VERSION")
 ginkgo_print_module_footer(${detailed_log} "")

From 6422245670b1658284054bc04831cf3876768363 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 10 Apr 2024 13:54:41 +0200
Subject: [PATCH 16/22] fix HIP static builds

---
 hip/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 738d9d09862..f82df149b0e 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -119,7 +119,7 @@ target_include_directories(ginkgo_hip
 target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP)
 
 target_link_libraries(ginkgo_hip PUBLIC ginkgo_device)
-target_link_libraries(ginkgo_hip PRIVATE roc::hipblas roc::hipsparse hip::hiprand roc::rocrand)
+target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand)
 if (hipfft_FOUND)
     target_link_libraries(ginkgo_hip PRIVATE hip::hipfft)
 endif()

From 55417bea1a3ef0ab0b058af1d1e316b123b79604 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 10 Apr 2024 19:17:41 +0200
Subject: [PATCH 17/22] move to ROCm 5.1.4

---
 .gitlab-ci.yml    | 14 +++++++-------
 .gitlab/image.yml |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3521d5f1b2e..c7985bdf11e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -420,25 +420,25 @@ build/amd/nompi/clang/rocm45/debug/shared:
     BUILD_HIP: "ON"
     BUILD_TYPE: "Debug"
 
-# ROCm 5.0.2 and friends
-build/amd/nompi/gcc/rocm502/debug/static:
+# ROCm 5.1.4 and friends
+build/amd/nompi/gcc/rocm514/debug/static:
   extends:
     - .build_and_test_template
     - .default_variables
     - .full_test_condition
-    - .use_gko-rocm502-nompi-gnu11-llvm11
+    - .use_gko-rocm514-nompi-gnu11-llvm11
   variables:
     BUILD_OMP: "ON"
     BUILD_HIP: "ON"
     BUILD_TYPE: "Debug"
     BUILD_SHARED_LIBS: "OFF"
 
-build/amd/nompi/clang/rocm502/release/shared:
+build/amd/nompi/clang/rocm514/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
     - .quick_test_condition
-    - .use_gko-rocm502-nompi-gnu11-llvm11
+    - .use_gko-rocm514-nompi-gnu11-llvm11
   variables:
     CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
@@ -446,12 +446,12 @@ build/amd/nompi/clang/rocm502/release/shared:
     BUILD_TYPE: "Release"
 
 # without omp
-build/amd/nompi/gcc/rocm502_wo_omp/release/shared:
+build/amd/nompi/gcc/rocm514_wo_omp/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
     - .full_test_condition
-    - .use_gko-rocm502-nompi-gnu11-llvm11
+    - .use_gko-rocm514-nompi-gnu11-llvm11
   variables:
     BUILD_OMP: "OFF"
     BUILD_MPI: "OFF"
diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index eb1ab5128af..2be565b6b10 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -74,8 +74,8 @@
     - private_ci
     - amd-gpu
 
-.use_gko-rocm502-nompi-gnu11-llvm11:
-  image: ginkgohub/rocm:502-openmpi-gnu11-llvm11
+.use_gko-rocm514-nompi-gnu11-llvm11:
+  image: ginkgohub/rocm:514-openmpi-gnu11-llvm11
   tags:
     - private_ci
     - amd-gpu

From 18e89462cc0846790c3a5ea062e4c2810cb4b693 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 10 Apr 2024 20:23:56 +0200
Subject: [PATCH 18/22] disable buggy hipFFT tests

---
 test/matrix/fft_kernels.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp
index d5ba741a064..ed186b1df60 100644
--- a/test/matrix/fft_kernels.cpp
+++ b/test/matrix/fft_kernels.cpp
@@ -149,6 +149,9 @@ TYPED_TEST(Fft, ApplyStrided1DInverseIsEqualToReference)
 
 TYPED_TEST(Fft, Apply2DIsEqualToReference)
 {
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+    GTEST_SKIP() << "rocFFT 5.1 has a bug related to 2D FFT";
+#endif
     using T = typename TestFixture::value_type;
 
     this->fft2->apply(this->data, this->out);
@@ -160,6 +163,9 @@ TYPED_TEST(Fft, Apply2DIsEqualToReference)
 
 TYPED_TEST(Fft, ApplyStrided2DIsEqualToReference)
 {
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+    GTEST_SKIP() << "rocFFT 5.1 has a bug related to 2D FFT";
+#endif
     using T = typename TestFixture::value_type;
 
     this->fft2->apply(this->data_strided, this->out_strided);
@@ -171,6 +177,9 @@ TYPED_TEST(Fft, ApplyStrided2DIsEqualToReference)
 
 TYPED_TEST(Fft, Apply2DInverseIsEqualToReference)
 {
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+    GTEST_SKIP() << "rocFFT 5.1 has a bug related to 2D FFT";
+#endif
     using T = typename TestFixture::value_type;
 
     this->ifft2->apply(this->data, this->out);
@@ -182,6 +191,9 @@ TYPED_TEST(Fft, Apply2DInverseIsEqualToReference)
 
 TYPED_TEST(Fft, ApplyStrided2DInverseIsEqualToReference)
 {
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+    GTEST_SKIP() << "rocFFT 5.1 has a bug related to 2D FFT";
+#endif
     using T = typename TestFixture::value_type;
 
     this->ifft2->apply(this->data_strided, this->out_strided);

From d5153f42f8c7ae14bdf5062593af48933e71a95e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 10 Apr 2024 22:44:02 +0200
Subject: [PATCH 19/22] fix compile def propagation for pkg-config

---
 cmake/information_helpers.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/information_helpers.cmake b/cmake/information_helpers.cmake
index 04687dfae5b..0e667a90f02 100644
--- a/cmake/information_helpers.cmake
+++ b/cmake/information_helpers.cmake
@@ -76,7 +76,9 @@ macro(ginkgo_interface_libraries_recursively INTERFACE_LIBS)
                 # Populate the compiler options and definitions if needed
                 get_target_property(GINKGO_LIBS_INTERFACE_DEFS "${_lib}" INTERFACE_COMPILE_DEFINITIONS)
                 if (GINKGO_LIBS_INTERFACE_DEFS)
-                    list(APPEND GINKGO_INTERFACE_CFLAGS_FOUND "${GINKGO_LIBS_INTERFACE_DEFS}")
+                    foreach(def IN LISTS GINKGO_LIBS_INTERFACE_DEFS)
+                        list(APPEND GINKGO_INTERFACE_CFLAGS_FOUND "-D${def}")
+                    endforeach()
                 endif()
                 unset(GINKGO_LIBS_INTERFACE_DEFS)
 

From 52563818bca166707ba4eb5df73b792dd54a21e0 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 12 Apr 2024 12:26:49 +0200
Subject: [PATCH 20/22] create pkg-config file for shared libs only

The static builds involve changing the linker for HIP,
which can't really be represented in pkg-config
---
 cmake/install_helpers.cmake | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/cmake/install_helpers.cmake b/cmake/install_helpers.cmake
index 1e48c991e8a..66efde51dde 100644
--- a/cmake/install_helpers.cmake
+++ b/cmake/install_helpers.cmake
@@ -59,19 +59,21 @@ function(ginkgo_install_library name)
 endfunction()
 
 function(ginkgo_install)
-    # generate pkg-config file, a three-step process is necessary to include the correct install prefix
-    # Step 1: substitute project variables in the generation script
-    configure_file("${Ginkgo_SOURCE_DIR}/cmake/generate_pkg.cmake.in"
-                   "${Ginkgo_BINARY_DIR}/cmake/generate_pkg.cmake"
-                   @ONLY)
-    # Step 2: substitute generator expressions
-    file(GENERATE OUTPUT ${Ginkgo_BINARY_DIR}/cmake/generate_pkg_$<CONFIG>.cmake
-         INPUT ${Ginkgo_BINARY_DIR}/cmake/generate_pkg.cmake)
-    # Step 3: at install time, call the generation script which has all variables
-    #         except the install prefix already replaced. Use the install prefix
-    #         that is specified at install time
-    install(SCRIPT "${Ginkgo_BINARY_DIR}/cmake/generate_pkg_$<CONFIG>.cmake"
-        COMPONENT Ginkgo_Development)
+    if (BUILD_SHARED_LIBS)
+        # generate pkg-config file, a three-step process is necessary to include the correct install prefix
+        # Step 1: substitute project variables in the generation script
+        configure_file("${Ginkgo_SOURCE_DIR}/cmake/generate_pkg.cmake.in"
+                    "${Ginkgo_BINARY_DIR}/cmake/generate_pkg.cmake"
+                    @ONLY)
+        # Step 2: substitute generator expressions
+        file(GENERATE OUTPUT ${Ginkgo_BINARY_DIR}/cmake/generate_pkg_$<CONFIG>.cmake
+            INPUT ${Ginkgo_BINARY_DIR}/cmake/generate_pkg.cmake)
+        # Step 3: at install time, call the generation script which has all variables
+        #         except the install prefix already replaced. Use the install prefix
+        #         that is specified at install time
+        install(SCRIPT "${Ginkgo_BINARY_DIR}/cmake/generate_pkg_$<CONFIG>.cmake"
+            COMPONENT Ginkgo_Development)
+    endif()
 
     # install the public header files
     install(DIRECTORY "${Ginkgo_SOURCE_DIR}/include/"

From 2494442269df80eed84e6640e43159cdcd3ddeb7 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 12 Apr 2024 13:27:20 +0200
Subject: [PATCH 21/22] make test_pkgconfig optional

---
 .gitlab/scripts.yml |  8 ++++++--
 CMakeLists.txt      | 34 ++++++++++++++++++----------------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml
index b58d21de261..4205de667ca 100644
--- a/.gitlab/scripts.yml
+++ b/.gitlab/scripts.yml
@@ -106,7 +106,9 @@
     - popd
     - if [ -n "${SYCL_DEVICE_TYPE}" ]; then unset SYCL_DEVICE_TYPE; fi
     - if [ -n "${SYCL_DEVICE_FILTER}" ]; then unset SYCL_DEVICE_FILTER; fi
-    - PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig
+    - if [[ "${BUILD_SHARED_LIBS}" == "ON" ]]; then
+      PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig;
+      fi
   dependencies: []
 
 
@@ -136,7 +138,9 @@
     - pushd test/test_install
     - ninja install
     - popd
-    - PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig
+    - if [[ "${BUILD_SHARED_LIBS}" == "ON" ]]; then
+      PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig;
+      fi
   cache: []
 
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0549c7b77d1..e2173be3b70 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -467,22 +467,24 @@ add_custom_target(test_exportbuild
     COMMAND ${GINKGO_TEST_EXPORTBUILD_CMD}
     COMMENT "Running a test on Ginkgo's exported build directory.")
 
-add_custom_target(test_pkgconfig
-    COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} ${TOOLSET}
-    -S${GINKGO_TEST_PKGCONFIG_SRC_DIR}
-    -B${GINKGO_TEST_PKGCONFIG_BIN_DIR}
-    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}
-    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    # `--config cfg` is ignored by single-configuration generator.
-    # `$<CONFIG>` is always be the same as `CMAKE_BUILD_TYPE` in
-    # single-configuration generator.
-    COMMAND ${CMAKE_COMMAND}
-    --build ${GINKGO_TEST_PKGCONFIG_BIN_DIR}
-    --config $<CONFIG>
-    COMMAND ${GINKGO_TEST_PKGCONFIG_CMD}
-    COMMENT "Running a test on Ginkgo's PkgConfig"
-    "This requires installing Ginkgo first")
+if (BUILD_SHARED_LIBS)
+    add_custom_target(test_pkgconfig
+        COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} ${TOOLSET}
+        -S${GINKGO_TEST_PKGCONFIG_SRC_DIR}
+        -B${GINKGO_TEST_PKGCONFIG_BIN_DIR}
+        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+        -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}
+        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+        # `--config cfg` is ignored by single-configuration generator.
+        # `$<CONFIG>` is always be the same as `CMAKE_BUILD_TYPE` in
+        # single-configuration generator.
+        COMMAND ${CMAKE_COMMAND}
+        --build ${GINKGO_TEST_PKGCONFIG_BIN_DIR}
+        --config $<CONFIG>
+        COMMAND ${GINKGO_TEST_PKGCONFIG_CMD}
+        COMMENT "Running a test on Ginkgo's PkgConfig"
+        "This requires installing Ginkgo first")
+endif()
 
 
 # Setup CPack

From a8a407f35887c480af66500c20dc53a41d2f7604 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 12 Apr 2024 16:45:47 +0200
Subject: [PATCH 22/22] disable pkg-config only for HIP

---
 .gitlab/scripts.yml         | 4 ++--
 CMakeLists.txt              | 4 +++-
 cmake/install_helpers.cmake | 4 +++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml
index 4205de667ca..dda127ff535 100644
--- a/.gitlab/scripts.yml
+++ b/.gitlab/scripts.yml
@@ -106,7 +106,7 @@
     - popd
     - if [ -n "${SYCL_DEVICE_TYPE}" ]; then unset SYCL_DEVICE_TYPE; fi
     - if [ -n "${SYCL_DEVICE_FILTER}" ]; then unset SYCL_DEVICE_FILTER; fi
-    - if [[ "${BUILD_SHARED_LIBS}" == "ON" ]]; then
+    - if [[ "${BUILD_SHARED_LIBS}" == "ON" || "${BUILD_HIP}" != "ON" ]]; then
       PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig;
       fi
   dependencies: []
@@ -138,7 +138,7 @@
     - pushd test/test_install
     - ninja install
     - popd
-    - if [[ "${BUILD_SHARED_LIBS}" == "ON" ]]; then
+    - if [[ "${BUILD_SHARED_LIBS}" == "ON" || "${BUILD_HIP}" != "ON" ]]; then
       PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig;
       fi
   cache: []
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e2173be3b70..3e17446854e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -467,7 +467,9 @@ add_custom_target(test_exportbuild
     COMMAND ${GINKGO_TEST_EXPORTBUILD_CMD}
     COMMENT "Running a test on Ginkgo's exported build directory.")
 
-if (BUILD_SHARED_LIBS)
+# static linking with pkg-config is not possible with HIP, since
+# some linker information cannot be expressed in pkg-config files
+if (BUILD_SHARED_LIBS OR NOT GINKGO_BUILD_HIP)
     add_custom_target(test_pkgconfig
         COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} ${TOOLSET}
         -S${GINKGO_TEST_PKGCONFIG_SRC_DIR}
diff --git a/cmake/install_helpers.cmake b/cmake/install_helpers.cmake
index 66efde51dde..898988142c9 100644
--- a/cmake/install_helpers.cmake
+++ b/cmake/install_helpers.cmake
@@ -59,7 +59,9 @@ function(ginkgo_install_library name)
 endfunction()
 
 function(ginkgo_install)
-    if (BUILD_SHARED_LIBS)
+    # static linking with pkg-config is not possible with HIP, since
+    # some linker information cannot be expressed in pkg-config files
+    if (BUILD_SHARED_LIBS OR NOT GINKGO_BUILD_HIP)
         # generate pkg-config file, a three-step process is necessary to include the correct install prefix
         # Step 1: substitute project variables in the generation script
         configure_file("${Ginkgo_SOURCE_DIR}/cmake/generate_pkg.cmake.in"