From 6cd265f3ae885f1a985874bcbaefd8b982244730 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 7 May 2023 10:18:08 +0200 Subject: [PATCH 01/22] remove HIP CUDA support from source --- common/cuda_hip/components/atomic.hpp.inc | 17 ++------- core/device_hooks/hip_hooks.cpp | 16 ++++----- core/log/profiler_hook.cpp | 2 -- core/log/profiler_hook.hpp | 4 --- core/test/base/executor.cpp | 7 ---- cuda/base/executor.cpp | 10 ------ devices/cuda/executor.cpp | 4 --- devices/hip/executor.cpp | 4 --- hip/base/config.hip.hpp | 8 ----- hip/base/device.hip.cpp | 2 +- hip/base/device.hpp | 2 +- hip/base/executor.hip.cpp | 17 --------- hip/base/stream.hip.cpp | 2 +- hip/base/thrust.hip.hpp | 10 +----- hip/components/cooperative_groups.hip.hpp | 44 ++--------------------- hip/components/format_conversion.hip.hpp | 13 ------- hip/preconditioner/jacobi_kernels.hip.cpp | 6 +--- hip/test/base/hip_executor.hip.cpp | 12 ------- include/ginkgo/config.hpp.in | 4 +-- include/ginkgo/core/base/executor.hpp | 35 +++--------------- include/ginkgo/core/base/fwd_decls.hpp | 7 ---- include/ginkgo/core/base/math.hpp | 4 +-- include/ginkgo/core/base/memory.hpp | 16 ++++----- include/ginkgo/core/base/stream.hpp | 4 +-- include/ginkgo/core/base/timer.hpp | 4 +-- include/ginkgo/core/matrix/csr.hpp | 4 --- test/utils/executor.hpp | 2 +- 27 files changed, 38 insertions(+), 222 deletions(-) diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc index 3d76cfdcb79..00fed2db4a6 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp.inc @@ -119,7 +119,7 @@ GKO_BIND_ATOMIC_ADD(unsigned long long int); GKO_BIND_ATOMIC_ADD(float); // AMD -#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC +#if defined(__HIPCC__) // the double atomicAdd is added after 4.3 @@ -157,7 +157,7 @@ GKO_BIND_ATOMIC_ADD(__half2); // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))) -#endif // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC +#endif // defined(__HIPCC__) #undef GKO_BIND_ATOMIC_ADD @@ -179,20 +179,7 @@ __forceinline__ __device__ T atomic_max(T* __restrict__ addr, T val) GKO_BIND_ATOMIC_MAX(int); GKO_BIND_ATOMIC_MAX(unsigned int); - -#if !defined(__HIPCC__) || \ - (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC) - - -#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__) -// Only Compute Capability 3.5 and higher supports 64-bit atomicMax -GKO_BIND_ATOMIC_MAX(unsigned long long int); -#endif - -#else // Is HIP platform & on AMD hardware GKO_BIND_ATOMIC_MAX(unsigned long long int); -#endif // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) && - // GINKGO_HIP_PLATFORM_HCC) #undef GKO_BIND_ATOMIC_MAX diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index a90691e1af4..cb85019e542 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -33,7 +33,7 @@ void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); -HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream) +HipAsyncAllocator::HipAsyncAllocator(ihipStream_t* stream) GKO_NOT_COMPILED(hip); @@ -44,7 +44,7 @@ void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); bool HipAsyncAllocator::check_environment(int device_id, - GKO_HIP_STREAM_STRUCT* stream) const + ihipStream_t* stream) const GKO_NOT_COMPILED(hip); @@ -62,7 +62,7 @@ void HipUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); bool HipUnifiedAllocator::check_environment(int device_id, - GKO_HIP_STREAM_STRUCT* stream) const + ihipStream_t* stream) const GKO_NOT_COMPILED(hip); @@ -76,13 +76,13 @@ void HipHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); bool HipHostAllocator::check_environment(int device_id, - GKO_HIP_STREAM_STRUCT* stream) const + ihipStream_t* stream) const GKO_NOT_COMPILED(hip); std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, - allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream) + allocation_mode alloc_mode, ihipStream_t* stream) { return std::shared_ptr( new HipExecutor(device_id, std::move(master), @@ -92,7 +92,7 @@ std::shared_ptr HipExecutor::create( std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, - std::shared_ptr alloc, GKO_HIP_STREAM_STRUCT* stream) + std::shared_ptr alloc, ihipStream_t* stream) { return std::shared_ptr( new HipExecutor(device_id, std::move(master), alloc, stream)); @@ -204,7 +204,7 @@ hip_stream::~hip_stream() {} hip_stream::hip_stream(hip_stream&&) GKO_NOT_COMPILED(hip); -GKO_HIP_STREAM_STRUCT* hip_stream::get() const GKO_NOT_COMPILED(hip); +ihipStream_t* hip_stream::get() const GKO_NOT_COMPILED(hip); HipTimer::HipTimer(std::shared_ptr exec) @@ -232,7 +232,7 @@ namespace hip { void reset_device(int device_id) GKO_NOT_COMPILED(hip); -void destroy_event(GKO_HIP_EVENT_STRUCT* event) GKO_NOT_COMPILED(hip); +void destroy_event(ihipEvent_t* event) GKO_NOT_COMPILED(hip); } // namespace hip diff --git a/core/log/profiler_hook.cpp b/core/log/profiler_hook.cpp index a8eef7668f2..8b5d84b2f0e 100644 --- a/core/log/profiler_hook.cpp +++ b/core/log/profiler_hook.cpp @@ -391,11 +391,9 @@ std::shared_ptr ProfilerHook::create_for_executor( if (std::dynamic_pointer_cast(exec)) { return create_nvtx(); } -#if (GINKGO_HIP_PLATFORM_NVCC == 0) if (std::dynamic_pointer_cast(exec)) { return create_roctx(); } -#endif if (std::dynamic_pointer_cast(exec)) { return create_vtune(); } diff --git a/core/log/profiler_hook.hpp b/core/log/profiler_hook.hpp index 3f4baf80db1..b6a88c1d471 100644 --- a/core/log/profiler_hook.hpp +++ b/core/log/profiler_hook.hpp @@ -110,7 +110,6 @@ class profiling_scope_guard : log::profiling_scope_guard { namespace hip { -#if (GINKGO_HIP_PLATFORM_NVCC == 0) class profiling_scope_guard : log::profiling_scope_guard { public: profiling_scope_guard(const char* name) @@ -119,9 +118,6 @@ class profiling_scope_guard : log::profiling_scope_guard { log::begin_roctx, log::end_nvtx} {} }; -#else -using profiling_scope_guard = log::default_profiling_scope_guard; -#endif } // namespace hip diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index dc4ea5aad63..989a1137b14 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -336,17 +336,10 @@ TEST(Executor, CanVerifyMemory) ASSERT_EQ(false, gpu_dpcpp->memory_accessible(gpu_dpcpp_dup)); ASSERT_EQ(false, gpu_dpcpp_dup->memory_accessible(gpu_dpcpp)); } -#if GINKGO_HIP_PLATFORM_NVCC - ASSERT_EQ(true, hip->memory_accessible(cuda)); - ASSERT_EQ(true, cuda->memory_accessible(hip)); - ASSERT_EQ(true, hip_1->memory_accessible(cuda_1)); - ASSERT_EQ(true, cuda_1->memory_accessible(hip_1)); -#else ASSERT_EQ(false, hip->memory_accessible(cuda)); ASSERT_EQ(false, cuda->memory_accessible(hip)); ASSERT_EQ(false, hip_1->memory_accessible(cuda_1)); ASSERT_EQ(false, cuda_1->memory_accessible(hip_1)); -#endif ASSERT_EQ(true, omp->memory_accessible(omp2)); ASSERT_EQ(true, hip->memory_accessible(hip2)); ASSERT_EQ(true, cuda->memory_accessible(cuda2)); diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index 52a92132689..82bc56792e5 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -135,17 +135,7 @@ void CudaExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes, void CudaExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes, const void* src_ptr, void* dest_ptr) const { -#if GINKGO_HIP_PLATFORM_NVCC == 1 - if (num_bytes > 0) { - detail::cuda_scoped_device_id_guard g(this->get_device_id()); - GKO_ASSERT_NO_CUDA_ERRORS(cudaMemcpyPeerAsync( - dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(), - num_bytes, this->get_stream())); - this->synchronize(); - } -#else GKO_NOT_SUPPORTED(dest); -#endif } diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp index 58261c318fb..f16225c07ee 100644 --- a/devices/cuda/executor.cpp +++ b/devices/cuda/executor.cpp @@ -28,11 +28,7 @@ bool CudaExecutor::verify_memory_to(const CudaExecutor* dest_exec) const bool CudaExecutor::verify_memory_to(const HipExecutor* dest_exec) const { -#if GINKGO_HIP_PLATFORM_NVCC - return this->get_device_id() == dest_exec->get_device_id(); -#else return false; -#endif } diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp index 6954e31b24b..a8bab47dd61 100644 --- a/devices/hip/executor.cpp +++ b/devices/hip/executor.cpp @@ -25,11 +25,7 @@ bool HipExecutor::verify_memory_to(const HipExecutor* dest_exec) const bool HipExecutor::verify_memory_to(const CudaExecutor* dest_exec) const { -#if GINKGO_HIP_PLATFORM_NVCC - return this->get_device_id() == dest_exec->get_device_id(); -#else return false; -#endif } diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp index fbad841fd0f..e0fb2d73210 100644 --- a/hip/base/config.hip.hpp +++ b/hip/base/config.hip.hpp @@ -27,21 +27,13 @@ struct config { /** * The type containing a bitmask over all lanes of a warp. */ -#if GINKGO_HIP_PLATFORM_HCC using lane_mask_type = uint64; -#else // GINKGO_HIP_PLATFORM_NVCC - using lane_mask_type = uint32; -#endif /** * The number of threads within a HIP warp. Here, we use the definition from * `device_functions.h`. */ -#if GINKGO_HIP_PLATFORM_HCC static constexpr uint32 warp_size = warpSize; -#else // GINKGO_HIP_PLATFORM_NVCC - static constexpr uint32 warp_size = 32; -#endif /** * The bitmask of the entire warp. diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp index 58376c2175b..2eaa92e8a66 100644 --- a/hip/base/device.hip.cpp +++ b/hip/base/device.hip.cpp @@ -28,7 +28,7 @@ void reset_device(int device_id) } -void destroy_event(GKO_HIP_EVENT_STRUCT* event) +void destroy_event(ihipEvent_t* event) { GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event)); } diff --git a/hip/base/device.hpp b/hip/base/device.hpp index f0ceae0dc2b..f00e75851ca 100644 --- a/hip/base/device.hpp +++ b/hip/base/device.hpp @@ -18,7 +18,7 @@ void reset_device(int device_id); /** calls hipEventDestroy on the given event. */ -void destroy_event(GKO_HIP_EVENT_STRUCT* event); +void destroy_event(ihipEvent_t* event); /** returns hipDeviceProp.name for the given device */ diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 2694ce4177f..22d1ce0c1e2 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -131,17 +131,7 @@ void HipExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes, void HipExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes, const void* src_ptr, void* dest_ptr) const { -#if GINKGO_HIP_PLATFORM_NVCC == 1 - if (num_bytes > 0) { - detail::hip_scoped_device_id_guard g(this->get_device_id()); - GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeerAsync( - dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(), - num_bytes, this->get_stream())); - this->synchronize(); - } -#else GKO_NOT_SUPPORTED(dest); -#endif } @@ -222,16 +212,9 @@ void HipExecutor::set_gpu_property() this->get_device_id())); this->get_exec_info().max_workgroup_size = max_threads_per_block; this->get_exec_info().max_workitem_sizes = max_threads_per_block_dim; -#if GINKGO_HIP_PLATFORM_NVCC - this->get_exec_info().num_pu_per_cu = - convert_sm_ver_to_cores(this->get_exec_info().major, - this->get_exec_info().minor) / - kernels::hip::config::warp_size; -#else // In GCN (Graphics Core Next), each multiprocessor has 4 SIMD // Reference: https://en.wikipedia.org/wiki/Graphics_Core_Next this->get_exec_info().num_pu_per_cu = 4; -#endif // GINKGO_HIP_PLATFORM_NVCC this->get_exec_info().max_subgroup_size = kernels::hip::config::warp_size; } diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp index 93c1fc008d9..47f4092cc82 100644 --- a/hip/base/stream.hip.cpp +++ b/hip/base/stream.hip.cpp @@ -44,7 +44,7 @@ hip_stream::hip_stream(hip_stream&& other) {} -GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; } +ihipStream_t* hip_stream::get() const { return stream_; } } // namespace gko diff --git a/hip/base/thrust.hip.hpp b/hip/base/thrust.hip.hpp index 2c0412fb67d..008f1e0645b 100644 --- a/hip/base/thrust.hip.hpp +++ b/hip/base/thrust.hip.hpp @@ -7,15 +7,11 @@ #include +#include #include #include -#if GINKGO_HIP_PLATFORM_HCC -#include -#else -#include -#endif namespace gko { @@ -25,11 +21,7 @@ namespace hip { inline auto thrust_policy(std::shared_ptr exec) { -#if GINKGO_HIP_PLATFORM_HCC return thrust::hip::par.on(exec->get_stream()); -#else - return thrust::cuda::par.on(exec->get_stream()); -#endif } diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 247218a1457..2e5723366e6 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -167,14 +167,8 @@ class thread_block_tile { __device__ __forceinline__ unsigned size() const noexcept { return Size; } - __device__ __forceinline__ void sync() const noexcept - { -#if GINKGO_HIP_PLATFORM_NVCC - __syncwarp(data_.mask); -#endif // GINKGO_HIP_PLATFORM_NVCC - } + __device__ __forceinline__ void sync() const noexcept {} -#if GINKGO_HIP_PLATFORM_HCC #define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType) \ __device__ __forceinline__ ValueType ShflOp( \ ValueType var, SelectorType selector) const noexcept \ @@ -184,17 +178,6 @@ class thread_block_tile { static_assert(true, \ "This assert is used to counter the false positive extra " \ "semi-colon warnings") -#else -#define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType) \ - __device__ __forceinline__ ValueType ShflOp( \ - ValueType var, SelectorType selector) const noexcept \ - { \ - return __##ShflOp##_sync(data_.mask, var, selector, Size); \ - } \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") -#endif GKO_BIND_SHFL(shfl, int32, int32); GKO_BIND_SHFL(shfl, float, int32); @@ -222,15 +205,11 @@ class thread_block_tile { */ __device__ __forceinline__ int any(int predicate) const noexcept { -#if GINKGO_HIP_PLATFORM_HCC if (Size == config::warp_size) { return __any(predicate); } else { return (__ballot(predicate) & data_.mask) != 0; } -#else - return __any_sync(data_.mask, predicate); -#endif } /** @@ -239,15 +218,11 @@ class thread_block_tile { */ __device__ __forceinline__ int all(int predicate) const noexcept { -#if GINKGO_HIP_PLATFORM_HCC if (Size == config::warp_size) { return __all(predicate); } else { return (__ballot(predicate) & data_.mask) == data_.mask; } -#else - return __all_sync(data_.mask, predicate); -#endif } /** @@ -260,19 +235,11 @@ class thread_block_tile { __device__ __forceinline__ config::lane_mask_type ballot( int predicate) const noexcept { -#if GINKGO_HIP_PLATFORM_HCC if (Size == config::warp_size) { return __ballot(predicate); } else { return (__ballot(predicate) & data_.mask) >> data_.lane_offset; } -#else - if (Size == config::warp_size) { - return __ballot_sync(data_.mask, predicate); - } else { - return __ballot_sync(data_.mask, predicate) >> data_.lane_offset; - } -#endif } private: @@ -345,14 +312,9 @@ class enable_extended_shuffle : public Group { } // namespace detail -// Implementing this as a using directive messes up with SFINAE for some reason, -// probably a bug in NVCC. If it is a complete type, everything works fine. template -struct thread_block_tile - : detail::enable_extended_shuffle> { - using detail::enable_extended_shuffle< - detail::thread_block_tile>::enable_extended_shuffle; -}; +using thread_block_tile = + detail::enable_extended_shuffle>; // Only support tile_partition with 1, 2, 4, 8, 16, 32, 64 (hip). diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp index 59c0405a874..a5b84533ddd 100644 --- a/hip/components/format_conversion.hip.hpp +++ b/hip/components/format_conversion.hip.hpp @@ -80,25 +80,12 @@ __host__ size_type calculate_nwarps(std::shared_ptr exec, size_type nwarps_in_hip = exec->get_num_multiprocessor() * exec->get_num_warps_per_sm() * config::warp_size / subwarp_size; -#if GINKGO_HIP_PLATFORM_NVCC - size_type multiple = 8; - if (nnz >= 2e8) { - multiple = 2048; - } else if (nnz >= 2e7) { - multiple = 512; - } else if (nnz >= 2e6) { - multiple = 128; - } else if (nnz >= 2e5) { - multiple = 32; - } -#else size_type multiple = 2; if (nnz >= 1e7) { multiple = 32; } else if (nnz >= 1e5) { multiple = 8; } -#endif // GINKGO_HIP_PLATFORM_NVCC #ifdef GINKGO_BENCHMARK_ENABLE_TUNING if (_tuning_flag) { multiple = _tuned_value; diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp index 1646a7fb376..83af45132c4 100644 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -33,12 +33,8 @@ namespace hip { namespace jacobi { -// a total of 32/16 warps (1024 threads) -#if GINKGO_HIP_PLATFORM_HCC +// a total of 16 warps (1024 threads) constexpr int default_num_warps = 16; -#else // GINKGO_HIP_PLATFORM_NVCC -constexpr int default_num_warps = 32; -#endif // with current architectures, at most 32 warps can be scheduled per SM (and // current GPUs have at most 84 SMs) constexpr int default_grid_size = 32 * 32 * 128; diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index cfdfc3122fd..908f1c06c3e 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -164,11 +164,7 @@ TEST_F(HipExecutor, FailsWhenOverallocating) __global__ void check_data(int* data) { if (data[0] != 3 || data[1] != 8) { -#if GINKGO_HIP_PLATFORM_HCC asm("s_trap 0x02;"); -#else // GINKGO_HIP_PLATFORM_NVCC - asm("trap;"); -#endif } } @@ -188,11 +184,7 @@ TEST_F(HipExecutor, CopiesDataToHip) __global__ void check_data2(int* data) { if (data[0] != 4 || data[1] != 8) { -#if GINKGO_HIP_PLATFORM_HCC asm("s_trap 0x02;"); -#else // GINKGO_HIP_PLATFORM_NVCC - asm("trap;"); -#endif } } @@ -316,11 +308,7 @@ TEST_F(HipExecutor, ExecInfoSetsCorrectProperties) &max_threads_per_block, hipDeviceAttributeMaxThreadsPerBlock, dev_id)); GKO_ASSERT_NO_HIP_ERRORS( hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, dev_id)); -#if GINKGO_HIP_PLATFORM_NVCC - auto num_cores = convert_sm_ver_to_cores(major, minor); -#else auto num_cores = warp_size * 4; -#endif ASSERT_EQ(hip->get_major_version(), major); ASSERT_EQ(hip->get_minor_version(), minor); diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 329918399d6..34d06d8d63c 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -41,10 +41,10 @@ /* What is HIP compiled for, hcc or nvcc? */ // clang-format off -#define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@ +#define GINKGO_HIP_PLATFORM_HCC 1 -#define GINKGO_HIP_PLATFORM_NVCC @GINKGO_HIP_PLATFORM_NVCC@ +#define GINKGO_HIP_PLATFORM_NVCC 0 // clang-format on diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index d7db35d2e3c..6b3f6d3755d 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -63,34 +63,10 @@ enum class log_propagation_mode { enum class allocation_mode { device, unified_global, unified_host }; -#ifdef NDEBUG - -// When in release, prefer device allocations constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device; constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device; -#else - -// When in debug, always UM allocations. -constexpr allocation_mode default_cuda_alloc_mode = - allocation_mode::unified_global; - -#if (GINKGO_HIP_PLATFORM_HCC == 1) - -// HIP on AMD GPUs does not support UM, so always prefer device allocations. -constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device; - -#else - -// HIP on NVIDIA GPUs supports UM, so prefer UM allocations. -constexpr allocation_mode default_hip_alloc_mode = - allocation_mode::unified_global; - -#endif - -#endif - } // namespace gko @@ -1718,13 +1694,13 @@ class HipExecutor : public detail::ExecutorBase, static std::shared_ptr create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode = default_hip_alloc_mode, - GKO_HIP_STREAM_STRUCT* stream = nullptr); + ihipStream_t* stream = nullptr); static std::shared_ptr create( int device_id, std::shared_ptr master, std::shared_ptr alloc = std::make_shared(), - GKO_HIP_STREAM_STRUCT* stream = nullptr); + ihipStream_t* stream = nullptr); std::shared_ptr get_master() noexcept override; @@ -1830,7 +1806,7 @@ class HipExecutor : public detail::ExecutorBase, return this->get_exec_info().closest_pu_ids; } - GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; } + ihipStream_t* get_stream() const { return stream_; } protected: void set_gpu_property(); @@ -1838,8 +1814,7 @@ class HipExecutor : public detail::ExecutorBase, void init_handles(); HipExecutor(int device_id, std::shared_ptr master, - std::shared_ptr alloc, - GKO_HIP_STREAM_STRUCT* stream) + std::shared_ptr alloc, ihipStream_t* stream) : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream} { this->get_exec_info().device_id = device_id; @@ -1876,7 +1851,7 @@ class HipExecutor : public detail::ExecutorBase, handle_manager hipblas_handle_; handle_manager hipsparse_handle_; std::shared_ptr alloc_; - GKO_HIP_STREAM_STRUCT* stream_; + ihipStream_t* stream_; }; diff --git a/include/ginkgo/core/base/fwd_decls.hpp b/include/ginkgo/core/base/fwd_decls.hpp index f7e446d7bf2..606328f45a4 100644 --- a/include/ginkgo/core/base/fwd_decls.hpp +++ b/include/ginkgo/core/base/fwd_decls.hpp @@ -21,15 +21,8 @@ struct hipblasContext; struct hipsparseContext; -#if GINKGO_HIP_PLATFORM_HCC struct ihipStream_t; struct ihipEvent_t; -#define GKO_HIP_STREAM_STRUCT ihipStream_t -#define GKO_HIP_EVENT_STRUCT ihipEvent_t -#else -#define GKO_HIP_STREAM_STRUCT CUstream_st -#define GKO_HIP_EVENT_STRUCT CUevent_st -#endif // after intel/llvm September'22 release, which uses major version 6, they diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 30b0da475d0..b81bea5e145 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -616,7 +616,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den) } -#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC +#if defined(__HIPCC__) /** @@ -794,7 +794,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T&) } -#endif // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC +#endif // defined(__HIPCC__) #undef GKO_BIND_ZERO_ONE diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp index 7f8da044fd9..78fe2cee829 100644 --- a/include/ginkgo/core/base/memory.hpp +++ b/include/ginkgo/core/base/memory.hpp @@ -77,8 +77,7 @@ class HipAllocatorBase : public Allocator { * @return true if and only if the allocator can be used by HipExecutor in * the given environment. */ - virtual bool check_environment(int device_id, - GKO_HIP_STREAM_STRUCT* stream) const + virtual bool check_environment(int device_id, ihipStream_t* stream) const { return true; } @@ -186,14 +185,13 @@ class HipAsyncAllocator : public HipAllocatorBase { void deallocate(void* ptr) override; - HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream); + HipAsyncAllocator(ihipStream_t* stream); protected: - bool check_environment(int device_id, - GKO_HIP_STREAM_STRUCT* stream) const override; + bool check_environment(int device_id, ihipStream_t* stream) const override; private: - GKO_HIP_STREAM_STRUCT* stream_; + ihipStream_t* stream_; }; @@ -211,8 +209,7 @@ class HipUnifiedAllocator : public HipAllocatorBase, public CpuAllocatorBase { HipUnifiedAllocator(int device_id, unsigned int flags); protected: - bool check_environment(int device_id, - GKO_HIP_STREAM_STRUCT* stream) const override; + bool check_environment(int device_id, ihipStream_t* stream) const override; private: int device_id_; @@ -232,8 +229,7 @@ class HipHostAllocator : public HipAllocatorBase, public CpuAllocatorBase { HipHostAllocator(int device_id); protected: - bool check_environment(int device_id, - GKO_HIP_STREAM_STRUCT* stream) const override; + bool check_environment(int device_id, ihipStream_t* stream) const override; private: int device_id_; diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp index 22af70ac14b..f95d8971a3a 100644 --- a/include/ginkgo/core/base/stream.hpp +++ b/include/ginkgo/core/base/stream.hpp @@ -89,10 +89,10 @@ class hip_stream { * Returns the native HIP stream handle. * In an empty hip_stream, this will return nullptr. */ - GKO_HIP_STREAM_STRUCT* get() const; + ihipStream_t* get() const; private: - GKO_HIP_STREAM_STRUCT* stream_; + ihipStream_t* stream_; int device_id_; }; diff --git a/include/ginkgo/core/base/timer.hpp b/include/ginkgo/core/base/timer.hpp index 8008cecfb94..7b5d2aed5b3 100644 --- a/include/ginkgo/core/base/timer.hpp +++ b/include/ginkgo/core/base/timer.hpp @@ -54,7 +54,7 @@ class time_point { type type_; union data_union { CUevent_st* cuda_event; - GKO_HIP_EVENT_STRUCT* hip_event; + ihipEvent_t* hip_event; sycl::event* dpcpp_event; std::chrono::steady_clock::time_point chrono; @@ -206,7 +206,7 @@ class HipTimer : public Timer { private: int device_id_; - GKO_HIP_STREAM_STRUCT* stream_; + ihipStream_t* stream_; }; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index f27fe12a934..153ebbd1730 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -478,7 +478,6 @@ class Csr : public EnableLinOp>, multiple = 32; } } -#if GINKGO_HIP_PLATFORM_HCC if (!cuda_strategy_) { multiple = 8; if (nnz >= static_cast(1e7)) { @@ -487,7 +486,6 @@ class Csr : public EnableLinOp>, multiple = 16; } } -#endif // GINKGO_HIP_PLATFORM_HCC auto nwarps = nwarps_ * multiple; return min(ceildiv(nnz, warp_size_), nwarps); @@ -605,12 +603,10 @@ class Csr : public EnableLinOp>, nnz_limit = intel_nnz_limit; row_len_limit = intel_row_len_limit; } -#if GINKGO_HIP_PLATFORM_HCC if (!cuda_strategy_) { nnz_limit = amd_nnz_limit; row_len_limit = amd_row_len_limit; } -#endif // GINKGO_HIP_PLATFORM_HCC auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master(); const bool is_mtx_on_host{host_mtx_exec == mtx_row_ptrs.get_executor()}; diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 21c40a70c0a..3899f2fefb7 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -63,7 +63,7 @@ inline void init_executor(std::shared_ptr ref, inline void init_executor(std::shared_ptr ref, std::shared_ptr& exec, - GKO_HIP_STREAM_STRUCT* stream = nullptr) + ihipStream_t* stream = nullptr) { if (gko::HipExecutor::get_num_devices() == 0) { throw std::runtime_error{"No suitable HIP devices"}; From b611c0b25167a10a40cb017f156e5ff34e0d31d9 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 7 May 2023 12:54:48 +0200 Subject: [PATCH 02/22] modernize HIP setup --- CMakeLists.txt | 14 --- INSTALL.md | 13 +-- benchmark/CMakeLists.txt | 13 +-- cmake/GinkgoConfig.cmake.in | 16 +-- cmake/build_helpers.cmake | 11 +- cmake/create_test.cmake | 54 ++------- cmake/hip.cmake | 107 +----------------- cmake/hip_path.cmake | 5 - hip/CMakeLists.txt | 52 ++------- hip/get_info.cmake | 15 +-- hip/test/base/CMakeLists.txt | 8 +- test/test_install/CMakeLists.txt | 27 +---- .../identify_stream_usage/CMakeLists.txt | 27 +++-- 13 files changed, 57 insertions(+), 305 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b89f2dbf05..0549c7b77d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,14 +52,6 @@ set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING "A list of target NVIDIA GPU architectures. See README.md for more detail.") # the details of fine/coarse grain memory and unsafe atomic are available https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#floating-point-fp-atomic-operations-and-coarse-fine-grained-memory-allocations option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU and ROCM >= 5). Default is ON because we use hipMalloc, which is always on coarse grain. Must turn off when allocating memory on fine grain" ON) -set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING - "Set the required HIP compiler flags. Current default is an empty string.") -set(GINKGO_HIP_NVCC_COMPILER_FLAGS "" CACHE STRING - "Set the required HIP nvcc compiler flags. Current default is an empty string.") -set(GINKGO_HIP_CLANG_COMPILER_FLAGS "" CACHE STRING - "Set the required HIP CLANG compiler flags. Current default is an empty string.") -set(GINKGO_HIP_AMDGPU "" CACHE STRING - "The amdgpu_target(s) variable passed to hipcc. The default is none (auto).") option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON) mark_as_advanced(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS) option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF) @@ -144,12 +136,6 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) endif() -if(BUILD_SHARED_LIBS) - set(GINKGO_STATIC_OR_SHARED SHARED) -else() - set(GINKGO_STATIC_OR_SHARED STATIC) -endif() - # Ensure we have a debug postfix if(NOT DEFINED CMAKE_DEBUG_POSTFIX) set(CMAKE_DEBUG_POSTFIX "d") diff --git a/INSTALL.md b/INSTALL.md index 045d5f93a09..54889f7e335 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -51,9 +51,9 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_BUILD_HIP={ON, OFF}` builds optimized HIP versions of the kernels (requires HIP), default is `ON` if an installation of HIP could be detected, `OFF` otherwise. -* `-DGINKGO_HIP_AMDGPU="gpuarch1;gpuarch2"` the amdgpu_target(s) variable - passed to hipcc for the `hcc` HIP backend. The default is none (auto). -* `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. Default is `OFF`. +* `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. If system HWLOC + is not found, Ginkgo will try to build it. Default is `ON` on Linux. Ginkgo + does not support HWLOC on Windows/MacOS, so the default is `OFF` on Windows/MacOS. * `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation from inline comments in the code. The default is `OFF`. * `-DGINKGO_DOC_GENERATE_EXAMPLES={ON, OFF}` generates the documentation of examples @@ -191,13 +191,6 @@ environment variable `HIP_PLATFORM` like so: export HIP_PLATFORM=nvcc # or nvidia for ROCM >= 4.1 ``` -#### Setting platform specific compilation flags -Platform specific compilation flags can be given through the following CMake -variables: -+ `-DGINKGO_HIP_COMPILER_FLAGS=`: compilation flags given to all platforms. -+ `-DGINKGO_HIP_NVCC_COMPILER_FLAGS=`: compilation flags given to NVIDIA platforms. -+ `-DGINKGO_HIP_CLANG_COMPILER_FLAGS=`: compilation flags given to AMD clang compiler. - ### Third party libraries and packages diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 6b8ce0687d6..ca209e65057 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -25,14 +25,9 @@ endfunction() function(ginkgo_benchmark_hipsparse_linops type def) add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp) + set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES LANGUAGE HIP) target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def}) - EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS) - set_target_properties(hipsparse_linops_${type} PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS}) - # use Thrust C++ device just for compilation, we don't use thrust::complex in the benchmarks - target_compile_definitions(hipsparse_linops_${type} PUBLIC -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP) - target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE - ${HSA_HEADER} ${HIP_INCLUDE_DIRS} - ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) + target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES}) endfunction() @@ -126,10 +121,8 @@ if (GINKGO_BUILD_HIP) ginkgo_benchmark_hipsparse_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION) ginkgo_benchmark_hipsparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) ginkgo_benchmark_hipsparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) + set_source_files_properties(utils/hip_timer.hip.cpp PROPERTIES LANGUAGE HIP) add_library(hip_timer utils/hip_timer.hip.cpp) - EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS) - set_target_properties(hip_timer PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS}) - target_include_directories(hip_timer SYSTEM PRIVATE ${HSA_HEADER} ${HIP_INCLUDE_DIRS}) target_link_libraries(hip_timer ginkgo) endif() diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 0776801aa99..4b4e8d77338 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -63,16 +63,10 @@ set(GINKGO_JACOBI_FULL_OPTIMIZATIONS @GINKGO_JACOBI_FULL_OPTIMIZATIONS@) set(GINKGO_CUDA_ARCHITECTURES "@CMAKE_CUDA_ARCHITECTURES@") set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@") -set(GINKGO_HIP_COMPILER_FLAGS "@GINKGO_HIP_COMPILER_FLAGS@") -set(GINKGO_HIP_HCC_COMPILER_FLAGS "@GINKGO_HIP_HCC_COMPILER_FLAGS@") -set(GINKGO_HIP_NVCC_COMPILER_FLAGS "@GINKGO_HIP_NVCC_COMPILER_FLAGS@") -set(GINKGO_HIP_CLANG_COMPILER_FLAGS "@GINKGO_HIP_CLANG_COMPILER_FLAGS@") -set(GINKGO_HIP_PLATFORM @GINKGO_HIP_PLATFORM@) -set(GINKGO_HIP_PLATFORM_AMD_REGEX "@HIP_PLATFORM_AMD_REGEX@") -set(GINKGO_HIP_PLATFORM_NVIDIA_REGEX "@HIP_PLATFORM_NVIDIA_REGEX@") -set(GINKGO_HIP_AMDGPU "@GINKGO_HIP_AMDGPU@") -set(GINKGO_HIP_VERSION @GINKGO_HIP_VERSION@) -set(GINKGO_AMD_ARCH_FLAGS "@GINKGO_AMD_ARCH_FLAGS@") +set(GINKGO_HIP_COMPILER_FLAGS "@CMAKE_HIP_COMPILER_FLAGS@") +set(GINKGO_HIP_PLATFORM "@GINKGO_HIP_PLATFORM@") +set(GINKGO_HIP_VERSION "@GINKGO_HIP_VERSION@") +set(GINKGO_HIP_ARCHITECTURES "@CMAKE_HIP_ARCHITECTURES@") set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@) set(GINKGO_DPCPP_MAJOR_VERSION @GINKGO_DPCPP_MAJOR_VERSION@) @@ -175,7 +169,7 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_CUDA) endif() if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP) - find_dependency(HIP) + enable_language(HIP) find_dependency(hipblas) find_dependency(hipfft) find_dependency(hiprand) diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake index 34189a09450..65c987b3490 100644 --- a/cmake/build_helpers.cmake +++ b/cmake/build_helpers.cmake @@ -93,17 +93,12 @@ function(ginkgo_check_headers target defines) list(APPEND HIP_SOURCES "${HEADER_SOURCEFILE}") endforeach() if(HIP_SOURCES) - set_source_files_properties(${HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE) - hip_add_library(${target}_headers_hip ${HIP_SOURCES}) # the compiler options get set by linking to ginkgo_hip + set_source_files_properties(${HIP_SOURCES} PROPERTIES LANGUAGE HIP) + add_library(${target}_headers_hip ${HIP_SOURCES}) # the compiler options get set by linking to ginkgo_hip target_link_libraries(${target}_headers_hip PRIVATE ${target} roc::hipblas roc::hipsparse hip::hiprand roc::rocrand) target_include_directories(${target}_headers_hip PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}" - "${GINKGO_HIP_THRUST_PATH}" - "${HIPBLAS_INCLUDE_DIRS}" - "${hiprand_INCLUDE_DIRS}" - "${HIPSPARSE_INCLUDE_DIRS}" - "${ROCPRIM_INCLUDE_DIRS}") + "${CMAKE_CURRENT_SOURCE_DIR}") endif() endfunction() diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index c2f5c1fb94f..8214afc50ba 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -167,55 +167,14 @@ endfunction(ginkgo_create_cuda_test_internal) ## Test compiled with HIP function(ginkgo_create_hip_test test_name) ginkgo_build_test_name(${test_name} test_target_name) - ginkgo_create_hip_test_internal(${test_name} ${test_name}.hip.cpp ${test_target_name} "" ${ARGN}) + ginkgo_create_hip_test_internal(${test_name} ${test_name}.hip.cpp ${test_target_name} ${ARGN}) endfunction(ginkgo_create_hip_test) ## Internal function allowing separate filename, test name and test target name. -function(ginkgo_create_hip_test_internal test_name filename test_target_name additional_flags) - set_source_files_properties(${filename} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE) - set(GINKGO_TEST_HIP_DEFINES -DGKO_COMPILING_HIP ${additional_flags}) - if (GINKGO_FAST_TESTS) - list(APPEND GINKGO_TEST_HIP_DEFINES -DGINKGO_FAST_TESTS) - endif() - if (GINKGO_TEST_NONDEFAULT_STREAM) - list(APPEND GINKGO_TEST_HIP_DEFINES -DGKO_TEST_NONDEFAULT_STREAM) - endif() - - # NOTE: With how HIP works, passing the flags `HIPCC_OPTIONS` etc. here - # creates a redefinition of all flags. This creates some issues with `nvcc`, - # but `clang` seems fine with the redefinitions. - if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") - hip_add_executable(${test_target_name} ${filename} - # If `FindHIP.cmake`, namely `HIP_PARSE_HIPCC_OPTIONS` macro and - # call gets fixed, uncomment this. - HIPCC_OPTIONS ${GINKGO_TEST_HIP_DEFINES} # ${GINKGO_HIPCC_OPTIONS} - # NVCC_OPTIONS ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_NVCC_OPTIONS} - # CLANG_OPTIONS ${GINKGO_TEST_HIP_DEFINES} ${GINKGO_HIP_CLANG_OPTIONS} - --expt-relaxed-constexpr --expt-extended-lambda - ) - else() # hcc/clang - hip_add_executable(${test_target_name} ${filename} - HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} ${GINKGO_TEST_HIP_DEFINES} - NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} - CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} - ) - endif() - - # Let's use a normal compiler for linking - set_target_properties(${test_target_name} PROPERTIES LINKER_LANGUAGE CXX) - - target_include_directories(${test_target_name} - PRIVATE - # Only `math` requires it so far, but it's much easier - # to put these this way. - ${GINKGO_HIP_THRUST_PATH} - # Only `exception_helpers` requires these so far, but it's much easier - # to put these this way. - ${HIPBLAS_INCLUDE_DIRS} - ${HIPFFT_INCLUDE_DIRS} - ${hiprand_INCLUDE_DIRS} - ${HIPSPARSE_INCLUDE_DIRS} - ) +function(ginkgo_create_hip_test_internal test_name filename test_target_name) + set_source_files_properties(${filename} PROPERTIES LANGUAGE HIP) + add_executable(${test_target_name} ${filename}) + target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP) ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN}) ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu) endfunction(ginkgo_create_hip_test_internal) @@ -317,7 +276,8 @@ function(ginkgo_create_common_device_test test_name) if(GINKGO_BUILD_HIP) # need to make a separate file for this, since we can't set conflicting properties on the same file configure_file(${test_name}.cpp ${test_name}.hip.cpp COPYONLY) - ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip "-std=c++14;-DEXEC_TYPE=HipExecutor;-DEXEC_NAMESPACE=hip" ${ARGN}) + ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip ${ARGN}) + target_compile_definitions(${test_target_name}_hip PRIVATE EXEC_TYPE=HipExecutor EXEC_NAMESPACE=hip) endif() endfunction(ginkgo_create_common_device_test) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 72a7a3a86d8..9a4ed90b712 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -1,26 +1,4 @@ -if(DEFINED ENV{HIP_PLATFORM}) - set(GINKGO_HIP_PLATFORM "$ENV{HIP_PLATFORM}") -elseif(GINKGO_HIPCONFIG_PATH) - execute_process(COMMAND ${GINKGO_HIPCONFIG_PATH} - --platform OUTPUT_VARIABLE GINKGO_HIP_PLATFORM) -else() - message(FATAL_ERROR "No platform could be found for HIP. " - "Set and export the environment variable HIP_PLATFORM.") -endif() -message(STATUS "HIP platform set to ${GINKGO_HIP_PLATFORM}") -set(HIP_PLATFORM_AMD_REGEX "hcc|amd") -set(HIP_PLATFORM_NVIDIA_REGEX "nvcc|nvidia") - -if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") - set(GINKGO_HIP_PLATFORM_HCC 1) -elseif (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") - enable_language(CUDA) - set(GINKGO_HIP_PLATFORM_NVCC 1) -endif() - -if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.21) - set(CMAKE_HIP_ARCHITECTURES OFF) -endif() +enable_language(HIP) if(NOT DEFINED ROCM_PATH) if(DEFINED ENV{ROCM_PATH}) @@ -88,24 +66,6 @@ if(NOT DEFINED ROCTRACER_PATH) endif() endif() -# Find HIPCC_CMAKE_LINKER_HELPER executable -find_program( - HIP_HIPCC_CMAKE_LINKER_HELPER - NAMES hipcc_cmake_linker_helper - PATHS - "${HIP_ROOT_DIR}" - ENV ROCM_PATH - ENV HIP_PATH - /opt/rocm - /opt/rocm/hip - PATH_SUFFIXES bin - NO_DEFAULT_PATH -) -if(NOT HIP_HIPCC_CMAKE_LINKER_HELPER) - # Now search in default paths - find_program(HIP_HIPCC_CMAKE_LINKER_HELPER hipcc_cmake_linker_helper) -endif() - find_program( HIP_HIPCONFIG_EXECUTABLE NAMES hipconfig @@ -130,17 +90,6 @@ execute_process( ERROR_STRIP_TRAILING_WHITESPACE ) -if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") - # ensure ENV{CUDA_PATH} is set by the user - if (NOT DEFINED ENV{CUDA_PATH}) - find_path(GINKGO_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include NO_DEFAULT_PATH) - if (NOT GINKGO_HIP_DEFAULT_CUDA_PATH) - message(FATAL_ERROR "HIP nvidia backend was requested but CUDA could not be " - "located. Set and export the environment variable CUDA_PATH.") - endif() - endif() -endif() - ## Setup all CMAKE variables to find HIP and its dependencies set(GINKGO_HIP_MODULE_PATH "${HIP_PATH}/cmake") list(APPEND CMAKE_MODULE_PATH "${GINKGO_HIP_MODULE_PATH}") @@ -155,15 +104,6 @@ list(APPEND CMAKE_PREFIX_PATH "${ROCRAND_PATH}/lib/cmake" ) -# NOTE: without this, HIP jacobi build takes a *very* long time. The reason for -# that is that these variables are seemingly empty by default, thus there is no -# proper optimization applied to the HIP builds otherwise. -set(HIP_HIPCC_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}" CACHE STRING "Flags used by the HIPCC compiler during DEBUG builds") -set(HIP_HIPCC_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}" CACHE STRING "Flags used by the HIPCC compiler during MINSIZEREL builds") -set(HIP_HIPCC_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "Flags used by the HIPCC compiler during RELEASE builds") -set(HIP_HIPCC_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" CACHE STRING "Flags used by the HIPCC compiler during RELWITHDEBINFO builds") - -find_package(HIP REQUIRED) find_package(hipblas REQUIRED) find_package(hipfft) # optional dependency find_package(hiprand REQUIRED) @@ -171,47 +111,8 @@ find_package(hipsparse REQUIRED) # At the moment, for hiprand to work also rocrand is required. find_package(rocrand REQUIRED) find_package(ROCTX) -find_path(GINKGO_HIP_THRUST_PATH "thrust/complex.h" - PATHS "${HIP_PATH}/../include" - ENV HIP_THRUST_PATH) -if (NOT GINKGO_HIP_THRUST_PATH) - message(FATAL_ERROR "Could not find the ROCm header thrust/complex.h which is required by Ginkgo HIP.") -endif() - -set(GINKGO_HIP_NVCC_ARCH "") -if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") - if (CMAKE_CUDA_HOST_COMPILER) - list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS "-ccbin=${CMAKE_CUDA_HOST_COMPILER}") - endif() - - # Remove false positive CUDA warnings when calling one() and zero() - list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr --expt-extended-lambda) - - # select GPU architecture - include(cmake/Modules/CudaArchitectureSelector.cmake) - cas_variable_cuda_architectures(GINKGO_HIP_NVCC_ARCH - ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES} - UNSUPPORTED "20" "21") -endif() - -# `target_compile_options` do not work with hip_add_library -# Thus, we need to pass the flags to `hip_add_library` itself -if(GINKGO_HIP_AMDGPU) - foreach(target ${GINKGO_HIP_AMDGPU}) - list(APPEND GINKGO_AMD_ARCH_FLAGS --amdgpu-target=${target}) - endforeach() -endif() - -set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS} "-std=c++14 -DGKO_COMPILING_HIP") -set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS}) -set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS}) -if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND HIP_VERSION VERSION_GREATER_EQUAL 5) - list(APPEND GINKGO_HIP_CLANG_OPTIONS "-munsafe-fp-atomics -Wno-unused-command-line-argument") +if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND GINKGO_HIP_VERSION VERSION_GREATER_EQUAL 5) + set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -munsafe-fp-atomics -Wno-unused-command-line-argument") endif() -# HIP's cmake support secretly carries around global state to remember -# whether we created any shared libraries, and sets PIC flags accordingly. -# CMake's scoping rules means that this makes the hip_add_* calls order- and -# scope-dependent, which is not good. Let's set the flags ourselves instead. -list(APPEND GINKGO_HIP_CLANG_OPTIONS "-fPIC") -list(APPEND GINKGO_HIP_NVCC_OPTIONS "--shared -Xcompiler '-fPIC'") +set(CMAKE_HIP_STANDARD 14) diff --git a/cmake/hip_path.cmake b/cmake/hip_path.cmake index 58fcd3db447..269f8403fd3 100644 --- a/cmake/hip_path.cmake +++ b/cmake/hip_path.cmake @@ -11,8 +11,3 @@ find_program(GINKGO_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin") if(GINKGO_HIPCONFIG_PATH) message(STATUS "Found hipconfig: ${GINKGO_HIPCONFIG_PATH}") endif() - -# We keep using NVCC/HCC for consistency with previous releases even if AMD -# updated everything to use NVIDIA/AMD in ROCM 4.1 -set(GINKGO_HIP_PLATFORM_NVCC 0) -set(GINKGO_HIP_PLATFORM_HCC 0) diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index ad106e123bc..56bcfec462a 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -1,3 +1,4 @@ +cmake_minimum_required(VERSION 3.21) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE) add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE) @@ -70,11 +71,7 @@ else() list(APPEND GINKGO_HIP_SOURCES matrix/fft_kernels_stub.hip.cpp) endif() -if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") - set(GKO_HIP_JACOBI_MAX_BLOCK_SIZE 32) -else() - set(GKO_HIP_JACOBI_MAX_BLOCK_SIZE 64) -endif() +set(GKO_HIP_JACOBI_MAX_BLOCK_SIZE 64) if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) set(GKO_HIP_JACOBI_BLOCK_SIZES) foreach(blocksize RANGE 1 ${GKO_HIP_JACOBI_MAX_BLOCK_SIZE}) @@ -102,24 +99,14 @@ endforeach() string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}") configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hip.hpp) -set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE) -hip_add_library(ginkgo_hip $ ${GINKGO_HIP_SOURCES} - HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} - CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} - NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} - ${GINKGO_STATIC_OR_SHARED}) +set_source_files_properties(${GINKGO_HIP_SOURCES} PROPERTIES LANGUAGE HIP) +add_library(ginkgo_hip $ ${GINKGO_HIP_SOURCES}) target_include_directories(ginkgo_hip - PUBLIC - ${HIP_INCLUDE_DIRS} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/.. # for generated headers like jacobi_common.hip.hpp - ${GINKGO_HIP_THRUST_PATH} - ${HIPBLAS_INCLUDE_DIRS} - ${HIPFFT_INCLUDE_DIRS} - ${hiprand_INCLUDE_DIRS} - ${HIPSPARSE_INCLUDE_DIRS} - $) + ) +target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP) target_link_libraries(ginkgo_hip PUBLIC ginkgo_device) target_link_libraries(ginkgo_hip PRIVATE roc::hipblas roc::hipsparse hip::hiprand roc::rocrand) @@ -130,34 +117,11 @@ if (GINKGO_HAVE_ROCTX) target_link_libraries(ginkgo_hip PRIVATE roc::roctx) endif() -if(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") - find_package(hip REQUIRED) - # To save a bit of pain, we directly link against the `library` instead of - # linking against the target. - if (CMAKE_BUILD_TYPE) - # Check if our configuration is available first - string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPER_BUILD_TYPE) - get_target_property(HIP_LIBAMDHIP64_LIBRARIES hip::amdhip64 IMPORTED_LOCATION_${UPPER_BUILD_TYPE}) - endif() - if (NOT HIP_LIBAMDHIP64_LIBRARIES) - # Fall back to anything - get_target_property(HIP_LIBAMDHIP64_LIBRARIES hip::amdhip64 LOCATION) - endif() - target_link_libraries(ginkgo_hip PUBLIC ${HIP_LIBAMDHIP64_LIBRARIES}) -elseif(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") - find_package(CUDA 10.1 REQUIRED) - target_link_libraries(ginkgo_hip PUBLIC ${CUDA_LIBRARIES}) -endif() +target_compile_options(ginkgo_hip PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) # Try to find everything in /opt/rocm/lib first. set(GKO_HIP_RPATH "${ROCM_PATH}/lib" ) -if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") - list(GET CUDA_LIBRARIES 0 CUDA_FIRST_LIB) - get_filename_component(GKO_CUDA_LIBDIR "${CUDA_FIRST_LIB}" DIRECTORY) - list(APPEND GKO_HIP_RPATH "${GKO_CUDA_LIBDIR}") -else() - list(APPEND GKO_HIP_RPATH "${HIP_PATH}/lib") -endif() +list(APPEND GKO_HIP_RPATH "${HIP_PATH}/lib") list(APPEND GKO_HIP_RPATH "${HIPBLAS_PATH}/lib" "${HIPRAND_PATH}/lib" "${HIPSPARSE_PATH}/lib" "${ROCRAND_PATH}/lib") diff --git a/hip/get_info.cmake b/hip/get_info.cmake index 1610ac0eee4..234f65642cc 100644 --- a/hip/get_info.cmake +++ b/hip/get_info.cmake @@ -1,15 +1,12 @@ ginkgo_print_module_header(${detailed_log} "HIP") ginkgo_print_foreach_variable(${detailed_log} - "GINKGO_HIPCONFIG_PATH;GINKGO_HIP_AMDGPU" - "GINKGO_HIP_CLANG_COMPILER_FLAGS;GINKGO_HIP_NVCC_COMPILER_FLAGS" - "GINKGO_HIP_THRUST_PATH;GINKGO_AMD_ARCH_FLAGS") + "GINKGO_HIPCONFIG_PATH") ginkgo_print_module_footer(${detailed_log} "HIP variables:") ginkgo_print_foreach_variable(${detailed_log} - "HIP_VERSION;HIP_COMPILER;HIP_PATH;ROCM_PATH" + "HIP_VERSION;HIP_PATH;ROCM_PATH" "HIP_PLATFORM;HIP_ROOT_DIR;HIP_RUNTIME;HIPBLAS_PATH;HIPSPARSE_PATH" - "HIPRAND_PATH;ROCRAND_PATH;HIP_CLANG_INCLUDE_PATH;HIP_CLANG_PATH" - "HIP_HIPCC_EXECUTABLE;HIP_HIPCONFIG_EXECUTABLE;HIP_HOST_COMPILATION_CPP") -ginkgo_print_flags(${detailed_log} "HIP_HIPCC_FLAGS") -ginkgo_print_flags(${detailed_log} "HIP_NVCC_FLAGS") -ginkgo_print_flags(${detailed_log} "HIP_CLANG_FLAGS") + "HIPRAND_PATH;ROCRAND_PATH;HIP_CLANG_INCLUDE_PATH" + "HIP_HIPCONFIG_EXECUTABLE") +ginkgo_print_flags(${detailed_log} "CMAKE_HIP_FLAGS") +ginkgo_print_flags(${detailed_log} "CMAKE_HIP_COMPILER") ginkgo_print_module_footer(${detailed_log} "") diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt index 11b0e209aeb..57c6f64bfd8 100644 --- a/hip/test/base/CMakeLists.txt +++ b/hip/test/base/CMakeLists.txt @@ -9,14 +9,8 @@ if(GINKGO_HAVE_HWLOC) endif() endif() ginkgo_create_hip_test(kernel_launch) -# correct flags for kernel_launch.hpp are set in GINKGO_HIPCC_OPTIONS ginkgo_create_test(lin_op RESOURCE_TYPE hipgpu) ginkgo_create_hip_test(math) ginkgo_create_test(memory RESOURCE_TYPE hipgpu) -# Only hcc needs the libraries. nvcc only requires the headers. -if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") - ginkgo_create_hip_test(exception_helpers ADDITIONAL_LIBRARIES roc::hipblas roc::hipsparse hip::hiprand roc::rocrand) -else() - ginkgo_create_hip_test(exception_helpers) -endif() +ginkgo_create_hip_test(exception_helpers) ginkgo_create_hip_test(scoped_device_id) diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt index 513af67e923..fef09b19021 100644 --- a/test/test_install/CMakeLists.txt +++ b/test/test_install/CMakeLists.txt @@ -44,33 +44,14 @@ if(GINKGO_BUILD_CUDA) endif() if(GINKGO_BUILD_HIP) - find_package(HIP REQUIRED) - - set_source_files_properties(test_install.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE) - # The library was compiled with `-fPIC` when in shared library mode - if (GINKGO_BUILD_SHARED_LIBS) - set (GINKGO_PIC_OPTION "-fPIC") - set (GINKGO_CUDA_PIC_OPTION "-Xcompiler '-fPIC'") - else() - set (GINKGO_PIC_OPTION "$<$:-fPIC>") - endif() - if (CMAKE_CUDA_HOST_COMPILER) - set(TESTINSTALL_CUDA_HOST_COMPILER "-ccbin=${CMAKE_CUDA_HOST_COMPILER}") - endif() - hip_add_executable(test_install_hip test_install.cpp - HIPCC_OPTIONS "-std=c++14" - CLANG_OPTIONS "${GINKGO_PIC_OPTION}" - NVCC_OPTIONS "${GINKGO_CUDA_PIC_OPTION}" "${TESTINSTALL_CUDA_HOST_COMPILER}") + enable_language(HIP) + configure_file(test_install.cpp test_install.hip.cpp COPYONLY) + set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp PROPERTIES LANGUAGE HIP) + add_executable(test_install_hip ${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp) target_link_libraries(test_install_hip PRIVATE Ginkgo::ginkgo) target_compile_definitions(test_install_hip PRIVATE HAS_HIP=1) target_compile_definitions(test_install_hip PRIVATE HAS_REFERENCE=${HAS_REFERENCE}) - - # If we always link with CXX there is no RPATH issue - set_target_properties(test_install_hip PROPERTIES LINKER_LANGUAGE CXX) - - # Instead of using CXX, it's possible to instead use - # target_link_libraries(test_install_hip PRIVATE ${GINKGO_INSTALL_RPATH_FOR_HIP}) endif() # Installation step diff --git a/third_party/identify_stream_usage/CMakeLists.txt b/third_party/identify_stream_usage/CMakeLists.txt index c20fe43278b..42036baede1 100644 --- a/third_party/identify_stream_usage/CMakeLists.txt +++ b/third_party/identify_stream_usage/CMakeLists.txt @@ -27,22 +27,21 @@ if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") find_package(hip REQUIRED) set_source_files_properties(identify_stream_usage.hip.cpp test_default_stream_identification.hip.cpp - PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE) - hip_add_library(identify_stream_usage_hip - identify_stream_usage.hip.cpp - HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} - CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} - NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} - SHARED) - target_link_libraries(identify_stream_usage_hip PUBLIC hip::amdhip64 ${CMAKE_DL_LIBS}) + PROPERTIES LANGUAGE HIP) + add_library(identify_stream_usage_hip SHARED identify_stream_usage.hip.cpp) + target_link_libraries(identify_stream_usage_hip ${CMAKE_DL_LIBS}) + set_target_properties( + identify_stream_usage_hip + PROPERTIES # set target compile options + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + ) - hip_add_executable(test_stream_identification_hip - test_default_stream_identification.hip.cpp - HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} - NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS} - CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS}) + add_executable(test_stream_identification_hip + test_default_stream_identification.hip.cpp) add_test(NAME default_stream_identification_hip COMMAND test_stream_identification_hip) set_tests_properties(default_stream_identification_hip PROPERTIES ENVIRONMENT LD_PRELOAD=$) -endif() \ No newline at end of file +endif() From 46ddec40a182270c347a2982f65b40e81a4c1d2e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 15 Feb 2024 15:08:01 +0100 Subject: [PATCH 03/22] work around ROCm compiler bug --- hip/CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 56bcfec462a..8bcf2fb9e7d 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -91,6 +91,16 @@ foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES) configure_file( preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + # The 3D indexing used in Jacobi kernel triggers an instruction selection bug in Debug builds + # Probably the same as https://github.com/llvm/llvm-project/issues/67574 + # Fixed in ROCm 6.0 https://github.com/ROCm/llvm-project/commit/cd7f574a1fd1d3f3e8b9c1cae61fa8133a51de5f + # and in LLVM trunk https://github.com/llvm/llvm-project/commit/cc3d2533cc2e4ea06981b86ede5087fbf801e789 + set_source_files_properties( + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + PROPERTIES + COMPILE_OPTIONS $<$:-O2>) list(APPEND GINKGO_HIP_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp From e90f8d74cd346b0d9c4abdfec5a7b5b85aa19f1f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 15 Feb 2024 15:25:19 +0100 Subject: [PATCH 04/22] autodetect based on HIP language --- cmake/autodetect_executors.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake index 757262f1ea1..d3ad2e3a6a1 100644 --- a/cmake/autodetect_executors.cmake +++ b/cmake/autodetect_executors.cmake @@ -31,7 +31,8 @@ if (NOT DEFINED GINKGO_BUILD_CUDA) endif() if (NOT DEFINED GINKGO_BUILD_HIP) - if(GINKGO_HIPCONFIG_PATH) + check_language(HIP) + if(CMAKE_HIP_COMPILER) message(STATUS "Enabling HIP executor") set(GINKGO_HAS_HIP ON) endif() From 7419034716d9564c9c9eac1c03ee22a1d8562bee Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 23 Feb 2024 15:15:14 +0100 Subject: [PATCH 05/22] fix exception_helpers includes --- hip/test/base/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt index 57c6f64bfd8..ae29eb782f1 100644 --- a/hip/test/base/CMakeLists.txt +++ b/hip/test/base/CMakeLists.txt @@ -12,5 +12,9 @@ ginkgo_create_hip_test(kernel_launch) ginkgo_create_test(lin_op RESOURCE_TYPE hipgpu) ginkgo_create_hip_test(math) ginkgo_create_test(memory RESOURCE_TYPE hipgpu) -ginkgo_create_hip_test(exception_helpers) +set(additional_libs) +if(hipfft_FOUND) + set(additional_libs hip::hipfft) +endif() +ginkgo_create_hip_test(exception_helpers ADDITIONAL_LIBRARIES roc::hipblas roc::hipsparse hip::hiprand roc::rocrand ${additional_libs}) ginkgo_create_hip_test(scoped_device_id) From 8ac6658b5042ca29e317ccff8af8c3a7fc9a609b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 8 Apr 2024 15:44:03 +0200 Subject: [PATCH 06/22] Revert "remove HIP CUDA support from source" This reverts commit c8ea7f47d2d350973ffb31628d887d259ffb662a. --- common/cuda_hip/components/atomic.hpp.inc | 17 +++++++-- core/device_hooks/hip_hooks.cpp | 16 ++++----- core/log/profiler_hook.cpp | 2 ++ core/log/profiler_hook.hpp | 4 +++ core/test/base/executor.cpp | 7 ++++ cuda/base/executor.cpp | 10 ++++++ devices/cuda/executor.cpp | 4 +++ devices/hip/executor.cpp | 4 +++ hip/base/config.hip.hpp | 8 +++++ hip/base/device.hip.cpp | 2 +- hip/base/device.hpp | 2 +- hip/base/executor.hip.cpp | 17 +++++++++ hip/base/stream.hip.cpp | 2 +- hip/base/thrust.hip.hpp | 10 +++++- hip/components/cooperative_groups.hip.hpp | 44 +++++++++++++++++++++-- hip/components/format_conversion.hip.hpp | 13 +++++++ hip/preconditioner/jacobi_kernels.hip.cpp | 6 +++- hip/test/base/hip_executor.hip.cpp | 12 +++++++ include/ginkgo/config.hpp.in | 4 +-- include/ginkgo/core/base/executor.hpp | 35 +++++++++++++++--- include/ginkgo/core/base/fwd_decls.hpp | 7 ++++ include/ginkgo/core/base/math.hpp | 4 +-- include/ginkgo/core/base/memory.hpp | 16 +++++---- include/ginkgo/core/base/stream.hpp | 4 +-- include/ginkgo/core/base/timer.hpp | 4 +-- include/ginkgo/core/matrix/csr.hpp | 4 +++ test/utils/executor.hpp | 2 +- 27 files changed, 222 insertions(+), 38 deletions(-) diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc index 00fed2db4a6..3d76cfdcb79 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp.inc @@ -119,7 +119,7 @@ GKO_BIND_ATOMIC_ADD(unsigned long long int); GKO_BIND_ATOMIC_ADD(float); // AMD -#if defined(__HIPCC__) +#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC // the double atomicAdd is added after 4.3 @@ -157,7 +157,7 @@ GKO_BIND_ATOMIC_ADD(__half2); // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))) -#endif // defined(__HIPCC__) +#endif // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC #undef GKO_BIND_ATOMIC_ADD @@ -179,7 +179,20 @@ __forceinline__ __device__ T atomic_max(T* __restrict__ addr, T val) GKO_BIND_ATOMIC_MAX(int); GKO_BIND_ATOMIC_MAX(unsigned int); + +#if !defined(__HIPCC__) || \ + (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC) + + +#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__) +// Only Compute Capability 3.5 and higher supports 64-bit atomicMax +GKO_BIND_ATOMIC_MAX(unsigned long long int); +#endif + +#else // Is HIP platform & on AMD hardware GKO_BIND_ATOMIC_MAX(unsigned long long int); +#endif // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) && + // GINKGO_HIP_PLATFORM_HCC) #undef GKO_BIND_ATOMIC_MAX diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index cb85019e542..a90691e1af4 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -33,7 +33,7 @@ void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); -HipAsyncAllocator::HipAsyncAllocator(ihipStream_t* stream) +HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream) GKO_NOT_COMPILED(hip); @@ -44,7 +44,7 @@ void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); bool HipAsyncAllocator::check_environment(int device_id, - ihipStream_t* stream) const + GKO_HIP_STREAM_STRUCT* stream) const GKO_NOT_COMPILED(hip); @@ -62,7 +62,7 @@ void HipUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); bool HipUnifiedAllocator::check_environment(int device_id, - ihipStream_t* stream) const + GKO_HIP_STREAM_STRUCT* stream) const GKO_NOT_COMPILED(hip); @@ -76,13 +76,13 @@ void HipHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); bool HipHostAllocator::check_environment(int device_id, - ihipStream_t* stream) const + GKO_HIP_STREAM_STRUCT* stream) const GKO_NOT_COMPILED(hip); std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, - allocation_mode alloc_mode, ihipStream_t* stream) + allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream) { return std::shared_ptr( new HipExecutor(device_id, std::move(master), @@ -92,7 +92,7 @@ std::shared_ptr HipExecutor::create( std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, - std::shared_ptr alloc, ihipStream_t* stream) + std::shared_ptr alloc, GKO_HIP_STREAM_STRUCT* stream) { return std::shared_ptr( new HipExecutor(device_id, std::move(master), alloc, stream)); @@ -204,7 +204,7 @@ hip_stream::~hip_stream() {} hip_stream::hip_stream(hip_stream&&) GKO_NOT_COMPILED(hip); -ihipStream_t* hip_stream::get() const GKO_NOT_COMPILED(hip); +GKO_HIP_STREAM_STRUCT* hip_stream::get() const GKO_NOT_COMPILED(hip); HipTimer::HipTimer(std::shared_ptr exec) @@ -232,7 +232,7 @@ namespace hip { void reset_device(int device_id) GKO_NOT_COMPILED(hip); -void destroy_event(ihipEvent_t* event) GKO_NOT_COMPILED(hip); +void destroy_event(GKO_HIP_EVENT_STRUCT* event) GKO_NOT_COMPILED(hip); } // namespace hip diff --git a/core/log/profiler_hook.cpp b/core/log/profiler_hook.cpp index 8b5d84b2f0e..a8eef7668f2 100644 --- a/core/log/profiler_hook.cpp +++ b/core/log/profiler_hook.cpp @@ -391,9 +391,11 @@ std::shared_ptr ProfilerHook::create_for_executor( if (std::dynamic_pointer_cast(exec)) { return create_nvtx(); } +#if (GINKGO_HIP_PLATFORM_NVCC == 0) if (std::dynamic_pointer_cast(exec)) { return create_roctx(); } +#endif if (std::dynamic_pointer_cast(exec)) { return create_vtune(); } diff --git a/core/log/profiler_hook.hpp b/core/log/profiler_hook.hpp index b6a88c1d471..3f4baf80db1 100644 --- a/core/log/profiler_hook.hpp +++ b/core/log/profiler_hook.hpp @@ -110,6 +110,7 @@ class profiling_scope_guard : log::profiling_scope_guard { namespace hip { +#if (GINKGO_HIP_PLATFORM_NVCC == 0) class profiling_scope_guard : log::profiling_scope_guard { public: profiling_scope_guard(const char* name) @@ -118,6 +119,9 @@ class profiling_scope_guard : log::profiling_scope_guard { log::begin_roctx, log::end_nvtx} {} }; +#else +using profiling_scope_guard = log::default_profiling_scope_guard; +#endif } // namespace hip diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 989a1137b14..dc4ea5aad63 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -336,10 +336,17 @@ TEST(Executor, CanVerifyMemory) ASSERT_EQ(false, gpu_dpcpp->memory_accessible(gpu_dpcpp_dup)); ASSERT_EQ(false, gpu_dpcpp_dup->memory_accessible(gpu_dpcpp)); } +#if GINKGO_HIP_PLATFORM_NVCC + ASSERT_EQ(true, hip->memory_accessible(cuda)); + ASSERT_EQ(true, cuda->memory_accessible(hip)); + ASSERT_EQ(true, hip_1->memory_accessible(cuda_1)); + ASSERT_EQ(true, cuda_1->memory_accessible(hip_1)); +#else ASSERT_EQ(false, hip->memory_accessible(cuda)); ASSERT_EQ(false, cuda->memory_accessible(hip)); ASSERT_EQ(false, hip_1->memory_accessible(cuda_1)); ASSERT_EQ(false, cuda_1->memory_accessible(hip_1)); +#endif ASSERT_EQ(true, omp->memory_accessible(omp2)); ASSERT_EQ(true, hip->memory_accessible(hip2)); ASSERT_EQ(true, cuda->memory_accessible(cuda2)); diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index 82bc56792e5..52a92132689 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -135,7 +135,17 @@ void CudaExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes, void CudaExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes, const void* src_ptr, void* dest_ptr) const { +#if GINKGO_HIP_PLATFORM_NVCC == 1 + if (num_bytes > 0) { + detail::cuda_scoped_device_id_guard g(this->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS(cudaMemcpyPeerAsync( + dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(), + num_bytes, this->get_stream())); + this->synchronize(); + } +#else GKO_NOT_SUPPORTED(dest); +#endif } diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp index f16225c07ee..58261c318fb 100644 --- a/devices/cuda/executor.cpp +++ b/devices/cuda/executor.cpp @@ -28,7 +28,11 @@ bool CudaExecutor::verify_memory_to(const CudaExecutor* dest_exec) const bool CudaExecutor::verify_memory_to(const HipExecutor* dest_exec) const { +#if GINKGO_HIP_PLATFORM_NVCC + return this->get_device_id() == dest_exec->get_device_id(); +#else return false; +#endif } diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp index a8bab47dd61..6954e31b24b 100644 --- a/devices/hip/executor.cpp +++ b/devices/hip/executor.cpp @@ -25,7 +25,11 @@ bool HipExecutor::verify_memory_to(const HipExecutor* dest_exec) const bool HipExecutor::verify_memory_to(const CudaExecutor* dest_exec) const { +#if GINKGO_HIP_PLATFORM_NVCC + return this->get_device_id() == dest_exec->get_device_id(); +#else return false; +#endif } diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp index e0fb2d73210..fbad841fd0f 100644 --- a/hip/base/config.hip.hpp +++ b/hip/base/config.hip.hpp @@ -27,13 +27,21 @@ struct config { /** * The type containing a bitmask over all lanes of a warp. */ +#if GINKGO_HIP_PLATFORM_HCC using lane_mask_type = uint64; +#else // GINKGO_HIP_PLATFORM_NVCC + using lane_mask_type = uint32; +#endif /** * The number of threads within a HIP warp. Here, we use the definition from * `device_functions.h`. */ +#if GINKGO_HIP_PLATFORM_HCC static constexpr uint32 warp_size = warpSize; +#else // GINKGO_HIP_PLATFORM_NVCC + static constexpr uint32 warp_size = 32; +#endif /** * The bitmask of the entire warp. diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp index 2eaa92e8a66..58376c2175b 100644 --- a/hip/base/device.hip.cpp +++ b/hip/base/device.hip.cpp @@ -28,7 +28,7 @@ void reset_device(int device_id) } -void destroy_event(ihipEvent_t* event) +void destroy_event(GKO_HIP_EVENT_STRUCT* event) { GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event)); } diff --git a/hip/base/device.hpp b/hip/base/device.hpp index f00e75851ca..f0ceae0dc2b 100644 --- a/hip/base/device.hpp +++ b/hip/base/device.hpp @@ -18,7 +18,7 @@ void reset_device(int device_id); /** calls hipEventDestroy on the given event. */ -void destroy_event(ihipEvent_t* event); +void destroy_event(GKO_HIP_EVENT_STRUCT* event); /** returns hipDeviceProp.name for the given device */ diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 22d1ce0c1e2..2694ce4177f 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -131,7 +131,17 @@ void HipExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes, void HipExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes, const void* src_ptr, void* dest_ptr) const { +#if GINKGO_HIP_PLATFORM_NVCC == 1 + if (num_bytes > 0) { + detail::hip_scoped_device_id_guard g(this->get_device_id()); + GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeerAsync( + dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(), + num_bytes, this->get_stream())); + this->synchronize(); + } +#else GKO_NOT_SUPPORTED(dest); +#endif } @@ -212,9 +222,16 @@ void HipExecutor::set_gpu_property() this->get_device_id())); this->get_exec_info().max_workgroup_size = max_threads_per_block; this->get_exec_info().max_workitem_sizes = max_threads_per_block_dim; +#if GINKGO_HIP_PLATFORM_NVCC + this->get_exec_info().num_pu_per_cu = + convert_sm_ver_to_cores(this->get_exec_info().major, + this->get_exec_info().minor) / + kernels::hip::config::warp_size; +#else // In GCN (Graphics Core Next), each multiprocessor has 4 SIMD // Reference: https://en.wikipedia.org/wiki/Graphics_Core_Next this->get_exec_info().num_pu_per_cu = 4; +#endif // GINKGO_HIP_PLATFORM_NVCC this->get_exec_info().max_subgroup_size = kernels::hip::config::warp_size; } diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp index 47f4092cc82..93c1fc008d9 100644 --- a/hip/base/stream.hip.cpp +++ b/hip/base/stream.hip.cpp @@ -44,7 +44,7 @@ hip_stream::hip_stream(hip_stream&& other) {} -ihipStream_t* hip_stream::get() const { return stream_; } +GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; } } // namespace gko diff --git a/hip/base/thrust.hip.hpp b/hip/base/thrust.hip.hpp index 008f1e0645b..2c0412fb67d 100644 --- a/hip/base/thrust.hip.hpp +++ b/hip/base/thrust.hip.hpp @@ -7,11 +7,15 @@ #include -#include #include #include +#if GINKGO_HIP_PLATFORM_HCC +#include +#else +#include +#endif namespace gko { @@ -21,7 +25,11 @@ namespace hip { inline auto thrust_policy(std::shared_ptr exec) { +#if GINKGO_HIP_PLATFORM_HCC return thrust::hip::par.on(exec->get_stream()); +#else + return thrust::cuda::par.on(exec->get_stream()); +#endif } diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 2e5723366e6..247218a1457 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -167,8 +167,14 @@ class thread_block_tile { __device__ __forceinline__ unsigned size() const noexcept { return Size; } - __device__ __forceinline__ void sync() const noexcept {} + __device__ __forceinline__ void sync() const noexcept + { +#if GINKGO_HIP_PLATFORM_NVCC + __syncwarp(data_.mask); +#endif // GINKGO_HIP_PLATFORM_NVCC + } +#if GINKGO_HIP_PLATFORM_HCC #define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType) \ __device__ __forceinline__ ValueType ShflOp( \ ValueType var, SelectorType selector) const noexcept \ @@ -178,6 +184,17 @@ class thread_block_tile { static_assert(true, \ "This assert is used to counter the false positive extra " \ "semi-colon warnings") +#else +#define GKO_BIND_SHFL(ShflOp, ValueType, SelectorType) \ + __device__ __forceinline__ ValueType ShflOp( \ + ValueType var, SelectorType selector) const noexcept \ + { \ + return __##ShflOp##_sync(data_.mask, var, selector, Size); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") +#endif GKO_BIND_SHFL(shfl, int32, int32); GKO_BIND_SHFL(shfl, float, int32); @@ -205,11 +222,15 @@ class thread_block_tile { */ __device__ __forceinline__ int any(int predicate) const noexcept { +#if GINKGO_HIP_PLATFORM_HCC if (Size == config::warp_size) { return __any(predicate); } else { return (__ballot(predicate) & data_.mask) != 0; } +#else + return __any_sync(data_.mask, predicate); +#endif } /** @@ -218,11 +239,15 @@ class thread_block_tile { */ __device__ __forceinline__ int all(int predicate) const noexcept { +#if GINKGO_HIP_PLATFORM_HCC if (Size == config::warp_size) { return __all(predicate); } else { return (__ballot(predicate) & data_.mask) == data_.mask; } +#else + return __all_sync(data_.mask, predicate); +#endif } /** @@ -235,11 +260,19 @@ class thread_block_tile { __device__ __forceinline__ config::lane_mask_type ballot( int predicate) const noexcept { +#if GINKGO_HIP_PLATFORM_HCC if (Size == config::warp_size) { return __ballot(predicate); } else { return (__ballot(predicate) & data_.mask) >> data_.lane_offset; } +#else + if (Size == config::warp_size) { + return __ballot_sync(data_.mask, predicate); + } else { + return __ballot_sync(data_.mask, predicate) >> data_.lane_offset; + } +#endif } private: @@ -312,9 +345,14 @@ class enable_extended_shuffle : public Group { } // namespace detail +// Implementing this as a using directive messes up with SFINAE for some reason, +// probably a bug in NVCC. If it is a complete type, everything works fine. template -using thread_block_tile = - detail::enable_extended_shuffle>; +struct thread_block_tile + : detail::enable_extended_shuffle> { + using detail::enable_extended_shuffle< + detail::thread_block_tile>::enable_extended_shuffle; +}; // Only support tile_partition with 1, 2, 4, 8, 16, 32, 64 (hip). diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp index a5b84533ddd..59c0405a874 100644 --- a/hip/components/format_conversion.hip.hpp +++ b/hip/components/format_conversion.hip.hpp @@ -80,12 +80,25 @@ __host__ size_type calculate_nwarps(std::shared_ptr exec, size_type nwarps_in_hip = exec->get_num_multiprocessor() * exec->get_num_warps_per_sm() * config::warp_size / subwarp_size; +#if GINKGO_HIP_PLATFORM_NVCC + size_type multiple = 8; + if (nnz >= 2e8) { + multiple = 2048; + } else if (nnz >= 2e7) { + multiple = 512; + } else if (nnz >= 2e6) { + multiple = 128; + } else if (nnz >= 2e5) { + multiple = 32; + } +#else size_type multiple = 2; if (nnz >= 1e7) { multiple = 32; } else if (nnz >= 1e5) { multiple = 8; } +#endif // GINKGO_HIP_PLATFORM_NVCC #ifdef GINKGO_BENCHMARK_ENABLE_TUNING if (_tuning_flag) { multiple = _tuned_value; diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp index 83af45132c4..1646a7fb376 100644 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -33,8 +33,12 @@ namespace hip { namespace jacobi { -// a total of 16 warps (1024 threads) +// a total of 32/16 warps (1024 threads) +#if GINKGO_HIP_PLATFORM_HCC constexpr int default_num_warps = 16; +#else // GINKGO_HIP_PLATFORM_NVCC +constexpr int default_num_warps = 32; +#endif // with current architectures, at most 32 warps can be scheduled per SM (and // current GPUs have at most 84 SMs) constexpr int default_grid_size = 32 * 32 * 128; diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index 908f1c06c3e..cfdfc3122fd 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -164,7 +164,11 @@ TEST_F(HipExecutor, FailsWhenOverallocating) __global__ void check_data(int* data) { if (data[0] != 3 || data[1] != 8) { +#if GINKGO_HIP_PLATFORM_HCC asm("s_trap 0x02;"); +#else // GINKGO_HIP_PLATFORM_NVCC + asm("trap;"); +#endif } } @@ -184,7 +188,11 @@ TEST_F(HipExecutor, CopiesDataToHip) __global__ void check_data2(int* data) { if (data[0] != 4 || data[1] != 8) { +#if GINKGO_HIP_PLATFORM_HCC asm("s_trap 0x02;"); +#else // GINKGO_HIP_PLATFORM_NVCC + asm("trap;"); +#endif } } @@ -308,7 +316,11 @@ TEST_F(HipExecutor, ExecInfoSetsCorrectProperties) &max_threads_per_block, hipDeviceAttributeMaxThreadsPerBlock, dev_id)); GKO_ASSERT_NO_HIP_ERRORS( hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, dev_id)); +#if GINKGO_HIP_PLATFORM_NVCC + auto num_cores = convert_sm_ver_to_cores(major, minor); +#else auto num_cores = warp_size * 4; +#endif ASSERT_EQ(hip->get_major_version(), major); ASSERT_EQ(hip->get_minor_version(), minor); diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 34d06d8d63c..329918399d6 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -41,10 +41,10 @@ /* What is HIP compiled for, hcc or nvcc? */ // clang-format off -#define GINKGO_HIP_PLATFORM_HCC 1 +#define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@ -#define GINKGO_HIP_PLATFORM_NVCC 0 +#define GINKGO_HIP_PLATFORM_NVCC @GINKGO_HIP_PLATFORM_NVCC@ // clang-format on diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 6b3f6d3755d..d7db35d2e3c 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -63,10 +63,34 @@ enum class log_propagation_mode { enum class allocation_mode { device, unified_global, unified_host }; +#ifdef NDEBUG + +// When in release, prefer device allocations constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device; constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device; +#else + +// When in debug, always UM allocations. +constexpr allocation_mode default_cuda_alloc_mode = + allocation_mode::unified_global; + +#if (GINKGO_HIP_PLATFORM_HCC == 1) + +// HIP on AMD GPUs does not support UM, so always prefer device allocations. +constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device; + +#else + +// HIP on NVIDIA GPUs supports UM, so prefer UM allocations. +constexpr allocation_mode default_hip_alloc_mode = + allocation_mode::unified_global; + +#endif + +#endif + } // namespace gko @@ -1694,13 +1718,13 @@ class HipExecutor : public detail::ExecutorBase, static std::shared_ptr create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode = default_hip_alloc_mode, - ihipStream_t* stream = nullptr); + GKO_HIP_STREAM_STRUCT* stream = nullptr); static std::shared_ptr create( int device_id, std::shared_ptr master, std::shared_ptr alloc = std::make_shared(), - ihipStream_t* stream = nullptr); + GKO_HIP_STREAM_STRUCT* stream = nullptr); std::shared_ptr get_master() noexcept override; @@ -1806,7 +1830,7 @@ class HipExecutor : public detail::ExecutorBase, return this->get_exec_info().closest_pu_ids; } - ihipStream_t* get_stream() const { return stream_; } + GKO_HIP_STREAM_STRUCT* get_stream() const { return stream_; } protected: void set_gpu_property(); @@ -1814,7 +1838,8 @@ class HipExecutor : public detail::ExecutorBase, void init_handles(); HipExecutor(int device_id, std::shared_ptr master, - std::shared_ptr alloc, ihipStream_t* stream) + std::shared_ptr alloc, + GKO_HIP_STREAM_STRUCT* stream) : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream} { this->get_exec_info().device_id = device_id; @@ -1851,7 +1876,7 @@ class HipExecutor : public detail::ExecutorBase, handle_manager hipblas_handle_; handle_manager hipsparse_handle_; std::shared_ptr alloc_; - ihipStream_t* stream_; + GKO_HIP_STREAM_STRUCT* stream_; }; diff --git a/include/ginkgo/core/base/fwd_decls.hpp b/include/ginkgo/core/base/fwd_decls.hpp index 606328f45a4..f7e446d7bf2 100644 --- a/include/ginkgo/core/base/fwd_decls.hpp +++ b/include/ginkgo/core/base/fwd_decls.hpp @@ -21,8 +21,15 @@ struct hipblasContext; struct hipsparseContext; +#if GINKGO_HIP_PLATFORM_HCC struct ihipStream_t; struct ihipEvent_t; +#define GKO_HIP_STREAM_STRUCT ihipStream_t +#define GKO_HIP_EVENT_STRUCT ihipEvent_t +#else +#define GKO_HIP_STREAM_STRUCT CUstream_st +#define GKO_HIP_EVENT_STRUCT CUevent_st +#endif // after intel/llvm September'22 release, which uses major version 6, they diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index b81bea5e145..30b0da475d0 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -616,7 +616,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr int64 ceildiv(int64 num, int64 den) } -#if defined(__HIPCC__) +#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC /** @@ -794,7 +794,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T one(const T&) } -#endif // defined(__HIPCC__) +#endif // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC #undef GKO_BIND_ZERO_ONE diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp index 78fe2cee829..7f8da044fd9 100644 --- a/include/ginkgo/core/base/memory.hpp +++ b/include/ginkgo/core/base/memory.hpp @@ -77,7 +77,8 @@ class HipAllocatorBase : public Allocator { * @return true if and only if the allocator can be used by HipExecutor in * the given environment. */ - virtual bool check_environment(int device_id, ihipStream_t* stream) const + virtual bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const { return true; } @@ -185,13 +186,14 @@ class HipAsyncAllocator : public HipAllocatorBase { void deallocate(void* ptr) override; - HipAsyncAllocator(ihipStream_t* stream); + HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream); protected: - bool check_environment(int device_id, ihipStream_t* stream) const override; + bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const override; private: - ihipStream_t* stream_; + GKO_HIP_STREAM_STRUCT* stream_; }; @@ -209,7 +211,8 @@ class HipUnifiedAllocator : public HipAllocatorBase, public CpuAllocatorBase { HipUnifiedAllocator(int device_id, unsigned int flags); protected: - bool check_environment(int device_id, ihipStream_t* stream) const override; + bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const override; private: int device_id_; @@ -229,7 +232,8 @@ class HipHostAllocator : public HipAllocatorBase, public CpuAllocatorBase { HipHostAllocator(int device_id); protected: - bool check_environment(int device_id, ihipStream_t* stream) const override; + bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const override; private: int device_id_; diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp index f95d8971a3a..22af70ac14b 100644 --- a/include/ginkgo/core/base/stream.hpp +++ b/include/ginkgo/core/base/stream.hpp @@ -89,10 +89,10 @@ class hip_stream { * Returns the native HIP stream handle. * In an empty hip_stream, this will return nullptr. */ - ihipStream_t* get() const; + GKO_HIP_STREAM_STRUCT* get() const; private: - ihipStream_t* stream_; + GKO_HIP_STREAM_STRUCT* stream_; int device_id_; }; diff --git a/include/ginkgo/core/base/timer.hpp b/include/ginkgo/core/base/timer.hpp index 7b5d2aed5b3..8008cecfb94 100644 --- a/include/ginkgo/core/base/timer.hpp +++ b/include/ginkgo/core/base/timer.hpp @@ -54,7 +54,7 @@ class time_point { type type_; union data_union { CUevent_st* cuda_event; - ihipEvent_t* hip_event; + GKO_HIP_EVENT_STRUCT* hip_event; sycl::event* dpcpp_event; std::chrono::steady_clock::time_point chrono; @@ -206,7 +206,7 @@ class HipTimer : public Timer { private: int device_id_; - ihipStream_t* stream_; + GKO_HIP_STREAM_STRUCT* stream_; }; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 153ebbd1730..f27fe12a934 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -478,6 +478,7 @@ class Csr : public EnableLinOp>, multiple = 32; } } +#if GINKGO_HIP_PLATFORM_HCC if (!cuda_strategy_) { multiple = 8; if (nnz >= static_cast(1e7)) { @@ -486,6 +487,7 @@ class Csr : public EnableLinOp>, multiple = 16; } } +#endif // GINKGO_HIP_PLATFORM_HCC auto nwarps = nwarps_ * multiple; return min(ceildiv(nnz, warp_size_), nwarps); @@ -603,10 +605,12 @@ class Csr : public EnableLinOp>, nnz_limit = intel_nnz_limit; row_len_limit = intel_row_len_limit; } +#if GINKGO_HIP_PLATFORM_HCC if (!cuda_strategy_) { nnz_limit = amd_nnz_limit; row_len_limit = amd_row_len_limit; } +#endif // GINKGO_HIP_PLATFORM_HCC auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master(); const bool is_mtx_on_host{host_mtx_exec == mtx_row_ptrs.get_executor()}; diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 3899f2fefb7..21c40a70c0a 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -63,7 +63,7 @@ inline void init_executor(std::shared_ptr ref, inline void init_executor(std::shared_ptr ref, std::shared_ptr& exec, - ihipStream_t* stream = nullptr) + GKO_HIP_STREAM_STRUCT* stream = nullptr) { if (gko::HipExecutor::get_num_devices() == 0) { throw std::runtime_error{"No suitable HIP devices"}; From d972798093921fb4a33684c5096c4073be4a2a28 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 8 Apr 2024 15:46:57 +0200 Subject: [PATCH 07/22] fix INSTALL.md documentation Co-authored-by: Yuhsiang M. Tsai --- INSTALL.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 54889f7e335..087b4000f82 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -51,9 +51,9 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_BUILD_HIP={ON, OFF}` builds optimized HIP versions of the kernels (requires HIP), default is `ON` if an installation of HIP could be detected, `OFF` otherwise. -* `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. If system HWLOC - is not found, Ginkgo will try to build it. Default is `ON` on Linux. Ginkgo - does not support HWLOC on Windows/MacOS, so the default is `OFF` on Windows/MacOS. +* `-DGINKGO_HIP_AMDGPU="gpuarch1;gpuarch2"` the amdgpu_target(s) variable + passed to hipcc for the `hcc` HIP backend. The default is none (auto). +* `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. Default is `OFF`. * `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation from inline comments in the code. The default is `OFF`. * `-DGINKGO_DOC_GENERATE_EXAMPLES={ON, OFF}` generates the documentation of examples From 2c80e27d0bcda5a7fbb18884c320541f90d839c8 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 8 Apr 2024 16:28:36 +0200 Subject: [PATCH 08/22] improve usage of GINKGO_HIP_PLATFORM --- INSTALL.md | 16 +++++++--------- cmake/build_type_helpers.cmake | 2 +- cmake/create_test.cmake | 2 +- cmake/hip.cmake | 12 +++++++++++- core/CMakeLists.txt | 2 +- third_party/identify_stream_usage/CMakeLists.txt | 2 +- 6 files changed, 22 insertions(+), 14 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 087b4000f82..9719bdfb920 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -51,8 +51,8 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_BUILD_HIP={ON, OFF}` builds optimized HIP versions of the kernels (requires HIP), default is `ON` if an installation of HIP could be detected, `OFF` otherwise. -* `-DGINKGO_HIP_AMDGPU="gpuarch1;gpuarch2"` the amdgpu_target(s) variable - passed to hipcc for the `hcc` HIP backend. The default is none (auto). +* `-DCMAKE_HIP_ARCHITECTURES="gpuarch1;gpuarch2"` the AMDGPU targets to be passed to the compiler. + If empty, compiler chooses based on the available GPUs. * `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. Default is `OFF`. * `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation from inline comments in the code. The default is `OFF`. @@ -181,15 +181,13 @@ imposed by the `HIP` tool suite. The variables are the following: #### HIP platform detection of AMD and NVIDIA -By default, Ginkgo uses the output of `/opt/rocm/hip/bin/hipconfig --platform` -to select the backend. The accepted values are either `hcc` (`amd` with ROCM >= -4.1) or `nvcc` (`nvidia` with ROCM >= 4.1). When on an AMD or NVIDIA system, -this should output the correct platform by default. When on a system without -GPUs, this should output `hcc` by default. To change this value, export the -environment variable `HIP_PLATFORM` like so: +Ginkgo relies on CMake to decide which compiler to use for HIP. +To choose `nvcc` instead of the default ROCm `clang++`, set the corresponding +environment variable: ```bash -export HIP_PLATFORM=nvcc # or nvidia for ROCM >= 4.1 +export HIPCXX=nvcc ``` +Note that this option is currently not being tested in our CI pipelines. ### Third party libraries and packages diff --git a/cmake/build_type_helpers.cmake b/cmake/build_type_helpers.cmake index 09fc4a7dc9b..0c6b386bb21 100644 --- a/cmake/build_type_helpers.cmake +++ b/cmake/build_type_helpers.cmake @@ -93,7 +93,7 @@ foreach(_LANG IN LISTS ENABLED_LANGUAGES ITEMS "HIP") set(${PROJECT_NAME}_${_LANG}_${_TYPE}_SUPPORTED FALSE) endif() if(${PROJECT_NAME}_${_LANG}_${_TYPE}_SUPPORTED) - if(_LANG STREQUAL "HIP" AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") + if(_LANG STREQUAL "HIP" AND GINKGO_HIP_PLATFORM_NVIDIA) set(CMAKE_${_LANG}_FLAGS_${_TYPE} ${${PROJECT_NAME}_NVCC_${_TYPE}_COMPILER_FLAGS} CACHE STRING "Flags used by the ${_LANG} compiler during ${_TYPE} builds." FORCE diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 8214afc50ba..4a349485579 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -104,7 +104,7 @@ function(ginkgo_add_test test_name test_target_name) if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_CUDA) set(test_preload $:${test_preload}) endif() - if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") + if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM_AMD) set(test_preload $:${test_preload}) endif() if(test_preload) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 9a4ed90b712..bcfa0b5a4db 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -1,4 +1,14 @@ enable_language(HIP) +if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA") + set(GINKGO_HIP_PLATFORM "nvidia") + set(GINKGO_HIP_PLATFORM_NVIDIA ON) + set(GINKGO_HIP_PLATFORM_AMD OFF) +else() + set(GINKGO_HIP_PLATFORM "amd") + set(GINKGO_HIP_PLATFORM_NVIDIA OFF) + set(GINKGO_HIP_PLATFORM_AMD ON) +endif() + if(NOT DEFINED ROCM_PATH) if(DEFINED ENV{ROCM_PATH}) @@ -93,7 +103,7 @@ execute_process( ## Setup all CMAKE variables to find HIP and its dependencies set(GINKGO_HIP_MODULE_PATH "${HIP_PATH}/cmake") list(APPEND CMAKE_MODULE_PATH "${GINKGO_HIP_MODULE_PATH}") -if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") +if (GINKGO_HIP_PLATFORM_AND) list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake") endif() list(APPEND CMAKE_PREFIX_PATH diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 449b8da4584..f44e49338fd 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -142,7 +142,7 @@ endif() # Since we have a public dependency on HIP, this dependency appears # here as well -if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") +if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM_AMD) list(APPEND GKO_RPATH_ADDITIONS "${HIP_PATH}/lib") endif() diff --git a/third_party/identify_stream_usage/CMakeLists.txt b/third_party/identify_stream_usage/CMakeLists.txt index 42036baede1..cb316205953 100644 --- a/third_party/identify_stream_usage/CMakeLists.txt +++ b/third_party/identify_stream_usage/CMakeLists.txt @@ -23,7 +23,7 @@ if(GINKGO_BUILD_CUDA) ENVIRONMENT LD_PRELOAD=$) endif() -if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") +if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM_AMD) find_package(hip REQUIRED) set_source_files_properties(identify_stream_usage.hip.cpp test_default_stream_identification.hip.cpp From 60e363ae9f34df71b9c0bc0d447f87a3e883fdef Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 8 Apr 2024 16:30:14 +0200 Subject: [PATCH 09/22] disable simultaneous cuda/hip-cuda compilation in CI --- .gitlab-ci.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index fa8135e0763..f1238d4b7fd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -106,7 +106,6 @@ build/cuda101/nompi/clang/cuda_wo_omp/release/shared: variables: CXX_COMPILER: "clang++" BUILD_CUDA: "ON" - BUILD_HIP: "ON" BUILD_HWLOC: "OFF" BUILD_TYPE: "Release" CUDA_ARCH: 35 @@ -139,7 +138,6 @@ build/cuda101/nompi/clang/all/release/static: CXX_COMPILER: "clang++" BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_HIP: "ON" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OFF" CUDA_ARCH: 35 @@ -190,7 +188,6 @@ build/cuda102/nompi/gcc/all/debug/shared: variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_HIP: "ON" BUILD_TYPE: "Debug" FAST_TESTS: "ON" BUILD_HWLOC: "OFF" @@ -207,7 +204,6 @@ build/cuda102/nompi/clang/all/release/static: CXX_COMPILER: "clang++" BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_HIP: "ON" BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OFF" CUDA_ARCH: 35 @@ -686,7 +682,6 @@ warnings: variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_HIP: "ON" CXX_FLAGS: "-Werror=pedantic;-pedantic-errors" allow_failure: yes @@ -701,7 +696,6 @@ no-circular-deps: variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_HIP: "ON" EXTRA_CMAKE_FLAGS: '-DGINKGO_CHECK_CIRCULAR_DEPS=on' allow_failure: no @@ -729,7 +723,6 @@ clang-tidy: variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_HIP: "ON" EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_CLANG_TIDY=ON' allow_failure: yes From d1fb989c8fbf3849ba9ab64b2e1a5b51cc8559d2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 8 Apr 2024 16:34:45 +0200 Subject: [PATCH 10/22] add CMake minimum requirement for HIP --- README.md | 1 + cmake/hip.cmake | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 49d5f23f300..f34582539a1 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,7 @@ The Ginkgo HIP module has the following __additional__ requirements: * _AMD_ backend (using the `clang` compiler) * _10.1 <= CUDA < 11_ backend * if the hipFFT package is available, it is used to implement the FFT LinOps. +* _cmake 3.21+_ The Ginkgo DPC++(SYCL) module has the following __additional__ requirements: diff --git a/cmake/hip.cmake b/cmake/hip.cmake index bcfa0b5a4db..a42407a0673 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -1,3 +1,4 @@ +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) enable_language(HIP) if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA") set(GINKGO_HIP_PLATFORM "nvidia") From a8c859e428e128a13c0b47d979fe3b0a0fae9b5c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 8 Apr 2024 19:58:52 +0200 Subject: [PATCH 11/22] fix HIP definitions --- cmake/hip.cmake | 2 ++ cmake/hip_path.cmake | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index a42407a0673..83abf468e26 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -4,10 +4,12 @@ if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA") set(GINKGO_HIP_PLATFORM "nvidia") set(GINKGO_HIP_PLATFORM_NVIDIA ON) set(GINKGO_HIP_PLATFORM_AMD OFF) + set(GINKGO_HIP_PLATFORM_NVCC 1) else() set(GINKGO_HIP_PLATFORM "amd") set(GINKGO_HIP_PLATFORM_NVIDIA OFF) set(GINKGO_HIP_PLATFORM_AMD ON) + set(GINKGO_HIP_PLATFORM_HCC 1) endif() diff --git a/cmake/hip_path.cmake b/cmake/hip_path.cmake index 269f8403fd3..58fcd3db447 100644 --- a/cmake/hip_path.cmake +++ b/cmake/hip_path.cmake @@ -11,3 +11,8 @@ find_program(GINKGO_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin") if(GINKGO_HIPCONFIG_PATH) message(STATUS "Found hipconfig: ${GINKGO_HIPCONFIG_PATH}") endif() + +# We keep using NVCC/HCC for consistency with previous releases even if AMD +# updated everything to use NVIDIA/AMD in ROCM 4.1 +set(GINKGO_HIP_PLATFORM_NVCC 0) +set(GINKGO_HIP_PLATFORM_HCC 0) From 4bfb545c9322442f224cd812db3734517b0f07fb Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 8 Apr 2024 20:48:09 +0200 Subject: [PATCH 12/22] disable HIP in last job --- .gitlab-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f1238d4b7fd..3521d5f1b2e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -123,7 +123,6 @@ build/cuda101/openmpi/gcc/all/debug/shared: BUILD_CUDA: "ON" BUILD_MPI: "ON" MPI_AS_ROOT: "ON" - BUILD_HIP: "ON" BUILD_TYPE: "Debug" BUILD_PAPI_SDE: "ON" CUDA_ARCH: 35 From 270cf5300a01acaf158e1add30828735732e77b6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 9 Apr 2024 15:13:48 +0200 Subject: [PATCH 13/22] review updates - fix incorrect CMake version requirement - remove explicit RPATH additions - clean up configure log Co-authored-by: Yuhsiang M. Tsai Co-authored-by: Marcel Koch --- cmake/GinkgoConfig.cmake.in | 3 +++ cmake/hip.cmake | 2 +- core/CMakeLists.txt | 12 +----------- hip/CMakeLists.txt | 7 +------ hip/get_info.cmake | 6 ++---- 5 files changed, 8 insertions(+), 22 deletions(-) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 4b4e8d77338..23b1d25adc1 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -211,6 +211,9 @@ if(GINKGO_BUILD_CUDA) _ginkgo_check_compiler(CUDA) _ginkgo_check_compiler(CUDA_HOST) endif() +if(GINKGO_BUILD_HIP) + _ginkgo_check_compiler(HIP) +endif() include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 83abf468e26..327375bfe76 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) enable_language(HIP) if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA") set(GINKGO_HIP_PLATFORM "nvidia") diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index f44e49338fd..1b5f9237612 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -116,10 +116,6 @@ ginkgo_compile_features(ginkgo) add_library(Ginkgo::ginkgo ALIAS ginkgo) target_link_libraries(ginkgo PUBLIC ginkgo_device ginkgo_omp ginkgo_cuda ginkgo_reference ginkgo_hip ginkgo_dpcpp) - -# The PAPI dependency needs to be exposed to the user. -set(GKO_RPATH_ADDITIONS "") - if(GINKGO_HAVE_PAPI_SDE) target_link_libraries(ginkgo PUBLIC PAPI::PAPI_SDE) endif() @@ -140,14 +136,8 @@ if(GINKGO_BUILD_MPI) target_link_libraries(ginkgo PUBLIC MPI::MPI_CXX) endif() -# Since we have a public dependency on HIP, this dependency appears -# here as well -if(GINKGO_BUILD_HIP AND GINKGO_HIP_PLATFORM_AMD) - list(APPEND GKO_RPATH_ADDITIONS "${HIP_PATH}/lib") -endif() - ginkgo_default_includes(ginkgo) -ginkgo_install_library(ginkgo "${GKO_RPATH_ADDITIONS}") +ginkgo_install_library(ginkgo) if(GINKGO_CHECK_CIRCULAR_DEPS) ginkgo_check_headers(ginkgo "") diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 8bcf2fb9e7d..738d9d09862 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -129,15 +129,10 @@ endif() target_compile_options(ginkgo_hip PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) -# Try to find everything in /opt/rocm/lib first. -set(GKO_HIP_RPATH "${ROCM_PATH}/lib" ) -list(APPEND GKO_HIP_RPATH "${HIP_PATH}/lib") -list(APPEND GKO_HIP_RPATH "${HIPBLAS_PATH}/lib" "${HIPRAND_PATH}/lib" - "${HIPSPARSE_PATH}/lib" "${ROCRAND_PATH}/lib") ginkgo_compile_features(ginkgo_hip) ginkgo_default_includes(ginkgo_hip) -ginkgo_install_library(ginkgo_hip "${GKO_HIP_RPATH}") +ginkgo_install_library(ginkgo_hip) if (GINKGO_CHECK_CIRCULAR_DEPS) ginkgo_check_headers(ginkgo_hip GKO_COMPILING_HIP) diff --git a/hip/get_info.cmake b/hip/get_info.cmake index 234f65642cc..dc3068e4f81 100644 --- a/hip/get_info.cmake +++ b/hip/get_info.cmake @@ -1,12 +1,10 @@ ginkgo_print_module_header(${detailed_log} "HIP") -ginkgo_print_foreach_variable(${detailed_log} - "GINKGO_HIPCONFIG_PATH") ginkgo_print_module_footer(${detailed_log} "HIP variables:") +ginkgo_print_flags(${detailed_log} "CMAKE_HIP_FLAGS") +ginkgo_print_flags(${detailed_log} "CMAKE_HIP_COMPILER") ginkgo_print_foreach_variable(${detailed_log} "HIP_VERSION;HIP_PATH;ROCM_PATH" "HIP_PLATFORM;HIP_ROOT_DIR;HIP_RUNTIME;HIPBLAS_PATH;HIPSPARSE_PATH" "HIPRAND_PATH;ROCRAND_PATH;HIP_CLANG_INCLUDE_PATH" "HIP_HIPCONFIG_EXECUTABLE") -ginkgo_print_flags(${detailed_log} "CMAKE_HIP_FLAGS") -ginkgo_print_flags(${detailed_log} "CMAKE_HIP_COMPILER") ginkgo_print_module_footer(${detailed_log} "") From 382e1f0633a07ff976ac2a470368a66cf5b1d9e1 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 9 Apr 2024 15:28:28 +0200 Subject: [PATCH 14/22] set CUDA and HIP standard explicitly --- cmake/build_helpers.cmake | 4 ++++ cmake/create_test.cmake | 4 ++++ test/test_install/CMakeLists.txt | 2 ++ 3 files changed, 10 insertions(+) diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake index 65c987b3490..a1a1735f84e 100644 --- a/cmake/build_helpers.cmake +++ b/cmake/build_helpers.cmake @@ -19,6 +19,10 @@ endfunction() function(ginkgo_compile_features name) target_compile_features("${name}" PUBLIC cxx_std_14) + # we set these properties regardless of the enabled backends, + # because unknown properties are ignored + set_target_properties("${name}" PROPERTIES HIP_STANDARD 14) + set_target_properties("${name}" PROPERTIES CUDA_STANDARD 14) if(GINKGO_WITH_CLANG_TIDY AND GINKGO_CLANG_TIDY_PATH) set_property(TARGET "${name}" PROPERTY CXX_CLANG_TIDY "${GINKGO_CLANG_TIDY_PATH};-checks=*") endif() diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 4a349485579..ecb75d5da39 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -33,6 +33,10 @@ function(ginkgo_set_test_target_properties test_target_name test_library_suffix) target_link_libraries(${test_target_name} PRIVATE ginkgo_gtest_main${test_library_suffix}) endif() target_compile_features(${test_target_name} PUBLIC cxx_std_14) + # we set these properties regardless of the enabled backends, + # because unknown properties are ignored + set_target_properties(${test_target_name} PROPERTIES HIP_STANDARD 14) + set_target_properties(${test_target_name} PROPERTIES CUDA_STANDARD 14) target_include_directories(${test_target_name} PRIVATE ${Ginkgo_BINARY_DIR} ${set_properties_ADDITIONAL_INCLUDES}) target_link_libraries(${test_target_name} PRIVATE ginkgo GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES}) endfunction() diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt index fef09b19021..ee19b8d030e 100644 --- a/test/test_install/CMakeLists.txt +++ b/test/test_install/CMakeLists.txt @@ -38,6 +38,7 @@ if(GINKGO_BUILD_CUDA) enable_language(CUDA) configure_file(test_install.cpp test_install.cu COPYONLY) add_executable(test_install_cuda ${CMAKE_CURRENT_BINARY_DIR}/test_install.cu) + set_target_properties(test_install_cuda PROPERTIES CUDA_STANDARD 14) target_compile_definitions(test_install_cuda PRIVATE HAS_CUDA=1) target_compile_definitions(test_install_cuda PRIVATE HAS_REFERENCE=${HAS_REFERENCE}) target_link_libraries(test_install_cuda PRIVATE Ginkgo::ginkgo) @@ -48,6 +49,7 @@ if(GINKGO_BUILD_HIP) configure_file(test_install.cpp test_install.hip.cpp COPYONLY) set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp PROPERTIES LANGUAGE HIP) add_executable(test_install_hip ${CMAKE_CURRENT_BINARY_DIR}/test_install.hip.cpp) + set_target_properties(test_install_hip PROPERTIES HIP_STANDARD 14) target_link_libraries(test_install_hip PRIVATE Ginkgo::ginkgo) target_compile_definitions(test_install_hip PRIVATE HAS_HIP=1) From 5a0a4760d8bb43814c522c2fcbd7af8e3c6b082e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 9 Apr 2024 18:33:41 +0200 Subject: [PATCH 15/22] remove hipconfig detection --- cmake/hip_path.cmake | 5 ----- hip/get_info.cmake | 6 +----- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/cmake/hip_path.cmake b/cmake/hip_path.cmake index 58fcd3db447..a9f418cb3bd 100644 --- a/cmake/hip_path.cmake +++ b/cmake/hip_path.cmake @@ -7,11 +7,6 @@ if(NOT DEFINED HIP_PATH) endif() endif() -find_program(GINKGO_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin") -if(GINKGO_HIPCONFIG_PATH) - message(STATUS "Found hipconfig: ${GINKGO_HIPCONFIG_PATH}") -endif() - # We keep using NVCC/HCC for consistency with previous releases even if AMD # updated everything to use NVIDIA/AMD in ROCM 4.1 set(GINKGO_HIP_PLATFORM_NVCC 0) diff --git a/hip/get_info.cmake b/hip/get_info.cmake index dc3068e4f81..14a770234fa 100644 --- a/hip/get_info.cmake +++ b/hip/get_info.cmake @@ -2,9 +2,5 @@ ginkgo_print_module_header(${detailed_log} "HIP") ginkgo_print_module_footer(${detailed_log} "HIP variables:") ginkgo_print_flags(${detailed_log} "CMAKE_HIP_FLAGS") ginkgo_print_flags(${detailed_log} "CMAKE_HIP_COMPILER") -ginkgo_print_foreach_variable(${detailed_log} - "HIP_VERSION;HIP_PATH;ROCM_PATH" - "HIP_PLATFORM;HIP_ROOT_DIR;HIP_RUNTIME;HIPBLAS_PATH;HIPSPARSE_PATH" - "HIPRAND_PATH;ROCRAND_PATH;HIP_CLANG_INCLUDE_PATH" - "HIP_HIPCONFIG_EXECUTABLE") +ginkgo_print_variable(${detailed_log} "CMAKE_HIP_COMPILER_VERSION") ginkgo_print_module_footer(${detailed_log} "") From 6422245670b1658284054bc04831cf3876768363 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 10 Apr 2024 13:54:41 +0200 Subject: [PATCH 16/22] fix HIP static builds --- hip/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 738d9d09862..f82df149b0e 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -119,7 +119,7 @@ target_include_directories(ginkgo_hip target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP) target_link_libraries(ginkgo_hip PUBLIC ginkgo_device) -target_link_libraries(ginkgo_hip PRIVATE roc::hipblas roc::hipsparse hip::hiprand roc::rocrand) +target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand) if (hipfft_FOUND) target_link_libraries(ginkgo_hip PRIVATE hip::hipfft) endif() From 55417bea1a3ef0ab0b058af1d1e316b123b79604 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 10 Apr 2024 19:17:41 +0200 Subject: [PATCH 17/22] move to ROCm 5.1.4 --- .gitlab-ci.yml | 14 +++++++------- .gitlab/image.yml | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3521d5f1b2e..c7985bdf11e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -420,25 +420,25 @@ build/amd/nompi/clang/rocm45/debug/shared: BUILD_HIP: "ON" BUILD_TYPE: "Debug" -# ROCm 5.0.2 and friends -build/amd/nompi/gcc/rocm502/debug/static: +# ROCm 5.1.4 and friends +build/amd/nompi/gcc/rocm514/debug/static: extends: - .build_and_test_template - .default_variables - .full_test_condition - - .use_gko-rocm502-nompi-gnu11-llvm11 + - .use_gko-rocm514-nompi-gnu11-llvm11 variables: BUILD_OMP: "ON" BUILD_HIP: "ON" BUILD_TYPE: "Debug" BUILD_SHARED_LIBS: "OFF" -build/amd/nompi/clang/rocm502/release/shared: +build/amd/nompi/clang/rocm514/release/shared: extends: - .build_and_test_template - .default_variables - .quick_test_condition - - .use_gko-rocm502-nompi-gnu11-llvm11 + - .use_gko-rocm514-nompi-gnu11-llvm11 variables: CXX_COMPILER: "clang++" BUILD_OMP: "ON" @@ -446,12 +446,12 @@ build/amd/nompi/clang/rocm502/release/shared: BUILD_TYPE: "Release" # without omp -build/amd/nompi/gcc/rocm502_wo_omp/release/shared: +build/amd/nompi/gcc/rocm514_wo_omp/release/shared: extends: - .build_and_test_template - .default_variables - .full_test_condition - - .use_gko-rocm502-nompi-gnu11-llvm11 + - .use_gko-rocm514-nompi-gnu11-llvm11 variables: BUILD_OMP: "OFF" BUILD_MPI: "OFF" diff --git a/.gitlab/image.yml b/.gitlab/image.yml index eb1ab5128af..2be565b6b10 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -74,8 +74,8 @@ - private_ci - amd-gpu -.use_gko-rocm502-nompi-gnu11-llvm11: - image: ginkgohub/rocm:502-openmpi-gnu11-llvm11 +.use_gko-rocm514-nompi-gnu11-llvm11: + image: ginkgohub/rocm:514-openmpi-gnu11-llvm11 tags: - private_ci - amd-gpu From 18e89462cc0846790c3a5ea062e4c2810cb4b693 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 10 Apr 2024 20:23:56 +0200 Subject: [PATCH 18/22] disable buggy hipFFT tests --- test/matrix/fft_kernels.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp index d5ba741a064..ed186b1df60 100644 --- a/test/matrix/fft_kernels.cpp +++ b/test/matrix/fft_kernels.cpp @@ -149,6 +149,9 @@ TYPED_TEST(Fft, ApplyStrided1DInverseIsEqualToReference) TYPED_TEST(Fft, Apply2DIsEqualToReference) { +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC + GTEST_SKIP() << "rocFFT 5.1 has a bug related to 2D FFT"; +#endif using T = typename TestFixture::value_type; this->fft2->apply(this->data, this->out); @@ -160,6 +163,9 @@ TYPED_TEST(Fft, Apply2DIsEqualToReference) TYPED_TEST(Fft, ApplyStrided2DIsEqualToReference) { +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC + GTEST_SKIP() << "rocFFT 5.1 has a bug related to 2D FFT"; +#endif using T = typename TestFixture::value_type; this->fft2->apply(this->data_strided, this->out_strided); @@ -171,6 +177,9 @@ TYPED_TEST(Fft, ApplyStrided2DIsEqualToReference) TYPED_TEST(Fft, Apply2DInverseIsEqualToReference) { +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC + GTEST_SKIP() << "rocFFT 5.1 has a bug related to 2D FFT"; +#endif using T = typename TestFixture::value_type; this->ifft2->apply(this->data, this->out); @@ -182,6 +191,9 @@ TYPED_TEST(Fft, Apply2DInverseIsEqualToReference) TYPED_TEST(Fft, ApplyStrided2DInverseIsEqualToReference) { +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC + GTEST_SKIP() << "rocFFT 5.1 has a bug related to 2D FFT"; +#endif using T = typename TestFixture::value_type; this->ifft2->apply(this->data_strided, this->out_strided); From d5153f42f8c7ae14bdf5062593af48933e71a95e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 10 Apr 2024 22:44:02 +0200 Subject: [PATCH 19/22] fix compile def propagation for pkg-config --- cmake/information_helpers.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/information_helpers.cmake b/cmake/information_helpers.cmake index 04687dfae5b..0e667a90f02 100644 --- a/cmake/information_helpers.cmake +++ b/cmake/information_helpers.cmake @@ -76,7 +76,9 @@ macro(ginkgo_interface_libraries_recursively INTERFACE_LIBS) # Populate the compiler options and definitions if needed get_target_property(GINKGO_LIBS_INTERFACE_DEFS "${_lib}" INTERFACE_COMPILE_DEFINITIONS) if (GINKGO_LIBS_INTERFACE_DEFS) - list(APPEND GINKGO_INTERFACE_CFLAGS_FOUND "${GINKGO_LIBS_INTERFACE_DEFS}") + foreach(def IN LISTS GINKGO_LIBS_INTERFACE_DEFS) + list(APPEND GINKGO_INTERFACE_CFLAGS_FOUND "-D${def}") + endforeach() endif() unset(GINKGO_LIBS_INTERFACE_DEFS) From 52563818bca166707ba4eb5df73b792dd54a21e0 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 12 Apr 2024 12:26:49 +0200 Subject: [PATCH 20/22] create pkg-config file for shared libs only The static builds involve changing the linker for HIP, which can't really be represented in pkg-config --- cmake/install_helpers.cmake | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/cmake/install_helpers.cmake b/cmake/install_helpers.cmake index 1e48c991e8a..66efde51dde 100644 --- a/cmake/install_helpers.cmake +++ b/cmake/install_helpers.cmake @@ -59,19 +59,21 @@ function(ginkgo_install_library name) endfunction() function(ginkgo_install) - # generate pkg-config file, a three-step process is necessary to include the correct install prefix - # Step 1: substitute project variables in the generation script - configure_file("${Ginkgo_SOURCE_DIR}/cmake/generate_pkg.cmake.in" - "${Ginkgo_BINARY_DIR}/cmake/generate_pkg.cmake" - @ONLY) - # Step 2: substitute generator expressions - file(GENERATE OUTPUT ${Ginkgo_BINARY_DIR}/cmake/generate_pkg_$.cmake - INPUT ${Ginkgo_BINARY_DIR}/cmake/generate_pkg.cmake) - # Step 3: at install time, call the generation script which has all variables - # except the install prefix already replaced. Use the install prefix - # that is specified at install time - install(SCRIPT "${Ginkgo_BINARY_DIR}/cmake/generate_pkg_$.cmake" - COMPONENT Ginkgo_Development) + if (BUILD_SHARED_LIBS) + # generate pkg-config file, a three-step process is necessary to include the correct install prefix + # Step 1: substitute project variables in the generation script + configure_file("${Ginkgo_SOURCE_DIR}/cmake/generate_pkg.cmake.in" + "${Ginkgo_BINARY_DIR}/cmake/generate_pkg.cmake" + @ONLY) + # Step 2: substitute generator expressions + file(GENERATE OUTPUT ${Ginkgo_BINARY_DIR}/cmake/generate_pkg_$.cmake + INPUT ${Ginkgo_BINARY_DIR}/cmake/generate_pkg.cmake) + # Step 3: at install time, call the generation script which has all variables + # except the install prefix already replaced. Use the install prefix + # that is specified at install time + install(SCRIPT "${Ginkgo_BINARY_DIR}/cmake/generate_pkg_$.cmake" + COMPONENT Ginkgo_Development) + endif() # install the public header files install(DIRECTORY "${Ginkgo_SOURCE_DIR}/include/" From 2494442269df80eed84e6640e43159cdcd3ddeb7 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 12 Apr 2024 13:27:20 +0200 Subject: [PATCH 21/22] make test_pkgconfig optional --- .gitlab/scripts.yml | 8 ++++++-- CMakeLists.txt | 34 ++++++++++++++++++---------------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index b58d21de261..4205de667ca 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -106,7 +106,9 @@ - popd - if [ -n "${SYCL_DEVICE_TYPE}" ]; then unset SYCL_DEVICE_TYPE; fi - if [ -n "${SYCL_DEVICE_FILTER}" ]; then unset SYCL_DEVICE_FILTER; fi - - PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig + - if [[ "${BUILD_SHARED_LIBS}" == "ON" ]]; then + PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig; + fi dependencies: [] @@ -136,7 +138,9 @@ - pushd test/test_install - ninja install - popd - - PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig + - if [[ "${BUILD_SHARED_LIBS}" == "ON" ]]; then + PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig; + fi cache: [] diff --git a/CMakeLists.txt b/CMakeLists.txt index 0549c7b77d1..e2173be3b70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -467,22 +467,24 @@ add_custom_target(test_exportbuild COMMAND ${GINKGO_TEST_EXPORTBUILD_CMD} COMMENT "Running a test on Ginkgo's exported build directory.") -add_custom_target(test_pkgconfig - COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} ${TOOLSET} - -S${GINKGO_TEST_PKGCONFIG_SRC_DIR} - -B${GINKGO_TEST_PKGCONFIG_BIN_DIR} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - # `--config cfg` is ignored by single-configuration generator. - # `$` is always be the same as `CMAKE_BUILD_TYPE` in - # single-configuration generator. - COMMAND ${CMAKE_COMMAND} - --build ${GINKGO_TEST_PKGCONFIG_BIN_DIR} - --config $ - COMMAND ${GINKGO_TEST_PKGCONFIG_CMD} - COMMENT "Running a test on Ginkgo's PkgConfig" - "This requires installing Ginkgo first") +if (BUILD_SHARED_LIBS) + add_custom_target(test_pkgconfig + COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} ${TOOLSET} + -S${GINKGO_TEST_PKGCONFIG_SRC_DIR} + -B${GINKGO_TEST_PKGCONFIG_BIN_DIR} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + # `--config cfg` is ignored by single-configuration generator. + # `$` is always be the same as `CMAKE_BUILD_TYPE` in + # single-configuration generator. + COMMAND ${CMAKE_COMMAND} + --build ${GINKGO_TEST_PKGCONFIG_BIN_DIR} + --config $ + COMMAND ${GINKGO_TEST_PKGCONFIG_CMD} + COMMENT "Running a test on Ginkgo's PkgConfig" + "This requires installing Ginkgo first") +endif() # Setup CPack From a8a407f35887c480af66500c20dc53a41d2f7604 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 12 Apr 2024 16:45:47 +0200 Subject: [PATCH 22/22] disable pkg-config only for HIP --- .gitlab/scripts.yml | 4 ++-- CMakeLists.txt | 4 +++- cmake/install_helpers.cmake | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml index 4205de667ca..dda127ff535 100644 --- a/.gitlab/scripts.yml +++ b/.gitlab/scripts.yml @@ -106,7 +106,7 @@ - popd - if [ -n "${SYCL_DEVICE_TYPE}" ]; then unset SYCL_DEVICE_TYPE; fi - if [ -n "${SYCL_DEVICE_FILTER}" ]; then unset SYCL_DEVICE_FILTER; fi - - if [[ "${BUILD_SHARED_LIBS}" == "ON" ]]; then + - if [[ "${BUILD_SHARED_LIBS}" == "ON" || "${BUILD_HIP}" != "ON" ]]; then PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig; fi dependencies: [] @@ -138,7 +138,7 @@ - pushd test/test_install - ninja install - popd - - if [[ "${BUILD_SHARED_LIBS}" == "ON" ]]; then + - if [[ "${BUILD_SHARED_LIBS}" == "ON" || "${BUILD_HIP}" != "ON" ]]; then PKG_CONFIG_PATH=${INSTALL_PREFIX}/lib/pkgconfig:$PKG_CONFIG_PATH LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH ninja test_pkgconfig; fi cache: [] diff --git a/CMakeLists.txt b/CMakeLists.txt index e2173be3b70..3e17446854e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -467,7 +467,9 @@ add_custom_target(test_exportbuild COMMAND ${GINKGO_TEST_EXPORTBUILD_CMD} COMMENT "Running a test on Ginkgo's exported build directory.") -if (BUILD_SHARED_LIBS) +# static linking with pkg-config is not possible with HIP, since +# some linker information cannot be expressed in pkg-config files +if (BUILD_SHARED_LIBS OR NOT GINKGO_BUILD_HIP) add_custom_target(test_pkgconfig COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} ${TOOLSET} -S${GINKGO_TEST_PKGCONFIG_SRC_DIR} diff --git a/cmake/install_helpers.cmake b/cmake/install_helpers.cmake index 66efde51dde..898988142c9 100644 --- a/cmake/install_helpers.cmake +++ b/cmake/install_helpers.cmake @@ -59,7 +59,9 @@ function(ginkgo_install_library name) endfunction() function(ginkgo_install) - if (BUILD_SHARED_LIBS) + # static linking with pkg-config is not possible with HIP, since + # some linker information cannot be expressed in pkg-config files + if (BUILD_SHARED_LIBS OR NOT GINKGO_BUILD_HIP) # generate pkg-config file, a three-step process is necessary to include the correct install prefix # Step 1: substitute project variables in the generation script configure_file("${Ginkgo_SOURCE_DIR}/cmake/generate_pkg.cmake.in"