From be97d49e4601545c66c4b86b09df820ee30d257e Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 9 Aug 2022 15:36:06 -0400 Subject: [PATCH 1/3] use unsafe atomic --- CMakeLists.txt | 1 + cmake/hip.cmake | 4 ++++ common/cuda_hip/components/atomic.hpp.inc | 11 +++++++++++ 3 files changed, 16 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ebc6627094e..d02bcdfd386 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,7 @@ set(GINKGO_CUDA_COMPILER_FLAGS "" CACHE STRING set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING "A list of target NVIDIA GPU achitectures. See README.md for more detail.") option(GINKGO_CUDA_DEFAULT_HOST_COMPILER "Tell Ginkgo to not automatically set the CUDA host compiler" OFF) +option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU). Default is ON because we use hipMalloc. Must turn off when allocating memory on fine grain" ON) set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING "Set the required HIP compiler flags. Current default is an empty string.") set(GINKGO_HIP_NVCC_COMPILER_FLAGS "" CACHE STRING diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 855094eb1d1..95d1d29555e 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -212,9 +212,13 @@ if(GINKGO_HIP_AMDGPU) endforeach() endif() + set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS} "-std=c++14 -DGKO_COMPILING_HIP") set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS}) set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS}) +if(GINKGO_HIP_AMD_UNSAFE_ATOMIC) +set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} "-munsafe-fp-atomics") +endif() # HIP's cmake support secretly carries around global state to remember # whether we created any shared libraries, and sets PIC flags accordingly. # CMake's scoping rules means that this makes the hip_add_* calls order- and diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc index c858f19ded1..e95b0a1f56f 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp.inc @@ -147,6 +147,17 @@ GKO_BIND_ATOMIC_ADD(unsigned int); GKO_BIND_ATOMIC_ADD(unsigned long long int); GKO_BIND_ATOMIC_ADD(float); +#if defined(__HIPCC__) && defined(__HIP_DEVICE_COMPILE__) && \ + GINKGO_HIP_PLATFORM_HCC + + +// the double atomicAdd is added after 4.3 +GKO_BIND_ATOMIC_ADD(double); + + +#endif // defined(__HIPCC__) && defined(__HIP_DEVICE_COMPILE__) && + // GINKGO_HIP_PLATFORM_HCC + #if !defined(__HIPCC__) || \ (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC) From 4a2834d6038ec09bfb8487fafcd10d3e34438065 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 15 Aug 2022 13:52:22 -0400 Subject: [PATCH 2/3] unsafe float is only availble on rocm >= 5 --- CMakeLists.txt | 2 +- cmake/hip.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d02bcdfd386..1cbacd95e8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,7 +67,7 @@ set(GINKGO_CUDA_COMPILER_FLAGS "" CACHE STRING set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING "A list of target NVIDIA GPU achitectures. See README.md for more detail.") option(GINKGO_CUDA_DEFAULT_HOST_COMPILER "Tell Ginkgo to not automatically set the CUDA host compiler" OFF) -option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU). Default is ON because we use hipMalloc. Must turn off when allocating memory on fine grain" ON) +option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU and ROCM >= 5). Default is ON because we use hipMalloc, which is always on coarse grain. Must turn off when allocating memory on fine grain" ON) set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING "Set the required HIP compiler flags. Current default is an empty string.") set(GINKGO_HIP_NVCC_COMPILER_FLAGS "" CACHE STRING diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 95d1d29555e..5a98cda2f71 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -216,7 +216,7 @@ endif() set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS} "-std=c++14 -DGKO_COMPILING_HIP") set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS}) set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS}) -if(GINKGO_HIP_AMD_UNSAFE_ATOMIC) +if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND HIP_VERSION VERSION_GREATER_EQUAL 5) set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} "-munsafe-fp-atomics") endif() # HIP's cmake support secretly carries around global state to remember From 30a6f482ee815bad632e4fcf0afb48ab3f87515d Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 18 Aug 2022 10:17:26 -0400 Subject: [PATCH 3/3] simplify cond. add unsafe atomic, fine/coarse ref Co-authored-by: Tobias Ribizel --- CMakeLists.txt | 1 + cmake/hip.cmake | 2 +- common/cuda_hip/components/atomic.hpp.inc | 14 +++++--------- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1cbacd95e8c..9376cef03aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,7 @@ set(GINKGO_CUDA_COMPILER_FLAGS "" CACHE STRING set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING "A list of target NVIDIA GPU achitectures. See README.md for more detail.") option(GINKGO_CUDA_DEFAULT_HOST_COMPILER "Tell Ginkgo to not automatically set the CUDA host compiler" OFF) +# the details of fine/coarse grain memory and unsafe atomic are available https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#floating-point-fp-atomic-operations-and-coarse-fine-grained-memory-allocations option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU and ROCM >= 5). Default is ON because we use hipMalloc, which is always on coarse grain. Must turn off when allocating memory on fine grain" ON) set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING "Set the required HIP compiler flags. Current default is an empty string.") diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 5a98cda2f71..bd1e33b3cfc 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -217,7 +217,7 @@ set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS} "-std=c++14 -DGKO_COMPILIN set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS}) set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS}) if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND HIP_VERSION VERSION_GREATER_EQUAL 5) -set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} "-munsafe-fp-atomics") + list(APPEND GINKGO_HIP_CLANG_OPTIONS -munsafe-fp-atomics) endif() # HIP's cmake support secretly carries around global state to remember # whether we created any shared libraries, and sets PIC flags accordingly. diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc index e95b0a1f56f..b7e212a2da1 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp.inc @@ -147,19 +147,15 @@ GKO_BIND_ATOMIC_ADD(unsigned int); GKO_BIND_ATOMIC_ADD(unsigned long long int); GKO_BIND_ATOMIC_ADD(float); -#if defined(__HIPCC__) && defined(__HIP_DEVICE_COMPILE__) && \ - GINKGO_HIP_PLATFORM_HCC +// AMD +#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC // the double atomicAdd is added after 4.3 GKO_BIND_ATOMIC_ADD(double); -#endif // defined(__HIPCC__) && defined(__HIP_DEVICE_COMPILE__) && - // GINKGO_HIP_PLATFORM_HCC - -#if !defined(__HIPCC__) || \ - (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC) +#else // NVIDIA #if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \ @@ -190,12 +186,12 @@ GKO_BIND_ATOMIC_ADD(__half2); // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))) -#endif // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) && - // GINKGO_HIP_PLATFORM_HCC) +#endif // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC #undef GKO_BIND_ATOMIC_ADD + template __forceinline__ __device__ T atomic_max(T* __restrict__ addr, T val) {