diff --git a/CMakeLists.txt b/CMakeLists.txt index ebc6627094e..9376cef03aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,8 @@ set(GINKGO_CUDA_COMPILER_FLAGS "" CACHE STRING set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING "A list of target NVIDIA GPU achitectures. See README.md for more detail.") option(GINKGO_CUDA_DEFAULT_HOST_COMPILER "Tell Ginkgo to not automatically set the CUDA host compiler" OFF) +# the details of fine/coarse grain memory and unsafe atomic are available https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#floating-point-fp-atomic-operations-and-coarse-fine-grained-memory-allocations +option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU and ROCM >= 5). Default is ON because we use hipMalloc, which is always on coarse grain. Must turn off when allocating memory on fine grain" ON) set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING "Set the required HIP compiler flags. Current default is an empty string.") set(GINKGO_HIP_NVCC_COMPILER_FLAGS "" CACHE STRING diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 855094eb1d1..bd1e33b3cfc 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -212,9 +212,13 @@ if(GINKGO_HIP_AMDGPU) endforeach() endif() + set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS} "-std=c++14 -DGKO_COMPILING_HIP") set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS}) set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS}) +if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND HIP_VERSION VERSION_GREATER_EQUAL 5) + list(APPEND GINKGO_HIP_CLANG_OPTIONS -munsafe-fp-atomics) +endif() # HIP's cmake support secretly carries around global state to remember # whether we created any shared libraries, and sets PIC flags accordingly. # CMake's scoping rules means that this makes the hip_add_* calls order- and diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc index c858f19ded1..b7e212a2da1 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp.inc @@ -147,8 +147,15 @@ GKO_BIND_ATOMIC_ADD(unsigned int); GKO_BIND_ATOMIC_ADD(unsigned long long int); GKO_BIND_ATOMIC_ADD(float); -#if !defined(__HIPCC__) || \ - (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC) +// AMD +#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC + + +// the double atomicAdd is added after 4.3 +GKO_BIND_ATOMIC_ADD(double); + + +#else // NVIDIA #if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \ @@ -179,12 +186,12 @@ GKO_BIND_ATOMIC_ADD(__half2); // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))) -#endif // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) && - // GINKGO_HIP_PLATFORM_HCC) +#endif // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC #undef GKO_BIND_ATOMIC_ADD + template __forceinline__ __device__ T atomic_max(T* __restrict__ addr, T val) {