ginkgo-project · yhmtsai · Aug 19, 2022 · Aug 9, 2022 · Aug 15, 2022 · Aug 18, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -67,6 +67,8 @@ set(GINKGO_CUDA_COMPILER_FLAGS "" CACHE STRING
 set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING
     "A list of target NVIDIA GPU achitectures. See README.md for more detail.")
 option(GINKGO_CUDA_DEFAULT_HOST_COMPILER "Tell Ginkgo to not automatically set the CUDA host compiler" OFF)
+# the details of fine/coarse grain memory and unsafe atomic are available https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#floating-point-fp-atomic-operations-and-coarse-fine-grained-memory-allocations
+option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU and ROCM >= 5). Default is ON because we use hipMalloc, which is always on coarse grain. Must turn off when allocating memory on fine grain" ON)
 set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING
     "Set the required HIP compiler flags. Current default is an empty string.")
 set(GINKGO_HIP_NVCC_COMPILER_FLAGS "" CACHE STRING

diff --git a/cmake/hip.cmake b/cmake/hip.cmake
@@ -212,9 +212,13 @@ if(GINKGO_HIP_AMDGPU)
     endforeach()
 endif()
 
+
 set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS} "-std=c++14 -DGKO_COMPILING_HIP")
 set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS})
 set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS})
+if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND HIP_VERSION VERSION_GREATER_EQUAL 5)
+    list(APPEND GINKGO_HIP_CLANG_OPTIONS -munsafe-fp-atomics)
+endif()
 # HIP's cmake support secretly carries around global state to remember
 # whether we created any shared libraries, and sets PIC flags accordingly.
 # CMake's scoping rules means that this makes the hip_add_* calls order- and

diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc
@@ -147,8 +147,15 @@ GKO_BIND_ATOMIC_ADD(unsigned int);
 GKO_BIND_ATOMIC_ADD(unsigned long long int);
 GKO_BIND_ATOMIC_ADD(float);
 
-#if !defined(__HIPCC__) || \
-    (defined(__HIP_DEVICE_COMPILE__) && GINKGO_HIP_PLATFORM_NVCC)
+// AMD
+#if defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
+
+
+// the double atomicAdd is added after 4.3
+GKO_BIND_ATOMIC_ADD(double);
+
+
+#else  // NVIDIA
 
 
 #if !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || \
@@ -179,12 +186,12 @@ GKO_BIND_ATOMIC_ADD(__half2);
         // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
 
 
-#endif  // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) &&
-        // GINKGO_HIP_PLATFORM_HCC)
+#endif  // defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC
 
 
 #undef GKO_BIND_ATOMIC_ADD
 
+
 template <typename T>
 __forceinline__ __device__ T atomic_max(T* __restrict__ addr, T val)
 {