Skip to content

Commit

Permalink
SYCL: Add support for arbitrary size atomics
Browse files Browse the repository at this point in the history
  • Loading branch information
masterleinad committed Jan 23, 2023
1 parent c9929fc commit e4b3c82
Show file tree
Hide file tree
Showing 13 changed files with 440 additions and 16 deletions.
1 change: 1 addition & 0 deletions cmake/KokkosCore_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#cmakedefine KOKKOS_ENABLE_MEMKIND
#cmakedefine KOKKOS_ENABLE_LIBRT
#cmakedefine KOKKOS_ENABLE_SYCL
#cmakedefine KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED

/* General Settings */
#cmakedefine KOKKOS_ENABLE_CXX17
Expand Down
29 changes: 29 additions & 0 deletions cmake/kokkos_arch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,35 @@ IF (KOKKOS_ENABLE_SYCL)
)
ENDIF()

# Check support for device_global variables
# FIXME_SYCL Once the feature test macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL is
# available, use that instead.
IF(KOKKOS_ENABLE_SYCL)
INCLUDE(CheckCXXSourceCompiles)
STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}")
CHECK_CXX_SOURCE_COMPILES("
#include <sycl/sycl.hpp>
using namespace sycl::ext::oneapi::experimental;
using namespace sycl;
SYCL_EXTERNAL device_global<int, decltype(properties(device_image_scope))> Foo;
void bar(queue q) {
q.single_task([=] {
Foo = 42;
});
}
int main(){ return 0; }
"
KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED)

IF(KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED)
COMPILER_SPECIFIC_FLAGS(
DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED
)
ENDIF()
ENDIF()

SET(CUDA_ARCH_ALREADY_SPECIFIED "")
FUNCTION(CHECK_CUDA_ARCH ARCH FLAG)
Expand Down
16 changes: 15 additions & 1 deletion core/src/SYCL/Kokkos_SYCL_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@ void SYCLInternal::initialize(const sycl::queue& q) {
Kokkos::Impl::throw_runtime_exception(msg.str());
}

#ifdef KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED
// Init the array for used for arbitrarily sized atomics
if (this == &singleton()) {
desul::Impl::init_lock_arrays();
desul::Impl::init_lock_arrays_sycl(*m_queue);
}
#endif

m_team_scratch_current_size = 0;
m_team_scratch_ptr = nullptr;
}
Expand Down Expand Up @@ -160,7 +168,13 @@ void SYCLInternal::finalize() {

// The global_unique_token_locks array is static and should only be
// deallocated once by the defualt instance
if (this == &singleton()) Impl::sycl_global_unique_token_locks(true);
if (this == &singleton()) {
Impl::sycl_global_unique_token_locks(true);
#ifdef KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED
desul::Impl::finalize_lock_arrays();
desul::Impl::finalize_lock_arrays_sycl(*m_queue);
#endif
}

using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>;
if (nullptr != m_scratchSpace)
Expand Down
5 changes: 0 additions & 5 deletions core/unit_test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -742,11 +742,6 @@ if(Kokkos_ENABLE_HIP)
endif()

if(Kokkos_ENABLE_SYCL)
list(REMOVE_ITEM SYCL_SOURCES1A
# FIXME_SYCL atomic_fetch_oper for large types to be implemented
${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_complexdouble.cpp
)

list(REMOVE_ITEM SYCL_SOURCES2A
${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_WorkGraph.cpp
)
Expand Down
4 changes: 4 additions & 0 deletions core/unit_test/TestAtomicOperations_complexdouble.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@

namespace Test {
TEST(TEST_CATEGORY, atomic_operations_complexdouble) {
#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED)
if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>)
GTEST_SKIP() << "skipping since device_global variables are not available";
#endif
const int start = 1; // Avoid zero for division.
const int end = 11;
for (int i = start; i < end; ++i) {
Expand Down
8 changes: 5 additions & 3 deletions core/unit_test/TestAtomics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,8 +510,11 @@ TEST(TEST_CATEGORY, atomics) {
ASSERT_TRUE(
(TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 3)));

// FIXME_SYCL atomics for large types to be implemented
#ifndef KOKKOS_ENABLE_SYCL
// FIXME_SYCL Replace macro by SYCL_EXT_ONEAPI_DEVICE_GLOBAL or remove
// condition alltogether when possible.
#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_SYCL_DEVICE_GLOBAL_SUPPORTED)
if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::SYCL>) return;
#endif
ASSERT_TRUE(
(TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 1)));
ASSERT_TRUE(
Expand All @@ -536,7 +539,6 @@ TEST(TEST_CATEGORY, atomics) {
(TestAtomic::Loop<TestAtomic::SuperScalar<4>, TEST_EXECSPACE>(100, 3)));
#endif
#endif
#endif
}

} // namespace Test
10 changes: 10 additions & 0 deletions tpls/desul/include/desul/atomics/Adapt_SYCL.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,16 @@ using sycl_atomic_ref = sycl::atomic_ref<T,
sycl::access::address_space::generic_space>;
#endif

// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead
#ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED
// FIXME_SYCL The compiler forces us to use device_image_scope. Drop this when possible.
template <class T>
using sycl_device_global = sycl::ext::oneapi::experimental::device_global<
T,
decltype(sycl::ext::oneapi::experimental::properties(
sycl::ext::oneapi::experimental::device_image_scope))>;
#endif

} // namespace Impl
} // namespace desul

Expand Down
59 changes: 53 additions & 6 deletions tpls/desul/include/desul/atomics/Compare_Exchange_SYCL.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ SPDX-License-Identifier: (BSD-3-Clause)

#include <desul/atomics/Adapt_SYCL.hpp>
#include <desul/atomics/Common.hpp>
#include <desul/atomics/Lock_Array_SYCL.hpp>
#include <desul/atomics/Thread_Fence_SYCL.hpp>

// FIXME_SYCL SYCL2020 dictates that <sycl/sycl.hpp> is the header to include
Expand Down Expand Up @@ -78,16 +79,62 @@ std::enable_if_t<sizeof(T) == 8, T> device_atomic_exchange(T* const dest,
template <class T, class MemoryOrder, class MemoryScope>
std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T>
device_atomic_compare_exchange(
T* const /*dest*/, T compare, T /*value*/, MemoryOrder, MemoryScope) {
assert(false); // FIXME_SYCL not implemented
return compare;
T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
// This is a way to avoid deadlock in a subgroup
T return_val;
int done = 0;
auto sg = sycl::ext::oneapi::experimental::this_sub_group();
using sycl::ext::oneapi::group_ballot;
using sycl::ext::oneapi::sub_group_mask;
sub_group_mask active = group_ballot(sg, 1);
sub_group_mask done_active = group_ballot(sg, 0);
while (active != done_active) {
if (!done) {
if (lock_address_sycl((void*)dest, scope)) {
if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
atomic_thread_fence(MemoryOrderRelease(), scope);
atomic_thread_fence(MemoryOrderAcquire(), scope);
return_val = *dest;
if (return_val == compare) {
*dest = value;
device_atomic_thread_fence(MemoryOrderRelease(), scope);
}
unlock_address_sycl((void*)dest, scope);
done = 1;
}
}
done_active = group_ballot(sg, done);
}
return return_val;
}

template <class T, class MemoryOrder, class MemoryScope>
std::enable_if_t<(sizeof(T) != 8) && (sizeof(T) != 4), T> device_atomic_exchange(
T* const /*dest*/, T value, MemoryOrder, MemoryScope) {
assert(false); // FIXME_SYCL not implemented
return value;
T* const dest, T value, MemoryOrder, MemoryScope scope) {
// This is a way to avoid deadlock in a subgroup
T return_val;
int done = 0;
auto sg = sycl::ext::oneapi::experimental::this_sub_group();
using sycl::ext::oneapi::group_ballot;
using sycl::ext::oneapi::sub_group_mask;
sub_group_mask active = group_ballot(sg, 1);
sub_group_mask done_active = group_ballot(sg, 0);
while (active != done_active) {
if (!done) {
if (lock_address_sycl((void*)dest, scope)) {
if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
atomic_thread_fence(MemoryOrderRelease(), scope);
device_atomic_thread_fence(MemoryOrderAcquire(), scope);
return_val = *dest;
*dest = value;
device_atomic_thread_fence(MemoryOrderRelease(), scope);
unlock_address_sycl((void*)dest, scope);
done = 1;
}
}
done_active = group_ballot(sg, done);
}
return return_val;
}

} // namespace Impl
Expand Down
3 changes: 3 additions & 0 deletions tpls/desul/include/desul/atomics/Lock_Array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ SPDX-License-Identifier: (BSD-3-Clause)
#ifdef DESUL_HAVE_HIP_ATOMICS
#include <desul/atomics/Lock_Array_HIP.hpp>
#endif
#ifdef DESUL_HAVE_SYCL_ATOMICS
#include <desul/atomics/Lock_Array_SYCL.hpp>
#endif

namespace desul {
namespace Impl {
Expand Down
147 changes: 147 additions & 0 deletions tpls/desul/include/desul/atomics/Lock_Array_SYCL.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
/*
Copyright (c) 2019, Lawrence Livermore National Security, LLC
and DESUL project contributors. See the COPYRIGHT file for details.
Source: https://github.com/desul/desul
SPDX-License-Identifier: (BSD-3-Clause)
*/

#ifndef DESUL_ATOMICS_LOCK_ARRAY_SYCL_HPP_
#define DESUL_ATOMICS_LOCK_ARRAY_SYCL_HPP_

#include <cstdint>

#include "desul/atomics/Adapt_SYCL.hpp"
#include "desul/atomics/Common.hpp"
#include "desul/atomics/Macros.hpp"

// FIXME_SYCL
#if __has_include(<sycl/sycl.hpp>)
#include <sycl/sycl.hpp>
#else
#include <CL/sycl.hpp>
#endif

namespace desul {
namespace Impl {

// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead
#ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED

/**
* \brief This global variable in Host space is the central definition of these
* arrays.
*/
extern int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h;
extern int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h;

/// \brief After this call, the lock arrays used in [un]lock_address_sycl
/// are initialized and ready to be used.
///
/// This call is idempotent.
/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
/// snapshotted version while also linking against pure Desul
template <typename /*AlwaysInt*/ = int>
void init_lock_arrays_sycl(sycl::queue q);

/// \brief After this call, the lock arrays used in [un]lock_address_sycl
/// are freed and can't be used anymore.
///
/// This call is idempotent.
/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
/// snapshotted version while also linking against pure Desul
template <typename /*AlwaysInt*/ = int>
void finalize_lock_arrays_sycl(sycl::queue q);

/**
* \brief This global variable in SYCL space is what kernels use to get access
* to the lock arrays.
*
* There is only one single instance of this global variable for the entire
* executable, whose definition will be in Kokkos_SYCL_Locks.cpp (and whose
* declaration here must be extern). This one instance will be initialized
* by initialize_host_sycl_lock_arrays and need not be modified afterwards.
*/
SYCL_EXTERNAL extern sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_DEVICE;

SYCL_EXTERNAL extern sycl_device_global<int32_t*> SYCL_SPACE_ATOMIC_LOCKS_NODE;

#define SYCL_SPACE_ATOMIC_MASK 0x1FFFF

/// \brief Acquire a lock for the address
///
/// This function tries to acquire the lock for the hash value derived
/// from the provided ptr. If the lock is successfully acquired the
/// function returns true. Otherwise it returns false.
inline bool lock_address_sycl(void* ptr, MemoryScopeDevice) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & SYCL_SPACE_ATOMIC_MASK;
sycl::atomic_ref<int32_t,
sycl::memory_order::relaxed,
sycl::memory_scope::device,
sycl::access::address_space::global_space>
lock_device_ref(SYCL_SPACE_ATOMIC_LOCKS_DEVICE[offset]);
return (0 == lock_device_ref.exchange(1));
}

inline bool lock_address_sycl(void* ptr, MemoryScopeNode) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & SYCL_SPACE_ATOMIC_MASK;
sycl::atomic_ref<int32_t,
sycl::memory_order::relaxed,
sycl::memory_scope::system,
sycl::access::address_space::global_space>
lock_node_ref(SYCL_SPACE_ATOMIC_LOCKS_NODE[offset]);
return (0 == lock_node_ref.exchange(1));
}

/**
* \brief Release lock for the address
*
* This function releases the lock for the hash value derived from the provided
* ptr. This function should only be called after previously successfully
* acquiring a lock with lock_address.
*/
inline void unlock_address_sycl(void* ptr, MemoryScopeDevice) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & SYCL_SPACE_ATOMIC_MASK;
sycl::atomic_ref<int32_t,
sycl::memory_order::relaxed,
sycl::memory_scope::device,
sycl::access::address_space::global_space>
lock_device_ref(SYCL_SPACE_ATOMIC_LOCKS_DEVICE[offset]);
lock_device_ref.exchange(0);
}

inline void unlock_address_sycl(void* ptr, MemoryScopeNode) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & SYCL_SPACE_ATOMIC_MASK;
sycl::atomic_ref<int32_t,
sycl::memory_order::relaxed,
sycl::memory_scope::system,
sycl::access::address_space::global_space>
lock_node_ref(SYCL_SPACE_ATOMIC_LOCKS_NODE[offset]);
lock_node_ref.exchange(0);
}
#else
inline bool lock_address_sycl(void*, MemoryScopeDevice) {
assert(false);
return true;
}

inline bool lock_address_sycl(void*, MemoryScopeNode) {
assert(false);
return true;
}

inline void unlock_address_sycl(void*, MemoryScopeDevice) { assert(false); }

inline void unlock_address_sycl(void*, MemoryScopeNode) { assert(false); }
#endif
} // namespace Impl
} // namespace desul
#endif
2 changes: 1 addition & 1 deletion tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ SPDX-License-Identifier: (BSD-3-Clause)
#include <desul/atomics/Lock_Based_Fetch_Op_HIP.hpp>
#endif
#ifdef DESUL_HAVE_SYCL_ATOMICS
#include <desul/atomics/Lock_Based_Fetch_Op_Unimplemented.hpp>
#include <desul/atomics/Lock_Based_Fetch_Op_SYCL.hpp>
#endif

#include <desul/atomics/Lock_Based_Fetch_Op_Host.hpp>
Expand Down
Loading

0 comments on commit e4b3c82

Please sign in to comment.