diff --git a/core/src/Kokkos_OpenMPTarget.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp similarity index 98% rename from core/src/Kokkos_OpenMPTarget.hpp rename to core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index 4bcfed90e3..adf972dd08 100644 --- a/core/src/Kokkos_OpenMPTarget.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -30,7 +30,7 @@ static_assert(false, #include #include -#include +#include #include #include #include @@ -141,7 +141,6 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ -#include #include #include #include diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp index f30abb0c87..de8e629831 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -33,8 +33,8 @@ #include #include -#include -#include +#include +#include #include #include #include diff --git a/core/src/Kokkos_OpenMPTargetSpace.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp similarity index 100% rename from core/src/Kokkos_OpenMPTargetSpace.hpp rename to core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp deleted file mode 100644 index 6d62a3c7e4..0000000000 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp +++ /dev/null @@ -1,1929 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_OPENMPTARGETEXEC_HPP -#define KOKKOS_OPENMPTARGETEXEC_HPP - -#include -#include - -#include -#include "Kokkos_OpenMPTarget_Abort.hpp" - -// FIXME_OPENMPTARGET - Using this macro to implement a workaround for -// hierarchical reducers. It avoids hitting the code path which we wanted to -// write but doesn't work. undef'ed at the end. -// Intel compilers prefer the non-workaround version. -#ifndef KOKKOS_ARCH_INTEL_GPU -#define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND -#endif - -// FIXME_OPENMPTARGET - Using this macro to implement a workaround for -// hierarchical scan. It avoids hitting the code path which we wanted to -// write but doesn't work. undef'ed at the end. -#ifndef KOKKOS_ARCH_INTEL_GPU -#define KOKKOS_IMPL_TEAM_SCAN_WORKAROUND -#endif - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct OpenMPTargetReducerWrapper { - using value_type = typename Reducer::value_type; - - // Using a generic unknown Reducer for the OpenMPTarget backend is not - // implemented. - KOKKOS_INLINE_FUNCTION - static void join(value_type&, const value_type&) = delete; - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type&, const volatile value_type&) = delete; - - KOKKOS_INLINE_FUNCTION - static void init(value_type&) = delete; -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { dest += src; } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest += src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::sum(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { dest *= src; } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest *= src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::prod(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src < dest) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src < dest) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src > dest) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src > dest) dest = src; - } - - // Required - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::max(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest && src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest && src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::land(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - using result_view_type = Kokkos::View; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest || src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest || src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::lor(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest & src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest & src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::band(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest | src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest | src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::bor(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::min(); - val.loc = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.val > dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val > dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::max(); - val.loc = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - - public: - // Required - using value_type = MinMaxScalar; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_val = reduction_identity::max(); - val.min_val = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = MinMaxLocScalar; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_val = reduction_identity::max(); - val.min_val = reduction_identity::min(); - val.max_loc = reduction_identity::min(); - val.min_loc = reduction_identity::min(); - } -}; - -// -// specialize for MaxFirstLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (dest.val < src.val) { - dest = src; - } else if (!(src.val < dest.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (dest.val < src.val) { - dest = src; - } else if (!(src.val < dest.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::max(); - val.loc = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for MinFirstLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) { - dest = src; - } else if (!(dest.val < src.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) { - dest = src; - } else if (!(dest.val < src.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::min(); - val.loc = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for MinMaxFirstLastLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = MinMaxLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (!(dest.min_val < src.min_val)) { - dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; - } - - if (dest.max_val < src.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (!(src.max_val < dest.max_val)) { - dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (!(dest.min_val < src.min_val)) { - dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; - } - - if (dest.max_val < src.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (!(src.max_val < dest.max_val)) { - dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_val = reduction_identity::max(); - val.min_val = reduction_identity::min(); - val.max_loc = reduction_identity::max(); - val.min_loc = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for FirstLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = FirstLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) - ? src.min_loc_true - : dest.min_loc_true; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) - ? src.min_loc_true - : dest.min_loc_true; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.min_loc_true = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for LastLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = LastLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_loc_true = reduction_identity::max(); - } -#pragma omp end declare target -}; - -// -// specialize for StdIsPartitioned -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = StdIsPartScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_loc_true = ::Kokkos::reduction_identity::max(); - val.min_loc_false = ::Kokkos::reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for StdPartitionPoint -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = StdPartPointScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.min_loc_false = ::Kokkos::reduction_identity::min(); - } -#pragma omp end declare target -}; - -/* -template -class OpenMPTargetReducerWrapper { - public: - const ReducerType& reducer; - using value_type = typename ReducerType::value_type; - value_type& value; - - KOKKOS_INLINE_FUNCTION - void join(const value_type& upd) { - reducer.join(value,upd); - } - - KOKKOS_INLINE_FUNCTION - void init(const value_type& upd) { - reducer.init(value,upd); - } -};*/ - -} // namespace Impl -} // namespace Kokkos - -namespace Kokkos { -namespace Impl { - -//---------------------------------------------------------------------------- -/** \brief Data for OpenMPTarget thread execution */ - -class OpenMPTargetExec { - public: - // FIXME_OPENMPTARGET - Currently the maximum number of - // teams possible is calculated based on NVIDIA's Volta GPU. In - // future this value should be based on the chosen architecture for the - // OpenMPTarget backend. - static constexpr int MAX_ACTIVE_THREADS = 2080 * 80; - static constexpr int MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS / 32; - - private: - static void* scratch_ptr; - - public: - static void verify_is_process(const char* const); - static void verify_initialized(const char* const); - - static int* get_lock_array(int num_teams); - static void* get_scratch_ptr(); - static void clear_scratch(); - static void clear_lock_array(); - static void resize_scratch(int64_t team_reduce_bytes, - int64_t team_shared_bytes, - int64_t thread_local_bytes, int64_t league_size); - - static void* m_scratch_ptr; - static int64_t m_scratch_size; - static int* m_lock_array; - static int64_t m_lock_size; - static uint32_t* m_uniquetoken_ptr; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -class OpenMPTargetExecTeamMember { - public: - static constexpr int TEAM_REDUCE_SIZE = 512; - - using execution_space = Kokkos::Experimental::OpenMPTarget; - using scratch_memory_space = execution_space::scratch_memory_space; - using team_handle = OpenMPTargetExecTeamMember; - - scratch_memory_space m_team_shared; - size_t m_team_scratch_size[2]; - int m_team_rank; - int m_team_size; - int m_league_rank; - int m_league_size; - int m_vector_length; - int m_vector_lane; - int m_shmem_block_index; - void* m_glb_scratch; - void* m_reduce_scratch; - - public: - KOKKOS_INLINE_FUNCTION - const execution_space::scratch_memory_space& team_shmem() const { - return m_team_shared.set_team_thread_mode(0, 1, 0); - } - - // set_team_thread_mode routine parameters for future understanding: - // first parameter - scratch level. - // second parameter - size multiplier for advancing scratch ptr after a - // request was serviced. third parameter - offset size multiplier from current - // scratch ptr when returning a ptr for a request. - KOKKOS_INLINE_FUNCTION - const execution_space::scratch_memory_space& team_scratch(int level) const { - return m_team_shared.set_team_thread_mode(level, 1, 0); - } - - KOKKOS_INLINE_FUNCTION - const execution_space::scratch_memory_space& thread_scratch(int level) const { - return m_team_shared.set_team_thread_mode(level, team_size(), team_rank()); - } - - KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; } - KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; } - KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; } - KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; } - KOKKOS_INLINE_FUNCTION void* impl_reduce_scratch() const { - return m_reduce_scratch; - } - - KOKKOS_INLINE_FUNCTION void team_barrier() const { -#pragma omp barrier - } - - template - KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value, - int thread_id) const { - // Make sure there is enough scratch space: - using type = std::conditional_t<(sizeof(ValueType) < TEAM_REDUCE_SIZE), - ValueType, void>; - type* team_scratch = - reinterpret_cast(static_cast(m_glb_scratch) + - TEAM_REDUCE_SIZE * omp_get_team_num()); -#pragma omp barrier - if (team_rank() == thread_id) *team_scratch = value; -#pragma omp barrier - value = *team_scratch; - } - - template - KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure& f, ValueType& value, - const int& thread_id) const { - f(value); - team_broadcast(value, thread_id); - } - - // FIXME_OPENMPTARGET this function has the wrong interface and currently - // ignores the reducer passed. - template - KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value, - const JoinOp&) const { -#pragma omp barrier - - using value_type = ValueType; - // const JoinLambdaAdapter op(op_in); - - // Make sure there is enough scratch space: - using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE), - value_type, void>; - - const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type); - type* team_scratch = - reinterpret_cast(static_cast(m_glb_scratch) + - TEAM_REDUCE_SIZE * omp_get_team_num()); - for (int i = m_team_rank; i < n_values; i += m_team_size) { - team_scratch[i] = value_type(); - } - -#pragma omp barrier - - for (int k = 0; k < m_team_size; k += n_values) { - if ((k <= m_team_rank) && (k + n_values > m_team_rank)) - team_scratch[m_team_rank % n_values] += value; -#pragma omp barrier - } - - for (int d = 1; d < n_values; d *= 2) { - if ((m_team_rank + d < n_values) && (m_team_rank % (2 * d) == 0)) { - team_scratch[m_team_rank] += team_scratch[m_team_rank + d]; - } -#pragma omp barrier - } - return team_scratch[0]; - } - /** \brief Intra-team exclusive prefix sum with team_rank() ordering - * with intra-team non-deterministic ordering accumulation. - * - * The global inter-team accumulation value will, at the end of the - * league's parallel execution, be the scan's total. - * Parallel execution ordering of the league's teams is non-deterministic. - * As such the base value for each team's scan operation is similarly - * non-deterministic. - */ - template - KOKKOS_INLINE_FUNCTION ArgType - team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { - // FIXME_OPENMPTARGET - /* // Make sure there is enough scratch space: - using type = - std::conditional_t<(sizeof(ArgType) < TEAM_REDUCE_SIZE), ArgType, void>; - - volatile type * const work_value = ((type*) m_exec.scratch_thread()); - - *work_value = value ; - - memory_fence(); - - if ( team_fan_in() ) { - // The last thread to synchronize returns true, all other threads wait - for team_fan_out() - // m_team_base[0] == highest ranking team member - // m_team_base[ m_team_size - 1 ] == lowest ranking team member - // - // 1) copy from lower to higher rank, initialize lowest rank to zero - // 2) prefix sum from lowest to highest rank, skipping lowest rank - - type accum = 0 ; - - if ( global_accum ) { - for ( int i = m_team_size ; i-- ; ) { - type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i - )->scratch_thread()); accum += val ; - } - accum = atomic_fetch_add( global_accum , accum ); - } - - for ( int i = m_team_size ; i-- ; ) { - type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i - )->scratch_thread()); const type offset = accum ; accum += val ; val = - offset ; - } - - memory_fence(); - } - - team_fan_out(); - - return *work_value ;*/ - return ArgType(); - } - - /** \brief Intra-team exclusive prefix sum with team_rank() ordering. - * - * The highest rank thread can compute the reduction total as - * reduction_total = dev.team_scan( value ) + value ; - */ - template - KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { - return this->template team_scan(value, 0); - } - - //---------------------------------------- - // Private for the driver - - private: - using space = execution_space::scratch_memory_space; - - public: - // FIXME_OPENMPTARGET - 512(16*32) bytes at the begining of the scratch space - // for each league is saved for reduction. It should actually be based on the - // ValueType of the reduction variable. - inline OpenMPTargetExecTeamMember( - const int league_rank, const int league_size, const int team_size, - const int vector_length // const TeamPolicyInternal< OpenMPTarget, - // Properties ...> & team - , - void* const glb_scratch, const int shmem_block_index, - const size_t shmem_size_L0, const size_t shmem_size_L1) - : m_team_scratch_size{shmem_size_L0, shmem_size_L1}, - m_team_rank(0), - m_team_size(team_size), - m_league_rank(league_rank), - m_league_size(league_size), - m_vector_length(vector_length), - m_shmem_block_index(shmem_block_index), - m_glb_scratch(glb_scratch) { - const int omp_tid = omp_get_thread_num(); - - // The scratch memory allocated is a sum of TEAM_REDUCE_SIZE, L0 shmem size - // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for - // hierarchical reduction. There is an additional 10% of the requested - // scratch memory allocated per team as padding. Hence the product with 0.1. - const int reduce_offset = - m_shmem_block_index * - (shmem_size_L0 + shmem_size_L1 + - ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE); - const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE; - const int l1_offset = l0_offset + shmem_size_L0; - m_team_shared = scratch_memory_space( - (static_cast(glb_scratch) + l0_offset), shmem_size_L0, - static_cast(glb_scratch) + l1_offset, shmem_size_L1); - m_reduce_scratch = static_cast(glb_scratch) + reduce_offset; - m_league_rank = league_rank; - m_team_rank = omp_tid; - m_vector_lane = 0; - } - - static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; } -}; - -template -class TeamPolicyInternal - : public PolicyTraits { - public: - //! Tag this class as a kokkos execution policy - using execution_policy = TeamPolicyInternal; - - using traits = PolicyTraits; - - //---------------------------------------- - - template - inline static int team_size_max(const FunctorType&, const ParallelForTag&) { - return 256; - } - - template - inline static int team_size_max(const FunctorType&, - const ParallelReduceTag&) { - return 256; - } - - template - inline static int team_size_max(const FunctorType&, const ReducerType&, - const ParallelReduceTag&) { - return 256; - } - - template - inline static int team_size_recommended(const FunctorType&, - const ParallelForTag&) { - return 128; - } - - template - inline static int team_size_recommended(const FunctorType&, - const ParallelReduceTag&) { - return 128; - } - - template - inline static int team_size_recommended(const FunctorType&, - const ReducerType&, - const ParallelReduceTag&) { - return 128; - } - - //---------------------------------------- - - private: - int m_league_size; - int m_team_size; - int m_vector_length; - int m_team_alloc; - int m_team_iter; - std::array m_team_scratch_size; - std::array m_thread_scratch_size; - bool m_tune_team_size; - bool m_tune_vector_length; - constexpr const static size_t default_team_size = 256; - int m_chunk_size; - - inline void init(const int league_size_request, const int team_size_request, - const int vector_length_request) { - m_league_size = league_size_request; - - // Minimum team size should be 32 for OpenMPTarget backend. - if (team_size_request < 32) { - Kokkos::Impl::OpenMPTarget_abort( - "OpenMPTarget backend requires a minimum of 32 threads per team.\n"); - } else - m_team_size = team_size_request; - - m_vector_length = vector_length_request; - set_auto_chunk_size(); - } - - template - friend class TeamPolicyInternal; - - public: - // FIXME_OPENMPTARGET : Currently this routine is a copy of the Cuda - // implementation, but this has to be tailored to be architecture specific. - inline static int scratch_size_max(int level) { - return ( - level == 0 ? 1024 * 40 : // 48kB is the max for CUDA, but we need some - // for team_member.reduce etc. - 20 * 1024 * - 1024); // arbitrarily setting this to 20MB, for a Volta V100 - // that would give us about 3.2GB for 2 teams per SM - } - inline bool impl_auto_team_size() const { return m_tune_team_size; } - inline bool impl_auto_vector_length() const { return m_tune_vector_length; } - inline void impl_set_team_size(const size_t size) { m_team_size = size; } - inline void impl_set_vector_length(const size_t length) { - m_tune_vector_length = length; - } - inline int impl_vector_length() const { return m_vector_length; } - inline int team_size() const { return m_team_size; } - inline int league_size() const { return m_league_size; } - inline size_t scratch_size(const int& level, int team_size_ = -1) const { - if (team_size_ < 0) team_size_ = m_team_size; - return m_team_scratch_size[level] + - team_size_ * m_thread_scratch_size[level]; - } - - inline Kokkos::Experimental::OpenMPTarget space() const { - return Kokkos::Experimental::OpenMPTarget(); - } - - template - TeamPolicyInternal(const TeamPolicyInternal& p) - : m_league_size(p.m_league_size), - m_team_size(p.m_team_size), - m_vector_length(p.m_vector_length), - m_team_alloc(p.m_team_alloc), - m_team_iter(p.m_team_iter), - m_team_scratch_size(p.m_team_scratch_size), - m_thread_scratch_size(p.m_thread_scratch_size), - m_tune_team_size(p.m_tune_team_size), - m_tune_vector_length(p.m_tune_vector_length), - m_chunk_size(p.m_chunk_size) {} - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, int team_size_request, - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, team_size_request, vector_length_request); - } - - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, default_team_size / vector_length_request, - vector_length_request); - } - - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, default_team_size, 1); - } - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, team_size_request, 1); - } - - TeamPolicyInternal(int league_size_request, int team_size_request, - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, team_size_request, vector_length_request); - } - - TeamPolicyInternal(int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, default_team_size / vector_length_request, - vector_length_request); - } - - TeamPolicyInternal(int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, default_team_size, 1); - } - TeamPolicyInternal(int league_size_request, int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, team_size_request, 1); - } - inline static size_t vector_length_max() { - return 32; /* TODO: this is bad. Need logic that is compiler and backend - aware */ - } - inline int team_alloc() const { return m_team_alloc; } - inline int team_iter() const { return m_team_iter; } - - inline int chunk_size() const { return m_chunk_size; } - - /** \brief set chunk_size to a discrete value*/ - inline TeamPolicyInternal& set_chunk_size( - typename traits::index_type chunk_size_) { - m_chunk_size = chunk_size_; - return *this; - } - - /** \brief set per team scratch size for a specific level of the scratch - * hierarchy */ - inline TeamPolicyInternal& set_scratch_size(const int& level, - const PerTeamValue& per_team) { - m_team_scratch_size[level] = per_team.value; - return *this; - } - - /** \brief set per thread scratch size for a specific level of the scratch - * hierarchy */ - inline TeamPolicyInternal& set_scratch_size( - const int& level, const PerThreadValue& per_thread) { - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - /** \brief set per thread and per team scratch size for a specific level of - * the scratch hierarchy */ - inline TeamPolicyInternal& set_scratch_size( - const int& level, const PerTeamValue& per_team, - const PerThreadValue& per_thread) { - m_team_scratch_size[level] = per_team.value; - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - private: - /** \brief finalize chunk_size if it was set to AUTO*/ - inline void set_auto_chunk_size() { - int concurrency = 2048 * 128; - - if (concurrency == 0) concurrency = 1; - - if (m_chunk_size > 0) { - if (!Impl::is_integral_power_of_two(m_chunk_size)) - Kokkos::abort("TeamPolicy blocking granularity must be power of two"); - } - - int new_chunk_size = 1; - while (new_chunk_size * 100 * concurrency < m_league_size) - new_chunk_size *= 2; - if (new_chunk_size < 128) { - new_chunk_size = 1; - while ((new_chunk_size * 40 * concurrency < m_league_size) && - (new_chunk_size < 128)) - new_chunk_size *= 2; - } - m_chunk_size = new_chunk_size; - } - - public: - using member_type = Impl::OpenMPTargetExecTeamMember; -}; -} // namespace Impl - -} // namespace Kokkos - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember> -TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType& count) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - std::common_type_t, Impl::OpenMPTargetExecTeamMember> -TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType1& begin, const iType2& end) { - using iType = std::common_type_t; - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin), - iType(end)); -} - -template -KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember> -ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType& count) { - return Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< - std::common_type_t, Impl::OpenMPTargetExecTeamMember> -ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType1& arg_begin, const iType2& arg_end) { - using iType = std::common_type_t; - return Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), - iType(arg_end)); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember> -TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType& count) { - return Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< - std::common_type_t, Impl::OpenMPTargetExecTeamMember> -TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType1& arg_begin, const iType2& arg_end) { - using iType = std::common_type_t; - return Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), - iType(arg_end)); -} - -KOKKOS_INLINE_FUNCTION -Impl::ThreadSingleStruct PerTeam( - const Impl::OpenMPTargetExecTeamMember& thread) { - return Impl::ThreadSingleStruct(thread); -} - -KOKKOS_INLINE_FUNCTION -Impl::VectorSingleStruct PerThread( - const Impl::OpenMPTargetExecTeamMember& thread) { - return Impl::VectorSingleStruct(thread); -} -} // namespace Kokkos - -namespace Kokkos { - -/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda) { -#pragma omp for nowait schedule(static, 1) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); -} - -/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team - * and a summation of val is performed and put into result. - */ - -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ValueType& result) { - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamThread_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp barrier - TeamThread_scratch[0] = ValueType(); -#pragma omp barrier - - if constexpr (std::is_arithmetic::value) { -#pragma omp for reduction(+ : TeamThread_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamThread_scratch[0] += tmp; - } - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) - -#pragma omp for reduction(custom : TeamThread_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamThread_scratch[0] += tmp; - } - } - - result = TeamThread_scratch[0]; -} - -#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND) -// For some reason the actual version we wanted to write doesn't work -// and crashes. We should try this with every new compiler -// This is the variant we actually wanted to write -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType result) { - using ValueType = typename ReducerType::value_type; - -#pragma omp declare reduction( \ - custominner:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamThread_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp barrier - Impl::OpenMPTargetReducerWrapper::init(TeamThread_scratch[0]); -#pragma omp barrier - -#pragma omp for reduction(custominner : TeamThread_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, TeamThread_scratch[0]); - } - result.reference() = TeamThread_scratch[0]; -} -#else -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType result) { - using ValueType = typename ReducerType::value_type; - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamThread_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp barrier - ValueType tmp; - result.init(tmp); - TeamThread_scratch[0] = tmp; -#pragma omp barrier - - iType team_size = iType(omp_get_num_threads()); -#pragma omp for reduction(omp_red_teamthread_reducer \ - : TeamThread_scratch[:1]) schedule(static, 1) - for (iType t = 0; t < team_size; t++) { - ValueType tmp2; - result.init(tmp2); - - for (iType i = loop_boundaries.start + t; i < loop_boundaries.end; - i += team_size) { - lambda(i, tmp2); - } - - // FIXME_OPENMPTARGET: Join should work but doesn't. Every threads gets a - // private TeamThread_scratch[0] and at the end of the for-loop the `join` - // operation is performed by OpenMP itself and hence the simple assignment - // works. - // result.join(TeamThread_scratch[0], tmp2); - TeamThread_scratch[0] = tmp2; - } - - result.reference() = TeamThread_scratch[0]; -} -#endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND - -/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread - * and a reduction of val is performed using JoinType(ValueType& val, const - * ValueType& update) and put into init_result. The input value of init_result - * is used as initializer for temporary variables of ValueType. Therefore the - * input value should be the neutral element with respect to the join operation - * (e.g. '0 for +-' or '1 for *'). - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& init_result) { - ValueType* TeamThread_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - // FIXME_OPENMPTARGET: Still need to figure out how to get value_count here. - const int value_count = 1; - -#pragma omp barrier - TeamThread_scratch[0] = init_result; -#pragma omp barrier - -#pragma omp for - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, TeamThread_scratch[omp_get_num_threads() * value_count]); - } - - // Reduce all partial results within a team. - const int team_size = omp_get_num_threads(); - int tree_neighbor_offset = 1; - do { -#pragma omp for - for (int i = 0; i < team_size - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - const int neighbor = i + tree_neighbor_offset; - join(lambda, &TeamThread_scratch[i * value_count], - &TeamThread_scratch[neighbor * value_count]); - } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < team_size); - init_result = TeamThread_scratch[0]; -} - -// This is largely the same code as in HIP and CUDA except for the member name -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds, - const FunctorType& lambda) { - using Analysis = Impl::FunctorAnalysis, - FunctorType>; - using value_type = typename Analysis::value_type; - - const auto start = loop_bounds.start; - const auto end = loop_bounds.end; - // Note this thing is called .member in the CUDA specialization of - // TeamThreadRangeBoundariesStruct - auto& member = loop_bounds.team; - const auto team_rank = member.team_rank(); - -#if defined(KOKKOS_IMPL_TEAM_SCAN_WORKAROUND) - value_type scan_val = value_type(); - - if (team_rank == 0) { - for (iType i = start; i < end; ++i) { - lambda(i, scan_val, true); - } - } -#pragma omp barrier -#else - const auto team_size = member.team_size(); - const auto nchunk = (end - start + team_size - 1) / team_size; - value_type accum = 0; - // each team has to process one or - // more chunks of the prefix scan - for (iType i = 0; i < nchunk; ++i) { - auto ii = start + i * team_size + team_rank; - // local accumulation for this chunk - value_type local_accum = 0; - // user updates value with prefix value - if (ii < loop_bounds.end) lambda(ii, local_accum, false); - // perform team scan - local_accum = member.team_scan(local_accum); - // add this blocks accum to total accumulation - auto val = accum + local_accum; - // user updates their data with total accumulation - if (ii < loop_bounds.end) lambda(ii, val, true); - // the last value needs to be propogated to next chunk - if (team_rank == team_size - 1) accum = val; - // broadcast last value to rest of the team - member.team_broadcast(accum, team_size - 1); - } -#endif -} - -} // namespace Kokkos - -namespace Kokkos { -/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda) { -#pragma omp simd - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); -} - -/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread - * and a summation of val is performed and put into result. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ValueType& result) { - ValueType vector_reduce = ValueType(); - - if constexpr (std::is_arithmetic::value) { -#pragma omp simd reduction(+ : vector_reduce) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - vector_reduce += tmp; - } - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) - -#pragma omp simd reduction(custom : vector_reduce) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, vector_reduce); - } - } - - result = vector_reduce; -} - -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType const& result) { - using ValueType = typename ReducerType::value_type; - -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) - - ValueType vector_reduce; - Impl::OpenMPTargetReducerWrapper::init(vector_reduce); - -#pragma omp simd reduction(custom : vector_reduce) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, vector_reduce); - } - - result.reference() = vector_reduce; -} - -/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread - * and a reduction of val is performed using JoinType(ValueType& val, const - * ValueType& update) and put into init_result. The input value of init_result - * is used as initializer for temporary variables of ValueType. Therefore the - * input value should be the neutral element with respect to the join operation - * (e.g. '0 for +-' or '1 for *'). - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& init_result) { - ValueType result = init_result; - - // FIXME_OPENMPTARGET think about omp simd - // join does not work with omp reduction clause - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - join(result, tmp); - } - - init_result = result; -} - -/** \brief Intra-thread vector parallel exclusive prefix sum. Executes - * lambda(iType i, ValueType & val, bool final) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan - * operation is performed. Depending on the target execution space the operator - * might be called twice: once with final=false and once with final=true. When - * final==true val contains the prefix sum value. The contribution of this "i" - * needs to be added to val no matter whether final==true or not. In a serial - * execution (i.e. team_size==1) the operator is only called once with - * final==true. Scan_val will be set to the final sum value over all vector - * lanes. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const FunctorType& lambda) { - using Analysis = Impl::FunctorAnalysis, - FunctorType>; - using value_type = typename Analysis::value_type; - - value_type scan_val = value_type(); - -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif - for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) { - lambda(i, scan_val, true); - } -} - -} // namespace Kokkos - -#ifdef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND -#undef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND -#endif - -namespace Kokkos { -/** \brief Intra-team vector parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling team. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda) { -#pragma omp for simd nowait schedule(static, 1) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); -} - -/** \brief Intra-team vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling team - * and a summation of val is performed and put into result. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ValueType& result) { - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamVector_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp barrier - TeamVector_scratch[0] = ValueType(); -#pragma omp barrier - - if constexpr (std::is_arithmetic::value) { -#pragma omp for simd reduction(+ : TeamVector_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamVector_scratch[0] += tmp; - } - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) - -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamVector_scratch[0] += tmp; - } - } - - result = TeamVector_scratch[0]; -} - -#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND) -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType const& result) { - using ValueType = typename ReducerType::value_type; - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) - - ValueType* TeamVector_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp barrier - Impl::OpenMPTargetReducerWrapper::init(TeamVector_scratch[0]); -#pragma omp barrier - -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, TeamVector_scratch[0]); - } - - result.reference() = TeamVector_scratch[0]; -} -#else -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType const& result) { - using ValueType = typename ReducerType::value_type; - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamVector_scratch = - static_cast(loop_boundaries.team.impl_reduce_scratch()); - -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp barrier - ValueType tmp; - result.init(tmp); - TeamVector_scratch[0] = tmp; -#pragma omp barrier - - iType team_size = iType(omp_get_num_threads()); -#pragma omp for simd reduction(omp_red_teamthread_reducer \ - : TeamVector_scratch[:1]) schedule(static, 1) - for (iType t = 0; t < team_size; t++) { - ValueType tmp2; - result.init(tmp2); - - for (iType i = loop_boundaries.start + t; i < loop_boundaries.end; - i += team_size) { - lambda(i, tmp2); - } - TeamVector_scratch[0] = tmp2; - } - - result.reference() = TeamVector_scratch[0]; -} -#endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND -} // namespace Kokkos - -#ifdef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND -#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND -#endif - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::VectorSingleStruct& - /*single_struct*/, - const FunctorType& lambda) { - lambda(); -} - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::ThreadSingleStruct& - single_struct, - const FunctorType& lambda) { - if (single_struct.team_member.team_rank() == 0) lambda(); -} - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::VectorSingleStruct& - /*single_struct*/, - const FunctorType& lambda, ValueType& val) { - lambda(val); -} - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::ThreadSingleStruct& - single_struct, - const FunctorType& lambda, ValueType& val) { - if (single_struct.team_member.team_rank() == 0) { - lambda(val); - } - single_struct.team_member.team_broadcast(val, 0); -} -} // namespace Kokkos - -#endif /* #ifndef KOKKOS_OPENMPTARGETEXEC_HPP */ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 4a33961205..564f299ab5 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -27,7 +27,7 @@ // constructor. undef'ed at the end #define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND -#include +#include #include #include #include diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 71ce4b18f2..5e898727f1 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -20,1253 +20,648 @@ #include #include #include -#include +#include +#include + +#include +#include "Kokkos_OpenMPTarget_Abort.hpp" + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; - - const FunctorType m_functor; - const Policy m_policy; +class OpenMPTargetExecTeamMember { + public: + static constexpr int TEAM_REDUCE_SIZE = 512; + + using execution_space = Kokkos::Experimental::OpenMPTarget; + using scratch_memory_space = execution_space::scratch_memory_space; + using team_handle = OpenMPTargetExecTeamMember; + + scratch_memory_space m_team_shared; + size_t m_team_scratch_size[2]; + int m_team_rank; + int m_team_size; + int m_league_rank; + int m_league_size; + int m_vector_length; + int m_vector_lane; + int m_shmem_block_index; + void* m_glb_scratch; + void* m_reduce_scratch; public: - void execute() const { execute_impl(); } - - template - void execute_impl() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const auto begin = m_policy.begin(); - const auto end = m_policy.end(); - - if (end <= begin) return; - - FunctorType a_functor(m_functor); - -#pragma omp target teams distribute parallel for map(to : a_functor) - for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { - a_functor(i); - } else { - a_functor(TagType(), i); - } - } + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_shmem() const { + return m_team_shared.set_team_thread_mode(0, 1, 0); } - ParallelFor(const FunctorType& arg_functor, Policy arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} -}; + // set_team_thread_mode routine parameters for future understanding: + // first parameter - scratch level. + // second parameter - size multiplier for advancing scratch ptr after a + // request was serviced. third parameter - offset size multiplier from current + // scratch ptr when returning a ptr for a request. + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_scratch(int level) const { + return m_team_shared.set_team_thread_mode(level, 1, 0); + } -} // namespace Impl -} // namespace Kokkos + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& thread_scratch(int level) const { + return m_team_shared.set_team_thread_mode(level, team_size(), team_rank()); + } -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; } + KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; } + KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; } + KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; } + KOKKOS_INLINE_FUNCTION void* impl_reduce_scratch() const { + return m_reduce_scratch; + } -namespace Kokkos { -namespace Impl { + KOKKOS_INLINE_FUNCTION void team_barrier() const { +#pragma omp barrier + } -// This class has the memcpy routine that is commonly used by ParallelReduce -// over RangePolicy and TeamPolicy. -template -struct ParallelReduceCommon { - // Copy the result back to device if the view is on the device. - static void memcpy_result(PointerType dest, PointerType src, size_t size, - bool ptr_on_device) { - if (ptr_on_device) { - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0, - omp_get_default_device(), - omp_get_initial_device())); - } else { - *dest = *src; - } + template + KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value, + int thread_id) const { + // Make sure there is enough scratch space: + using type = std::conditional_t<(sizeof(ValueType) < TEAM_REDUCE_SIZE), + ValueType, void>; + type* team_scratch = + reinterpret_cast(static_cast(m_glb_scratch) + + TEAM_REDUCE_SIZE * omp_get_team_num()); +#pragma omp barrier + if (team_rank() == thread_id) *team_scratch = value; +#pragma omp barrier + value = *team_scratch; } -}; -template -struct ParallelReduceSpecialize { - inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/, - PointerType /*result_ptr*/) { - constexpr int FunctorHasJoin = - Impl::FunctorAnalysis::has_join_member_function; - constexpr int UseReducerType = is_reducer::value; - - std::stringstream error_message; - error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' ' - << UseReducerType << '\n'; - // FIXME_OPENMPTARGET - OpenMPTarget_abort(error_message.str().c_str()); + template + KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure& f, ValueType& value, + const int& thread_id) const { + f(value); + team_broadcast(value, thread_id); } -}; -template -struct ParallelReduceSpecialize, - ReducerType, PointerType, ValueType> { - using PolicyType = Kokkos::RangePolicy; - using TagType = typename PolicyType::work_tag; - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using Analysis = Impl::FunctorAnalysis; - using ReferenceType = typename Analysis::reference_type; - - using ParReduceCommon = ParallelReduceCommon; - - static void execute_reducer(const FunctorType& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const auto begin = p.begin(); - const auto end = p.end(); - - ValueType result; - OpenMPTargetReducerWrapper::init(result); - - // Initialize and copy back the result even if it is a zero length - // reduction. - if (end <= begin) { - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - return; - } + // FIXME_OPENMPTARGET this function has the wrong interface and currently + // ignores the reducer passed. + template + KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value, + const JoinOp&) const { +#pragma omp barrier -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) - for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { - f(i, result); - } else { - f(TagType(), i, result); - } + using value_type = ValueType; + // const JoinLambdaAdapter op(op_in); + + // Make sure there is enough scratch space: + using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE), + value_type, void>; + + const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type); + type* team_scratch = + reinterpret_cast(static_cast(m_glb_scratch) + + TEAM_REDUCE_SIZE * omp_get_team_num()); + for (int i = m_team_rank; i < n_values; i += m_team_size) { + team_scratch[i] = value_type(); } - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - } +#pragma omp barrier - template - static void execute_array(const FunctorType& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const auto begin = p.begin(); - const auto end = p.end(); - - // Enter the loop if the reduction is on a scalar type. - if constexpr (NumReductions == 1) { - ValueType result = ValueType(); - - // Initialize and copy back the result even if it is a zero length - // reduction. - if (end <= begin) { - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - return; - } - // Case where reduction is on a native data type. - if constexpr (std::is_arithmetic::value) { -#pragma omp target teams distribute parallel for \ - map(to:f) reduction(+: result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void::value) { - f(i, result); - } else { - f(TagType(), i, result); - } - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void::value) { - f(i, result); - } else { - f(TagType(), i, result); - } - } + for (int k = 0; k < m_team_size; k += n_values) { + if ((k <= m_team_rank) && (k + n_values > m_team_rank)) + team_scratch[m_team_rank % n_values] += value; +#pragma omp barrier + } - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - } else { - ValueType result[NumReductions] = {}; - - // Initialize and copy back the result even if it is a zero length - // reduction. - if (end <= begin) { - ParReduceCommon::memcpy_result(result_ptr, result, - NumReductions * sizeof(ValueType), - ptr_on_device); - return; + for (int d = 1; d < n_values; d *= 2) { + if ((m_team_rank + d < n_values) && (m_team_rank % (2 * d) == 0)) { + team_scratch[m_team_rank] += team_scratch[m_team_rank + d]; } -#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) - for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { - f(i, result); - } else { - f(TagType(), i, result); - } - } - - ParReduceCommon::memcpy_result( - result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device); +#pragma omp barrier } + return team_scratch[0]; } + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template + KOKKOS_INLINE_FUNCTION ArgType + team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { + // FIXME_OPENMPTARGET + /* // Make sure there is enough scratch space: + using type = + std::conditional_t<(sizeof(ArgType) < TEAM_REDUCE_SIZE), ArgType, void>; - static void execute_init_join(const FunctorType& f, const PolicyType& p, - PointerType ptr, const bool ptr_on_device) { - const auto begin = p.begin(); - const auto end = p.end(); - - using FunctorAnalysis = - Impl::FunctorAnalysis; - constexpr int HasInit = FunctorAnalysis::has_init_member_function; - - // Initialize the result pointer. - - const auto size = end - begin; - - // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently - // based on NVIDIA-V100 and should be modifid to be based on the - // architecture in the future. - const int max_team_threads = 32; - const int max_teams = - OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads; - // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); - - // Allocate scratch per active thread. Achieved by setting the first - // parameter of `resize_scratch=1`. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - std::numeric_limits::max()); - ValueType* scratch_ptr = - static_cast(OpenMPTargetExec::get_scratch_ptr()); - -#pragma omp target map(to : f) is_device_ptr(scratch_ptr) - { - typename FunctorAnalysis::Reducer final_reducer(&f); - // Enter this loop if the functor has an `init` - if constexpr (HasInit) { - // The `init` routine needs to be called on the device since it might - // need device members. - final_reducer.init(scratch_ptr); - final_reducer.final(scratch_ptr); - } else { - for (int i = 0; i < value_count; ++i) { - static_cast(scratch_ptr)[i] = ValueType(); - } + volatile type * const work_value = ((type*) m_exec.scratch_thread()); - final_reducer.final(scratch_ptr); - } - } + *work_value = value ; - if (end <= begin) { - // If there is no work to be done, copy back the initialized values and - // exit. - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); - - return; - } + memory_fence(); + + if ( team_fan_in() ) { + // The last thread to synchronize returns true, all other threads wait + for team_fan_out() + // m_team_base[0] == highest ranking team member + // m_team_base[ m_team_size - 1 ] == lowest ranking team member + // + // 1) copy from lower to higher rank, initialize lowest rank to zero + // 2) prefix sum from lowest to highest rank, skipping lowest rank + + type accum = 0 ; -#pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ - map(to \ - : f) is_device_ptr(scratch_ptr) - { - typename FunctorAnalysis::Reducer final_reducer(&f); -#pragma omp parallel - { - const int team_num = omp_get_team_num(); - const int num_teams = omp_get_num_teams(); - const auto chunk_size = size / num_teams; - const auto team_begin = begin + team_num * chunk_size; - const auto team_end = - (team_num == num_teams - 1) ? end : (team_begin + chunk_size); - ValueType* team_scratch = - scratch_ptr + team_num * max_team_threads * value_count; - ReferenceType result = final_reducer.init( - &team_scratch[omp_get_thread_num() * value_count]); - - // Accumulate partial results in thread specific storage. -#pragma omp for simd - for (auto i = team_begin; i < team_end; ++i) { - if constexpr (std::is_void::value) { - f(i, result); - } else { - f(TagType(), i, result); + if ( global_accum ) { + for ( int i = m_team_size ; i-- ; ) { + type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i + )->scratch_thread()); accum += val ; } + accum = atomic_fetch_add( global_accum , accum ); } - // Reduce all paritial results within a team. - const int team_size = max_team_threads; - int tree_neighbor_offset = 1; - do { -#pragma omp for simd - for (int i = 0; i < team_size - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - const int neighbor = i + tree_neighbor_offset; - final_reducer.join(&team_scratch[i * value_count], - &team_scratch[neighbor * value_count]); - } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < team_size); - } // end parallel - } // end target - - int tree_neighbor_offset = 1; - do { -#pragma omp target teams distribute parallel for simd map(to \ - : f) \ - is_device_ptr(scratch_ptr) - for (int i = 0; i < max_teams - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - typename FunctorAnalysis::Reducer final_reducer(&f); - ValueType* team_scratch = scratch_ptr; - const int team_offset = max_team_threads * value_count; - final_reducer.join( - &team_scratch[i * team_offset], - &team_scratch[(i + tree_neighbor_offset) * team_offset]); - - // If `final` is provided by the functor. - // Do the final only once at the end. - if (tree_neighbor_offset * 2 >= max_teams && omp_get_team_num() == 0 && - omp_get_thread_num() == 0) { - final_reducer.final(scratch_ptr); + for ( int i = m_team_size ; i-- ; ) { + type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i + )->scratch_thread()); const type offset = accum ; accum += val ; val = + offset ; } + + memory_fence(); } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < max_teams); - - // If the result view is on the host, copy back the values via memcpy. - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); - } -}; -template -class ParallelReduce, ReducerType, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::RangePolicy; - - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using Analysis = Impl::FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - - static constexpr int HasJoin = - Impl::FunctorAnalysis::has_join_member_function; - static constexpr int UseReducer = is_reducer::value; - static constexpr int IsArray = std::is_pointer::value; - - using ParReduceSpecialize = - ParallelReduceSpecialize; - - const FunctorType m_functor; - const Policy m_policy; - const ReducerType m_reducer; - const pointer_type m_result_ptr; - bool m_result_ptr_on_device; - const int m_result_ptr_num_elems; - using TagType = typename Policy::work_tag; + team_fan_out(); - public: - void execute() const { - if constexpr (HasJoin) { - // Enter this loop if the Functor has a init-join. - ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (UseReducer) { - // Enter this loop if the Functor is a reducer type. - ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (IsArray) { - // Enter this loop if the reduction is on an array and the routine is - // templated over the size of the array. - if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else { - Kokkos::abort("array reduction length must be <= 32"); - } - } else { - // This loop handles the basic scalar reduction. - ParReduceSpecialize::template execute_array( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } + return *work_value ;*/ + return ArgType(); } - template - ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, - const ViewType& arg_result_view, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(InvalidType()), - m_result_ptr(arg_result_view.data()), - m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()) {} - - ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, - const ReducerType& reducer) - : m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(reducer.view().size()) {} -}; - -} // namespace Impl -} // namespace Kokkos + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template + KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { + return this->template team_scan(value, 0); + } -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + //---------------------------------------- + // Private for the driver -namespace Kokkos { -namespace Impl { + private: + using space = execution_space::scratch_memory_space; -template -class ParallelScan, - Kokkos::Experimental::OpenMPTarget> { - protected: - using Policy = Kokkos::RangePolicy; + public: + // FIXME_OPENMPTARGET - 512(16*32) bytes at the begining of the scratch space + // for each league is saved for reduction. It should actually be based on the + // ValueType of the reduction variable. + inline OpenMPTargetExecTeamMember( + const int league_rank, const int league_size, const int team_size, + const int vector_length // const TeamPolicyInternal< OpenMPTarget, + // Properties ...> & team + , + void* const glb_scratch, const int shmem_block_index, + const size_t shmem_size_L0, const size_t shmem_size_L1) + : m_team_scratch_size{shmem_size_L0, shmem_size_L1}, + m_team_rank(0), + m_team_size(team_size), + m_league_rank(league_rank), + m_league_size(league_size), + m_vector_length(vector_length), + m_shmem_block_index(shmem_block_index), + m_glb_scratch(glb_scratch) { + const int omp_tid = omp_get_thread_num(); + + // The scratch memory allocated is a sum of TEAM_REDUCE_SIZE, L0 shmem size + // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for + // hierarchical reduction. There is an additional 10% of the requested + // scratch memory allocated per team as padding. Hence the product with 0.1. + const int reduce_offset = + m_shmem_block_index * + (shmem_size_L0 + shmem_size_L1 + + ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE); + const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE; + const int l1_offset = l0_offset + shmem_size_L0; + m_team_shared = scratch_memory_space( + (static_cast(glb_scratch) + l0_offset), shmem_size_L0, + static_cast(glb_scratch) + l1_offset, shmem_size_L1); + m_reduce_scratch = static_cast(glb_scratch) + reduce_offset; + m_league_rank = league_rank; + m_team_rank = omp_tid; + m_vector_lane = 0; + } - using WorkTag = typename Policy::work_tag; - using WorkRange = typename Policy::WorkRange; - using Member = typename Policy::member_type; - using idx_type = typename Policy::index_type; + static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; } +}; - using Analysis = Impl::FunctorAnalysis; +template +class TeamPolicyInternal + : public PolicyTraits { + public: + //! Tag this class as a kokkos execution policy + using execution_policy = TeamPolicyInternal; - using value_type = typename Analysis::value_type; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; + using traits = PolicyTraits; - const FunctorType m_functor; - const Policy m_policy; + //---------------------------------------- - value_type* m_result_ptr; - const bool m_result_ptr_device_accessible; + template + inline static int team_size_max(const FunctorType&, const ParallelForTag&) { + return 256; + } - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(idx, val, is_final); + template + inline static int team_size_max(const FunctorType&, + const ParallelReduceTag&) { + return 256; } - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(WorkTag(), idx, val, is_final); + + template + inline static int team_size_max(const FunctorType&, const ReducerType&, + const ParallelReduceTag&) { + return 256; } - public: - void impl_execute( - Kokkos::View - element_values, - Kokkos::View - chunk_values, - Kokkos::View count) - const { - const idx_type N = m_policy.end() - m_policy.begin(); - const idx_type chunk_size = 128; - const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; - idx_type nteams = n_chunks > 512 ? 512 : n_chunks; - idx_type team_size = 128; - - FunctorType a_functor(m_functor); -#pragma omp target teams distribute map(to \ - : a_functor) num_teams(nteams) \ - thread_limit(team_size) - for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { - typename Analysis::Reducer final_reducer(&a_functor); -#pragma omp parallel num_threads(team_size) - { - const idx_type local_offset = team_id * chunk_size; - -#pragma omp for - for (idx_type i = 0; i < chunk_size; ++i) { - const idx_type idx = local_offset + i; - value_type val; - final_reducer.init(&val); - if (idx < N) call_with_tag(a_functor, idx, val, false); - element_values(team_id, i) = val; - } -#pragma omp barrier - if (omp_get_thread_num() == 0) { - value_type sum; - final_reducer.init(&sum); - for (idx_type i = 0; i < chunk_size; ++i) { - final_reducer.join(&sum, &element_values(team_id, i)); - element_values(team_id, i) = sum; - } - chunk_values(team_id) = sum; - } -#pragma omp barrier - if (omp_get_thread_num() == 0) { - if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) { - value_type sum; - final_reducer.init(&sum); - for (idx_type i = 0; i < n_chunks; ++i) { - final_reducer.join(&sum, &chunk_values(i)); - chunk_values(i) = sum; - } - } - } - } - } + template + inline static int team_size_recommended(const FunctorType&, + const ParallelForTag&) { + return 128; + } -#pragma omp target teams distribute map(to \ - : a_functor) num_teams(nteams) \ - thread_limit(team_size) - for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { - typename Analysis::Reducer final_reducer(&a_functor); -#pragma omp parallel num_threads(team_size) - { - const idx_type local_offset = team_id * chunk_size; - value_type offset_value; - if (team_id > 0) - offset_value = chunk_values(team_id - 1); - else - final_reducer.init(&offset_value); - -#pragma omp for - for (idx_type i = 0; i < chunk_size; ++i) { - const idx_type idx = local_offset + i; - value_type local_offset_value; - if (i > 0) { - local_offset_value = element_values(team_id, i - 1); - // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs -#ifdef KOKKOS_ARCH_VEGA - if constexpr (Analysis::has_join_member_function) { - if constexpr (std::is_void_v) - a_functor.join(local_offset_value, offset_value); - else - a_functor.join(WorkTag{}, local_offset_value, offset_value); - } else - local_offset_value += offset_value; -#else - final_reducer.join(&local_offset_value, &offset_value); -#endif - } else - local_offset_value = offset_value; - if (idx < N) - call_with_tag(a_functor, idx, local_offset_value, true); - if (idx == N - 1 && m_result_ptr_device_accessible) - *m_result_ptr = local_offset_value; - } - } - } + template + inline static int team_size_recommended(const FunctorType&, + const ParallelReduceTag&) { + return 128; } - void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const idx_type N = m_policy.end() - m_policy.begin(); - const idx_type chunk_size = 128; - const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; - - // This could be scratch memory per team - Kokkos::View - element_values("element_values", n_chunks, chunk_size); - Kokkos::View - chunk_values("chunk_values", n_chunks); - Kokkos::View count( - "Count"); - - impl_execute(element_values, chunk_values, count); + template + inline static int team_size_recommended(const FunctorType&, + const ReducerType&, + const ParallelReduceTag&) { + return 128; } //---------------------------------------- - ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy, - pointer_type arg_result_ptr = nullptr, - bool arg_result_ptr_device_accessible = false) - : m_functor(arg_functor), - m_policy(arg_policy), - m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} - - //---------------------------------------- -}; + private: + int m_league_size; + int m_team_size; + int m_vector_length; + int m_team_alloc; + int m_team_iter; + std::array m_team_scratch_size; + std::array m_thread_scratch_size; + bool m_tune_team_size; + bool m_tune_vector_length; + constexpr const static size_t default_team_size = 256; + int m_chunk_size; + + inline void init(const int league_size_request, const int team_size_request, + const int vector_length_request) { + m_league_size = league_size_request; + + // Minimum team size should be 32 for OpenMPTarget backend. + if (team_size_request < 32) { + Kokkos::Impl::OpenMPTarget_abort( + "OpenMPTarget backend requires a minimum of 32 threads per team.\n"); + } else + m_team_size = team_size_request; + + m_vector_length = vector_length_request; + set_auto_chunk_size(); + } -template -class ParallelScanWithTotal, - ReturnType, Kokkos::Experimental::OpenMPTarget> - : public ParallelScan, - Kokkos::Experimental::OpenMPTarget> { - using base_t = ParallelScan, - Kokkos::Experimental::OpenMPTarget>; - using value_type = typename base_t::value_type; + template + friend class TeamPolicyInternal; public: - void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); - const int chunk_size = 128; - const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; - - if (N > 0) { - // This could be scratch memory per team - Kokkos::View - element_values("element_values", n_chunks, chunk_size); - Kokkos::View - chunk_values("chunk_values", n_chunks); - Kokkos::View count( - "Count"); - - base_t::impl_execute(element_values, chunk_values, count); - - if (!base_t::m_result_ptr_device_accessible) { - const int size = base_t::Analysis::value_size(base_t::m_functor); - DeepCopy( - base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size); - } - } else if (!base_t::m_result_ptr_device_accessible) { - *base_t::m_result_ptr = 0; - } + // FIXME_OPENMPTARGET : Currently this routine is a copy of the Cuda + // implementation, but this has to be tailored to be architecture specific. + inline static int scratch_size_max(int level) { + return ( + level == 0 ? 1024 * 40 : // 48kB is the max for CUDA, but we need some + // for team_member.reduce etc. + 20 * 1024 * + 1024); // arbitrarily setting this to 20MB, for a Volta V100 + // that would give us about 3.2GB for 2 teams per SM } - - template - ParallelScanWithTotal(const FunctorType& arg_functor, - const typename base_t::Policy& arg_policy, - const ViewType& arg_result_view) - : base_t(arg_functor, arg_policy, arg_result_view.data(), - MemorySpaceAccess::accessible) { + inline bool impl_auto_team_size() const { return m_tune_team_size; } + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline void impl_set_team_size(const size_t size) { m_team_size = size; } + inline void impl_set_vector_length(const size_t length) { + m_tune_vector_length = length; + } + inline int impl_vector_length() const { return m_vector_length; } + inline int team_size() const { return m_team_size; } + inline int league_size() const { return m_league_size; } + inline size_t scratch_size(const int& level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; } -}; -} // namespace Impl -} // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + inline Kokkos::Experimental::OpenMPTarget space() const { + return Kokkos::Experimental::OpenMPTarget(); + } -namespace Kokkos { -namespace Impl { + template + TeamPolicyInternal(const TeamPolicyInternal& p) + : m_league_size(p.m_league_size), + m_team_size(p.m_team_size), + m_vector_length(p.m_vector_length), + m_team_alloc(p.m_team_alloc), + m_team_iter(p.m_team_iter), + m_team_scratch_size(p.m_team_scratch_size), + m_thread_scratch_size(p.m_thread_scratch_size), + m_tune_team_size(p.m_tune_team_size), + m_tune_vector_length(p.m_tune_vector_length), + m_chunk_size(p.m_chunk_size) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, int team_size_request, + int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, team_size_request, vector_length_request); + } -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = - Kokkos::Impl::TeamPolicyInternal; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, default_team_size / vector_length_request, + vector_length_request); + } - const FunctorType m_functor; - const Policy m_policy; - const size_t m_shmem_size; + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, default_team_size, 1); + } + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, team_size_request, 1); + } - public: - void execute() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - execute_impl(); + TeamPolicyInternal(int league_size_request, int team_size_request, + int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, team_size_request, vector_length_request); } - private: - template - void execute_impl() const { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const auto league_size = m_policy.league_size(); - const auto team_size = m_policy.team_size(); - const auto vector_length = m_policy.impl_vector_length(); - - const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); - const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1, - league_size); - - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - FunctorType a_functor(m_functor); - - // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the - // scratch implementation does not work in the Release or RelWithDebugInfo - // mode but works in the Debug mode. - - // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - // nteams should not exceed the maximum in-flight teams possible. - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; - - // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; - -// Performing our own scheduling of teams to avoid separation of code between -// teams-distribute and parallel. Gave a 2x performance boost in test cases with -// the clang compiler. atomic_compare_exchange can be avoided since the standard -// guarantees that the number of teams specified in the `num_teams` clause is -// always less than or equal to the maximum concurrently running teams. -#pragma omp target teams num_teams(nteams) thread_limit(team_size) \ - map(to \ - : a_functor) is_device_ptr(scratch_ptr) -#pragma omp parallel - { - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Iterate through the number of teams until league_size and assign the - // league_id accordingly - // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename Policy::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - m_functor(team); - else - m_functor(TagType(), team); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); - } + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, default_team_size / vector_length_request, + vector_length_request); } - public: - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) - : m_functor(arg_functor), - m_policy(arg_policy), - m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, arg_policy.team_size())) {} -}; + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, default_team_size, 1); + } + TeamPolicyInternal(int league_size_request, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, team_size_request, 1); + } + inline static size_t vector_length_max() { + return 32; /* TODO: this is bad. Need logic that is compiler and backend + aware */ + } + inline int team_alloc() const { return m_team_alloc; } + inline int team_iter() const { return m_team_iter; } -template -struct ParallelReduceSpecialize, - ReducerType, PointerType, ValueType> { - using PolicyType = TeamPolicyInternal; - using TagType = typename PolicyType::work_tag; - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using Analysis = Impl::FunctorAnalysis; - - using ReferenceType = typename Analysis::reference_type; - - using ParReduceCommon = ParallelReduceCommon; - - static void execute_reducer(const FunctorType& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - - const int league_size = p.league_size(); - const int team_size = p.team_size(); - const int vector_length = p.impl_vector_length(); - - const size_t shmem_size_L0 = p.scratch_size(0, team_size); - const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - - ValueType result = ValueType(); - - // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; - - // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; - -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) - -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(custom \ - : result) -#pragma omp parallel reduction(custom : result) - { - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); - } + inline int chunk_size() const { return m_chunk_size; } - // Copy results back to device if `parallel_reduce` is on a device view. - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); + /** \brief set chunk_size to a discrete value*/ + inline TeamPolicyInternal& set_chunk_size( + typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; } - template - static void execute_array(const FunctorType& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - - const int league_size = p.league_size(); - const int team_size = p.team_size(); - const int vector_length = p.impl_vector_length(); - - const size_t shmem_size_L0 = p.scratch_size(0, team_size); - const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - - // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; - - // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; - - // Case where the number of reduction items is 1. - if constexpr (NumReductions == 1) { - ValueType result = ValueType(); - - // Case where reduction is on a native data type. - if constexpr (std::is_arithmetic::value) { -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+: result) -#pragma omp parallel reduction(+ : result) - { - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); - } - } else { - // Case where the reduction is on a non-native data type. -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(custom \ - : result) -#pragma omp parallel reduction(custom : result) - { - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); - } - } + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size(const int& level, + const PerTeamValue& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } - // Copy results back to device if `parallel_reduce` is on a device view. - ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - } else { - ValueType result[NumReductions] = {}; - // Case where the reduction is on an array. -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) -#pragma omp parallel reduction(+ : result[:NumReductions]) - { - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); - } + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerThreadValue& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } - // Copy results back to device if `parallel_reduce` is on a device view. - ParReduceCommon::memcpy_result( - result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device); - } + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerTeamValue& per_team, + const PerThreadValue& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; } - // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over - // RangePolicy. Need a new implementation. - static void execute_init_join(const FunctorType& f, const PolicyType& p, - PointerType ptr, const bool ptr_on_device) { - using FunctorAnalysis = - Impl::FunctorAnalysis; - constexpr int HasInit = FunctorAnalysis::has_init_member_function; - - const int league_size = p.league_size(); - const int team_size = p.team_size(); - const int vector_length = p.impl_vector_length(); - - auto begin = 0; - auto end = league_size * team_size + team_size * vector_length; - - const size_t shmem_size_L0 = p.scratch_size(0, team_size); - const size_t shmem_size_L1 = p.scratch_size(1, team_size); - - // FIXME_OPENMPTARGET: This would oversubscribe scratch memory since we are - // already using the available scratch memory to create temporaries for each - // thread. - if ((shmem_size_L0 + shmem_size_L1) > 0) { - Kokkos::abort( - "OpenMPTarget: Scratch memory is not supported in `parallel_reduce` " - "over functors with init/join."); - } + private: + /** \brief finalize chunk_size if it was set to AUTO*/ + inline void set_auto_chunk_size() { + int concurrency = 2048 * 128; - const auto nteams = league_size; - - // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); - - // Allocate scratch per active thread. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - - // Enter this loop if the functor has an `init` - if constexpr (HasInit) { - // The `init` routine needs to be called on the device since it might need - // device members. -#pragma omp target map(to : f) is_device_ptr(scratch_ptr) - { - typename FunctorAnalysis::Reducer final_reducer(&f); - final_reducer.init(scratch_ptr); - final_reducer.final(scratch_ptr); - } - } else { -#pragma omp target map(to : f) is_device_ptr(scratch_ptr) - { - for (int i = 0; i < value_count; ++i) { - static_cast(scratch_ptr)[i] = ValueType(); - } + if (concurrency == 0) concurrency = 1; - typename FunctorAnalysis::Reducer final_reducer(&f); - final_reducer.final(static_cast(scratch_ptr)); - } + if (m_chunk_size > 0) { + if (!Impl::is_integral_power_of_two(m_chunk_size)) + Kokkos::abort("TeamPolicy blocking granularity must be power of two"); } - if (end <= begin) { - // If there is no work to be done, copy back the initialized values and - // exit. - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); - - return; + int new_chunk_size = 1; + while (new_chunk_size * 100 * concurrency < m_league_size) + new_chunk_size *= 2; + if (new_chunk_size < 128) { + new_chunk_size = 1; + while ((new_chunk_size * 40 * concurrency < m_league_size) && + (new_chunk_size < 128)) + new_chunk_size *= 2; } - -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) - { -#pragma omp parallel - { - const int team_num = omp_get_team_num(); - const int num_teams = omp_get_num_teams(); - ValueType* team_scratch = static_cast(scratch_ptr) + - team_num * team_size * value_count; - typename FunctorAnalysis::Reducer final_reducer(&f); - ReferenceType result = final_reducer.init(&team_scratch[0]); - - for (int league_id = team_num; league_id < league_size; - league_id += num_teams) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - team_num, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) { - f(team, result); - } else { - f(TagType(), team, result); - } - } - } // end parallel - } // end target - - int tree_neighbor_offset = 1; - do { -#pragma omp target teams distribute parallel for simd map(to \ - : f) \ - is_device_ptr(scratch_ptr) - for (int i = 0; i < nteams - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - ValueType* team_scratch = static_cast(scratch_ptr); - const int team_offset = team_size * value_count; - typename FunctorAnalysis::Reducer final_reducer(&f); - final_reducer.join( - &team_scratch[i * team_offset], - &team_scratch[(i + tree_neighbor_offset) * team_offset]); - - // If `final` is provided by the functor. - // Do the final only once at the end. - if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 && - omp_get_thread_num() == 0) { - final_reducer.final(scratch_ptr); - } - } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < nteams); - - // If the result view is on the host, copy back the values via memcpy. - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); + m_chunk_size = new_chunk_size; } -}; - -template -class ParallelReduce, - ReducerType, Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = - Kokkos::Impl::TeamPolicyInternal; - - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using WorkTagFwd = - std::conditional_t::value, WorkTag, - void>; - using Analysis = Impl::FunctorAnalysis; - - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - using value_type = typename Analysis::value_type; - - bool m_result_ptr_on_device; - const int m_result_ptr_num_elems; - - static constexpr int HasJoin = - Impl::FunctorAnalysis::has_join_member_function; - static constexpr int UseReducer = is_reducer::value; - static constexpr int IsArray = std::is_pointer::value; - - using ParReduceSpecialize = - ParallelReduceSpecialize; - - const FunctorType m_functor; - const Policy m_policy; - const ReducerType m_reducer; - const pointer_type m_result_ptr; - const size_t m_shmem_size; public: - void execute() const { - if constexpr (HasJoin) { - ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (UseReducer) { - ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (IsArray) { - if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array<2>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array<4>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array<8>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array<16>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array<32>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else { - Kokkos::abort("array reduction length must be <= 32"); - } - } else { - ParReduceSpecialize::template execute_array<1>( - m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } - } - - template - ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, - const ViewType& arg_result, - std::enable_if_t::value && - !Kokkos::is_reducer::value, - void*> = nullptr) - : m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result.size()), - m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(InvalidType()), - m_result_ptr(arg_result.data()), - m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, arg_policy.team_size())) {} - - ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, - const ReducerType& reducer) - : m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(reducer.view().size()), - m_functor(arg_functor), - m_policy(arg_policy), - m_reducer(reducer), - m_result_ptr(reducer.view().data()), - m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, arg_policy.team_size())) {} + using member_type = Impl::OpenMPTargetExecTeamMember; }; } // namespace Impl } // namespace Kokkos +namespace Kokkos { + +template +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember> +TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType& count) { + return Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, count); +} + +template +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + std::common_type_t, Impl::OpenMPTargetExecTeamMember> +TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType1& begin, const iType2& end) { + using iType = std::common_type_t; + return Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin), + iType(end)); +} + +template +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember> +ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType& count) { + return Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, count); +} + +template +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + std::common_type_t, Impl::OpenMPTargetExecTeamMember> +ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType1& arg_begin, const iType2& arg_end) { + using iType = std::common_type_t; + return Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), + iType(arg_end)); +} + +template +KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember> +TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType& count) { + return Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, count); +} + +template +KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< + std::common_type_t, Impl::OpenMPTargetExecTeamMember> +TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType1& arg_begin, const iType2& arg_end) { + using iType = std::common_type_t; + return Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), + iType(arg_end)); +} + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct PerTeam( + const Impl::OpenMPTargetExecTeamMember& thread) { + return Impl::ThreadSingleStruct(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct PerThread( + const Impl::OpenMPTargetExecTeamMember& thread) { + return Impl::VectorSingleStruct(thread); +} +} // namespace Kokkos + +namespace Kokkos { + +template +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct& + /*single_struct*/, + const FunctorType& lambda) { + lambda(); +} + +template +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct& + single_struct, + const FunctorType& lambda) { + if (single_struct.team_member.team_rank() == 0) lambda(); +} + +template +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct& + /*single_struct*/, + const FunctorType& lambda, ValueType& val) { + lambda(val); +} + +template +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct& + single_struct, + const FunctorType& lambda, ValueType& val) { + if (single_struct.team_member.team_rank() == 0) { + lambda(val); + } + single_struct.team_member.team_broadcast(val, 0); +} +} // namespace Kokkos + namespace Kokkos { namespace Impl { @@ -1320,5 +715,44 @@ struct TeamVectorRangeBoundariesStruct { } // namespace Kokkos //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +/** \brief Data for OpenMPTarget thread execution */ + +class OpenMPTargetExec { + public: + // FIXME_OPENMPTARGET - Currently the maximum number of + // teams possible is calculated based on NVIDIA's Volta GPU. In + // future this value should be based on the chosen architecture for the + // OpenMPTarget backend. + static constexpr int MAX_ACTIVE_THREADS = 2080 * 80; + static constexpr int MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS / 32; + + private: + static void* scratch_ptr; + + public: + static void verify_is_process(const char* const); + static void verify_initialized(const char* const); + + static int* get_lock_array(int num_teams); + static void* get_scratch_ptr(); + static void clear_scratch(); + static void clear_lock_array(); + static void resize_scratch(int64_t team_reduce_bytes, + int64_t team_shared_bytes, + int64_t thread_local_bytes, int64_t league_size); + + static void* m_scratch_ptr; + static int64_t m_scratch_size; + static int* m_lock_array; + static int64_t m_lock_size; + static uint32_t* m_uniquetoken_ptr; +}; + +} // namespace Impl +} // namespace Kokkos #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp new file mode 100644 index 0000000000..fcf168e9c9 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp @@ -0,0 +1,72 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLEL_FOR_RANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLEL_FOR_RANGE_HPP + +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::RangePolicy; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + + public: + void execute() const { execute_impl(); } + + template + void execute_impl() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const auto begin = m_policy.begin(); + const auto end = m_policy.end(); + + if (end <= begin) return; + + FunctorType a_functor(m_functor); + +#pragma omp target teams distribute parallel for map(to : a_functor) + for (auto i = begin; i < end; ++i) { + if constexpr (std::is_void::value) { + a_functor(i); + } else { + a_functor(TagType(), i); + } + } + } + + ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp new file mode 100644 index 0000000000..12de3423f8 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -0,0 +1,170 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLEL_FOR_TEAM_HPP +#define KOKKOS_OPENMPTARGET_PARALLEL_FOR_TEAM_HPP + +#include +#include +#include +#include + +namespace Kokkos { + +/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda) { +#pragma omp for nowait schedule(static, 1) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); +} + +/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda) { +#pragma omp simd + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); +} + +/** \brief Intra-team vector parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling team. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda) { +#pragma omp for simd nowait schedule(static, 1) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); +} + +namespace Impl { + +template +class ParallelFor, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = + Kokkos::Impl::TeamPolicyInternal; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + const size_t m_shmem_size; + + public: + void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + execute_impl(); + } + + private: + template + void execute_impl() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const auto league_size = m_policy.league_size(); + const auto team_size = m_policy.team_size(); + const auto vector_length = m_policy.impl_vector_length(); + + const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); + const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); + OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1, + league_size); + + void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + FunctorType a_functor(m_functor); + + // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the + // scratch implementation does not work in the Release or RelWithDebugInfo + // mode but works in the Debug mode. + + // Maximum active teams possible. + int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; + // nteams should not exceed the maximum in-flight teams possible. + const auto nteams = + league_size < max_active_teams ? league_size : max_active_teams; + + // If the league size is <=0, do not launch the kernel. + if (nteams <= 0) return; + +// Performing our own scheduling of teams to avoid separation of code between +// teams-distribute and parallel. Gave a 2x performance boost in test cases with +// the clang compiler. atomic_compare_exchange can be avoided since the standard +// guarantees that the number of teams specified in the `num_teams` clause is +// always less than or equal to the maximum concurrently running teams. +#pragma omp target teams num_teams(nteams) thread_limit(team_size) \ + map(to \ + : a_functor) is_device_ptr(scratch_ptr) +#pragma omp parallel + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Iterate through the number of teams until league_size and assign the + // league_id accordingly + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename Policy::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) + m_functor(team); + else + m_functor(TagType(), team); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + } + + public: + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize::value( + arg_functor, arg_policy.team_size())) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp new file mode 100644 index 0000000000..1ac46b9919 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -0,0 +1,133 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_RANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_RANGE_HPP + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelReduce, ReducerType, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::RangePolicy; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + + using ReducerTypeFwd = + std::conditional_t::value, + FunctorType, ReducerType>; + using Analysis = Impl::FunctorAnalysis; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + static constexpr int HasJoin = + Impl::FunctorAnalysis::has_join_member_function; + static constexpr int UseReducer = is_reducer::value; + static constexpr int IsArray = std::is_pointer::value; + + using ParReduceSpecialize = + ParallelReduceSpecialize; + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + bool m_result_ptr_on_device; + const int m_result_ptr_num_elems; + using TagType = typename Policy::work_tag; + + public: + void execute() const { + if constexpr (HasJoin) { + // Enter this loop if the Functor has a init-join. + ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr, + m_result_ptr_on_device); + } else if constexpr (UseReducer) { + // Enter this loop if the Functor is a reducer type. + ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr, + m_result_ptr_on_device); + } else if constexpr (IsArray) { + // Enter this loop if the reduction is on an array and the routine is + // templated over the size of the array. + if (m_result_ptr_num_elems <= 2) { + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 4) { + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 8) { + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 16) { + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 32) { + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else { + Kokkos::abort("array reduction length must be <= 32"); + } + } else { + // This loop handles the basic scalar reduction. + ParReduceSpecialize::template execute_array( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } + } + + template + ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, + const ViewType& arg_result_view, + std::enable_if_t::value && + !Kokkos::is_reducer::value, + void*> = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result_view.data()), + m_result_ptr_on_device( + MemorySpaceAccess::accessible), + m_result_ptr_num_elems(arg_result_view.size()) {} + + ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_result_ptr_on_device( + MemorySpaceAccess::accessible), + m_result_ptr_num_elems(reducer.view().size()) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp new file mode 100644 index 0000000000..236c6d6f7a --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -0,0 +1,551 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_TEAM_HPP +#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_TEAM_HPP + +#include +#include +#include +#include +#include + +// FIXME_OPENMPTARGET - Using this macro to implement a workaround for +// hierarchical reducers. It avoids hitting the code path which we wanted to +// write but doesn't work. undef'ed at the end. +// Intel compilers prefer the non-workaround version. +#ifndef KOKKOS_ARCH_INTEL_GPU +#define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND +#endif + +namespace Kokkos { + +/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team + * and a summation of val is performed and put into result. + */ + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamThread_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + TeamThread_scratch[0] = ValueType(); +#pragma omp barrier + + if constexpr (std::is_arithmetic::value) { +#pragma omp for reduction(+ : TeamThread_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamThread_scratch[0] += tmp; + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) + +#pragma omp for reduction(custom : TeamThread_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamThread_scratch[0] += tmp; + } + } + + result = TeamThread_scratch[0]; +} + +#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND) +// For some reason the actual version we wanted to write doesn't work +// and crashes. We should try this with every new compiler +// This is the variant we actually wanted to write +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType result) { + using ValueType = typename ReducerType::value_type; + +#pragma omp declare reduction( \ + custominner:ValueType \ + : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamThread_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + Impl::OpenMPTargetReducerWrapper::init(TeamThread_scratch[0]); +#pragma omp barrier + +#pragma omp for reduction(custominner : TeamThread_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, TeamThread_scratch[0]); + } + result.reference() = TeamThread_scratch[0]; +} +#else +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType result) { + using ValueType = typename ReducerType::value_type; + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamThread_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp declare reduction( \ + omp_red_teamthread_reducer:ValueType \ + : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp barrier + ValueType tmp; + result.init(tmp); + TeamThread_scratch[0] = tmp; +#pragma omp barrier + + iType team_size = iType(omp_get_num_threads()); +#pragma omp for reduction(omp_red_teamthread_reducer \ + : TeamThread_scratch[:1]) schedule(static, 1) + for (iType t = 0; t < team_size; t++) { + ValueType tmp2; + result.init(tmp2); + + for (iType i = loop_boundaries.start + t; i < loop_boundaries.end; + i += team_size) { + lambda(i, tmp2); + } + + // FIXME_OPENMPTARGET: Join should work but doesn't. Every threads gets a + // private TeamThread_scratch[0] and at the end of the for-loop the `join` + // operation is performed by OpenMP itself and hence the simple assignment + // works. + // result.join(TeamThread_scratch[0], tmp2); + TeamThread_scratch[0] = tmp2; + } + + result.reference() = TeamThread_scratch[0]; +} +#endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread + * and a reduction of val is performed using JoinType(ValueType& val, const + * ValueType& update) and put into init_result. The input value of init_result + * is used as initializer for temporary variables of ValueType. Therefore the + * input value should be the neutral element with respect to the join operation + * (e.g. '0 for +-' or '1 for *'). + */ +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, const JoinType& join, ValueType& init_result) { + ValueType* TeamThread_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + // FIXME_OPENMPTARGET: Still need to figure out how to get value_count here. + const int value_count = 1; + +#pragma omp barrier + TeamThread_scratch[0] = init_result; +#pragma omp barrier + +#pragma omp for + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, TeamThread_scratch[omp_get_num_threads() * value_count]); + } + + // Reduce all partial results within a team. + const int team_size = omp_get_num_threads(); + int tree_neighbor_offset = 1; + do { +#pragma omp for + for (int i = 0; i < team_size - tree_neighbor_offset; + i += 2 * tree_neighbor_offset) { + const int neighbor = i + tree_neighbor_offset; + join(lambda, &TeamThread_scratch[i * value_count], + &TeamThread_scratch[neighbor * value_count]); + } + tree_neighbor_offset *= 2; + } while (tree_neighbor_offset < team_size); + init_result = TeamThread_scratch[0]; +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread + * and a summation of val is performed and put into result. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + ValueType vector_reduce = ValueType(); + + if constexpr (std::is_arithmetic::value) { +#pragma omp simd reduction(+ : vector_reduce) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + vector_reduce += tmp; + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) + +#pragma omp simd reduction(custom : vector_reduce) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, vector_reduce); + } + } + + result = vector_reduce; +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType const& result) { + using ValueType = typename ReducerType::value_type; + +#pragma omp declare reduction( \ + custom:ValueType \ + : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) + + ValueType vector_reduce; + Impl::OpenMPTargetReducerWrapper::init(vector_reduce); + +#pragma omp simd reduction(custom : vector_reduce) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, vector_reduce); + } + + result.reference() = vector_reduce; +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread + * and a reduction of val is performed using JoinType(ValueType& val, const + * ValueType& update) and put into init_result. The input value of init_result + * is used as initializer for temporary variables of ValueType. Therefore the + * input value should be the neutral element with respect to the join operation + * (e.g. '0 for +-' or '1 for *'). + */ +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, const JoinType& join, ValueType& init_result) { + ValueType result = init_result; + + // FIXME_OPENMPTARGET think about omp simd + // join does not work with omp reduction clause + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + join(result, tmp); + } + + init_result = result; +} + +/** \brief Intra-team vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling team + * and a summation of val is performed and put into result. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamVector_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + TeamVector_scratch[0] = ValueType(); +#pragma omp barrier + + if constexpr (std::is_arithmetic::value) { +#pragma omp for simd reduction(+ : TeamVector_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamVector_scratch[0] += tmp; + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) + +#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamVector_scratch[0] += tmp; + } + } + + result = TeamVector_scratch[0]; +} + +#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND) +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType const& result) { + using ValueType = typename ReducerType::value_type; + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + +#pragma omp declare reduction( \ + custom:ValueType \ + : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) + + ValueType* TeamVector_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + Impl::OpenMPTargetReducerWrapper::init(TeamVector_scratch[0]); +#pragma omp barrier + +#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, TeamVector_scratch[0]); + } + + result.reference() = TeamVector_scratch[0]; +} +#else +template +KOKKOS_INLINE_FUNCTION std::enable_if_t::value> +parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType const& result) { + using ValueType = typename ReducerType::value_type; + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamVector_scratch = + static_cast(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp declare reduction( \ + omp_red_teamthread_reducer:ValueType \ + : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp barrier + ValueType tmp; + result.init(tmp); + TeamVector_scratch[0] = tmp; +#pragma omp barrier + + iType team_size = iType(omp_get_num_threads()); +#pragma omp for simd reduction(omp_red_teamthread_reducer \ + : TeamVector_scratch[:1]) schedule(static, 1) + for (iType t = 0; t < team_size; t++) { + ValueType tmp2; + result.init(tmp2); + + for (iType i = loop_boundaries.start + t; i < loop_boundaries.end; + i += team_size) { + lambda(i, tmp2); + } + TeamVector_scratch[0] = tmp2; + } + + result.reference() = TeamVector_scratch[0]; +} +#endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND + +namespace Impl { + +template +class ParallelReduce, + ReducerType, Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = + Kokkos::Impl::TeamPolicyInternal; + + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using ReducerTypeFwd = + std::conditional_t::value, + FunctorType, ReducerType>; + using WorkTagFwd = + std::conditional_t::value, WorkTag, + void>; + using Analysis = Impl::FunctorAnalysis; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + using value_type = typename Analysis::value_type; + + bool m_result_ptr_on_device; + const int m_result_ptr_num_elems; + + static constexpr int HasJoin = + Impl::FunctorAnalysis::has_join_member_function; + static constexpr int UseReducer = is_reducer::value; + static constexpr int IsArray = std::is_pointer::value; + + using ParReduceSpecialize = + ParallelReduceSpecialize; + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const size_t m_shmem_size; + + public: + void execute() const { + if constexpr (HasJoin) { + ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr, + m_result_ptr_on_device); + } else if constexpr (UseReducer) { + ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr, + m_result_ptr_on_device); + } else if constexpr (IsArray) { + if (m_result_ptr_num_elems <= 2) { + ParReduceSpecialize::template execute_array<2>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 4) { + ParReduceSpecialize::template execute_array<4>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 8) { + ParReduceSpecialize::template execute_array<8>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 16) { + ParReduceSpecialize::template execute_array<16>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else if (m_result_ptr_num_elems <= 32) { + ParReduceSpecialize::template execute_array<32>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } else { + Kokkos::abort("array reduction length must be <= 32"); + } + } else { + ParReduceSpecialize::template execute_array<1>( + m_functor, m_policy, m_result_ptr, m_result_ptr_on_device); + } + } + + template + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ViewType& arg_result, + std::enable_if_t::value && + !Kokkos::is_reducer::value, + void*> = nullptr) + : m_result_ptr_on_device( + MemorySpaceAccess::accessible), + m_result_ptr_num_elems(arg_result.size()), + m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize::value( + arg_functor, arg_policy.team_size())) {} + + ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy, + const ReducerType& reducer) + : m_result_ptr_on_device( + MemorySpaceAccess::accessible), + m_result_ptr_num_elems(reducer.view().size()), + m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize::value( + arg_functor, arg_policy.team_size())) {} +}; + +} // namespace Impl + +#ifdef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND +#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND +#endif +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp new file mode 100644 index 0000000000..72eefe5683 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -0,0 +1,252 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLELSCAN_RANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLELSCAN_RANGE_HPP + +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelScan, + Kokkos::Experimental::OpenMPTarget> { + protected: + using Policy = Kokkos::RangePolicy; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + using idx_type = typename Policy::index_type; + + using Analysis = Impl::FunctorAnalysis; + + using value_type = typename Analysis::value_type; + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + + value_type* m_result_ptr; + const bool m_result_ptr_device_accessible; + + template + std::enable_if_t::value> call_with_tag( + const FunctorType& f, const idx_type& idx, value_type& val, + const bool& is_final) const { + f(idx, val, is_final); + } + template + std::enable_if_t::value> call_with_tag( + const FunctorType& f, const idx_type& idx, value_type& val, + const bool& is_final) const { + f(WorkTag(), idx, val, is_final); + } + + public: + void impl_execute( + Kokkos::View + element_values, + Kokkos::View + chunk_values, + Kokkos::View count) + const { + const idx_type N = m_policy.end() - m_policy.begin(); + const idx_type chunk_size = 128; + const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; + idx_type nteams = n_chunks > 512 ? 512 : n_chunks; + idx_type team_size = 128; + + FunctorType a_functor(m_functor); +#pragma omp target teams distribute map(to : a_functor) num_teams(nteams) + for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { + typename Analysis::Reducer final_reducer(&a_functor); +#pragma omp parallel num_threads(team_size) + { + const idx_type local_offset = team_id * chunk_size; + +#pragma omp for + for (idx_type i = 0; i < chunk_size; ++i) { + const idx_type idx = local_offset + i; + value_type val; + final_reducer.init(&val); + if (idx < N) call_with_tag(a_functor, idx, val, false); + element_values(team_id, i) = val; + } +#pragma omp barrier + if (omp_get_thread_num() == 0) { + value_type sum; + final_reducer.init(&sum); + for (idx_type i = 0; i < chunk_size; ++i) { + final_reducer.join(&sum, &element_values(team_id, i)); + element_values(team_id, i) = sum; + } + chunk_values(team_id) = sum; + } +#pragma omp barrier + if (omp_get_thread_num() == 0) { + if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) { + value_type sum; + final_reducer.init(&sum); + for (idx_type i = 0; i < n_chunks; ++i) { + final_reducer.join(&sum, &chunk_values(i)); + chunk_values(i) = sum; + } + } + } + } + } + +#pragma omp target teams distribute map(to \ + : a_functor) num_teams(nteams) \ + thread_limit(team_size) + for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { + typename Analysis::Reducer final_reducer(&a_functor); +#pragma omp parallel num_threads(team_size) + { + const idx_type local_offset = team_id * chunk_size; + value_type offset_value; + if (team_id > 0) + offset_value = chunk_values(team_id - 1); + else + final_reducer.init(&offset_value); + +#pragma omp for + for (idx_type i = 0; i < chunk_size; ++i) { + const idx_type idx = local_offset + i; + value_type local_offset_value; + if (i > 0) { + local_offset_value = element_values(team_id, i - 1); + // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs +#ifdef KOKKOS_ARCH_VEGA + if constexpr (Analysis::has_join_member_function) { + if constexpr (std::is_void_v) + a_functor.join(local_offset_value, offset_value); + else + a_functor.join(WorkTag{}, local_offset_value, offset_value); + } else + local_offset_value += offset_value; +#else + final_reducer.join(&local_offset_value, &offset_value); +#endif + } else + local_offset_value = offset_value; + if (idx < N) + call_with_tag(a_functor, idx, local_offset_value, true); + if (idx == N - 1 && m_result_ptr_device_accessible) + *m_result_ptr = local_offset_value; + } + } + } + } + + void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const idx_type N = m_policy.end() - m_policy.begin(); + const idx_type chunk_size = 128; + const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; + + // This could be scratch memory per team + Kokkos::View + element_values("element_values", n_chunks, chunk_size); + Kokkos::View + chunk_values("chunk_values", n_chunks); + Kokkos::View count( + "Count"); + + impl_execute(element_values, chunk_values, count); + } + + //---------------------------------------- + + ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy, + pointer_type arg_result_ptr = nullptr, + bool arg_result_ptr_device_accessible = false) + : m_functor(arg_functor), + m_policy(arg_policy), + m_result_ptr(arg_result_ptr), + m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} + + //---------------------------------------- +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template +class ParallelScanWithTotal, + ReturnType, Kokkos::Experimental::OpenMPTarget> + : public ParallelScan, + Kokkos::Experimental::OpenMPTarget> { + using base_t = ParallelScan, + Kokkos::Experimental::OpenMPTarget>; + using value_type = typename base_t::value_type; + + public: + void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); + const int chunk_size = 128; + const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; + + if (N > 0) { + // This could be scratch memory per team + Kokkos::View + element_values("element_values", n_chunks, chunk_size); + Kokkos::View + chunk_values("chunk_values", n_chunks); + Kokkos::View count( + "Count"); + + base_t::impl_execute(element_values, chunk_values, count); + + if (!base_t::m_result_ptr_device_accessible) { + const int size = base_t::Analysis::value_size(base_t::m_functor); + DeepCopy( + base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size); + } + } else if (!base_t::m_result_ptr_device_accessible) { + *base_t::m_result_ptr = 0; + } + } + + template + ParallelScanWithTotal(const FunctorType& arg_functor, + const typename base_t::Policy& arg_policy, + const ViewType& arg_result_view) + : base_t(arg_functor, arg_policy, arg_result_view.data(), + MemorySpaceAccess::accessible) { + } +}; +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp new file mode 100644 index 0000000000..65002c1830 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp @@ -0,0 +1,129 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLELSCAN_TEAM_HPP +#define KOKKOS_OPENMPTARGET_PARALLELSCAN_TEAM_HPP + +#include +#include +#include +#include + +// FIXME_OPENMPTARGET - Using this macro to implement a workaround for +// hierarchical scan. It avoids hitting the code path which we wanted to +// write but doesn't work. undef'ed at the end. +#ifndef KOKKOS_ARCH_INTEL_GPU +#define KOKKOS_IMPL_TEAM_SCAN_WORKAROUND +#endif + +namespace Kokkos { + +// This is largely the same code as in HIP and CUDA except for the member name +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds, + const FunctorType& lambda) { + using Analysis = Impl::FunctorAnalysis, + FunctorType>; + using value_type = typename Analysis::value_type; + + const auto start = loop_bounds.start; + const auto end = loop_bounds.end; + // Note this thing is called .member in the CUDA specialization of + // TeamThreadRangeBoundariesStruct + auto& member = loop_bounds.team; + const auto team_rank = member.team_rank(); + +#if defined(KOKKOS_IMPL_TEAM_SCAN_WORKAROUND) + value_type scan_val = value_type(); + + if (team_rank == 0) { + for (iType i = start; i < end; ++i) { + lambda(i, scan_val, true); + } + } +#pragma omp barrier +#else + const auto team_size = member.team_size(); + const auto nchunk = (end - start + team_size - 1) / team_size; + value_type accum = 0; + // each team has to process one or + // more chunks of the prefix scan + for (iType i = 0; i < nchunk; ++i) { + auto ii = start + i * team_size + team_rank; + // local accumulation for this chunk + value_type local_accum = 0; + // user updates value with prefix value + if (ii < loop_bounds.end) lambda(ii, local_accum, false); + // perform team scan + local_accum = member.team_scan(local_accum); + // add this blocks accum to total accumulation + auto val = accum + local_accum; + // user updates their data with total accumulation + if (ii < loop_bounds.end) lambda(ii, val, true); + // the last value needs to be propogated to next chunk + if (team_rank == team_size - 1) accum = val; + // broadcast last value to rest of the team + member.team_broadcast(accum, team_size - 1); + } +#endif +} + +} // namespace Kokkos + +namespace Kokkos { + +/** \brief Intra-thread vector parallel exclusive prefix sum. Executes + * lambda(iType i, ValueType & val, bool final) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan + * operation is performed. Depending on the target execution space the operator + * might be called twice: once with final=false and once with final=true. When + * final==true val contains the prefix sum value. The contribution of this "i" + * needs to be added to val no matter whether final==true or not. In a serial + * execution (i.e. team_size==1) the operator is only called once with + * final==true. Scan_val will be set to the final sum value over all vector + * lanes. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const FunctorType& lambda) { + using Analysis = Impl::FunctorAnalysis, + FunctorType>; + using value_type = typename Analysis::value_type; + + value_type scan_val = value_type(); + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) { + lambda(i, scan_val, true); + } +} + +} // namespace Kokkos + +#ifdef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND +#undef KOKKOS_IMPL_TEAM_SCAN_WORKAROUND +#endif + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp new file mode 100644 index 0000000000..75b17b7235 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -0,0 +1,675 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_PARALLEL_COMMON_HPP +#define KOKKOS_OPENMPTARGET_PARALLEL_COMMON_HPP + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +// This class has the memcpy routine that is commonly used by ParallelReduce +// over RangePolicy and TeamPolicy. +template +struct ParallelReduceCopy { + // Copy the result back to device if the view is on the device. + static void memcpy_result(PointerType dest, PointerType src, size_t size, + bool ptr_on_device) { + if (ptr_on_device) { + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0, + omp_get_default_device(), + omp_get_initial_device())); + } else { + *dest = *src; + } + } +}; + +// template +template +struct ParallelReduceSpecialize { + inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/, + PointerType /*result_ptr*/) { + constexpr int FunctorHasJoin = + Impl::FunctorAnalysis::has_join_member_function; + constexpr int UseReducerType = is_reducer::value; + + std::stringstream error_message; + error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' ' + << UseReducerType << '\n'; + // FIXME_OPENMPTARGET + OpenMPTarget_abort(error_message.str().c_str()); + } +}; + +template +struct ParallelReduceSpecialize, + ReducerType, PointerType, ValueType> { + using PolicyType = Kokkos::RangePolicy; + using TagType = typename PolicyType::work_tag; + using ReducerTypeFwd = + std::conditional_t::value, + FunctorType, ReducerType>; + using Analysis = Impl::FunctorAnalysis; + using ReferenceType = typename Analysis::reference_type; + + using ParReduceCopy = ParallelReduceCopy; + + static void execute_reducer(const FunctorType& f, const PolicyType& p, + PointerType result_ptr, bool ptr_on_device) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const auto begin = p.begin(); + const auto end = p.end(); + + ValueType result; + OpenMPTargetReducerWrapper::init(result); + + // Initialize and copy back the result even if it is a zero length + // reduction. + if (end <= begin) { + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + return; + } + +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams distribute parallel for map(to \ + : f) reduction(custom \ + : result) + for (auto i = begin; i < end; ++i) { + if constexpr (std::is_void::value) { + f(i, result); + } else { + f(TagType(), i, result); + } + } + + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + } + + template + static void execute_array(const FunctorType& f, const PolicyType& p, + PointerType result_ptr, bool ptr_on_device) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const auto begin = p.begin(); + const auto end = p.end(); + + // Enter the loop if the reduction is on a scalar type. + if constexpr (NumReductions == 1) { + ValueType result = ValueType(); + + // Initialize and copy back the result even if it is a zero length + // reduction. + if (end <= begin) { + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + return; + } + // Case where reduction is on a native data type. + if constexpr (std::is_arithmetic::value) { +#pragma omp target teams distribute parallel for \ + map(to:f) reduction(+: result) + for (auto i = begin; i < end; ++i) + + if constexpr (std::is_void::value) { + f(i, result); + } else { + f(TagType(), i, result); + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) +#pragma omp target teams distribute parallel for map(to \ + : f) reduction(custom \ + : result) + for (auto i = begin; i < end; ++i) + + if constexpr (std::is_void::value) { + f(i, result); + } else { + f(TagType(), i, result); + } + } + + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + } else { + ValueType result[NumReductions] = {}; + + // Initialize and copy back the result even if it is a zero length + // reduction. + if (end <= begin) { + ParReduceCopy::memcpy_result(result_ptr, result, + NumReductions * sizeof(ValueType), + ptr_on_device); + return; + } +#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) + for (auto i = begin; i < end; ++i) { + if constexpr (std::is_void::value) { + f(i, result); + } else { + f(TagType(), i, result); + } + } + + ParReduceCopy::memcpy_result( + result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device); + } + } + + static void execute_init_join(const FunctorType& f, const PolicyType& p, + PointerType ptr, const bool ptr_on_device) { + const auto begin = p.begin(); + const auto end = p.end(); + + using FunctorAnalysis = + Impl::FunctorAnalysis; + constexpr int HasInit = FunctorAnalysis::has_init_member_function; + + // Initialize the result pointer. + + const auto size = end - begin; + + // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently + // based on NVIDIA-V100 and should be modifid to be based on the + // architecture in the future. + const int max_team_threads = 32; + const int max_teams = + OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads; + // Number of elements in the reduction + const auto value_count = FunctorAnalysis::value_count(f); + + // Allocate scratch per active thread. Achieved by setting the first + // parameter of `resize_scratch=1`. + OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), + std::numeric_limits::max()); + ValueType* scratch_ptr = + static_cast(OpenMPTargetExec::get_scratch_ptr()); + +#pragma omp target map(to : f) is_device_ptr(scratch_ptr) + { + typename FunctorAnalysis::Reducer final_reducer(&f); + // Enter this loop if the functor has an `init` + if constexpr (HasInit) { + // The `init` routine needs to be called on the device since it might + // need device members. + final_reducer.init(scratch_ptr); + final_reducer.final(scratch_ptr); + } else { + for (int i = 0; i < value_count; ++i) { + static_cast(scratch_ptr)[i] = ValueType(); + } + + final_reducer.final(scratch_ptr); + } + } + + if (end <= begin) { + // If there is no work to be done, copy back the initialized values and + // exit. + if (!ptr_on_device) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_initial_device(), omp_get_default_device())); + else + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_default_device(), omp_get_default_device())); + + return; + } + +#pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ + map(to \ + : f) is_device_ptr(scratch_ptr) + { + typename FunctorAnalysis::Reducer final_reducer(&f); +#pragma omp parallel + { + const int team_num = omp_get_team_num(); + const int num_teams = omp_get_num_teams(); + const auto chunk_size = size / num_teams; + const auto team_begin = begin + team_num * chunk_size; + const auto team_end = + (team_num == num_teams - 1) ? end : (team_begin + chunk_size); + ValueType* team_scratch = + scratch_ptr + team_num * max_team_threads * value_count; + ReferenceType result = final_reducer.init( + &team_scratch[omp_get_thread_num() * value_count]); + + // Accumulate partial results in thread specific storage. +#pragma omp for simd + for (auto i = team_begin; i < team_end; ++i) { + if constexpr (std::is_void::value) { + f(i, result); + } else { + f(TagType(), i, result); + } + } + + // Reduce all paritial results within a team. + const int team_size = max_team_threads; + int tree_neighbor_offset = 1; + do { +#pragma omp for simd + for (int i = 0; i < team_size - tree_neighbor_offset; + i += 2 * tree_neighbor_offset) { + const int neighbor = i + tree_neighbor_offset; + final_reducer.join(&team_scratch[i * value_count], + &team_scratch[neighbor * value_count]); + } + tree_neighbor_offset *= 2; + } while (tree_neighbor_offset < team_size); + } // end parallel + } // end target + + int tree_neighbor_offset = 1; + do { +#pragma omp target teams distribute parallel for simd map(to \ + : f) \ + is_device_ptr(scratch_ptr) + for (int i = 0; i < max_teams - tree_neighbor_offset; + i += 2 * tree_neighbor_offset) { + typename FunctorAnalysis::Reducer final_reducer(&f); + ValueType* team_scratch = scratch_ptr; + const int team_offset = max_team_threads * value_count; + final_reducer.join( + &team_scratch[i * team_offset], + &team_scratch[(i + tree_neighbor_offset) * team_offset]); + + // If `final` is provided by the functor. + // Do the final only once at the end. + if (tree_neighbor_offset * 2 >= max_teams && omp_get_team_num() == 0 && + omp_get_thread_num() == 0) { + final_reducer.final(scratch_ptr); + } + } + tree_neighbor_offset *= 2; + } while (tree_neighbor_offset < max_teams); + + // If the result view is on the host, copy back the values via memcpy. + if (!ptr_on_device) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_initial_device(), omp_get_default_device())); + else + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_default_device(), omp_get_default_device())); + } +}; + +template +struct ParallelReduceSpecialize, + ReducerType, PointerType, ValueType> { + using PolicyType = TeamPolicyInternal; + using TagType = typename PolicyType::work_tag; + using ReducerTypeFwd = + std::conditional_t::value, + FunctorType, ReducerType>; + using Analysis = Impl::FunctorAnalysis; + + using ReferenceType = typename Analysis::reference_type; + + using ParReduceCopy = ParallelReduceCopy; + + static void execute_reducer(const FunctorType& f, const PolicyType& p, + PointerType result_ptr, bool ptr_on_device) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + + const int league_size = p.league_size(); + const int team_size = p.team_size(); + const int vector_length = p.impl_vector_length(); + + const size_t shmem_size_L0 = p.scratch_size(0, team_size); + const size_t shmem_size_L1 = p.scratch_size(1, team_size); + OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, + shmem_size_L0, shmem_size_L1, league_size); + void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + + ValueType result = ValueType(); + + // Maximum active teams possible. + int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; + const auto nteams = + league_size < max_active_teams ? league_size : max_active_teams; + + // If the league size is <=0, do not launch the kernel. + if (nteams <= 0) return; + +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) + +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) reduction(custom \ + : result) +#pragma omp parallel reduction(custom : result) + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) + f(team, result); + else + f(TagType(), team, result); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + + // Copy results back to device if `parallel_reduce` is on a device view. + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + } + + template + static void execute_array(const FunctorType& f, const PolicyType& p, + PointerType result_ptr, bool ptr_on_device) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + + const int league_size = p.league_size(); + const int team_size = p.team_size(); + const int vector_length = p.impl_vector_length(); + + const size_t shmem_size_L0 = p.scratch_size(0, team_size); + const size_t shmem_size_L1 = p.scratch_size(1, team_size); + OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, + shmem_size_L0, shmem_size_L1, league_size); + void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + + // Maximum active teams possible. + int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; + const auto nteams = + league_size < max_active_teams ? league_size : max_active_teams; + + // If the league size is <=0, do not launch the kernel. + if (nteams <= 0) return; + + // Case where the number of reduction items is 1. + if constexpr (NumReductions == 1) { + ValueType result = ValueType(); + + // Case where reduction is on a native data type. + if constexpr (std::is_arithmetic::value) { +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) reduction(+: result) +#pragma omp parallel reduction(+ : result) + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) + f(team, result); + else + f(TagType(), team, result); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + } else { + // Case where the reduction is on a non-native data type. +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) reduction(custom \ + : result) +#pragma omp parallel reduction(custom : result) + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) + f(team, result); + else + f(TagType(), team, result); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + } + + // Copy results back to device if `parallel_reduce` is on a device view. + ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), + ptr_on_device); + } else { + ValueType result[NumReductions] = {}; + // Case where the reduction is on an array. +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) +#pragma omp parallel reduction(+ : result[:NumReductions]) + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) + f(team, result); + else + f(TagType(), team, result); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + + // Copy results back to device if `parallel_reduce` is on a device view. + ParReduceCopy::memcpy_result( + result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device); + } + } + + // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over + // RangePolicy. Need a new implementation. + static void execute_init_join(const FunctorType& f, const PolicyType& p, + PointerType ptr, const bool ptr_on_device) { + using FunctorAnalysis = + Impl::FunctorAnalysis; + constexpr int HasInit = FunctorAnalysis::has_init_member_function; + + const int league_size = p.league_size(); + const int team_size = p.team_size(); + const int vector_length = p.impl_vector_length(); + + auto begin = 0; + auto end = league_size * team_size + team_size * vector_length; + + const size_t shmem_size_L0 = p.scratch_size(0, team_size); + const size_t shmem_size_L1 = p.scratch_size(1, team_size); + + // FIXME_OPENMPTARGET: This would oversubscribe scratch memory since we are + // already using the available scratch memory to create temporaries for each + // thread. + if ((shmem_size_L0 + shmem_size_L1) > 0) { + Kokkos::abort( + "OpenMPTarget: Scratch memory is not supported in `parallel_reduce` " + "over functors with init/join."); + } + + const auto nteams = league_size; + + // Number of elements in the reduction + const auto value_count = FunctorAnalysis::value_count(f); + + // Allocate scratch per active thread. + OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), + league_size); + void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + + // Enter this loop if the functor has an `init` + if constexpr (HasInit) { + // The `init` routine needs to be called on the device since it might need + // device members. +#pragma omp target map(to : f) is_device_ptr(scratch_ptr) + { + typename FunctorAnalysis::Reducer final_reducer(&f); + final_reducer.init(scratch_ptr); + final_reducer.final(scratch_ptr); + } + } else { +#pragma omp target map(to : f) is_device_ptr(scratch_ptr) + { + for (int i = 0; i < value_count; ++i) { + static_cast(scratch_ptr)[i] = ValueType(); + } + + typename FunctorAnalysis::Reducer final_reducer(&f); + final_reducer.final(static_cast(scratch_ptr)); + } + } + + if (end <= begin) { + // If there is no work to be done, copy back the initialized values and + // exit. + if (!ptr_on_device) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_initial_device(), omp_get_default_device())); + else + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_default_device(), omp_get_default_device())); + + return; + } + +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) + { +#pragma omp parallel + { + const int team_num = omp_get_team_num(); + const int num_teams = omp_get_num_teams(); + ValueType* team_scratch = static_cast(scratch_ptr) + + team_num * team_size * value_count; + typename FunctorAnalysis::Reducer final_reducer(&f); + ReferenceType result = final_reducer.init(&team_scratch[0]); + + for (int league_id = team_num; league_id < league_size; + league_id += num_teams) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + team_num, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void::value) { + f(team, result); + } else { + f(TagType(), team, result); + } + } + } // end parallel + } // end target + + int tree_neighbor_offset = 1; + do { +#pragma omp target teams distribute parallel for simd map(to \ + : f) \ + is_device_ptr(scratch_ptr) + for (int i = 0; i < nteams - tree_neighbor_offset; + i += 2 * tree_neighbor_offset) { + ValueType* team_scratch = static_cast(scratch_ptr); + const int team_offset = team_size * value_count; + typename FunctorAnalysis::Reducer final_reducer(&f); + final_reducer.join( + &team_scratch[i * team_offset], + &team_scratch[(i + tree_neighbor_offset) * team_offset]); + + // If `final` is provided by the functor. + // Do the final only once at the end. + if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 && + omp_get_thread_num() == 0) { + final_reducer.final(scratch_ptr); + } + } + tree_neighbor_offset *= 2; + } while (tree_neighbor_offset < nteams); + + // If the result view is on the host, copy back the values via memcpy. + if (!ptr_on_device) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_initial_device(), omp_get_default_device())); + else + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, + omp_get_default_device(), omp_get_default_device())); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp index 21bdb67e34..251ca20b44 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp @@ -19,7 +19,8 @@ #include #include -#include +#include +#include // WORKAROUND OPENMPTARGET: sometimes tile sizes don't make it correctly, // this was tracked down to a bug in clang with regards of mapping structs @@ -437,7 +438,7 @@ class ParallelReduce, ReducerType, const Policy m_policy; const ReducerType m_reducer; - using ParReduceCommon = ParallelReduceCommon; + using ParReduceCopy = ParallelReduceCopy; bool m_result_ptr_on_device; @@ -518,8 +519,8 @@ reduction(+:result) } } - ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } template @@ -573,8 +574,8 @@ reduction(+:result) } } - ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } template @@ -636,8 +637,8 @@ reduction(+:result) } } - ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } template @@ -707,8 +708,8 @@ reduction(+:result) } } - ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } template @@ -784,8 +785,8 @@ reduction(+:result) } } - ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); + ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), + m_result_ptr_on_device); } template diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp new file mode 100644 index 0000000000..672271ed6b --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -0,0 +1,694 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGETREDUCER_HPP +#define KOKKOS_OPENMPTARGETREDUCER_HPP + +#include +#include + +#include +#include "Kokkos_OpenMPTarget_Abort.hpp" + +namespace Kokkos { +namespace Impl { + +template +struct OpenMPTargetReducerWrapper { + using value_type = typename Reducer::value_type; + + // Using a generic unknown Reducer for the OpenMPTarget backend is not + // implemented. + KOKKOS_INLINE_FUNCTION + static void join(value_type&, const value_type&) = delete; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type&, const volatile value_type&) = delete; + + KOKKOS_INLINE_FUNCTION + static void init(value_type&) = delete; +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { dest += src; } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest += src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::sum(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { dest *= src; } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest *= src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::prod(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src < dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src < dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::min(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src > dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src > dest) dest = src; + } + + // Required + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::max(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest && src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest && src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::land(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + using result_view_type = Kokkos::View; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest || src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest || src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::lor(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest & src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest & src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::band(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + public: + // Required + using value_type = std::remove_cv_t; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest | src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest | src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity::bor(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = ValLocScalar; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.val < dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.val < dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.val = reduction_identity::min(); + val.loc = reduction_identity::min(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = ValLocScalar; + + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.val > dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.val > dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.val = reduction_identity::max(); + val.loc = reduction_identity::min(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + + public: + // Required + using value_type = MinMaxScalar; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_val = reduction_identity::max(); + val.min_val = reduction_identity::min(); + } +}; + +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = MinMaxLocScalar; + + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_val = reduction_identity::max(); + val.min_val = reduction_identity::min(); + val.max_loc = reduction_identity::min(); + val.min_loc = reduction_identity::min(); + } +}; + +// +// specialize for MaxFirstLoc +// +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = ValLocScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (dest.val < src.val) { + dest = src; + } else if (!(src.val < dest.val)) { + dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (dest.val < src.val) { + dest = src; + } else if (!(src.val < dest.val)) { + dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.val = reduction_identity::max(); + val.loc = reduction_identity::min(); + } +#pragma omp end declare target +}; + +// +// specialize for MinFirstLoc +// +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = ValLocScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.val < dest.val) { + dest = src; + } else if (!(dest.val < src.val)) { + dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.val < dest.val) { + dest = src; + } else if (!(dest.val < src.val)) { + dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.val = reduction_identity::min(); + val.loc = reduction_identity::min(); + } +#pragma omp end declare target +}; + +// +// specialize for MinMaxFirstLastLoc +// +template +struct OpenMPTargetReducerWrapper> { + private: + using scalar_type = std::remove_cv_t; + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = MinMaxLocScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } else if (!(dest.min_val < src.min_val)) { + dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; + } + + if (dest.max_val < src.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } else if (!(src.max_val < dest.max_val)) { + dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } else if (!(dest.min_val < src.min_val)) { + dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; + } + + if (dest.max_val < src.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } else if (!(src.max_val < dest.max_val)) { + dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_val = reduction_identity::max(); + val.min_val = reduction_identity::min(); + val.max_loc = reduction_identity::max(); + val.min_loc = reduction_identity::min(); + } +#pragma omp end declare target +}; + +// +// specialize for FirstLoc +// +template +struct OpenMPTargetReducerWrapper> { + private: + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = FirstLocScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) + ? src.min_loc_true + : dest.min_loc_true; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) + ? src.min_loc_true + : dest.min_loc_true; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.min_loc_true = reduction_identity::min(); + } +#pragma omp end declare target +}; + +// +// specialize for LastLoc +// +template +struct OpenMPTargetReducerWrapper> { + private: + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = LastLocScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) + ? src.max_loc_true + : dest.max_loc_true; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) + ? src.max_loc_true + : dest.max_loc_true; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_loc_true = reduction_identity::max(); + } +#pragma omp end declare target +}; + +// +// specialize for StdIsPartitioned +// +template +struct OpenMPTargetReducerWrapper> { + private: + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = StdIsPartScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) + ? src.max_loc_true + : dest.max_loc_true; + + dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) + ? dest.min_loc_false + : src.min_loc_false; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) + ? src.max_loc_true + : dest.max_loc_true; + + dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) + ? dest.min_loc_false + : src.min_loc_false; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_loc_true = ::Kokkos::reduction_identity::max(); + val.min_loc_false = ::Kokkos::reduction_identity::min(); + } +#pragma omp end declare target +}; + +// +// specialize for StdPartitionPoint +// +template +struct OpenMPTargetReducerWrapper> { + private: + using index_type = std::remove_cv_t; + + public: + // Required + using value_type = StdPartPointScalar; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) + ? dest.min_loc_false + : src.min_loc_false; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) + ? dest.min_loc_false + : src.min_loc_false; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.min_loc_false = ::Kokkos::reduction_identity::min(); + } +#pragma omp end declare target +}; + +/* +template +class OpenMPTargetReducerWrapper { + public: + const ReducerType& reducer; + using value_type = typename ReducerType::value_type; + value_type& value; + + KOKKOS_INLINE_FUNCTION + void join(const value_type& upd) { + reducer.join(value,upd); + } + + KOKKOS_INLINE_FUNCTION + void init(const value_type& upd) { + reducer.init(value,upd); + } +};*/ + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp index c7f146871b..d9ea555055 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp @@ -20,7 +20,7 @@ #include #ifdef KOKKOS_ENABLE_OPENMPTARGET -#include +#include #include #include #include diff --git a/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp b/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp index 0bd89ef4cf..6bde8f59d8 100644 --- a/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp +++ b/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp @@ -18,10 +18,17 @@ #define KOKKOS_DECLARE_OPENMPTARGET_HPP #if defined(KOKKOS_ENABLE_OPENMPTARGET) -#include -#include +#include +#include +#include #include #include +#include +#include +#include +#include +#include +#include #endif #endif