diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp index 45b8f42f17..02905572e1 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -67,6 +67,8 @@ void OpenMPTargetExec::verify_initialized(const char* const label) { msg.append(" ERROR: not initialized"); Kokkos::Impl::throw_runtime_exception(msg); } + OpenMPTargetExec::MAX_ACTIVE_THREADS = + Kokkos::Experimental::OpenMPTarget().concurrency(); } void* OpenMPTargetExec::m_scratch_ptr = nullptr; @@ -74,6 +76,7 @@ int64_t OpenMPTargetExec::m_scratch_size = 0; int* OpenMPTargetExec::m_lock_array = nullptr; uint64_t OpenMPTargetExec::m_lock_size = 0; uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; +int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; void OpenMPTargetExec::clear_scratch() { Kokkos::Experimental::OpenMPTargetSpace space; @@ -100,11 +103,23 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, const int64_t shmem_size = shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. const int64_t padding = shmem_size * 10 / 100; // Padding per team. + + // Maximum active teams possible. + // The number should not exceed the maximum in-flight teams possible or the + // league_size. + int max_active_teams = + std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + + // max_active_teams is the number of active teams on the given hardware. + // We set the number of teams to be twice the number of max_active_teams for + // the compiler to pick the right number in its case. + omp_set_num_teams(max_active_teams * 2); + // Total amount of scratch memory allocated is depenedent // on the maximum number of in-flight teams possible. int64_t total_size = (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * - std::min(MAX_ACTIVE_THREADS / team_size, league_size); + max_active_teams * 2; if (total_size > m_scratch_size) { space.deallocate(m_scratch_ptr, m_scratch_size); diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 02f42ee2a6..3999920517 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -66,13 +67,40 @@ void OpenMPTargetInternal::fence(const std::string& name, } } int OpenMPTargetInternal::concurrency() const { - return 128000; // FIXME_OPENMPTARGET + int max_threads = 2048 * 80; +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + int max_threads_sm = 2048; +#if defined(KOKKOS_ARCH_AMPERE86) + max_threads = max_threads_sm * 84; +#elif defined(KOKKOS_ARCH_AMPERE80) + max_threads = max_threads_sm * 108; +#elif defined(KOKKOS_ARCH_VOLTA72) + max_threads = max_threads_sm * 84; +#elif defined(KOKKOS_ARCH_VOLTA70) + max_threads = max_threads_sm * 80; +#elif defined(KOKKOS_ARCH_PASCAL60) || defined(KOKKOS_ARCH_PASCAL61) + max_threads = max_threads_sm * 60; +#endif +#elif defined(KOKKOS_ARCH_INTEL_GPU) +#pragma omp target map(max_threads) + { max_threads = omp_get_num_procs(); } + + // Multiply the number of processors with the SIMD length. + max_threads *= 32; +#endif + + return max_threads; } const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; } void OpenMPTargetInternal::print_configuration(std::ostream& os, bool /*verbose*/) const { // FIXME_OPENMPTARGET os << "Using OpenMPTarget\n"; +#if defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) + os << "Defined KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU: Workaround " + "for " + "hierarchical parallelism for Intel GPUs."; +#endif } void OpenMPTargetInternal::impl_finalize() { diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 50167e297b..9767d8e53e 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -26,6 +26,12 @@ #include #include "Kokkos_OpenMPTarget_Abort.hpp" +// Intel architectures prefer the classical hierarchical parallelism that relies +// on OpenMP. +#if defined(KOKKOS_ARCH_INTEL_GPU) +#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -727,8 +733,7 @@ class OpenMPTargetExec { // teams possible is calculated based on NVIDIA's Volta GPU. In // future this value should be based on the chosen architecture for the // OpenMPTarget backend. - static constexpr int MAX_ACTIVE_THREADS = 2080 * 80; - static constexpr int MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS / 32; + static int MAX_ACTIVE_THREADS; private: static void* scratch_ptr; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 12de3423f8..4aefbc96cd 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -115,44 +115,68 @@ class ParallelFor, // mode but works in the Debug mode. // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - // nteams should not exceed the maximum in-flight teams possible. - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; + int max_active_teams = omp_get_max_teams(); + + // FIXME_OPENMPTARGET: Although the maximum number of teams is set using the + // omp_set_num_teams in the resize_scratch routine, the call is not + // respected. Hence we need to use `num_teams` routine to restrict the + // number of teams generated to max_active_teams. Hopefully we can avoid the + // num_teams clause in the future and let compiler pick the right number of + // teams. This is not true for Intel architectures. // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; + if (max_active_teams <= 0) return; // Performing our own scheduling of teams to avoid separation of code between // teams-distribute and parallel. Gave a 2x performance boost in test cases with // the clang compiler. atomic_compare_exchange can be avoided since the standard // guarantees that the number of teams specified in the `num_teams` clause is // always less than or equal to the maximum concurrently running teams. -#pragma omp target teams num_teams(nteams) thread_limit(team_size) \ - map(to \ - : a_functor) is_device_ptr(scratch_ptr) +#if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) +#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \ + num_teams(max_active_teams) is_device_ptr(scratch_ptr) #pragma omp parallel { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + const int blockIdx = omp_get_team_num(); const int gridDim = omp_get_num_teams(); // Iterate through the number of teams until league_size and assign the // league_id accordingly // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename Policy::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - m_functor(team); - else - m_functor(TagType(), team); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename Policy::member_type team(league_id, league_size, team_size, + vector_length, scratch_ptr, blockIdx, + shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + m_functor(team); + else + m_functor(TagType(), team); + } } +#else +#pragma omp target teams distribute firstprivate(a_functor) \ + is_device_ptr(scratch_ptr) num_teams(max_active_teams) \ + thread_limit(team_size) + for (int i = 0; i < league_size; i++) { +#pragma omp parallel + { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + + typename Policy::member_type team(i, league_size, team_size, + vector_length, scratch_ptr, i, + shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + m_functor(team); + else + m_functor(TagType(), team); + } + } +#endif } public: diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp index 2ce25f9ffd..ceb1337c58 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -106,7 +106,7 @@ struct ParallelReduceSpecialize, : f) reduction(custom \ : result) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(i, result); } else { f(TagType(), i, result); @@ -138,13 +138,14 @@ struct ParallelReduceSpecialize, ptr_on_device); return; } + // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { #pragma omp target teams distribute parallel for \ map(to:f) reduction(+: result) for (auto i = begin; i < end; ++i) - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(i, result); } else { f(TagType(), i, result); @@ -156,7 +157,7 @@ struct ParallelReduceSpecialize, : result) for (auto i = begin; i < end; ++i) - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(i, result); } else { f(TagType(), i, result); @@ -178,7 +179,7 @@ struct ParallelReduceSpecialize, } #pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(i, result); } else { f(TagType(), i, result); @@ -261,7 +262,7 @@ struct ParallelReduceSpecialize, // Accumulate partial results in thread specific storage. #pragma omp for simd for (auto i = team_begin; i < team_end; ++i) { - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(i, result); } else { f(TagType(), i, result); @@ -355,42 +356,60 @@ struct ParallelReduceSpecialize, ValueType result = ValueType(); // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; + int max_active_teams = omp_get_max_teams(); // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; + if (max_active_teams <= 0) return; #pragma omp declare reduction( \ custom:ValueType \ : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(custom \ - : result) +#if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ + firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom \ + : result) #pragma omp parallel reduction(custom : result) { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + const int blockIdx = omp_get_team_num(); const int gridDim = omp_get_num_teams(); // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + f(team, result); + else + f(TagType(), team, result); + } + } +#else +#pragma omp target teams distribute firstprivate(f) is_device_ptr(scratch_ptr) \ + num_teams(max_active_teams) thread_limit(team_size) reduction(custom \ + : result) + for (int i = 0; i < league_size; i++) { +#pragma omp parallel reduction(custom : result) + { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + + typename PolicyType::member_type team(i, league_size, team_size, + vector_length, scratch_ptr, i, + shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + f(team, result); + else + f(TagType(), team, result); + } } +#endif // Copy results back to device if `parallel_reduce` is on a device view. ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), @@ -416,12 +435,10 @@ struct ParallelReduceSpecialize, void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); // Maximum active teams possible. - int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; - const auto nteams = - league_size < max_active_teams ? league_size : max_active_teams; + int max_active_teams = omp_get_max_teams(); // If the league size is <=0, do not launch the kernel. - if (nteams <= 0) return; + if (max_active_teams <= 0) return; // Case where the number of reduction items is 1. if constexpr (NumReductions == 1) { @@ -429,55 +446,55 @@ struct ParallelReduceSpecialize, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ : f) \ is_device_ptr(scratch_ptr) reduction(+: result) #pragma omp parallel reduction(+ : result) { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + const int blockIdx = omp_get_team_num(); const int gridDim = omp_get_num_teams(); // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + f(team, result); + else + f(TagType(), team, result); + } } } else { // Case where the reduction is on a non-native data type. #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(custom \ - : result) +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ + map(to \ + : f) is_device_ptr(scratch_ptr) reduction(custom \ + : result) #pragma omp parallel reduction(custom : result) { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + const int blockIdx = omp_get_team_num(); const int gridDim = omp_get_num_teams(); // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + f(team, result); + else + f(TagType(), team, result); + } } } @@ -487,28 +504,28 @@ struct ParallelReduceSpecialize, } else { ValueType result[NumReductions] = {}; // Case where the reduction is on an array. -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ : f) \ is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) #pragma omp parallel reduction(+ : result[:NumReductions]) { + if (omp_get_num_teams() > max_active_teams) + Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); + const int blockIdx = omp_get_team_num(); const int gridDim = omp_get_num_teams(); // Guarantee that the compilers respect the `num_teams` clause - if (gridDim <= nteams) { - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) - f(team, result); - else - f(TagType(), team, result); - } - } else - Kokkos::abort("`num_teams` clause was not respected.\n"); + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_void_v) + f(team, result); + else + f(TagType(), team, result); + } } // Copy results back to device if `parallel_reduce` is on a device view. @@ -593,7 +610,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, team_num, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void::value) { + if constexpr (std::is_void_v) { f(team, result); } else { f(TagType(), team, result);