Skip to content

Commit

Permalink
Don't call the functor's destructor on the device for Serial and Cuda
Browse files Browse the repository at this point in the history
  • Loading branch information
masterleinad committed Feb 28, 2023
1 parent d177f61 commit f93e48a
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 36 deletions.
59 changes: 36 additions & 23 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,8 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
// (c) gridDim.x <= blockDim.y * blockDim.y
// (d) gridDim.y == gridDim.z == 1

const FunctorType m_functor;
const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
m_functor_reducer;
const Policy m_policy;
word_size_type* m_scratch_space;
size_type* m_scratch_flags;
Expand All @@ -433,23 +434,25 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
template <class TagType>
__device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
const Member& i, reference_type update, const bool final_result) const {
m_functor(i, update, final_result);
m_functor_reducer.get_functor()(i, update, final_result);
}

template <class TagType>
__device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
const Member& i, reference_type update, const bool final_result) const {
m_functor(TagType(), i, update, final_result);
m_functor_reducer.get_functor()(TagType(), i, update, final_result);
}

//----------------------------------------

__device__ inline void initial() const {
typename Analysis::Reducer final_reducer(m_functor);
const typename Analysis::Reducer& final_reducer =
m_functor_reducer.get_reducer();

const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
word_count(Analysis::value_size(m_functor_reducer.get_functor()) /
sizeof(word_size_type));

word_size_type* const shared_value =
kokkos_impl_cuda_shared_memory<word_size_type>() +
Expand Down Expand Up @@ -485,11 +488,13 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
//----------------------------------------

__device__ inline void final() const {
typename Analysis::Reducer final_reducer(m_functor);
const typename Analysis::Reducer& final_reducer =
m_functor_reducer.get_reducer();

const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
word_count(Analysis::value_size(m_functor_reducer.get_functor()) /
sizeof(word_size_type));

// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
// value[2] , ... }
Expand Down Expand Up @@ -619,7 +624,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
if (nwork) {
constexpr int GridMaxComputeCapability_2x = 0x0ffff;

const int block_size = local_block_size(m_functor);
const int block_size = local_block_size(m_functor_reducer.get_functor());
KOKKOS_ASSERT(block_size > 0);

const int grid_max =
Expand All @@ -639,13 +644,15 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {

m_scratch_space =
reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
m_policy.space(), Analysis::value_size(m_functor) * grid_x));
m_policy.space(),
Analysis::value_size(m_functor_reducer.get_functor()) * grid_x));
m_scratch_flags =
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);

dim3 grid(grid_x, 1, 1);
dim3 block(1, block_size, 1); // REQUIRED DIMENSIONS ( 1 , N , 1 )
const int shmem = Analysis::value_size(m_functor) * (block_size + 2);
const int shmem = Analysis::value_size(m_functor_reducer.get_functor()) *
(block_size + 2);

#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
if (m_run_serial) {
Expand All @@ -670,7 +677,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
}

ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
: m_functor(arg_functor),
: m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}),
m_policy(arg_policy),
m_scratch_space(nullptr),
m_scratch_flags(nullptr),
Expand Down Expand Up @@ -728,7 +735,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
// (c) gridDim.x <= blockDim.y * blockDim.y
// (d) gridDim.y == gridDim.z == 1

const FunctorType m_functor;
const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
m_functor_reducer;
const Policy m_policy;
word_size_type* m_scratch_space;
size_type* m_scratch_flags;
Expand All @@ -743,23 +751,25 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
template <class TagType>
__device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_range(
const Member& i, reference_type update, const bool final_result) const {
m_functor(i, update, final_result);
m_functor_reducer.get_functor()(i, update, final_result);
}

template <class TagType>
__device__ inline std::enable_if_t<!std::is_void<TagType>::value> exec_range(
const Member& i, reference_type update, const bool final_result) const {
m_functor(TagType(), i, update, final_result);
m_functor_reducer.get_functor()(TagType(), i, update, final_result);
}

//----------------------------------------

__device__ inline void initial() const {
typename Analysis::Reducer final_reducer(m_functor);
const typename Analysis::Reducer& final_reducer =
m_functor_reducer.get_reducer();

const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
word_count(Analysis::value_size(m_functor_reducer.get_functor()) /
sizeof(word_size_type));

word_size_type* const shared_value =
kokkos_impl_cuda_shared_memory<word_size_type>() +
Expand Down Expand Up @@ -795,11 +805,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
//----------------------------------------

__device__ inline void final() const {
typename Analysis::Reducer final_reducer(m_functor);
const typename Analysis::Reducer& final_reducer =
m_functor_reducer.get_reducer();

const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
word_count(final_reducer.value_size() / sizeof(word_size_type));

// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
// value[2] , ... }
Expand Down Expand Up @@ -935,7 +946,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
if (nwork) {
enum { GridMaxComputeCapability_2x = 0x0ffff };

const int block_size = local_block_size(m_functor);
const int block_size = local_block_size(m_functor_reducer.get_functor());
KOKKOS_ASSERT(block_size > 0);

const int grid_max =
Expand All @@ -953,15 +964,17 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
// How many block are really needed for this much work:
const int grid_x = (nwork + work_per_block - 1) / work_per_block;

const typename Analysis::Reducer& final_reducer =
m_functor_reducer.get_reducer();
m_scratch_space =
reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
m_policy.space(), Analysis::value_size(m_functor) * grid_x));
m_policy.space(), final_reducer.value_size() * grid_x));
m_scratch_flags =
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);

dim3 grid(grid_x, 1, 1);
dim3 block(1, block_size, 1); // REQUIRED DIMENSIONS ( 1 , N , 1 )
const int shmem = Analysis::value_size(m_functor) * (block_size + 2);
const int shmem = final_reducer.value_size() * (block_size + 2);

#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
if (m_run_serial) {
Expand All @@ -982,7 +995,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
m_policy.space()
.impl_internal_space_instance()); // copy to device and execute

const int size = Analysis::value_size(m_functor);
const int size = final_reducer.value_size();
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
if (m_run_serial)
DeepCopy<HostSpace, CudaSpace, Cuda>(m_policy.space(), &m_returnvalue,
Expand All @@ -1003,7 +1016,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
ParallelScanWithTotal(const FunctorType& arg_functor,
const Policy& arg_policy,
const ViewType& arg_result_view)
: m_functor(arg_functor),
: m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}),
m_policy(arg_policy),
m_scratch_space(nullptr),
m_scratch_flags(nullptr),
Expand Down
31 changes: 18 additions & 13 deletions core/src/Serial/Kokkos_Serial_Parallel_Range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
using pointer_type = typename Analysis::pointer_type;
using reference_type = typename Analysis::reference_type;

const FunctorType m_functor;
const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
m_functor_reducer;
const Policy m_policy;

template <class TagType>
inline std::enable_if_t<std::is_void<TagType>::value> exec(
reference_type update) const {
const typename Policy::member_type e = m_policy.end();
for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
m_functor(i, update, true);
m_functor_reducer.get_functor()(i, update, true);
}
}

Expand All @@ -172,13 +173,15 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
const TagType t{};
const typename Policy::member_type e = m_policy.end();
for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
m_functor(t, i, update, true);
m_functor_reducer.get_functor()(t, i, update, true);
}
}

public:
inline void execute() const {
const size_t pool_reduce_size = Analysis::value_size(m_functor);
const typename Analysis::Reducer& final_reducer =
m_functor_reducer.get_reducer();
const size_t pool_reduce_size = final_reducer.value_size();
const size_t team_reduce_size = 0; // Never shrinks
const size_t team_shared_size = 0; // Never shrinks
const size_t thread_local_size = 0; // Never shrinks
Expand All @@ -191,16 +194,15 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
pool_reduce_size, team_reduce_size, team_shared_size,
thread_local_size);

typename Analysis::Reducer final_reducer(m_functor);

reference_type update = final_reducer.init(pointer_type(
internal_instance->m_thread_team_data.pool_reduce_local()));

this->template exec<WorkTag>(update);
}

inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
: m_functor(arg_functor), m_policy(arg_policy) {}
: m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}),
m_policy(arg_policy) {}
};

/*--------------------------------------------------------------------------*/
Expand All @@ -218,7 +220,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
using pointer_type = typename Analysis::pointer_type;
using reference_type = typename Analysis::reference_type;

const FunctorType m_functor;
const CombinedFunctorReducer<FunctorType, typename Analysis::Reducer>
m_functor_reducer;
const Policy m_policy;
const pointer_type m_result_ptr;

Expand All @@ -227,7 +230,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
reference_type update) const {
const typename Policy::member_type e = m_policy.end();
for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
m_functor(i, update, true);
m_functor_reducer.get_functor()(i, update, true);
}
}

Expand All @@ -237,13 +240,14 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
const TagType t{};
const typename Policy::member_type e = m_policy.end();
for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
m_functor(t, i, update, true);
m_functor_reducer.get_functor()(t, i, update, true);
}
}

public:
inline void execute() {
const size_t pool_reduce_size = Analysis::value_size(m_functor);
const size_t pool_reduce_size =
m_functor_reducer.get_reducer().value_size();
const size_t team_reduce_size = 0; // Never shrinks
const size_t team_shared_size = 0; // Never shrinks
const size_t thread_local_size = 0; // Never shrinks
Expand All @@ -256,7 +260,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
pool_reduce_size, team_reduce_size, team_shared_size,
thread_local_size);

typename Analysis::Reducer final_reducer(m_functor);
const typename Analysis::Reducer& final_reducer =
m_functor_reducer.get_reducer();

reference_type update = final_reducer.init(pointer_type(
internal_instance->m_thread_team_data.pool_reduce_local()));
Expand All @@ -271,7 +276,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
ParallelScanWithTotal(const FunctorType& arg_functor,
const Policy& arg_policy,
const ViewType& arg_result_view)
: m_functor(arg_functor),
: m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}),
m_policy(arg_policy),
m_result_ptr(arg_result_view.data()) {
static_assert(
Expand Down

0 comments on commit f93e48a

Please sign in to comment.