Skip to content

Commit

Permalink
Further update to CUDA occupancy calculation (#5739)
Browse files Browse the repository at this point in the history
* Update the occupancy calculation to reflect the maximum number of registers per SM, subject to the warp allocation granularity of 4 warps per SM

* Addressed PR comments

* Propagated warp allocation constraints to occupancy control functions

* Reduce occupancy-related code-reuse in CUDA Parallel MDRange

* Comment cleanup

* clang-format

* Check per-kernel perf, not end-to-end...

* Update core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp

Co-authored-by: Damien L-G <dalg24+github@gmail.com>

* Apply suggestions from code review

Co-authored-by: Damien L-G <dalg24+github@gmail.com>

* Clarified comments, fixed compile error in suggestion that I missed

* clang-format

---------

Co-authored-by: Damien L-G <dalg24+github@gmail.com>
  • Loading branch information
weinbe2 and dalg24 committed Jan 30, 2023
1 parent 22380c7 commit 04e3437
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 27 deletions.
66 changes: 63 additions & 3 deletions core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,67 @@
namespace Kokkos {
namespace Impl {

inline int cuda_warp_per_sm_allocation_granularity(
cudaDeviceProp const& properties) {
// Allocation granularity of warps in each sm
switch (properties.major) {
case 3:
case 5:
case 7:
case 8:
case 9: return 4;
case 6: return (properties.minor == 0 ? 2 : 4);
default:
throw_runtime_exception(
"Unknown device in cuda warp per sm allocation granularity");
return 0;
}
}

inline int cuda_max_warps_per_sm_registers(
cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
// Maximum number of warps per sm as a function of register counts,
// subject to the constraint that warps are allocated with a fixed granularity
int const max_regs_per_block = properties.regsPerBlock;
int const regs_per_warp = attributes.numRegs * properties.warpSize;
int const warp_granularity =
cuda_warp_per_sm_allocation_granularity(properties);
// The granularity of register allocation is chunks of 256 registers per warp,
// which implies a need to over-allocate, so we round up
int const allocated_regs_per_warp = 256 * ((regs_per_warp + 256 - 1) / 256);

// The maximum number of warps per SM is constrained from above by register
// allocation. To satisfy the constraint that warps per SM is allocated at a
// finite granularity, we need to round down.
int const max_warps_per_sm =
warp_granularity *
(max_regs_per_block / (allocated_regs_per_warp * warp_granularity));

return max_warps_per_sm;
}

inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
cudaFuncAttributes const& attributes,
int block_size, size_t dynamic_shmem) {
// Limits due do registers/SM
// Limits due to registers/SM
int const regs_per_sm = properties.regsPerMultiprocessor;
int const regs_per_thread = attributes.numRegs;
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
int const max_blocks_regs =
regs_per_sm / (allocated_regs_per_thread * block_size);
int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);

// Compute the maximum number of warps as a function of the number of
// registers
int const max_warps_per_sm_registers =
cuda_max_warps_per_sm_registers(properties, attributes);

// Correct the number of blocks to respect the maximum number of warps per
// SM, which is constrained to be a multiple of the warp allocation
// granularity defined in `cuda_warp_per_sm_allocation_granularity`.
while ((max_blocks_regs * block_size / properties.warpSize) >
max_warps_per_sm_registers)
max_blocks_regs--;

// Limits due to shared memory/SM
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
Expand Down Expand Up @@ -179,6 +229,16 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}

// Thin version of cuda_get_opt_block_size for cases where there is no shared
// memory
template <class LaunchBounds>
int cuda_get_opt_block_size_no_shmem(const cudaDeviceProp& prop,
const cudaFuncAttributes& attr,
LaunchBounds) {
return cuda_deduce_block_size(
false, prop, attr, [](int /*block_size*/) { return 0; }, LaunchBounds{});
}

} // namespace Impl
} // namespace Kokkos

Expand Down
14 changes: 13 additions & 1 deletion core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,21 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
const size_t max_blocks_regs =
size_t max_blocks_regs =
regs_per_sm / (allocated_regs_per_thread * block_size);

// Compute the maximum number of warps as a function of the number of
// registers
const size_t max_warps_per_sm_registers =
cuda_max_warps_per_sm_registers(device_props, func_attr);

// Correct the number of blocks to respect the maximum number of warps per
// SM, which is constrained to be a multiple of the warp allocation
// granularity defined in `cuda_warp_per_sm_allocation_granularity`.
while ((max_blocks_regs * block_size / device_props.warpSize) >
max_warps_per_sm_registers)
max_blocks_regs--;

// Compute how many threads per sm we actually want
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
// only allocate multiples of warp size
Expand Down
53 changes: 30 additions & 23 deletions core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,34 @@
namespace Kokkos {
namespace Impl {

template <typename ParallelType, typename Policy, typename LaunchBounds>
int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
cudaFuncAttributes attr =
CudaParallelLaunch<ParallelType,
LaunchBounds>::get_cuda_func_attributes();
auto const& prop = pol.space().cuda_device_prop();

// Limits due to registers/SM, MDRange doesn't have
// shared memory constraints
int const optimal_block_size =
cuda_get_opt_block_size_no_shmem(prop, attr, LaunchBounds{});

// Compute how many blocks of this size we can launch, based on warp
// constraints
int const max_warps_per_sm_registers =
Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
int const max_num_threads_from_warps =
max_warps_per_sm_registers * prop.warpSize;
int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;

// Compute the total number of threads
int const max_threads_per_sm = optimal_block_size * max_num_blocks;

return std::min(
max_threads_per_sm,
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
}

template <class FunctorType, class... Traits>
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
Expand All @@ -57,18 +85,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
cudaFuncAttributes attr =
CudaParallelLaunch<ParallelFor,
LaunchBounds>::get_cuda_func_attributes();
auto const& prop = pol.space().cuda_device_prop();
// Limits due to registers/SM, MDRange doesn't have
// shared memory constraints
int const regs_per_sm = prop.regsPerMultiprocessor;
int const regs_per_thread = attr.numRegs;
int const max_threads_per_sm = regs_per_sm / regs_per_thread;
return std::min(
max_threads_per_sm,
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_rp; }
inline __device__ void operator()() const {
Expand Down Expand Up @@ -230,17 +247,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
cudaFuncAttributes attr =
CudaParallelLaunch<ParallelReduce,
LaunchBounds>::get_cuda_func_attributes();
auto const& prop = pol.space().cuda_device_prop();
// Limits due do registers/SM
int const regs_per_sm = prop.regsPerMultiprocessor;
int const regs_per_thread = attr.numRegs;
int const max_threads_per_sm = regs_per_sm / regs_per_thread;
return std::min(
max_threads_per_sm,
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_policy; }
inline __device__ void exec_range(reference_type update) const {
Expand Down

0 comments on commit 04e3437

Please sign in to comment.