Further update to CUDA occupancy calculation (#5739)

* Update the occupancy calculation to reflect the maximum number of registers per SM, subject to the warp allocation granularity of 4 warps per SM * Addressed PR comments * Propagated warp allocation constraints to occupancy control functions * Reduce occupancy-related code-reuse in CUDA Parallel MDRange * Comment cleanup * clang-format * Check per-kernel perf, not end-to-end... * Update core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp Co-authored-by: Damien L-G <dalg24+github@gmail.com> * Apply suggestions from code review Co-authored-by: Damien L-G <dalg24+github@gmail.com> * Clarified comments, fixed compile error in suggestion that I missed * clang-format --------- Co-authored-by: Damien L-G <dalg24+github@gmail.com>
sandialabs · Jan 30, 2023 · 04e3437 · 04e3437
1 parent 22380c7
commit 04e3437
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 27 deletions.
diff --git a/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -25,17 +25,67 @@
 namespace Kokkos {
 namespace Impl {
 
+inline int cuda_warp_per_sm_allocation_granularity(
+    cudaDeviceProp const& properties) {
+  // Allocation granularity of warps in each sm
+  switch (properties.major) {
+    case 3:
+    case 5:
+    case 7:
+    case 8:
+    case 9: return 4;
+    case 6: return (properties.minor == 0 ? 2 : 4);
+    default:
+      throw_runtime_exception(
+          "Unknown device in cuda warp per sm allocation granularity");
+      return 0;
+  }
+}
+
+inline int cuda_max_warps_per_sm_registers(
+    cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
+  // Maximum number of warps per sm as a function of register counts,
+  // subject to the constraint that warps are allocated with a fixed granularity
+  int const max_regs_per_block = properties.regsPerBlock;
+  int const regs_per_warp      = attributes.numRegs * properties.warpSize;
+  int const warp_granularity =
+      cuda_warp_per_sm_allocation_granularity(properties);
+  // The granularity of register allocation is chunks of 256 registers per warp,
+  // which implies a need to over-allocate, so we round up
+  int const allocated_regs_per_warp = 256 * ((regs_per_warp + 256 - 1) / 256);
+
+  // The maximum number of warps per SM is constrained from above by register
+  // allocation. To satisfy the constraint that warps per SM is allocated at a
+  // finite granularity, we need to round down.
+  int const max_warps_per_sm =
+      warp_granularity *
+      (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
+
+  return max_warps_per_sm;
+}
+
 inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
                                          cudaFuncAttributes const& attributes,
                                          int block_size, size_t dynamic_shmem) {
-  // Limits due do registers/SM
+  // Limits due to registers/SM
   int const regs_per_sm     = properties.regsPerMultiprocessor;
   int const regs_per_thread = attributes.numRegs;
   // The granularity of register allocation is chunks of 256 registers per warp
   // -> 8 registers per thread
   int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
-  int const max_blocks_regs =
-      regs_per_sm / (allocated_regs_per_thread * block_size);
+  int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
+
+  // Compute the maximum number of warps as a function of the number of
+  // registers
+  int const max_warps_per_sm_registers =
+      cuda_max_warps_per_sm_registers(properties, attributes);
+
+  // Correct the number of blocks to respect the maximum number of warps per
+  // SM, which is constrained to be a multiple of the warp allocation
+  // granularity defined in `cuda_warp_per_sm_allocation_granularity`.
+  while ((max_blocks_regs * block_size / properties.warpSize) >
+         max_warps_per_sm_registers)
+    max_blocks_regs--;
 
   // Limits due to shared memory/SM
   size_t const shmem_per_sm            = properties.sharedMemPerMultiprocessor;
@@ -179,6 +229,16 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
                                 LaunchBounds{});
 }
 
+// Thin version of cuda_get_opt_block_size for cases where there is no shared
+// memory
+template <class LaunchBounds>
+int cuda_get_opt_block_size_no_shmem(const cudaDeviceProp& prop,
+                                     const cudaFuncAttributes& attr,
+                                     LaunchBounds) {
+  return cuda_deduce_block_size(
+      false, prop, attr, [](int /*block_size*/) { return 0; }, LaunchBounds{});
+}
+
 }  // namespace Impl
 }  // namespace Kokkos
 

diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -160,9 +160,21 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
   // The granularity of register allocation is chunks of 256 registers per warp
   // -> 8 registers per thread
   const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
-  const size_t max_blocks_regs =
+  size_t max_blocks_regs =
       regs_per_sm / (allocated_regs_per_thread * block_size);
 
+  // Compute the maximum number of warps as a function of the number of
+  // registers
+  const size_t max_warps_per_sm_registers =
+      cuda_max_warps_per_sm_registers(device_props, func_attr);
+
+  // Correct the number of blocks to respect the maximum number of warps per
+  // SM, which is constrained to be a multiple of the warp allocation
+  // granularity defined in `cuda_warp_per_sm_allocation_granularity`.
+  while ((max_blocks_regs * block_size / device_props.warpSize) >
+         max_warps_per_sm_registers)
+    max_blocks_regs--;
+
   // Compute how many threads per sm we actually want
   const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
   // only allocate multiples of warp size

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -39,6 +39,34 @@
 namespace Kokkos {
 namespace Impl {
 
+template <typename ParallelType, typename Policy, typename LaunchBounds>
+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
+  cudaFuncAttributes attr =
+      CudaParallelLaunch<ParallelType,
+                         LaunchBounds>::get_cuda_func_attributes();
+  auto const& prop = pol.space().cuda_device_prop();
+
+  // Limits due to registers/SM, MDRange doesn't have
+  // shared memory constraints
+  int const optimal_block_size =
+      cuda_get_opt_block_size_no_shmem(prop, attr, LaunchBounds{});
+
+  // Compute how many blocks of this size we can launch, based on warp
+  // constraints
+  int const max_warps_per_sm_registers =
+      Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
+  int const max_num_threads_from_warps =
+      max_warps_per_sm_registers * prop.warpSize;
+  int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
+
+  // Compute the total number of threads
+  int const max_threads_per_sm = optimal_block_size * max_num_blocks;
+
+  return std::min(
+      max_threads_per_sm,
+      static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+}
+
 template <class FunctorType, class... Traits>
 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
  public:
@@ -57,18 +85,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
  public:
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor,
-                           LaunchBounds>::get_cuda_func_attributes();
-    auto const& prop = pol.space().cuda_device_prop();
-    // Limits due to registers/SM, MDRange doesn't have
-    // shared memory constraints
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+    return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
   }
   Policy const& get_policy() const { return m_rp; }
   inline __device__ void operator()() const {
@@ -230,17 +247,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
  public:
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelReduce,
-                           LaunchBounds>::get_cuda_func_attributes();
-    auto const& prop = pol.space().cuda_device_prop();
-    // Limits due do registers/SM
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+    return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
   }
   Policy const& get_policy() const { return m_policy; }
   inline __device__ void exec_range(reference_type update) const {