Skip to content

Commit

Permalink
fix kernel launch issues for certain cuda versions
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcelKoch committed Nov 29, 2021
1 parent 42b25bc commit 810d86c
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 64 deletions.
62 changes: 34 additions & 28 deletions common/cuda_hip/distributed/partition_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -43,36 +43,42 @@ void build_starting_indices(std::shared_ptr<const DefaultExecutor> exec,
LocalIndexType* starting_indices,
LocalIndexType* part_sizes)
{
Array<LocalIndexType> range_sizes{exec, num_ranges};
// num_parts sentinel at the end
Array<comm_index_type> tmp_part_ids{exec, num_ranges + 1};
Array<GlobalIndexType> permutation{exec, num_ranges};
// set part_sizes to 0 in case of empty parts
components::fill_array(exec, part_sizes, num_parts, LocalIndexType{});
if (num_ranges > 0) {
Array<LocalIndexType> range_sizes{exec, num_ranges};
// num_parts sentinel at the end
Array<comm_index_type> tmp_part_ids{exec, num_ranges + 1};
Array<GlobalIndexType> permutation{exec, num_ranges};
// set part_sizes to 0 in case of empty parts
components::fill_array(exec, part_sizes, num_parts, LocalIndexType{});

kernel::setup_sizes_ids_permutation(
exec, num_ranges, num_parts, range_offsets, range_parts,
range_sizes, tmp_part_ids, permutation);

kernel::setup_sizes_ids_permutation(exec, num_ranges, num_parts,
range_offsets, range_parts, range_sizes,
tmp_part_ids, permutation);

auto tmp_part_id_ptr = thrust::device_pointer_cast(tmp_part_ids.get_data());
auto range_sizes_ptr = thrust::device_pointer_cast(range_sizes.get_data());
auto permutation_ptr = thrust::device_pointer_cast(permutation.get_data());
auto value_it = thrust::make_zip_iterator(
thrust::make_tuple(range_sizes_ptr, permutation_ptr));
// group range_sizes by part ID
thrust::stable_sort_by_key(thrust::device, tmp_part_id_ptr,
tmp_part_id_ptr + num_ranges, value_it);
// compute inclusive prefix sum for each part
thrust::inclusive_scan_by_key(thrust::device, tmp_part_id_ptr,
tmp_part_id_ptr + num_ranges, range_sizes_ptr,
range_sizes_ptr);
// write back the results
kernel::compute_part_sizes_and_starting_indices(
exec, num_ranges, range_sizes, tmp_part_ids, permutation,
starting_indices, part_sizes);
num_empty_parts =
thrust::count(thrust::device, part_sizes, part_sizes + num_parts, 0);
auto tmp_part_id_ptr =
thrust::device_pointer_cast(tmp_part_ids.get_data());
auto range_sizes_ptr =
thrust::device_pointer_cast(range_sizes.get_data());
auto permutation_ptr =
thrust::device_pointer_cast(permutation.get_data());
auto value_it = thrust::make_zip_iterator(
thrust::make_tuple(range_sizes_ptr, permutation_ptr));
// group range_sizes by part ID
thrust::stable_sort_by_key(thrust::device, tmp_part_id_ptr,
tmp_part_id_ptr + num_ranges, value_it);
// compute inclusive prefix sum for each part
thrust::inclusive_scan_by_key(thrust::device, tmp_part_id_ptr,
tmp_part_id_ptr + num_ranges,
range_sizes_ptr, range_sizes_ptr);
// write back the results
kernel::compute_part_sizes_and_starting_indices(
exec, num_ranges, range_sizes, tmp_part_ids, permutation,
starting_indices, part_sizes);
num_empty_parts = thrust::count(thrust::device, part_sizes,
part_sizes + num_parts, 0);
} else {
num_empty_parts = num_parts;
}
}

GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
Expand Down
26 changes: 15 additions & 11 deletions cuda/base/kernel_launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -79,23 +79,27 @@ template <typename KernelFunction, typename... KernelArgs>
void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
size_type size, KernelArgs&&... args)
{
gko::cuda::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size, block_size);
generic_kernel_1d<<<num_blocks, block_size>>>(static_cast<int64>(size), fn,
map_to_device(args)...);
if (size > 0) {
gko::cuda::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size, block_size);
generic_kernel_1d<<<num_blocks, block_size>>>(
static_cast<int64>(size), fn, map_to_device(args)...);
}
}

template <typename KernelFunction, typename... KernelArgs>
void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
dim<2> size, KernelArgs&&... args)
{
gko::cuda::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size[0] * size[1], block_size);
generic_kernel_2d<<<num_blocks, block_size>>>(static_cast<int64>(size[0]),
static_cast<int64>(size[1]),
fn, map_to_device(args)...);
if (size[0] * size[1] > 0) {
gko::cuda::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size[0] * size[1], block_size);
generic_kernel_2d<<<num_blocks, block_size>>>(
static_cast<int64>(size[0]), static_cast<int64>(size[1]), fn,
map_to_device(args)...);
}
}


Expand Down
14 changes: 8 additions & 6 deletions cuda/base/kernel_launch_solver.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,14 @@ void run_kernel_solver(std::shared_ptr<const CudaExecutor> exec,
KernelFunction fn, dim<2> size, size_type default_stride,
KernelArgs&&... args)
{
gko::cuda::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size[0] * size[1], block_size);
generic_kernel_2d_solver<<<num_blocks, block_size>>>(
static_cast<int64>(size[0]), static_cast<int64>(size[1]),
static_cast<int64>(default_stride), fn, map_to_device(args)...);
if (size[0] * size[1] > 0) {
gko::cuda::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size[0] * size[1], block_size);
generic_kernel_2d_solver<<<num_blocks, block_size>>>(
static_cast<int64>(size[0]), static_cast<int64>(size[1]),
static_cast<int64>(default_stride), fn, map_to_device(args)...);
}
}


Expand Down
28 changes: 17 additions & 11 deletions hip/base/kernel_launch.hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,23 +82,29 @@ template <typename KernelFunction, typename... KernelArgs>
void run_kernel(std::shared_ptr<const HipExecutor> exec, KernelFunction fn,
size_type size, KernelArgs&&... args)
{
gko::hip::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size, block_size);
hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0,
static_cast<int64>(size), fn, map_to_device(args)...);
if (size > 0) {
gko::hip::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size, block_size);
hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0,
static_cast<int64>(size), fn,
map_to_device(args)...);
}
}

template <typename KernelFunction, typename... KernelArgs>
void run_kernel(std::shared_ptr<const HipExecutor> exec, KernelFunction fn,
dim<2> size, KernelArgs&&... args)
{
gko::hip::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size[0] * size[1], block_size);
hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0,
static_cast<int64>(size[0]), static_cast<int64>(size[1]),
fn, map_to_device(args)...);
if (size[0] * size[1] > 0) {
gko::hip::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size[0] * size[1], block_size);
hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0,
static_cast<int64>(size[0]),
static_cast<int64>(size[1]), fn,
map_to_device(args)...);
}
}


Expand Down
18 changes: 10 additions & 8 deletions hip/base/kernel_launch_solver.hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,16 @@ void run_kernel_solver(std::shared_ptr<const HipExecutor> exec,
KernelFunction fn, dim<2> size, size_type default_stride,
KernelArgs&&... args)
{
gko::hip::device_guard guard{exec->get_device_id()};
constexpr auto block_size = kernels::hip::default_block_size;
auto num_blocks = ceildiv(size[0] * size[1], block_size);
hipLaunchKernelGGL(kernels::hip::generic_kernel_2d_solver, num_blocks,
block_size, 0, 0, static_cast<int64>(size[0]),
static_cast<int64>(size[1]),
static_cast<int64>(default_stride), fn,
kernels::hip::map_to_device(args)...);
if (size[0] * size[1] > 0) {
gko::hip::device_guard guard{exec->get_device_id()};
constexpr auto block_size = kernels::hip::default_block_size;
auto num_blocks = ceildiv(size[0] * size[1], block_size);
hipLaunchKernelGGL(kernels::hip::generic_kernel_2d_solver, num_blocks,
block_size, 0, 0, static_cast<int64>(size[0]),
static_cast<int64>(size[1]),
static_cast<int64>(default_stride), fn,
kernels::hip::map_to_device(args)...);
}
}


Expand Down

0 comments on commit 810d86c

Please sign in to comment.