Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Jan 21, 2020
1 parent 453bde4 commit ffe5b91
Show file tree
Hide file tree
Showing 11 changed files with 499 additions and 450 deletions.
108 changes: 86 additions & 22 deletions common/factorization/par_ilut_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -247,13 +247,13 @@ template <typename IndexType, typename ValueType, typename BeginCallback,
typename StepCallback, typename FinishCallback>
__device__ void abstract_threshold_filter(
const IndexType *row_ptrs, const ValueType *vals, IndexType num_rows,
remove_complex<ValueType> threshold, bool is_lower, BeginCallback begin_cb,
remove_complex<ValueType> threshold, BeginCallback begin_cb,
StepCallback step_cb, FinishCallback finish_cb)
{
auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
auto row = tidx / config::warp_size;
auto lane = threadIdx.x % config::warp_size;
auto warp = group::thread_block_tile<config::warp_size>();
auto row = tidx / warp.size();
auto lane = warp.thread_rank();
using lanemask = decltype(warp.ballot(true));
auto lane_prefix_mask = (lanemask(1) << lane) - 1;
if (row >= num_rows) {
Expand All @@ -264,7 +264,7 @@ __device__ void abstract_threshold_filter(
auto begin = row_ptrs[row];
auto end = row_ptrs[row + 1];
begin_cb(row);
auto diag_idx = is_lower ? end - 1 : begin;
auto diag_idx = end - 1;
auto num_steps = ceildiv(end - begin, config::warp_size);
for (auto step = 0; step < num_steps; ++step) {
auto idx = begin + lane + step * config::warp_size;
Expand All @@ -273,7 +273,8 @@ __device__ void abstract_threshold_filter(
}
auto keep = idx < end && (abs(val) >= threshold || idx == diag_idx);
auto mask = warp.ballot(keep);
step_cb(idx, keep, popcnt(mask), popcnt(mask & lane_prefix_mask));
step_cb(row, idx, val, keep, popcnt(mask),
popcnt(mask & lane_prefix_mask));
}
finish_cb(row, lane);
}
Expand All @@ -282,18 +283,18 @@ __device__ void abstract_threshold_filter(
template <typename IndexType, typename ValueType>
__global__ __launch_bounds__(default_block_size) void threshold_filter_nnz(
const IndexType *row_ptrs, const ValueType *vals, IndexType num_rows,
remove_complex<ValueType> threshold, IndexType *nnz, bool is_lower)
remove_complex<ValueType> threshold, IndexType *nnz)
{
IndexType count{};
abstract_threshold_filter(row_ptrs, vals, num_rows, threshold, is_lower,
[](IndexType) {},
[&](IndexType, bool, IndexType warp_count,
IndexType) { count += warp_count; },
[&](IndexType row, IndexType lane) {
if (row < num_rows && lane == 0) {
nnz[row] = count;
}
});
abstract_threshold_filter(
row_ptrs, vals, num_rows, threshold, [&](IndexType) { count = 0; },
[&](IndexType, IndexType, ValueType, bool, IndexType warp_count,
IndexType) { count += warp_count; },
[&](IndexType row, IndexType lane) {
if (row < num_rows && lane == 0) {
nnz[row] = count;
}
});
}


Expand All @@ -302,20 +303,23 @@ __global__ __launch_bounds__(default_block_size) void threshold_filter(
const IndexType *old_row_ptrs, const IndexType *old_col_idxs,
const ValueType *old_vals, IndexType num_rows,
remove_complex<ValueType> threshold, const IndexType *new_row_ptrs,
IndexType *new_col_idxs, ValueType *new_vals, bool is_lower)
IndexType *new_row_idxs, IndexType *new_col_idxs, ValueType *new_vals)
{
IndexType count{};
IndexType new_offset{};
abstract_threshold_filter(
old_row_ptrs, old_vals, num_rows, threshold, is_lower,
[&](IndexType row) { new_offset = new_row_ptrs[row]; },
[&](IndexType idx, bool keep, IndexType warp_count,
IndexType warp_prefix_sum) {
old_row_ptrs, old_vals, num_rows, threshold,
[&](IndexType row) {
new_offset = new_row_ptrs[row];
count = 0;
},
[&](IndexType row, IndexType idx, ValueType val, bool keep,
IndexType warp_count, IndexType warp_prefix_sum) {
if (keep) {
auto new_idx = new_offset + warp_prefix_sum + count;
new_row_idxs[new_idx] = row;
new_col_idxs[new_idx] = old_col_idxs[idx];
// hopefully the compiler is able to remove this duplicate load
new_vals[new_idx] = old_vals[idx];
new_vals[new_idx] = val;
}
count += warp_count;
},
Expand Down Expand Up @@ -452,4 +456,64 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init(
}


template <int subwarp_size, typename IndexType, typename ValueType>
__global__ __launch_bounds__(default_block_size) void sweep(
const IndexType *a_row_ptrs, const IndexType *a_col_idxs,
const ValueType *a_vals, const IndexType *l_row_ptrs,
const IndexType *l_row_idxs, const IndexType *l_col_idxs, ValueType *l_vals,
IndexType l_nnz, const IndexType *u_col_ptrs, const IndexType *u_row_idxs,
const IndexType *u_col_idxs, ValueType *u_vals, IndexType u_nnz)
{
auto tidx = (threadIdx.x + blockIdx.x * blockDim.x) / subwarp_size;
if (tidx >= l_nnz + u_nnz) {
return;
}
auto row = tidx < l_nnz ? l_row_idxs[tidx] : u_row_idxs[tidx - l_nnz];
auto col = tidx < l_nnz ? l_col_idxs[tidx] : u_col_idxs[tidx - l_nnz];
if (tidx < l_nnz && row == col) {
// don't update the diagonal twice
return;
}
auto subwarp =
group::tiled_partition<subwarp_size>(group::this_thread_block());
auto a_row_begin = a_row_ptrs[row];
auto a_row_size = a_row_ptrs[row + 1] - a_row_begin;
auto a_idx =
group_wide_search(a_row_begin, a_row_size, subwarp,
[&](IndexType i) { return a_col_idxs[i] >= col; });
auto a_val = a_col_idxs[a_idx] == col ? a_vals[a_idx] : zero<ValueType>();
auto l_row_begin = l_row_ptrs[row];
auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
auto u_col_begin = u_col_ptrs[row];
auto u_col_size = u_col_ptrs[row + 1] - u_col_begin;
ValueType sum{};
ValueType last_product{};
IndexType l_out_idx{};
IndexType u_out_idx{};
group_merge(l_col_idxs, l_row_begin, l_row_size, u_row_idxs, u_col_begin,
u_col_size,
[&](IndexType l_idx, ValueType l_col, IndexType u_idx,
ValueType u_row, IndexType) {
if (l_col == u_row) {
l_out_idx = l_idx;
u_out_idx = u_idx;
last_product = l_vals[l_idx] * u_vals[u_idx];
sum += last_product;
}
});

if (row > col) {
auto to_write = sum / u_vals[u_col_ptrs[col + 1] - 1];
if (::gko::isfinite(to_write)) {
l_vals[l_out_idx] = to_write;
}
} else {
auto to_write = sum;
if (::gko::isfinite(to_write)) {
u_vals[u_out_idx] = to_write;
}
}
}


} // namespace kernel
34 changes: 22 additions & 12 deletions core/factorization/par_ilut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,14 @@ struct ParIlutState {
std::unique_ptr<CooMatrix> l_coo;
// transposed upper factor U currently being updated
std::unique_ptr<CooMatrix> u_transp_coo;
// temporary array for threshold selection
Array<ValueType> selection_tmp;
// temporary array for threshold selection
Array<remove_complex<ValueType>> selection_tmp2;
// strategy to be used by the lower factor
std::shared_ptr<typename CsrMatrix::strategy_type> l_strategy;
// strategy to be used by the upper factor
std::shared_ptr<typename CsrMatrix::strategy_type> u_strategy;

ParIlutState(std::shared_ptr<const Executor> exec_in,
const CsrMatrix *system_matrix_in,
Expand All @@ -131,8 +139,8 @@ struct ParIlutState {
system_matrix{system_matrix_in},
l{std::move(l_in)},
u{std::move(u_in)},
l_row_idxs_array{exec},
u_col_idxs_array{exec}
selection_tmp{exec},
selection_tmp2{exec}
{
auto mtx_size = system_matrix->get_size();
auto u_nnz = u->get_num_stored_elements();
Expand Down Expand Up @@ -178,7 +186,7 @@ ParIlut<ValueType, IndexType>::generate_l_u(
->convert_to(csr_system_matrix_unique_ptr.get());
csr_system_matrix = csr_system_matrix_unique_ptr.get();
}
if (!skip_sorting) {
if (!parameters_.skip_sorting) {
if (csr_system_matrix_unique_ptr == nullptr) {
csr_system_matrix_unique_ptr = CsrMatrix::create(exec);
csr_system_matrix_unique_ptr->copy_from(csr_system_matrix);
Expand All @@ -203,12 +211,12 @@ ParIlut<ValueType, IndexType>::generate_l_u(
auto l_nnz = static_cast<size_type>(l_nnz_it);
auto u_nnz = static_cast<size_type>(u_nnz_it);

auto l =
Csr::create(exec, Array<ValueType>{exec, l_nnz},
Array<IndexType>{exec, l_nnz}, std::move(l_row_ptrs_array));
auto u =
Csr::create(exec, Array<ValueType>{exec, u_nnz},
Array<IndexType>{exec, u_nnz}, std::move(u_row_ptrs_array));
auto l = CsrMatrix::create(exec, Array<ValueType>{exec, l_nnz},
Array<IndexType>{exec, l_nnz},
std::move(l_row_ptrs_array));
auto u = CsrMatrix::create(exec, Array<ValueType>{exec, u_nnz},
Array<IndexType>{exec, u_nnz},
std::move(u_row_ptrs_array));

// initialize L and U
exec->run(make_initialize_l_u(csr_system_matrix, l.get(), u.get()));
Expand Down Expand Up @@ -276,10 +284,12 @@ void ParIlutState<ValueType, IndexType>::iterate()
} else {
// select threshold to remove smallest candidates
remove_complex<ValueType> l_threshold{};
exec->run(make_threshold_select(l_new.get(), l_nnz_limit, l_threshold));
exec->run(make_threshold_select(l_new.get(), l_nnz_limit, l_threshold,
selection_tmp, selection_tmp2));
remove_complex<ValueType> u_threshold{};
exec->run(
make_threshold_select(u_new_csc.get(), u_nnz_limit, u_threshold));
exec->run(make_threshold_select(u_new_csc.get(), u_nnz_limit,
u_threshold, selection_tmp,
selection_tmp2));

// remove smallest candidates
exec->run(make_threshold_filter(l_new.get(), l_threshold, l.get(),
Expand Down
3 changes: 2 additions & 1 deletion core/factorization/par_ilut_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ namespace kernels {
#define GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL(ValueType, IndexType) \
void threshold_select(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<ValueType, IndexType> *m, \
IndexType rank, \
IndexType rank, Array<ValueType> &tmp, \
Array<remove_complex<ValueType>> &tmp2, \
remove_complex<ValueType> &threshold)
#define GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL(ValueType, IndexType) \
void threshold_filter(std::shared_ptr<const DefaultExecutor> exec, \
Expand Down
Loading

0 comments on commit ffe5b91

Please sign in to comment.