Skip to content

Commit

Permalink
add LU factorization kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Jul 19, 2022
1 parent 3aba0f7 commit b4be3c5
Show file tree
Hide file tree
Showing 28 changed files with 517,124 additions and 42 deletions.
70 changes: 70 additions & 0 deletions common/cuda_hip/components/volatile.hpp.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*******************************<GINKGO LICENSE>******************************
Copyright (c) 2017-2022, the Ginkgo authors
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

template <typename ValueType, typename IndexType>
__device__ __forceinline__
std::enable_if_t<!is_complex_s<ValueType>::value, ValueType>
load(const ValueType* values, IndexType index)
{
const volatile ValueType* val = values + index;
return *val;
}

template <typename ValueType, typename IndexType>
__device__ __forceinline__ std::enable_if_t<
std::is_floating_point<ValueType>::value, thrust::complex<ValueType>>
load(const thrust::complex<ValueType>* values, IndexType index)
{
auto real = reinterpret_cast<const ValueType*>(values);
auto imag = real + 1;
return {load(real, 2 * index), load(imag, 2 * index)};
}

template <typename ValueType, typename IndexType>
__device__ __forceinline__ void store(
ValueType* values, IndexType index,
std::enable_if_t<!is_complex_s<ValueType>::value, ValueType> value)
{
volatile ValueType* val = values + index;
*val = value;
}

template <typename ValueType, typename IndexType>
__device__ __forceinline__ void store(thrust::complex<ValueType>* values,
IndexType index,
thrust::complex<ValueType> value)
{
auto real = reinterpret_cast<ValueType*>(values);
auto imag = real + 1;
store(real, 2 * index, value.real());
store(imag, 2 * index, value.imag());
}
206 changes: 206 additions & 0 deletions common/cuda_hip/factorization/lu_kernels.hpp.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
/*******************************<GINKGO LICENSE>******************************
Copyright (c) 2017-2022, the Ginkgo authors
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

namespace kernel {


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void initialize(
const IndexType* __restrict__ mtx_row_ptrs,
const IndexType* __restrict__ mtx_cols,
const ValueType* __restrict__ mtx_vals,
const IndexType* __restrict__ factor_row_ptrs,
const IndexType* __restrict__ factor_cols,
const IndexType* __restrict__ factor_storage_offsets,
const int32* __restrict__ factor_storage,
const int64* __restrict__ factor_row_descs,
ValueType* __restrict__ factor_vals, IndexType* __restrict__ diag_idxs,
size_type num_rows)
{
const auto row = thread::get_subwarp_id_flat<config::warp_size>();
if (row >= num_rows) {
return;
}
const auto warp =
group::tiled_partition<config::warp_size>(group::this_thread_block());
// first zero out this row of the factor
const auto factor_begin = factor_row_ptrs[row];
const auto factor_end = factor_row_ptrs[row + 1];
const auto lane = static_cast<int>(warp.thread_rank());
for (auto nz = factor_begin + lane; nz < factor_end;
nz += config::warp_size) {
factor_vals[nz] = zero<ValueType>();
}
warp.sync();
// then fill in the values from mtx
gko::matrix::csr::device_sparsity_lookup<IndexType> lookup{
factor_row_ptrs, factor_cols, factor_storage_offsets,
factor_storage, factor_row_descs, row};
const auto row_begin = mtx_row_ptrs[row];
const auto row_end = mtx_row_ptrs[row + 1];
for (auto nz = row_begin + lane; nz < row_end; nz += config::warp_size) {
const auto col = mtx_cols[nz];
const auto val = mtx_vals[nz];
factor_vals[lookup.lookup_unsafe(col) + factor_begin] = val;
}
if (lane == 0) {
diag_idxs[row] = lookup.lookup_unsafe(row) + factor_begin;
}
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void factorize(
const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols,
const IndexType* __restrict__ storage_offsets,
const int32* __restrict__ storage, const int64* __restrict__ row_descs,
const IndexType* __restrict__ diag_idxs, ValueType* __restrict__ vals,
int* __restrict__ ready, int* __restrict__ block_counter,
size_type num_rows)
{
__shared__ int threadblock_offset;
__shared__ int sh_ready[default_block_size / config::warp_size];
if (threadIdx.x == 0) {
threadblock_offset = atomic_add(block_counter, 1) * default_block_size;
}
if (threadIdx.x < default_block_size / config::warp_size) {
sh_ready[threadIdx.x] = 0;
}
__syncthreads();
const auto row = (threadblock_offset + static_cast<int>(threadIdx.x)) /
config::warp_size;
if (row >= num_rows) {
return;
}
const auto block = threadblock_offset / default_block_size;
const auto warp =
group::tiled_partition<config::warp_size>(group::this_thread_block());
const auto lane = warp.thread_rank();
const auto row_begin = row_ptrs[row];
const auto row_diag = diag_idxs[row];
const auto row_end = row_ptrs[row + 1];
const auto row_local = row % (default_block_size / config::warp_size);
gko::matrix::csr::device_sparsity_lookup<IndexType> lookup{
row_ptrs, cols, storage_offsets, storage, row_descs, row};
// for each lower triangular entry: eliminate with corresponding row
for (auto lower_nz = row_begin; lower_nz < row_diag; lower_nz++) {
const auto dep = cols[lower_nz];
const auto dep_block = dep / (default_block_size / config::warp_size);
const auto dep_local = dep % (default_block_size / config::warp_size);
auto val = vals[lower_nz];
const auto diag_idx = diag_idxs[dep];
const auto dep_end = row_ptrs[dep + 1];
assert(dep < row);
if (dep_block == block) {
// wait for a local dependency
while (!load(sh_ready, dep_local)) {
__threadfence();
}
} else {
// wait for a global dependency
while (!load(ready, dep)) {
__threadfence();
}
}
__threadfence();
const auto diag = vals[diag_idx];
const auto scale = val / diag;
if (lane == 0) {
vals[lower_nz] = scale;
}
// subtract all entries past the diagonal
for (auto upper_nz = diag_idx + 1 + lane; upper_nz < dep_end;
upper_nz += config::warp_size) {
const auto upper_col = cols[upper_nz];
const auto upper_val = vals[upper_nz];
const auto output_pos = lookup.lookup_unsafe(upper_col) + row_begin;
vals[output_pos] -= scale * upper_val;
}
}
__threadfence();
// notify local warps
store(sh_ready, row_local, 1);
// notify other blocks
store(ready, row, 1);
}


} // namespace kernel


template <typename ValueType, typename IndexType>
void initialize(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* mtx,
const IndexType* factor_lookup_offsets,
const int64* factor_lookup_descs,
const int32* factor_lookup_storage, IndexType* diag_idxs,
matrix::Csr<ValueType, IndexType>* factors)
{
const auto num_rows = mtx->get_size()[0];
if (num_rows > 0) {
const auto num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
kernel::initialize<<<num_blocks, default_block_size>>>(
mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
as_device_type(mtx->get_const_values()),
factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
factor_lookup_offsets, factor_lookup_storage, factor_lookup_descs,
as_device_type(factors->get_values()), diag_idxs, num_rows);
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);


template <typename ValueType, typename IndexType>
void factorize(std::shared_ptr<const DefaultExecutor> exec,
const IndexType* lookup_offsets, const int64* lookup_descs,
const int32* lookup_storage, const IndexType* diag_idxs,
matrix::Csr<ValueType, IndexType>* factors,
array<int>& tmp_storage)
{
const auto num_rows = factors->get_size()[0];
if (num_rows > 0) {
tmp_storage.resize_and_reset(num_rows + 1);
components::fill_array(exec, tmp_storage.get_data(), num_rows + 1,
int{});
const auto num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
kernel::factorize<<<num_blocks, default_block_size>>>(
factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
lookup_offsets, lookup_storage, lookup_descs, diag_idxs,
as_device_type(factors->get_values()), tmp_storage.get_data(),
tmp_storage.get_data() + num_rows, num_rows);
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
13 changes: 13 additions & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "core/factorization/factorization_kernels.hpp"
#include "core/factorization/ic_kernels.hpp"
#include "core/factorization/ilu_kernels.hpp"
#include "core/factorization/lu_kernels.hpp"
#include "core/factorization/par_ic_kernels.hpp"
#include "core/factorization/par_ict_kernels.hpp"
#include "core/factorization/par_ilu_kernels.hpp"
Expand Down Expand Up @@ -502,6 +503,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
GKO_STUB_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL);
GKO_STUB_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);

template <typename ValueType, typename IndexType>
GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType)
Expand Down Expand Up @@ -694,6 +697,16 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
} // namespace ilu_factorization


namespace lu_factorization {


GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);


} // namespace lu_factorization


namespace par_ic_factorization {


Expand Down
87 changes: 87 additions & 0 deletions core/factorization/lu_kernels.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*******************************<GINKGO LICENSE>******************************
Copyright (c) 2017-2022, the Ginkgo authors
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

#ifndef GKO_CORE_FACTORIZATION_LU_KERNELS_HPP_
#define GKO_CORE_FACTORIZATION_LU_KERNELS_HPP_


#include <memory>


#include <ginkgo/core/base/executor.hpp>
#include <ginkgo/core/base/types.hpp>
#include <ginkgo/core/matrix/csr.hpp>


#include "core/base/kernel_declaration.hpp"


namespace gko {
namespace kernels {


#define GKO_DECLARE_LU_INITIALIZE(ValueType, IndexType) \
void initialize(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<ValueType, IndexType>* mtx, \
const IndexType* factor_lookup_offsets, \
const int64* factor_lookup_descs, \
const int32* factor_lookup_storage, IndexType* diag_idxs, \
matrix::Csr<ValueType, IndexType>* factors)


#define GKO_DECLARE_LU_FACTORIZE(ValueType, IndexType) \
void factorize( \
std::shared_ptr<const DefaultExecutor> exec, \
const IndexType* lookup_storage_offsets, const int64* lookup_descs, \
const int32* lookup_storage, const IndexType* diag_idxs, \
matrix::Csr<ValueType, IndexType>* factors, array<int>& tmp_storage)


#define GKO_DECLARE_ALL_AS_TEMPLATES \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_LU_INITIALIZE(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_LU_FACTORIZE(ValueType, IndexType)


GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(lu_factorization,
GKO_DECLARE_ALL_AS_TEMPLATES);


#undef GKO_DECLARE_ALL_AS_TEMPLATES


} // namespace kernels
} // namespace gko


#endif // GKO_CORE_FACTORIZATION_LU_KERNELS_HPP_
Loading

0 comments on commit b4be3c5

Please sign in to comment.