Skip to content

Commit

Permalink
optimization for batchcg medium kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
phu0ngng committed May 4, 2023
1 parent a4fe096 commit 2808f12
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 46 deletions.
4 changes: 2 additions & 2 deletions core/solver/batch_cg_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,11 @@ StorageConfig compute_shared_storage(const int shared_mem_per_blk,
{
using real_type = remove_complex<ValueType>;
const int vec_size = num_rows * num_rhs * sizeof(ValueType);
const int num_priority_vecs = 4;
const int num_priority_vecs = 5;
const int prec_storage =
Prectype::dynamic_work_size(num_rows, num_nz) * sizeof(ValueType);
int rem_shared = shared_mem_per_blk;
const int num_cg_vecs{6};
const int num_cg_vecs{5};
StorageConfig sconf{false, 0, num_cg_vecs, 0, num_rows};
if (rem_shared <= 0) {
set_gmem_stride_bytes<align_bytes>(sconf, vec_size, prec_storage);
Expand Down
3 changes: 0 additions & 3 deletions dpcpp/matrix/batch_vector_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,7 @@ __dpct_inline__ void compute_norm2_kernel(

val = sycl::reduce_over_group(group, val, sycl::plus<>());

// if (sg_tid == 0) {
result = sqrt(val);
// }
// sg.barrier();
}

/*
Expand Down
6 changes: 4 additions & 2 deletions dpcpp/solver/batch_cg_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ class KernelCaller {
auto workspace = gko::Array<ValueType>(
exec_, sconf.gmem_stride_bytes * num_batches / sizeof(ValueType));
assert(sconf.gmem_stride_bytes % sizeof(ValueType) == 0);
// std::cout << "HERE: " << sconf.n_shared << " " <<
// sconf.prec_shared << std::endl;

ValueType* const workspace_data = workspace.get_data();
auto b_values = b.values;
Expand All @@ -129,7 +131,7 @@ class KernelCaller {
sycl::accessor<real_type, 1, sycl::access_mode::read_write,
sycl::access::target::local>
slm_reals(sycl::range<1>(2), cgh);
if (nrows < 64) {
if (nrows <= 32) {
cgh.parallel_for(
sycl_nd_range(grid, block),
[=](sycl::nd_item<3> item_ct1)
Expand All @@ -156,7 +158,7 @@ class KernelCaller {
nrows, a.num_nnz, slm_values_ptr, slm_reals_ptr,
item_ct1, workspace_data);
});
} else if (nrows < 512) {
} else if (nrows <= 256 && sconf.n_global == 0) {
cgh.parallel_for(
sycl_nd_range(grid, block),
[=](sycl::nd_item<3> item_ct1)
Expand Down
66 changes: 32 additions & 34 deletions dpcpp/solver/batch_cg_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ __dpct_inline__ void update_p(const int num_rows,

template <typename ValueType>
__dpct_inline__ void update_x_and_r_small(
const int num_rows, const ValueType& rho_old_shared_entry,
const int num_rows, const ValueType rho_old_shared_entry,
const ValueType* const p_shared_entry,
const ValueType* const Ap_shared_entry, ValueType& alpha_shared_entry,
ValueType* const x_shared_entry, ValueType* const r_shared_entry,
Expand All @@ -169,7 +169,7 @@ __dpct_inline__ void update_x_and_r_small(

template <typename ValueType>
__dpct_inline__ void update_x_and_r(const int num_rows,
const ValueType& rho_old_shared_entry,
const ValueType rho_old_shared_entry,
const ValueType* const p_shared_entry,
const ValueType* const Ap_shared_entry,
ValueType* const x_shared_entry,
Expand Down Expand Up @@ -252,15 +252,14 @@ void small_apply_kernel(

// stopping criterion object
StopType stop(tol, norms_rhs_sh);
norms_res_sh[0] = sqrt(abs(rho_old_sh[0]));
item_ct1.barrier(sycl::access::fence_space::local_space);
if (stop.check_converged(norms_res_sh)) {
return;
}

int iter = 0;
for (; iter < max_iter; iter++) {
norms_res_sh[0] = sqrt(abs(rho_old_sh[0]));
item_ct1.barrier(sycl::access::fence_space::local_space);
if (stop.check_converged(norms_res_sh)) {
break;
}

// Ap = A * p
single_matvec_kernel(A_global_entry, p_sh, Ap_sh, item_ct1);
item_ct1.barrier(sycl::access::fence_space::local_space);
Expand All @@ -282,6 +281,11 @@ void small_apply_kernel(
item_ct1);
}
item_ct1.barrier(sycl::access::fence_space::local_space);
norms_res_sh[0] = sqrt(abs(rho_new_sh[0]));
item_ct1.barrier(sycl::access::fence_space::local_space);
if (stop.check_converged(norms_res_sh)) {
return;
}

// beta = rho_new / rho_old
// p = z + beta * p
Expand Down Expand Up @@ -362,15 +366,13 @@ void apply_medium_kernel(const gko::kernels::batch_cg::StorageConfig sconf,

// stopping criterion object
StopType stop(tol, &norms_rhs_sh);
norms_res_sh = sqrt(abs(rho_old_sh));
if (stop.check_converged(&norms_res_sh)) {
return;
}

int iter = 0;
for (; iter < max_iter; iter++) {
norms_res_sh = sqrt(abs(rho_old_sh));
item_ct1.barrier(sycl::access::fence_space::local_space);
if (stop.check_converged(&norms_res_sh)) {
break;
}

// Ap = A * p
single_matvec_kernel(A_global_entry, p_sh, Ap_sh, item_ct1);
item_ct1.barrier(sycl::access::fence_space::local_space);
Expand All @@ -381,19 +383,23 @@ void apply_medium_kernel(const gko::kernels::batch_cg::StorageConfig sconf,
update_x_and_r(nrows, rho_old_sh, p_sh, Ap_sh, x_sh, r_sh, item_ct1);
item_ct1.barrier(sycl::access::fence_space::local_space);


// z = precond * r
prec_shared.apply(nrows, r_sh, z_sh, item_ct1);
item_ct1.barrier(sycl::access::fence_space::local_space);

// rho_new = (r)' * (z)
compute_dot_product_kernel(nrows, r_sh, z_sh, rho_new_sh, item_ct1);
norms_res_sh = sqrt(abs(rho_new_sh));
if (stop.check_converged(&norms_res_sh)) {
break;
}

// beta = rho_new / rho_old
// p = z + beta * p
update_p(nrows, rho_new_sh, rho_old_sh, z_sh, p_sh, item_ct1);
item_ct1.barrier(sycl::access::fence_space::local_space);

rho_old_sh = rho_new_sh;
// item_ct1.barrier(sycl::access::fence_space::local_space);
}

logger.log_iteration(ibatch, iter, norms_res_sh);
Expand Down Expand Up @@ -440,17 +446,7 @@ void apply_large_kernel(const gko::kernels::batch_cg::StorageConfig sconf,
ValueType* Ap_sh;
ValueType* x_sh;
ValueType* prec_work_sh;
/*
if (sconf.n_global == 0){
r_sh = slm_values;
z_sh = r_sh + sconf.padded_vec_len;
p_sh = z_sh + sconf.padded_vec_len;
Ap_sh = p_sh + sconf.padded_vec_len;
prec_work_sh = Ap_sh + sconf.padded_vec_len;
x_sh = prec_work_sh + PrecType::dynamic_work_size(nrows, nnz);
}
else {
*/
//
if (sconf.n_shared >= 1) {
r_sh = slm_values;
} else {
Expand Down Expand Up @@ -481,7 +477,6 @@ void apply_large_kernel(const gko::kernels::batch_cg::StorageConfig sconf,
} else {
x_sh = prec_work_sh + PrecType::dynamic_work_size(nrows, nnz);
}
//}
// generate preconditioner
prec_shared.generate(ibatch, A_global_entry, prec_work_sh, item_ct1);

Expand All @@ -498,15 +493,14 @@ void apply_large_kernel(const gko::kernels::batch_cg::StorageConfig sconf,

// stopping criterion object
StopType stop(tol, &norms_rhs_sh);
norms_res_sh = sqrt(abs(rho_old_sh));
item_ct1.barrier(sycl::access::fence_space::local_space);
if (stop.check_converged(&norms_res_sh)) {
return;
}

int iter = 0;
for (; iter < max_iter; iter++) {
norms_res_sh = sqrt(abs(rho_old_sh));
item_ct1.barrier(sycl::access::fence_space::local_space);
if (stop.check_converged(&norms_res_sh)) {
break;
}

// Ap = A * p
single_matvec_kernel(A_global_entry, p_sh, Ap_sh, item_ct1);
item_ct1.barrier(sycl::access::fence_space::local_space);
Expand All @@ -523,6 +517,10 @@ void apply_large_kernel(const gko::kernels::batch_cg::StorageConfig sconf,

// rho_new = (r)' * (z)
compute_dot_product_kernel(nrows, r_sh, z_sh, rho_new_sh, item_ct1);
norms_res_sh = sqrt(abs(rho_new_sh));
if (stop.check_converged(&norms_res_sh)) {
break;
}

// beta = rho_new / rho_old
// p = z + beta * p
Expand Down
13 changes: 8 additions & 5 deletions examples/batched-solver/run-all-batch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
#declare -a SOLVERS=("bicgstab" "cg" "direct" "gmres" "idr" "lower_trs" "upper_trs" "richardson")
#declare -a SOLVERS=("cg" "bicgstab" "gmres" "richardson")
declare -a SOLVERS=("cg")
#declare -a BATCH_SIZES=("50" "100" "500" "1000" "5000" "10000" "50000")
declare -a BATCH_SIZES=("1000" "5000" "10000" "50000" "100000")
declare -a MAT_SIZES=("16" "32" "64" "128" "256" "512" "768" "1024" "1280" "1536" "1792" "2048")
declare -a BATCH_SIZES=("128" "256" "512" "1024" "2048" "4096" "8192" "16384" "32768" "65536" "131072" "262144")
#declare -a BATCH_SIZES=("32768" "65536" "131072" "262144")
#declare -a BATCH_SIZES=("131072")
declare -a MAT_SIZES=("16" "32" "64" "128" "256" "512" "1024")

NUM_TILES=1
NUM_TILES=2
EXEC="dpcpp"
VER="opt"
VER="opt3"

BIN_PREFIX_PATH="${HOME}/ginkgo/build/examples/batched-solver"
OUTPUT_PATH="../performance"
Expand All @@ -18,6 +19,8 @@ mkdir -p $DIR

if [ $NUM_TILES -eq 1 ]; then
export ZE_AFFINITY_MASK=0.0
else
unset ZE_AFFINITY_MASK
fi

for SOLVER in "${SOLVERS[@]}"
Expand Down

0 comments on commit 2808f12

Please sign in to comment.