optimization for batchcg medium kernel

ginkgo-project · May 4, 2023 · 2808f12 · 2808f12
1 parent a4fe096
commit 2808f12
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 46 deletions.
diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp
@@ -158,11 +158,11 @@ StorageConfig compute_shared_storage(const int shared_mem_per_blk,
 {
     using real_type = remove_complex<ValueType>;
     const int vec_size = num_rows * num_rhs * sizeof(ValueType);
-    const int num_priority_vecs = 4;
+    const int num_priority_vecs = 5;
     const int prec_storage =
         Prectype::dynamic_work_size(num_rows, num_nz) * sizeof(ValueType);
     int rem_shared = shared_mem_per_blk;
-    const int num_cg_vecs{6};
+    const int num_cg_vecs{5};
     StorageConfig sconf{false, 0, num_cg_vecs, 0, num_rows};
     if (rem_shared <= 0) {
         set_gmem_stride_bytes<align_bytes>(sconf, vec_size, prec_storage);

diff --git a/dpcpp/matrix/batch_vector_kernels.hpp.inc b/dpcpp/matrix/batch_vector_kernels.hpp.inc
@@ -146,10 +146,7 @@ __dpct_inline__ void compute_norm2_kernel(
 
     val = sycl::reduce_over_group(group, val, sycl::plus<>());
 
-    // if (sg_tid == 0) {
     result = sqrt(val);
-    // }
-    // sg.barrier();
 }
 
 /*

diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -115,6 +115,8 @@ class KernelCaller {
         auto workspace = gko::Array<ValueType>(
             exec_, sconf.gmem_stride_bytes * num_batches / sizeof(ValueType));
         assert(sconf.gmem_stride_bytes % sizeof(ValueType) == 0);
+        //    std::cout << "HERE: " << sconf.n_shared << " " <<
+        //    sconf.prec_shared << std::endl;
 
         ValueType* const workspace_data = workspace.get_data();
         auto b_values = b.values;
@@ -129,7 +131,7 @@ class KernelCaller {
             sycl::accessor<real_type, 1, sycl::access_mode::read_write,
                            sycl::access::target::local>
                 slm_reals(sycl::range<1>(2), cgh);
-            if (nrows < 64) {
+            if (nrows <= 32) {
                 cgh.parallel_for(
                     sycl_nd_range(grid, block),
                     [=](sycl::nd_item<3> item_ct1)
@@ -156,7 +158,7 @@ class KernelCaller {
                                 nrows, a.num_nnz, slm_values_ptr, slm_reals_ptr,
                                 item_ct1, workspace_data);
                         });
-            } else if (nrows < 512) {
+            } else if (nrows <= 256 && sconf.n_global == 0) {
                 cgh.parallel_for(
                     sycl_nd_range(grid, block),
                     [=](sycl::nd_item<3> item_ct1)

diff --git a/dpcpp/solver/batch_cg_kernels.hpp.inc b/dpcpp/solver/batch_cg_kernels.hpp.inc
@@ -146,7 +146,7 @@ __dpct_inline__ void update_p(const int num_rows,
 
 template <typename ValueType>
 __dpct_inline__ void update_x_and_r_small(
-    const int num_rows, const ValueType& rho_old_shared_entry,
+    const int num_rows, const ValueType rho_old_shared_entry,
     const ValueType* const p_shared_entry,
     const ValueType* const Ap_shared_entry, ValueType& alpha_shared_entry,
     ValueType* const x_shared_entry, ValueType* const r_shared_entry,
@@ -169,7 +169,7 @@ __dpct_inline__ void update_x_and_r_small(
 
 template <typename ValueType>
 __dpct_inline__ void update_x_and_r(const int num_rows,
-                                    const ValueType& rho_old_shared_entry,
+                                    const ValueType rho_old_shared_entry,
                                     const ValueType* const p_shared_entry,
                                     const ValueType* const Ap_shared_entry,
                                     ValueType* const x_shared_entry,
@@ -252,15 +252,14 @@ void small_apply_kernel(
 
     // stopping criterion object
     StopType stop(tol, norms_rhs_sh);
+    norms_res_sh[0] = sqrt(abs(rho_old_sh[0]));
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    if (stop.check_converged(norms_res_sh)) {
+        return;
+    }
 
     int iter = 0;
     for (; iter < max_iter; iter++) {
-        norms_res_sh[0] = sqrt(abs(rho_old_sh[0]));
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-        if (stop.check_converged(norms_res_sh)) {
-            break;
-        }
-
         // Ap = A * p
         single_matvec_kernel(A_global_entry, p_sh, Ap_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::local_space);
@@ -282,6 +281,11 @@ void small_apply_kernel(
                                           item_ct1);
         }
         item_ct1.barrier(sycl::access::fence_space::local_space);
+        norms_res_sh[0] = sqrt(abs(rho_new_sh[0]));
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        if (stop.check_converged(norms_res_sh)) {
+            return;
+        }
 
         // beta = rho_new / rho_old
         // p = z + beta * p
@@ -362,15 +366,13 @@ void apply_medium_kernel(const gko::kernels::batch_cg::StorageConfig sconf,
 
     // stopping criterion object
     StopType stop(tol, &norms_rhs_sh);
+    norms_res_sh = sqrt(abs(rho_old_sh));
+    if (stop.check_converged(&norms_res_sh)) {
+        return;
+    }
 
     int iter = 0;
     for (; iter < max_iter; iter++) {
-        norms_res_sh = sqrt(abs(rho_old_sh));
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-        if (stop.check_converged(&norms_res_sh)) {
-            break;
-        }
-
         // Ap = A * p
         single_matvec_kernel(A_global_entry, p_sh, Ap_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::local_space);
@@ -381,19 +383,23 @@ void apply_medium_kernel(const gko::kernels::batch_cg::StorageConfig sconf,
         update_x_and_r(nrows, rho_old_sh, p_sh, Ap_sh, x_sh, r_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::local_space);
 
+
         // z = precond * r
         prec_shared.apply(nrows, r_sh, z_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::local_space);
 
         //  rho_new =  (r)' * (z)
         compute_dot_product_kernel(nrows, r_sh, z_sh, rho_new_sh, item_ct1);
+        norms_res_sh = sqrt(abs(rho_new_sh));
+        if (stop.check_converged(&norms_res_sh)) {
+            break;
+        }
 
         // beta = rho_new / rho_old
         // p = z + beta * p
         update_p(nrows, rho_new_sh, rho_old_sh, z_sh, p_sh, item_ct1);
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-
         rho_old_sh = rho_new_sh;
+        // item_ct1.barrier(sycl::access::fence_space::local_space);
     }
 
     logger.log_iteration(ibatch, iter, norms_res_sh);
@@ -440,17 +446,7 @@ void apply_large_kernel(const gko::kernels::batch_cg::StorageConfig sconf,
     ValueType* Ap_sh;
     ValueType* x_sh;
     ValueType* prec_work_sh;
-    /*
-    if (sconf.n_global == 0){
-      r_sh = slm_values;
-      z_sh = r_sh + sconf.padded_vec_len;
-      p_sh = z_sh + sconf.padded_vec_len;
-      Ap_sh = p_sh + sconf.padded_vec_len;
-      prec_work_sh = Ap_sh + sconf.padded_vec_len;
-      x_sh = prec_work_sh + PrecType::dynamic_work_size(nrows, nnz);
-    }
-    else {
-    */
+    //
     if (sconf.n_shared >= 1) {
         r_sh = slm_values;
     } else {
@@ -481,7 +477,6 @@ void apply_large_kernel(const gko::kernels::batch_cg::StorageConfig sconf,
     } else {
         x_sh = prec_work_sh + PrecType::dynamic_work_size(nrows, nnz);
     }
-    //}
     // generate preconditioner
     prec_shared.generate(ibatch, A_global_entry, prec_work_sh, item_ct1);
 
@@ -498,15 +493,14 @@ void apply_large_kernel(const gko::kernels::batch_cg::StorageConfig sconf,
 
     // stopping criterion object
     StopType stop(tol, &norms_rhs_sh);
+    norms_res_sh = sqrt(abs(rho_old_sh));
+    item_ct1.barrier(sycl::access::fence_space::local_space);
+    if (stop.check_converged(&norms_res_sh)) {
+        return;
+    }
 
     int iter = 0;
     for (; iter < max_iter; iter++) {
-        norms_res_sh = sqrt(abs(rho_old_sh));
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-        if (stop.check_converged(&norms_res_sh)) {
-            break;
-        }
-
         // Ap = A * p
         single_matvec_kernel(A_global_entry, p_sh, Ap_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::local_space);
@@ -523,6 +517,10 @@ void apply_large_kernel(const gko::kernels::batch_cg::StorageConfig sconf,
 
         //  rho_new =  (r)' * (z)
         compute_dot_product_kernel(nrows, r_sh, z_sh, rho_new_sh, item_ct1);
+        norms_res_sh = sqrt(abs(rho_new_sh));
+        if (stop.check_converged(&norms_res_sh)) {
+            break;
+        }
 
         // beta = rho_new / rho_old
         // p = z + beta * p

diff --git a/examples/batched-solver/run-all-batch.sh b/examples/batched-solver/run-all-batch.sh
@@ -3,13 +3,14 @@
 #declare -a SOLVERS=("bicgstab" "cg" "direct" "gmres" "idr" "lower_trs" "upper_trs" "richardson")
 #declare -a SOLVERS=("cg" "bicgstab" "gmres" "richardson")
 declare -a SOLVERS=("cg")
-#declare -a BATCH_SIZES=("50" "100" "500" "1000" "5000" "10000" "50000")
-declare -a BATCH_SIZES=("1000" "5000" "10000" "50000" "100000")
-declare -a MAT_SIZES=("16" "32" "64" "128" "256" "512" "768" "1024" "1280" "1536" "1792" "2048")
+declare -a BATCH_SIZES=("128" "256" "512" "1024" "2048" "4096" "8192" "16384" "32768" "65536" "131072" "262144")
+#declare -a BATCH_SIZES=("32768" "65536" "131072" "262144")
+#declare -a BATCH_SIZES=("131072")
+declare -a MAT_SIZES=("16" "32" "64" "128" "256" "512" "1024")
 
-NUM_TILES=1
+NUM_TILES=2
 EXEC="dpcpp"
-VER="opt"
+VER="opt3"
 
 BIN_PREFIX_PATH="${HOME}/ginkgo/build/examples/batched-solver"
 OUTPUT_PATH="../performance"
@@ -18,6 +19,8 @@ mkdir -p $DIR
 
 if [ $NUM_TILES -eq 1 ];  then
     export ZE_AFFINITY_MASK=0.0
+else
+    unset ZE_AFFINITY_MASK
 fi
 
 for SOLVER in "${SOLVERS[@]}"