ginkgo-project · phu0ngng · Feb 6, 2023 · Feb 6, 2023 · Feb 6, 2023 · Jan 24, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -123,11 +123,9 @@ endif()
 
 # For now, PGI/NVHPC nvc++ compiler doesn't seem to support
 # `#pragma omp declare reduction`
-#
-# The math with optimization level -O2 doesn't follow IEEE standard, so we
-# enable that back as well.
-if (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Kieee")
+if (${CMAKE_CXX_COMPILER_ID} MATCHES "PGI|NVHPC")
+    message(STATUS "OpenMP: Switching to OFF because PGI/NVHPC nvc++ compiler lacks important features.")
+    set(GINKGO_BUILD_OMP OFF)
 endif()
 
 #Batch Ginkgo options:
@@ -306,8 +304,16 @@ if(MSVC)
 endif()
 
 if(GINKGO_BUILD_DPCPP)
-    ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_MAJOR_VERSION __LIBSYCL_MAJOR_VERSION)
-    ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION __SYCL_COMPILER_VERSION)
+    if(CMAKE_CXX_COMPILER MATCHES "icpx")
+        set(GINKGO_DPCPP_MAJOR_VERSION "6")
+        set(GINKGO_DPCPP_VERSION "6")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl")
+        # AoT
+        # set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsycl -fsycl-targets=spir64 -Xsycl-target-backend '-device 12.60.7' -fsycl-max-parallel-link-jobs=8 -fsycl-unnamed-lambda")
+    else()
+        ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_MAJOR_VERSION __LIBSYCL_MAJOR_VERSION)
+        ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION __SYCL_COMPILER_VERSION)
+    endif()
 else()
     set(GINKGO_DPCPP_MAJOR_VERSION "0")
 endif()
@@ -453,10 +459,10 @@ if(NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "")
 endif()
 add_custom_target(test_install
     COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} ${TOOLSET}
-    -S${GINKGO_TEST_INSTALL_SRC_DIR}
+    -H${GINKGO_TEST_INSTALL_SRC_DIR}
     -B${GINKGO_TEST_INSTALL_BIN_DIR}
     -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-    -DCMAKE_PREFIX_PATH=${CMAKE_INSTALL_PREFIX}
+    -DCMAKE_PREFIX_PATH=${CMAKE_INSTALL_PREFIX}/${GINKGO_INSTALL_CONFIG_DIR}
     -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
     -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}
@@ -476,7 +482,7 @@ add_custom_target(test_install
 
 add_custom_target(test_exportbuild
     COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} ${TOOLSET}
-    -S${GINKGO_TEST_EXPORTBUILD_SRC_DIR}
+    -H${GINKGO_TEST_EXPORTBUILD_SRC_DIR}
     -B${GINKGO_TEST_EXPORTBUILD_BIN_DIR}
     -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
@@ -494,7 +500,7 @@ add_custom_target(test_exportbuild
 
 add_custom_target(test_pkgconfig
     COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} ${TOOLSET}
-    -S${GINKGO_TEST_PKGCONFIG_SRC_DIR}
+    -H${GINKGO_TEST_PKGCONFIG_SRC_DIR}
     -B${GINKGO_TEST_PKGCONFIG_BIN_DIR}
     -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}

diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
@@ -296,12 +296,12 @@ __device__ __forceinline__ void csr_advanced_matvec_kernel(
 
 template <typename ValueType>
 __global__
-    __launch_bounds__(default_block_size, sm_multiplier) void advanced_spmv(
-        const batch_dense::UniformBatch<const ValueType> alpha,
-        const gko::batch_csr::UniformBatch<const ValueType> a,
-        const batch_dense::UniformBatch<const ValueType> b,
-        const batch_dense::UniformBatch<const ValueType> beta,
-        const batch_dense::UniformBatch<ValueType> c)
+__launch_bounds__(default_block_size, sm_multiplier) void advanced_spmv(
+    const batch_dense::UniformBatch<const ValueType> alpha,
+    const gko::batch_csr::UniformBatch<const ValueType> a,
+    const batch_dense::UniformBatch<const ValueType> b,
+    const batch_dense::UniformBatch<const ValueType> beta,
+    const batch_dense::UniformBatch<ValueType> c)
 {
     for (size_type ibatch = blockIdx.x; ibatch < a.num_batch;
          ibatch += gridDim.x) {
@@ -466,10 +466,9 @@ __global__ void uniform_convert_to_batch_dense(
 
 
 __global__
-    __launch_bounds__(default_block_size) void check_all_diagonal_locations(
-        const int min_rows_cols, const int* const __restrict__ row_ptrs,
-        const int* const __restrict__ col_idxs,
-        bool* const __restrict__ all_diags)
+__launch_bounds__(default_block_size) void check_all_diagonal_locations(
+    const int min_rows_cols, const int* const __restrict__ row_ptrs,
+    const int* const __restrict__ col_idxs, bool* const __restrict__ all_diags)
 {
     constexpr auto warp_size = config::warp_size;
     const auto tile =

diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
@@ -119,12 +119,12 @@ __device__ __forceinline__ void single_advanced_matvec_kernel(
 
 template <typename ValueType>
 __global__
-    __launch_bounds__(default_block_size, sm_multiplier) void advanced_mv(
-        const gko::batch_dense::UniformBatch<const ValueType> alpha,
-        const gko::batch_dense::UniformBatch<const ValueType> a,
-        const gko::batch_dense::UniformBatch<const ValueType> b,
-        const gko::batch_dense::UniformBatch<const ValueType> beta,
-        const gko::batch_dense::UniformBatch<ValueType> c)
+__launch_bounds__(default_block_size, sm_multiplier) void advanced_mv(
+    const gko::batch_dense::UniformBatch<const ValueType> alpha,
+    const gko::batch_dense::UniformBatch<const ValueType> a,
+    const gko::batch_dense::UniformBatch<const ValueType> b,
+    const gko::batch_dense::UniformBatch<const ValueType> beta,
+    const gko::batch_dense::UniformBatch<ValueType> c)
 {
     for (size_type ibatch = blockIdx.x; ibatch < a.num_batch;
          ibatch += gridDim.x) {
@@ -308,9 +308,9 @@ __device__ __forceinline__ void compute_norm2(
 
 template <typename ValueType>
 __global__
-    __launch_bounds__(default_block_size, sm_multiplier) void compute_norm2(
-        const gko::batch_dense::UniformBatch<const ValueType> x,
-        const gko::batch_dense::UniformBatch<remove_complex<ValueType>> result)
+__launch_bounds__(default_block_size, sm_multiplier) void compute_norm2(
+    const gko::batch_dense::UniformBatch<const ValueType> x,
+    const gko::batch_dense::UniformBatch<remove_complex<ValueType>> result)
 {
     for (size_type ibatch = blockIdx.x; ibatch < x.num_batch;
          ibatch += gridDim.x) {

diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
@@ -184,12 +184,12 @@ __global__ __launch_bounds__(default_block_size, sm_multiplier) void spmv(
 
 template <typename ValueType>
 __global__
-    __launch_bounds__(default_block_size, sm_multiplier) void advanced_spmv(
-        const batch_dense::UniformBatch<const ValueType> alpha,
-        const gko::batch_ell::UniformBatch<const ValueType> a,
-        const batch_dense::UniformBatch<const ValueType> b,
-        const batch_dense::UniformBatch<const ValueType> beta,
-        const batch_dense::UniformBatch<ValueType> c)
+__launch_bounds__(default_block_size, sm_multiplier) void advanced_spmv(
+    const batch_dense::UniformBatch<const ValueType> alpha,
+    const gko::batch_ell::UniformBatch<const ValueType> a,
+    const batch_dense::UniformBatch<const ValueType> b,
+    const batch_dense::UniformBatch<const ValueType> beta,
+    const batch_dense::UniformBatch<ValueType> c)
 {
     for (size_type ibatch = blockIdx.x; ibatch < a.num_batch;
          ibatch += gridDim.x) {

diff --git a/common/cuda_hip/preconditioner/batch_ilu_kernels.hpp.inc b/common/cuda_hip/preconditioner/batch_ilu_kernels.hpp.inc
@@ -112,12 +112,12 @@ __device__ __forceinline__ void modify_rows_below_curr_row(
 
 template <typename ValueType>
 __global__
-    __launch_bounds__(default_block_size) void generate_exact_ilu0_kernel(
-        const size_type batch_size, const int nrows, const int nnz,
-        const int* const __restrict__ diag_ptrs,
-        const int* const __restrict__ mat_row_ptrs,
-        const int* const __restrict__ mat_col_idxs,
-        ValueType* const __restrict__ mat_values)
+__launch_bounds__(default_block_size) void generate_exact_ilu0_kernel(
+    const size_type batch_size, const int nrows, const int nnz,
+    const int* const __restrict__ diag_ptrs,
+    const int* const __restrict__ mat_row_ptrs,
+    const int* const __restrict__ mat_col_idxs,
+    ValueType* const __restrict__ mat_values)
 {
     for (size_type batch_id = blockIdx.x; batch_id < batch_size;
          batch_id += gridDim.x) {
@@ -211,13 +211,13 @@ __global__ __launch_bounds__(default_block_size) void generate_parilu0_kernel(
 }
 
 __global__
-    __launch_bounds__(default_block_size) void generate_common_pattern_to_fill_L_and_U(
-        const int nrows, const int* const __restrict__ row_ptrs,
-        const int* const __restrict__ col_idxs,
-        const int* const __restrict__ row_ptrs_L,
-        const int* const __restrict__ row_ptrs_U,
-        int* const __restrict__ L_col_holders,
-        int* const __restrict__ U_col_holders)
+__launch_bounds__(default_block_size) void generate_common_pattern_to_fill_L_and_U(
+    const int nrows, const int* const __restrict__ row_ptrs,
+    const int* const __restrict__ col_idxs,
+    const int* const __restrict__ row_ptrs_L,
+    const int* const __restrict__ row_ptrs_U,
+    int* const __restrict__ L_col_holders,
+    int* const __restrict__ U_col_holders)
 {
     constexpr int warp_size = config::warp_size;
     auto tile_grp =

diff --git a/common/cuda_hip/preconditioner/batch_isai_kernels.hpp.inc b/common/cuda_hip/preconditioner/batch_isai_kernels.hpp.inc
@@ -32,13 +32,13 @@
 
 template <int subwarp_size>
 __global__
-    __launch_bounds__(default_block_size) void extract_dense_linear_sys_pattern_kernel(
-        const int nrows, const int* const __restrict__ A_row_ptrs,
-        const int* const __restrict__ A_col_idxs,
-        const int* const __restrict__ aiA_row_ptrs,
-        const int* const __restrict__ aiA_col_idxs,
-        int* const dense_mat_pattern, int* const rhs_one_idxs, int* const sizes,
-        int* num_matches_per_row_for_each_csr_sys)
+__launch_bounds__(default_block_size) void extract_dense_linear_sys_pattern_kernel(
+    const int nrows, const int* const __restrict__ A_row_ptrs,
+    const int* const __restrict__ A_col_idxs,
+    const int* const __restrict__ aiA_row_ptrs,
+    const int* const __restrict__ aiA_col_idxs, int* const dense_mat_pattern,
+    int* const rhs_one_idxs, int* const sizes,
+    int* num_matches_per_row_for_each_csr_sys)
 {
     using gko::preconditioner::batch_isai::row_size_limit;
     // assert(subwarp_size >= row_size_limit); //Not required here
@@ -312,16 +312,15 @@
 
 template <int subwarp_size, typename ValueType>
 __global__
-    __launch_bounds__(default_block_size) void fill_values_dense_mat_and_solve_kernel(
-        const int nbatch, const int nrows, const int A_nnz,
-        const ValueType* const A_values, const int aiA_nnz,
-        const int* const __restrict__ aiA_row_ptrs,
-        ValueType* const __restrict__ aiA_values,
-        const int* const __restrict__ dense_mat_pattern,
-        const int* const __restrict__ rhs_one_idxs,
-        const int* const __restrict__ sizes,
-        const enum gko::preconditioner::batch_isai_input_matrix_type
-            matrix_type)
+__launch_bounds__(default_block_size) void fill_values_dense_mat_and_solve_kernel(
+    const int nbatch, const int nrows, const int A_nnz,
+    const ValueType* const A_values, const int aiA_nnz,
+    const int* const __restrict__ aiA_row_ptrs,
+    ValueType* const __restrict__ aiA_values,
+    const int* const __restrict__ dense_mat_pattern,
+    const int* const __restrict__ rhs_one_idxs,
+    const int* const __restrict__ sizes,
+    const enum gko::preconditioner::batch_isai_input_matrix_type matrix_type)
 {
     using gko::preconditioner::batch_isai::row_size_limit;
     static_assert(row_size_limit <= subwarp_size, "incompatible subwarp size");
@@ -374,13 +373,13 @@

        if (matrix_type == gko::preconditioner::batch_isai_input_matrix_type::
                               lower_tri)  // input matrix: lower_tri =>
                                           // tranposed system: uppper_tri
        {
            sol = solve_upper_tri_dense_system(subwarpgrp, size, local_row,
                                               rhs_one_idx);
        } else if (matrix_type ==
                   gko::preconditioner::batch_isai_input_matrix_type::
                       upper_tri)  // input matrix: upper_tri => tranposed
                                   // system: lower_tri
        {
            sol = solve_lower_tri_dense_system(subwarpgrp, size, local_row,
@@ -436,16 +435,16 @@
 
 template <typename ValueType>
 __global__
-    __launch_bounds__(default_block_size) void extract_csr_sys_pattern_kernel(
-        const int lin_sys_row, const int* const __restrict__ inv_row_ptrs,
-        const int* const __restrict__ inv_col_idxs,
-        const int* const __restrict__ sys_row_ptrs,
-        const int* const __restrict__ sys_col_idxs,
-        const int* const __restrict__ csr_pattern_row_ptrs,
-        int* const __restrict__ csr_pattern_col_idxs,
-        gko::remove_complex<ValueType>* const __restrict__ csr_pattern_values)
+__launch_bounds__(default_block_size) void extract_csr_sys_pattern_kernel(
+    const int lin_sys_row, const int* const __restrict__ inv_row_ptrs,
+    const int* const __restrict__ inv_col_idxs,
+    const int* const __restrict__ sys_row_ptrs,
+    const int* const __restrict__ sys_col_idxs,
+    const int* const __restrict__ csr_pattern_row_ptrs,
+    int* const __restrict__ csr_pattern_col_idxs,
+    gko::remove_complex<ValueType>* const __restrict__ csr_pattern_values)
 {
     // use one thread per match of the 2 arrays (non-coalseced accesses but data
     // locality)
    const int gid = threadIdx.x + blockIdx.x * blockDim.x;

@@ -470,12 +469,11 @@
 
 template <typename ValueType>
 __global__
-    __launch_bounds__(default_block_size) void fill_batch_csr_system_kernel(
-        const int nbatch, const int csr_nnz,
-        const gko::remove_complex<
-            ValueType>* const __restrict__ csr_pattern_values,
-        const int sys_nnz, const ValueType* const __restrict__ sys_csr_values,
-        ValueType* const __restrict__ batch_csr_mats_values)
+__launch_bounds__(default_block_size) void fill_batch_csr_system_kernel(
+    const int nbatch, const int csr_nnz,
+    const gko::remove_complex<ValueType>* const __restrict__ csr_pattern_values,
+    const int sys_nnz, const ValueType* const __restrict__ sys_csr_values,
+    ValueType* const __restrict__ batch_csr_mats_values)
 {
     const int gid = threadIdx.x + blockDim.x * blockIdx.x;
 
@@ -494,10 +492,9 @@
 
 template <typename ValueType>
 __global__
-    __launch_bounds__(default_block_size) void initialize_b_and_x_vectors_kernel(
-        const int nbatch, const int size, const int rhs_one_idx,
-        ValueType* const __restrict__ b_vals,
-        ValueType* const __restrict__ x_vals)
+__launch_bounds__(default_block_size) void initialize_b_and_x_vectors_kernel(
+    const int nbatch, const int size, const int rhs_one_idx,
+    ValueType* const __restrict__ b_vals, ValueType* const __restrict__ x_vals)
 {
     const int gid = threadIdx.x + blockDim.x * blockIdx.x;
 
@@ -515,11 +512,11 @@
 
 template <typename ValueType>
 __global__
-    __launch_bounds__(default_block_size) void write_large_sys_solution_to_inverse_kernel(
-        const int nbatch, const int lin_sys_row, const int size,
-        const ValueType* const __restrict__ x_vals, const int inv_nnz,
-        const int* const __restrict__ inv_row_ptrs,
-        ValueType* const __restrict__ inv_vals)
+__launch_bounds__(default_block_size) void write_large_sys_solution_to_inverse_kernel(
+    const int nbatch, const int lin_sys_row, const int size,
+    const ValueType* const __restrict__ x_vals, const int inv_nnz,
+    const int* const __restrict__ inv_row_ptrs,
+    ValueType* const __restrict__ inv_vals)
 {
     const int gid = threadIdx.x + blockDim.x * blockIdx.x;
     assert(size == inv_row_ptrs[lin_sys_row + 1] - inv_row_ptrs[lin_sys_row]);