diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fb62494135d..eaf638e04d3 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -56,6 +56,8 @@ stages:
     - |
         (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1
     - ctest -V
+    - make install
+    - make test_install
   dependencies: []
   except:
       - schedules
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e91de8b135d..32a05209929 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,23 @@ commits. For a comprehensive list, use the following command:
 git log --first-parent
 ```
 
+## Version 1.1.1
+This version of Ginkgo provides a few fixes in Ginkgo's core routines. The
+supported systems and requirements are unchanged from version 1.1.0.
+
+### Fixes
++ Improve Ginkgo's installation and fix the `test_install` step ([#406](https://github.com/ginkgo-project/ginkgo/pull/406)),
++ Fix some documentation issues ([#406](https://github.com/ginkgo-project/ginkgo/pull/406)),
++ Fix multiple code issues reported by sonarqube ([#406](https://github.com/ginkgo-project/ginkgo/pull/406)),
++ Update the git-cmake-format repository ([#399](https://github.com/ginkgo-project/ginkgo/pull/399)),
++ Improve the global update header script ([#390](https://github.com/ginkgo-project/ginkgo/pull/390)),
++ Fix broken bounds checks ([#388](https://github.com/ginkgo-project/ginkgo/pull/388)),
++ Fix CSR strategies and improve performance ([#379](https://github.com/ginkgo-project/ginkgo/pull/379)),
++ Fix a small typo in the stencil examples ([#381](https://github.com/ginkgo-project/ginkgo/pull/381)),
++ Fix ELL error on small matrices ([#375](https://github.com/ginkgo-project/ginkgo/pull/375)),
++ Fix SellP read function ([#374](https://github.com/ginkgo-project/ginkgo/pull/374)),
++ Add factorization support in `create_new_algorithm.sh`  ([#371](https://github.com/ginkgo-project/ginkgo/pull/371))
+
 ## Version 1.1.0
 
 The Ginkgo team is proud to announce the new minor release of Ginkgo version
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c94059125f..6e9af2bdd07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.9)
 
-project(Ginkgo LANGUAGES C CXX VERSION 1.1.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures")
+project(Ginkgo LANGUAGES C CXX VERSION 1.1.1 DESCRIPTION "A numerical linear algebra library targeting many-core architectures")
 set(Ginkgo_VERSION_TAG "master")
 set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG})
 
@@ -160,8 +160,9 @@ if(GINKGO_DEVEL_TOOLS)
 endif()
 
 # Generate the global `ginkgo/ginkgo.hpp` header with every call of make
+# when bash is present and the developer tools are enabled
 find_program(BASH bash)
-if(NOT "${BASH}" STREQUAL "BASH-NOTFOUND")
+if(NOT "${BASH}" STREQUAL "BASH-NOTFOUND" AND GINKGO_DEVEL_TOOLS)
     add_custom_target(generate_ginkgo_header ALL
         COMMAND ${Ginkgo_SOURCE_DIR}/dev_tools/scripts/update_ginkgo_header.sh
         WORKING_DIRECTORY ${Ginkgo_SOURCE_DIR})
@@ -187,6 +188,7 @@ add_custom_target(test_install
     COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} -H${Ginkgo_SOURCE_DIR}/test_install
         -B${Ginkgo_BINARY_DIR}/test_install
         -DCMAKE_PREFIX_PATH=${CMAKE_INSTALL_PREFIX}/${GINKGO_INSTALL_CONFIG_DIR}
+        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
     COMMAND ${CMAKE_COMMAND} --build ${Ginkgo_BINARY_DIR}/test_install
     COMMAND ${Ginkgo_BINARY_DIR}/test_install/test_install
     COMMENT "Running a test on the installed binaries. This requires running `(sudo) make install` first.")
diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in
index 1db5cbbffa4..e944ff6e933 100644
--- a/cmake/GinkgoConfig.cmake.in
+++ b/cmake/GinkgoConfig.cmake.in
@@ -61,6 +61,8 @@ set(GINKGO_JACOBI_FULL_OPTIMIZATIONS @GINKGO_JACOBI_FULL_OPTIMIZATIONS@)
 set(GINKGO_CUDA_ARCHITECTURES @GINKGO_CUDA_ARCHITECTURES@)
 set(GINKGO_CUDA_HOST_COMPILER @CMAKE_CUDA_HOST_COMPILER@)
 
+set(GINKGO_HAVE_PAPI_SDE @GINKGO_HAVE_PAPI_SDE@)
+
 # Ginkgo external package variables
 set(GINKGO_USE_EXTERNAL_CAS "@GINKGO_USE_EXTERNAL_CAS@")
 set(GINKGO_USE_EXTERNAL_GTEST "@GINKGO_USE_EXTERNAL_GTEST@")
@@ -80,10 +82,13 @@ set(TPL_RAPIDJSON_INCLUDE_DIRS "@TPL_RAPIDJSON_INCLUDE_DIRS@")
 # Ginkgo installation configuration
 set(GINKGO_CONFIG_FILE_PATH "${CMAKE_CURRENT_LIST_DIR}")
 string(REPLACE "@GINKGO_INSTALL_CONFIG_DIR@" "" GINKGO_INSTALL_PREFIX "${GINKGO_CONFIG_FILE_PATH}")
-set(GINKGO_INSTALL_INCLUDE_DIR "@GINKGO_INSTALL_INCLUDE_DIR@")
-set(GINKGO_INSTALL_LIBRARY_DIR "@GINKGO_INSTALL_LIBRARY_DIR@")
-set(GINKGO_INSTALL_PKGCONFIG_DIR "@GINKGO_INSTALL_PKGCONFIG_DIR@")
-set(GINKGO_INSTALL_CONFIG_DIR "@GINKGO_INSTALL_CONFIG_DIR@")
+set(GINKGO_INSTALL_INCLUDE_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_INCLUDE_DIR@")
+set(GINKGO_INSTALL_LIBRARY_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_LIBRARY_DIR@")
+set(GINKGO_INSTALL_PKGCONFIG_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_PKGCONFIG_DIR@")
+set(GINKGO_INSTALL_CONFIG_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_CONFIG_DIR@")
+set(GINKGO_INSTALL_MODULE_DIR "${GINKGO_INSTALL_PREFIX}/@GINKGO_INSTALL_MODULE_DIR@")
+set(CMAKE_MODULE_PATH "${GINKGO_INSTALL_MODULE_DIR}")
+
 
 set(GINKGO_INTERFACE_LINK_LIBRARIES "@GINKGO_INTERFACE_LINK_LIBRARIES@")
 set(GINKGO_INTERFACE_LINK_FLAGS "@GINKGO_INTERFACE_LINK_FLAGS@")
@@ -111,6 +116,8 @@ set(GINKGO_OPENMP_LIBRARIES @OpenMP_CXX_LIBRARIES@)
 
 set(GINKGO_OPENMP_FLAGS "@OpenMP_CXX_FLAGS@")
 
+# Modulepath configuration
+
 # NOTE: we do not export benchmarks, examples, tests or devel tools
 #     so `third_party` libraries are currently unneeded.
 
diff --git a/cmake/Modules/FindPAPI.cmake b/cmake/Modules/FindPAPI.cmake
index 15f5b5cd38a..3e16af2e125 100644
--- a/cmake/Modules/FindPAPI.cmake
+++ b/cmake/Modules/FindPAPI.cmake
@@ -68,30 +68,31 @@ if(PAPI_INCLUDE_DIR)
 
     if (PAPI_LIBRARY)
         # find the components
-       foreach(component IN LISTS PAPI_FIND_COMPONENTS)
-           file(WRITE "${CMAKE_BINARY_DIR}/papi_${component}_detect.c"
-             "
-             #include <papi.h>
-             int main() {
-              int retval;
-              retval = PAPI_library_init(PAPI_VER_CURRENT);
-                if (retval != PAPI_VER_CURRENT && retval > 0)
-                 return -1;
-                if (PAPI_get_component_index(\"${component}\") < 0)
-                 return 0;
-                return 1;
-             }
-            ")
-        try_run(PAPI_${component}_FOUND
-            gko_result_unused
-            "${CMAKE_BINARY_DIR}"
-            "${CMAKE_BINARY_DIR}/papi_${component}_detect.c"
-            LINK_LIBRARIES ${PAPI_LIBRARY}
-            )
+        enable_language(C)
+        foreach(component IN LISTS PAPI_FIND_COMPONENTS)
+            file(WRITE "${CMAKE_BINARY_DIR}/papi_${component}_detect.c"
+                "
+                #include <papi.h>
+                int main() {
+                 int retval;
+                 retval = PAPI_library_init(PAPI_VER_CURRENT);
+                   if (retval != PAPI_VER_CURRENT && retval > 0)
+                    return -1;
+                   if (PAPI_get_component_index(\"${component}\") < 0)
+                    return 0;
+                   return 1;
+                }"
+                )
+            try_run(PAPI_${component}_FOUND
+                gko_result_unused
+                "${CMAKE_BINARY_DIR}"
+                "${CMAKE_BINARY_DIR}/papi_${component}_detect.c"
+                LINK_LIBRARIES ${PAPI_LIBRARY}
+                )
 
-        if (NOT PAPI_${component}_FOUND EQUAL 1)
-            unset(PAPI_${component}_FOUND)
-        endif()
+            if (NOT PAPI_${component}_FOUND EQUAL 1)
+                unset(PAPI_${component}_FOUND)
+            endif()
         endforeach()
     endif()
 endif()
diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake
index 76125a761a6..8a8ad047d56 100644
--- a/cmake/build_helpers.cmake
+++ b/cmake/build_helpers.cmake
@@ -94,4 +94,4 @@ endmacro()
 
 macro(ginkgo_switch_to_windows_dynamic lang)
     ginkgo_switch_windows_link(${lang} "MT" "MD")
-endmacro()
\ No newline at end of file
+endmacro()
diff --git a/cmake/install_helpers.cmake b/cmake/install_helpers.cmake
index 1b9f73b1c69..fd0c90d383f 100644
--- a/cmake/install_helpers.cmake
+++ b/cmake/install_helpers.cmake
@@ -6,6 +6,7 @@ set(GINKGO_INSTALL_INCLUDE_DIR "include")
 set(GINKGO_INSTALL_LIBRARY_DIR "lib")
 set(GINKGO_INSTALL_PKGCONFIG_DIR "lib/pkgconfig")
 set(GINKGO_INSTALL_CONFIG_DIR "lib/cmake/Ginkgo")
+set(GINKGO_INSTALL_MODULE_DIR "lib/cmake/Ginkgo/Modules")
 
 function(ginkgo_install_library name subdir)
     # install .so and .a files
@@ -33,6 +34,9 @@ function(ginkgo_install)
         install(FILES "${Ginkgo_SOURCE_DIR}/third_party/papi_sde/papi_sde_interface.h"
             DESTINATION "${GINKGO_INSTALL_INCLUDE_DIR}/third_party/papi_sde"
             )
+        install(FILES "${Ginkgo_SOURCE_DIR}/cmake/Modules/FindPAPI.cmake"
+            DESTINATION "${GINKGO_INSTALL_MODULE_DIR}/"
+            )
     endif()
 
     # export targets
diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index 77ed59ca9a0..884b85425b3 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -43,9 +43,9 @@ namespace gko {
 
 version version_info::get_cuda_version() noexcept
 {
-    // We just return 1.1.0 with a special "not compiled" tag in placeholder
-    // modules.
-    return {1, 1, 0, "not compiled"};
+    // We just return the version with a special "not compiled" tag in
+    // placeholder modules.
+    return {GKO_VERSION_STR, "not compiled"};
 }
 
 
diff --git a/core/device_hooks/omp_hooks.cpp b/core/device_hooks/omp_hooks.cpp
index fdf598ab5eb..4fb251758a8 100644
--- a/core/device_hooks/omp_hooks.cpp
+++ b/core/device_hooks/omp_hooks.cpp
@@ -38,9 +38,9 @@ namespace gko {
 
 version version_info::get_omp_version() noexcept
 {
-    // We just return 1.1.0 with a special "not compiled" tag in placeholder
-    // modules.
-    return {1, 1, 0, "not compiled"};
+    // We just return the version with a special "not compiled" tag in
+    // placeholder modules.
+    return {GKO_VERSION_STR, "not compiled"};
 }
 
 
diff --git a/core/device_hooks/reference_hooks.cpp b/core/device_hooks/reference_hooks.cpp
index 31e51fd0d27..7e7ab287ca5 100644
--- a/core/device_hooks/reference_hooks.cpp
+++ b/core/device_hooks/reference_hooks.cpp
@@ -39,9 +39,9 @@ namespace gko {
 
 version version_info::get_reference_version() noexcept
 {
-    // We just return 1.1.0 with a special "not compiled" tag in placeholder
-    // modules.
-    return {1, 1, 0, "not compiled"};
+    // We just return the version with a special "not compiled" tag in
+    // placeholder modules.
+    return {GKO_VERSION_STR, "not compiled"};
 }
 
 
diff --git a/core/devices/cuda/executor.cpp b/core/devices/cuda/executor.cpp
index 0be14ce9fe4..b377b2afa94 100644
--- a/core/devices/cuda/executor.cpp
+++ b/core/devices/cuda/executor.cpp
@@ -48,7 +48,7 @@ std::shared_ptr<const Executor> CudaExecutor::get_master() const noexcept
 }
 
 
-int CudaExecutor::num_execs[max_devices];
+unsigned CudaExecutor::num_execs[max_devices];
 
 
 std::mutex CudaExecutor::mutex[max_devices];
diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp
index a9dddadf5ab..73d001d6dba 100644
--- a/core/matrix/coo.cpp
+++ b/core/matrix/coo.cpp
@@ -108,10 +108,13 @@ void Coo<ValueType, IndexType>::convert_to(
     Csr<ValueType, IndexType> *result) const
 {
     auto exec = this->get_executor();
-    auto tmp = Csr<ValueType, IndexType>::create(exec, this->get_size());
+    auto tmp = Csr<ValueType, IndexType>::create(
+        exec, this->get_size(), this->get_num_stored_elements(),
+        result->get_strategy());
     tmp->values_ = this->values_;
     tmp->col_idxs_ = this->col_idxs_;
     exec->run(coo::make_convert_to_csr(tmp.get(), this));
+    tmp->make_srow();
     tmp->move_to(result);
 }
 
@@ -120,10 +123,13 @@ template <typename ValueType, typename IndexType>
 void Coo<ValueType, IndexType>::move_to(Csr<ValueType, IndexType> *result)
 {
     auto exec = this->get_executor();
-    auto tmp = Csr<ValueType, IndexType>::create(exec, this->get_size());
+    auto tmp = Csr<ValueType, IndexType>::create(
+        exec, this->get_size(), this->get_num_stored_elements(),
+        result->get_strategy());
     tmp->values_ = std::move(this->values_);
     tmp->col_idxs_ = std::move(this->col_idxs_);
     exec->run(coo::make_convert_to_csr(tmp.get(), this));
+    tmp->make_srow();
     tmp->move_to(result);
 }
 
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 72c23343857..b284593d21a 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -108,12 +108,21 @@ inline void conversion_helper(Csr<ValueType, IndexType> *result,
 {
     auto exec = source->get_executor();
 
-    size_type num_stored_nonzeros = 0;
-    exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros));
-    auto tmp = Csr<ValueType, IndexType>::create(
-        exec, source->get_size(), num_stored_nonzeros, result->get_strategy());
-    exec->run(op(tmp.get(), source));
-    tmp->move_to(result);
+    if (source->get_size()) {
+        size_type num_stored_nonzeros = 0;
+        exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros));
+        auto tmp = Csr<ValueType, IndexType>::create(exec, source->get_size(),
+                                                     num_stored_nonzeros,
+                                                     result->get_strategy());
+        exec->run(op(tmp.get(), source));
+        tmp->move_to(result);
+    }
+    // If source is empty, there is no need to copy data or to call kernels
+    else {
+        auto tmp =
+            Csr<ValueType, IndexType>::create(exec, result->get_strategy());
+        tmp->move_to(result);
+    }
 }
 
 
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index a4b64f5a82c..1d4bed57288 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -134,10 +134,11 @@ void Ell<ValueType, IndexType>::convert_to(
     size_type num_stored_elements = 0;
     exec->run(ell::make_count_nonzeros(this, &num_stored_elements));
 
-    auto tmp = Csr<ValueType, IndexType>::create(exec, this->get_size(),
-                                                 num_stored_elements);
+    auto tmp = Csr<ValueType, IndexType>::create(
+        exec, this->get_size(), num_stored_elements, result->get_strategy());
     exec->run(ell::make_convert_to_csr(tmp.get(), this));
 
+    tmp->make_srow();
     tmp->move_to(result);
 }
 
diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp
index cf956c9d0d8..40a50a377ea 100644
--- a/core/matrix/hybrid.cpp
+++ b/core/matrix/hybrid.cpp
@@ -135,10 +135,11 @@ void Hybrid<ValueType, IndexType>::convert_to(
     size_type num_stored_elements = 0;
     exec->run(hybrid::make_count_nonzeros(this, &num_stored_elements));
 
-    auto tmp = Csr<ValueType, IndexType>::create(exec, this->get_size(),
-                                                 num_stored_elements);
+    auto tmp = Csr<ValueType, IndexType>::create(
+        exec, this->get_size(), num_stored_elements, result->get_strategy());
     exec->run(hybrid::make_convert_to_csr(tmp.get(), this));
 
+    tmp->make_srow();
     tmp->move_to(result);
 }
 
@@ -161,8 +162,9 @@ void Hybrid<ValueType, IndexType>::read(const mat_data &data)
     get_each_row_nnz(data, row_nnz);
     strategy_->compute_hybrid_config(row_nnz, &ell_lim, &coo_lim);
 
-    auto tmp = Hybrid::create(this->get_executor()->get_master(), data.size,
-                              ell_lim, data.size[0], coo_lim);
+    auto tmp =
+        Hybrid::create(this->get_executor()->get_master(), data.size, ell_lim,
+                       data.size[0], coo_lim, this->get_strategy());
 
     // Get values and column indexes.
     size_type ind = 0;
diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp
index 67552874cf8..880678b3738 100644
--- a/core/matrix/sellp.cpp
+++ b/core/matrix/sellp.cpp
@@ -76,6 +76,12 @@ size_type calculate_total_cols(const matrix_data<ValueType, IndexType> &data,
     IndexType current_slice = 0;
     size_type total_cols = 0;
     for (const auto &elem : data.nonzeros) {
+        if (elem.row != current_row) {
+            current_row = elem.row;
+            slice_lengths[current_slice] =
+                max(slice_lengths[current_slice], nonzeros_per_row);
+            nonzeros_per_row = 0;
+        }
         if (elem.row / slice_size != current_slice) {
             slice_lengths[current_slice] =
                 stride_factor *
@@ -83,12 +89,6 @@ size_type calculate_total_cols(const matrix_data<ValueType, IndexType> &data,
             total_cols += slice_lengths[current_slice];
             current_slice = elem.row / slice_size;
         }
-        if (elem.row != current_row) {
-            current_row = elem.row;
-            slice_lengths[current_slice] =
-                max(slice_lengths[current_slice], nonzeros_per_row);
-            nonzeros_per_row = 0;
-        }
         nonzeros_per_row += (elem.value != zero<ValueType>());
     }
     slice_lengths[current_slice] =
@@ -147,9 +147,10 @@ void Sellp<ValueType, IndexType>::convert_to(
 
     size_type num_stored_nonzeros = 0;
     exec->run(sellp::make_count_nonzeros(this, &num_stored_nonzeros));
-    auto tmp = Csr<ValueType, IndexType>::create(exec, this->get_size(),
-                                                 num_stored_nonzeros);
+    auto tmp = Csr<ValueType, IndexType>::create(
+        exec, this->get_size(), num_stored_nonzeros, result->get_strategy());
     exec->run(sellp::make_convert_to_csr(tmp.get(), this));
+    tmp->make_srow();
     tmp->move_to(result);
 }
 
diff --git a/core/test/base/exception_helpers.cpp b/core/test/base/exception_helpers.cpp
index a6ac533b545..dd013835300 100644
--- a/core/test/base/exception_helpers.cpp
+++ b/core/test/base/exception_helpers.cpp
@@ -55,7 +55,7 @@ TEST(NotCompiled, ThrowsWhenUsed)
 }
 
 
-void does_not_support_int() { throw GKO_NOT_SUPPORTED(int); }
+void does_not_support_int() { GKO_NOT_SUPPORTED(int); }
 
 TEST(NotSupported, ReturnsNotSupportedException)
 {
@@ -131,6 +131,18 @@ TEST(AssertConformant, ThrowsWhenNotConformant)
 }
 
 
+TEST(AssertEqual, DoesNotThrowWhenEqual)
+{
+    ASSERT_NO_THROW(GKO_ASSERT_EQ(1, 1));
+}
+
+
+TEST(AssertEqual, ThrowsWhenNotEqual)
+{
+    ASSERT_THROW(GKO_ASSERT_EQ(0, 1), gko::ValueMismatch);
+}
+
+
 TEST(AssertEqualRows, DoesNotThrowWhenEqualRowSize)
 {
     ASSERT_NO_THROW(
diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp
index bd730edbea6..155f2c8ce21 100644
--- a/core/test/matrix/csr.cpp
+++ b/core/test/matrix/csr.cpp
@@ -127,11 +127,13 @@ TEST_F(Csr, CanBeCreatedFromExistingData)
     double values[] = {1.0, 2.0, 3.0, 4.0};
     gko::int32 col_idxs[] = {0, 1, 1, 0};
     gko::int32 row_ptrs[] = {0, 2, 3, 4};
+
     auto mtx = gko::matrix::Csr<>::create(
         exec, gko::dim<2>{3, 2}, gko::Array<double>::view(exec, 4, values),
         gko::Array<gko::int32>::view(exec, 4, col_idxs),
         gko::Array<gko::int32>::view(exec, 4, row_ptrs),
         std::make_shared<Mtx::load_balance>(2));
+
     ASSERT_EQ(mtx->get_num_srow_elements(), 1);
     ASSERT_EQ(mtx->get_const_values(), values);
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
@@ -183,6 +185,7 @@ TEST_F(Csr, CanBeCleared)
 TEST_F(Csr, CanBeReadFromMatrixData)
 {
     auto m = Mtx::create(exec, std::make_shared<Mtx::load_balance>(2));
+
     m->read({{2, 3},
              {{0, 0, 1.0},
               {0, 1, 3.0},
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index 48561d475a8..d0b46c83c51 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -673,6 +673,8 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
                 as_cuda_type(b->get_const_values()),
                 as_cuda_type(b->get_stride()), as_cuda_type(c->get_values()),
                 as_cuda_type(c->get_stride()));
+        } else {
+            GKO_NOT_SUPPORTED(nwarps);
         }
     } else if (a->get_strategy()->get_name() == "merge_path") {
         int items_per_thread =
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index cc82d30a240..76abffe5859 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -85,10 +85,11 @@ constexpr double ratio = 1e-2;
 /**
  * A compile-time list of sub-warp sizes for which the spmv kernels should be
  * compiled.
- * 0 is a special case where it uses a sub-warp size of 32 in
+ * 0 is a special case where it uses a sub-warp size of warp_size in
  * combination with atomic_adds.
  */
-using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
+using compiled_kernels =
+    syn::value_list<int, 0, 1, 2, 4, 8, 16, 32, cuda_config::warp_size>;
 
 
 namespace kernel {
@@ -97,7 +98,7 @@ namespace {
 
 template <int subwarp_size, bool atomic, typename ValueType, typename IndexType,
           typename Closure>
-__device__ void spmv_kernel(const size_type num_rows,
+__device__ void spmv_kernel(const size_type num_rows, const int nwarps_per_row,
                             const ValueType *__restrict__ val,
                             const IndexType *__restrict__ col,
                             const size_type stride,
@@ -108,9 +109,7 @@ __device__ void spmv_kernel(const size_type num_rows,
 {
     const auto tidx =
         static_cast<IndexType>(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto nwarps_per_row =
-        gridDim.x * blockDim.x / num_rows / subwarp_size;
-    const auto x = tidx / subwarp_size / nwarps_per_row;
+    const IndexType x = tidx / subwarp_size / nwarps_per_row;
     const auto warp_id = tidx / subwarp_size % nwarps_per_row;
     const auto y_start = tidx % subwarp_size +
                          num_stored_elements_per_row * warp_id / nwarps_per_row;
@@ -148,24 +147,26 @@ __device__ void spmv_kernel(const size_type num_rows,
 template <int subwarp_size, bool atomic = false, typename ValueType,
           typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
-    const size_type num_rows, const ValueType *__restrict__ val,
-    const IndexType *__restrict__ col, const size_type stride,
-    const size_type num_stored_elements_per_row,
+    const size_type num_rows, const int nwarps_per_row,
+    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const size_type stride, const size_type num_stored_elements_per_row,
     const ValueType *__restrict__ b, const size_type b_stride,
     ValueType *__restrict__ c, const size_type c_stride)
 {
     spmv_kernel<subwarp_size, atomic>(
-        num_rows, val, col, stride, num_stored_elements_per_row, b, b_stride, c,
-        c_stride, [](const ValueType &x, const ValueType &y) { return x; });
+        num_rows, nwarps_per_row, val, col, stride, num_stored_elements_per_row,
+        b, b_stride, c, c_stride,
+        [](const ValueType &x, const ValueType &y) { return x; });
 }
 
 
 template <int subwarp_size, bool atomic = false, typename ValueType,
           typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
-    const size_type num_rows, const ValueType *__restrict__ alpha,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
-    const size_type stride, const size_type num_stored_elements_per_row,
+    const size_type num_rows, const int nwarps_per_row,
+    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
+    const IndexType *__restrict__ col, const size_type stride,
+    const size_type num_stored_elements_per_row,
     const ValueType *__restrict__ b, const size_type b_stride,
     const ValueType *__restrict__ beta, ValueType *__restrict__ c,
     const size_type c_stride)
@@ -178,15 +179,15 @@ __global__ __launch_bounds__(default_block_size) void spmv(
     // operation.
     if (atomic) {
         spmv_kernel<subwarp_size, atomic>(
-            num_rows, val, col, stride, num_stored_elements_per_row, b,
-            b_stride, c, c_stride,
+            num_rows, nwarps_per_row, val, col, stride,
+            num_stored_elements_per_row, b, b_stride, c, c_stride,
             [&alpha_val](const ValueType &x, const ValueType &y) {
                 return alpha_val * x;
             });
     } else {
         spmv_kernel<subwarp_size, atomic>(
-            num_rows, val, col, stride, num_stored_elements_per_row, b,
-            b_stride, c, c_stride,
+            num_rows, nwarps_per_row, val, col, stride,
+            num_stored_elements_per_row, b, b_stride, c, c_stride,
             [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
                 return alpha_val * x + beta_val * y;
             });
@@ -210,7 +211,7 @@ void abstract_spmv(syn::value_list<int, info>, int nwarps_per_row,
                    const matrix::Dense<ValueType> *beta = nullptr)
 {
     const auto nrows = a->get_size()[0];
-    constexpr int subwarp_size = (info == 0) ? 32 : info;
+    constexpr int subwarp_size = (info == 0) ? cuda_config::warp_size : info;
     constexpr bool atomic = (info == 0);
     const dim3 block_size(default_block_size, 1, 1);
     const dim3 grid_size(
@@ -218,13 +219,14 @@ void abstract_spmv(syn::value_list<int, info>, int nwarps_per_row,
         b->get_size()[1], 1);
     if (alpha == nullptr && beta == nullptr) {
         kernel::spmv<subwarp_size, atomic><<<grid_size, block_size, 0, 0>>>(
-            nrows, as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
-            a->get_stride(), a->get_num_stored_elements_per_row(),
+            nrows, nwarps_per_row, as_cuda_type(a->get_const_values()),
+            a->get_const_col_idxs(), a->get_stride(),
+            a->get_num_stored_elements_per_row(),
             as_cuda_type(b->get_const_values()), b->get_stride(),
             as_cuda_type(c->get_values()), c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
         kernel::spmv<subwarp_size, atomic><<<grid_size, block_size, 0, 0>>>(
-            nrows, as_cuda_type(alpha->get_const_values()),
+            nrows, nwarps_per_row, as_cuda_type(alpha->get_const_values()),
             as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
             a->get_stride(), a->get_num_stored_elements_per_row(),
             as_cuda_type(b->get_const_values()), b->get_stride(),
@@ -255,16 +257,17 @@ std::array<int, 3> compute_subwarp_size_and_atomicity(
     // Use multithreads to perform the reduction on each row when the matrix is
     // wide.
     // To make every thread have computation, so pick the value which is the
-    // power of 2 less than 32 and is less than or equal to ell_ncols. If the
-    // subwarp_size is 32 and allow more than one warps to work on the same row,
-    // use atomic add to handle the warps write the value into the same
-    // position. The #warps is decided according to the number of warps allowed
-    // on GPU.
+    // power of 2 less than warp_size and is less than or equal to ell_ncols. If
+    // the subwarp_size is warp_size and allow more than one warps to work on
+    // the same row, use atomic add to handle the warps write the value into the
+    // same position. The #warps is decided according to the number of warps
+    // allowed on GPU.
     if (static_cast<double>(ell_ncols) / nrows > ratio) {
-        while (subwarp_size < 32 && (subwarp_size << 1) <= ell_ncols) {
+        while (subwarp_size < cuda_config::warp_size &&
+               (subwarp_size << 1) <= ell_ncols) {
             subwarp_size <<= 1;
         }
-        if (subwarp_size == 32) {
+        if (subwarp_size == cuda_config::warp_size) {
             nwarps_per_row =
                 std::min(ell_ncols / cuda_config::warp_size, nwarps / nrows);
             nwarps_per_row = std::max(nwarps_per_row, 1);
@@ -292,8 +295,8 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
 
     /**
      * info is the parameter for selecting the cuda kernel.
-     * for info == 0, it uses the kernel by 32 threads with atomic operation
-     * for other value, it uses the kernel without atomic_add
+     * for info == 0, it uses the kernel by warp_size threads with atomic
+     * operation for other value, it uses the kernel without atomic_add
      */
     const int info = (!atomic) * subwarp_size;
     if (atomic) {
@@ -323,8 +326,8 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
 
     /**
      * info is the parameter for selecting the cuda kernel.
-     * for info == 0, it uses the kernel by 32 threads with atomic operation
-     * for other value, it uses the kernel without atomic_add
+     * for info == 0, it uses the kernel by warp_size threads with atomic
+     * operation for other value, it uses the kernel without atomic_add
      */
     const int info = (!atomic) * subwarp_size;
     if (atomic) {
diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp
index 82e3d43d232..ff4ae0b8b88 100644
--- a/cuda/test/matrix/ell_kernels.cpp
+++ b/cuda/test/matrix/ell_kernels.cpp
@@ -82,8 +82,8 @@ class Ell : public ::testing::Test {
     }
 
     void set_up_apply_data(int num_rows = 532, int num_cols = 231,
-                           int num_stored_elements_per_row = 0, int stride = 0,
-                           int num_vectors = 1)
+                           int num_vectors = 1,
+                           int num_stored_elements_per_row = 0, int stride = 0)
     {
         mtx = Mtx::create(ref, gko::dim<2>{}, num_stored_elements_per_row,
                           stride);
@@ -148,7 +148,7 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 
 TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 300, 600);
+    set_up_apply_data(532, 231, 1, 300, 600);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -159,7 +159,7 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 
 TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 300, 600);
+    set_up_apply_data(532, 231, 1, 300, 600);
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
 
@@ -169,7 +169,7 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 
 TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 300, 600, 3);
+    set_up_apply_data(532, 231, 3, 300, 600);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -180,7 +180,7 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 
 TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 300, 600, 3);
+    set_up_apply_data(532, 231, 3, 300, 600);
 
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
@@ -211,6 +211,72 @@ TEST_F(Ell, AdvancedByAtomicApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, SimpleApplyByAtomicToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(10, 10000, 3);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, AdvancedByAtomicToDenseMatrixApplyIsEquivalentToRef)
+{
+    set_up_apply_data(10, 10000, 3);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, SimpleApplyOnSmallMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(1, 10);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, AdvancedApplyOnSmallMatrixToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(1, 10, 3);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, SimpleApplyOnSmallMatrixToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(1, 10, 3);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(1, 10);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
 TEST_F(Ell, ConvertToDenseIsEquivalentToRef)
 {
     set_up_apply_data();
diff --git a/dev_tools/scripts/create_new_algorithm.sh b/dev_tools/scripts/create_new_algorithm.sh
index 819f27c60ea..24b37b475fe 100755
--- a/dev_tools/scripts/create_new_algorithm.sh
+++ b/dev_tools/scripts/create_new_algorithm.sh
@@ -22,7 +22,7 @@ function print_help {
 }
 
 function list_sources {
-    for type in solver preconditioner matrix
+    for type in solver preconditioner matrix factorization
     do
         for i in $(ls $GINKGO_ROOT_DIR/core/$type/*.cpp)
         do
@@ -74,7 +74,7 @@ name=${name,,}
 Name=${name^}
 NAME=${name^^}
 
-if [[ "$name" == "" ]] || ( [[ "$source_type" != "preconditioner" ]] && [[ "$source_type" != "matrix" ]] && [[ "$source_type" != "solver" ]] ) || [[ "$source_name" == "" ]]; then
+if [[ "$name" == "" ]] || ( [[ "$source_type" != "preconditioner" ]] && [[ "$source_type" != "matrix" ]] && [[ "$source_type" != "solver" ]] && [[ "$source_type" != "factorization" ]] ) || [[ "$source_name" == "" ]]; then
     print_help
     exit 1
 fi
diff --git a/dev_tools/scripts/update_ginkgo_header.sh b/dev_tools/scripts/update_ginkgo_header.sh
index d986887aba4..df78ba7794e 100755
--- a/dev_tools/scripts/update_ginkgo_header.sh
+++ b/dev_tools/scripts/update_ginkgo_header.sh
@@ -1,7 +1,17 @@
 #!/usr/bin/env bash
 
+# Note: This script is supposed to support the developer and not to hinder
+#       the development process by setting more restrictions.
+#       This is the reason why every exit code is 0, otherwise, the whole
+#       `make` procedure would fail.
+
 PLACE_HOLDER="#PUBLIC_HEADER_PLACE_HOLDER"
 
+WARNING_PREFIX="[WARNING] ginkgo.hpp update script failed because:"
+
+RM_PARAMETER="-f"
+
+
 THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
 INCLUDE_DIR="${THIS_DIR}/../../include"
 
@@ -17,28 +27,29 @@ HEADER_LIST="global_includes.hpp.tmp"
 # Test if required commands are present on the system:
 command -v find &> /dev/null
 if [ ${?} -ne 0 ]; then
-    echo 'The command `find` is required for this script to work, but not supported by your system.' 1>&2
-    exit 1
+    echo "${WARNING_PREFIX} "'The command `find` is required for this script to work, but not supported by your system.' 1>&2
+    exit 0
 fi
 command -v sort &> /dev/null
 if [ ${?} -ne 0 ]; then
-    echo 'The command `sort` is required for this script to work, but not supported by your system.' 1>&2
-    exit 1
+    echo "${WARNING_PREFIX} "'The command `sort` is required for this script to work, but not supported by your system.' 1>&2
+    exit 0
 fi
 command -v cmp &> /dev/null
 if [ ${?} -ne 0 ]; then
-    echo 'The command `cmp` is required for this script to work, but not supported by your system.' 1>&2
-    exit 1
+    echo "${WARNING_PREFIX} "'The command `cmp` is required for this script to work, but not supported by your system.' 1>&2
+    exit 0
 fi
 
+
 # Put all header files as a list (separated by newlines) in the file ${HEADER_LIST}
 # Requires detected files (including the path) to not contain newlines
 find "${TOP_HEADER_FOLDER}" -name '*.hpp' -type f -print > "${HEADER_LIST}"
 
 if [ ${?} -ne 0 ]; then
-    echo 'Exiting due to an error being returned by `find`!' 1>&2
-    rm "${HEADER_LIST}"
-    exit 1
+    echo "${WARNING_PREFIX} "'The `find` command returned with an error!' 1>&2
+    rm "${RM_PARAMETER}" "${HEADER_LIST}"
+    exit 0
 fi
 
 # It must be a POSIX locale in order to sort according to ASCII
@@ -47,14 +58,20 @@ export LC_ALL=C
 sort -o "${HEADER_LIST}" "${HEADER_LIST}"
 
 if [ ${?} -ne 0 ]; then
-    echo 'Exiting due to an error being returned by `sort`!' 1>&2
-    rm "${HEADER_LIST}"
-    exit 1
+    echo "${WARNING_PREFIX} "'The `sort` command returned with an error!' 1>&2
+    rm "${RM_PARAMETER}" "${HEADER_LIST}"
+    exit 0
+fi
+
+if [ ! -r "${GINKGO_HEADER_TEMPLATE_FILE}" ]; then
+    echo "${WARNING_PREFIX} The file '${GINKGO_HEADER_TEMPLATE_FILE}' can not be read!" 1>&2
+    rm "${RM_PARAMETER}" "${HEADER_LIST}"
+    exit 0
 fi
 
 # Detect the end of line type (CRLF/LF) by ${GINKGO_HEADER_TEMPLATE_FILE}
 END="";
-if [[ "$(file ${GINKGO_HEADER_TEMPLATE_FILE})" == *"CRLF"* ]]; then
+if [[ "$(file "${GINKGO_HEADER_TEMPLATE_FILE}")" == *"CRLF"* ]]; then
     END="\r"
 fi
 
@@ -64,6 +81,18 @@ fi
 # (e.g. benchmarks and examples)
 GINKGO_HEADER_TMP="${GINKGO_HEADER_FILE}.tmp"
 
+# See if we have write permissions to ${GINKGO_HEADER_TMP}
+echo "Test for write permissions" > "${GINKGO_HEADER_TMP}"
+if [ ${?} -ne 0 ]; then
+    echo "${WARNING_PREFIX} No write permissions for temporary file '${GINKGO_HEADER_TMP}'!" 1>&2
+    rm "${RM_PARAMETER}" "${HEADER_LIST}"
+    exit 0
+fi
+# Remove file again, so the test does not corrupt the result
+rm "${RM_PARAMETER}" "${GINKGO_HEADER_TMP}"
+
+
+
 PREVIOUS_FOLDER=""
 # "IFS=''" sets the word delimiters for read.
 # An empty ${IFS} means the given name (after `read`) will be set to the whole line,
@@ -101,14 +130,19 @@ while IFS='' read -r line; do
     fi
 done < "${GINKGO_HEADER_TEMPLATE_FILE}"
 
+rm "${RM_PARAMETER}" "${HEADER_LIST}"
+
 # Use the generated file ONLY when the public header does not exist yet
 # or the generated one is different to the existing one
 if [ ! -f "${GINKGO_HEADER_FILE}" ] || \
    ! cmp -s "${GINKGO_HEADER_TMP}" "${GINKGO_HEADER_FILE}"
 then
     mv "${GINKGO_HEADER_TMP}" "${GINKGO_HEADER_FILE}"
+    if [ ${?} -ne 0 ]; then
+        echo "${WARNING_PREFIX} No permission to replace the header '${GINKGO_HEADER_FILE}'!" 1>&2
+        rm "${RM_PARAMETER}" "${GINKGO_HEADER_TMP}"
+        exit 0
+    fi
 else
-    rm "${GINKGO_HEADER_TMP}"
+    rm "${RM_PARAMETER}" "${GINKGO_HEADER_TMP}"
 fi
-
-rm "${HEADER_LIST}"
diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml
index d873dd9ee2d..4c25a288a38 100644
--- a/doc/DoxygenLayout.xml
+++ b/doc/DoxygenLayout.xml
@@ -3,7 +3,7 @@
   <!-- Navigation index tabs for HTML output -->
   <navindex>
     <tab type="mainpage" visible="yes" title=""/>
-    <tab type="user" visible="yes" title="Tutorial" url="https://github.com/ginkgo-project/ginkgo/wiki/Tutorial:-Building-a-2D-Poisson-Solver" />
+    <tab type="user" visible="yes" title="Tutorial" url="https://github.com/ginkgo-project/ginkgo/wiki/Tutorial:-Building-a-Poisson-Solver" />
     <tab type="user" visible="yes" title="Examples" url="@ref Examples" />
     <tab type="user" visible="yes" title="Using Ginkgo" url="@ref install_ginkgo">
       <tab type="user" visible="yes" title="Installing Ginkgo" url="@ref install_ginkgo" />
diff --git a/doc/headers/jacobi.hpp b/doc/headers/jacobi.hpp
new file mode 100644
index 00000000000..8e406d75fea
--- /dev/null
+++ b/doc/headers/jacobi.hpp
@@ -0,0 +1,40 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2019, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @defgroup jacobi Jacobi Preconditioner
+ *
+ * @brief A module dedicated to the implementation and usage of the
+ * Jacobi Preconditioner in Ginkgo.
+ *
+ * @ingroup precond
+ */
diff --git a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
index a2accaded3e..6600e7291ba 100644
--- a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
+++ b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-/*****************************<DECSRIPTION>***********************************
+/*****************************<DESCRIPTION>***********************************
 This example solves a 2D Poisson equation:
 
     \Omega = (0,1)^2
@@ -67,7 +67,7 @@ additional parameters.
 
 The intention of this is to show how generation of stencil values and the right
 hand side vector changes when increasing the dimension.
-*****************************<DECSRIPTION>**********************************/
+*****************************<DESCRIPTION>**********************************/
 
 #include <array>
 #include <chrono>
diff --git a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
index 7da78bd0f0d..339f4239519 100644
--- a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
+++ b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-/*****************************<DECSRIPTION>***********************************
+/*****************************<DESCRIPTION>***********************************
 This example solves a 1D Poisson equation:
 
     u : [0, 1] -> R
@@ -67,7 +67,7 @@ existing software - the `generate_stencil_matrix`, `generate_rhs`,
 all (i.e. they could have been there before the application developer decided to
 use Ginkgo, and the only part where Ginkgo is introduced is inside the
 `solve_system` function.
-*****************************<DECSRIPTION>**********************************/
+*****************************<DESCRIPTION>**********************************/
 
 #include <ginkgo/ginkgo.hpp>
 #include <iostream>
diff --git a/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp b/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp
index 78e31e4a8e9..f319ed35513 100644
--- a/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp
+++ b/examples/twentyseven-pt-stencil-solver/twentyseven-pt-stencil-solver.cpp
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-/*****************************<DECSRIPTION>***********************************
+/*****************************<DESCRIPTION>***********************************
 This example solves a 3D Poisson equation:
 
     \Omega = (0,1)^3
@@ -68,7 +68,7 @@ changed when passing additional parameters.
 
 The intention of this is to show how generation of stencil values and the right
 hand side vector changes when increasing the dimension.
-*****************************<DECSRIPTION>**********************************/
+*****************************<DESCRIPTION>**********************************/
 
 #include <array>
 #include <chrono>
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index 31ec295d395..aeb8cb30712 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_VERSION_MINOR @Ginkgo_VERSION_MINOR@
 #define GKO_VERSION_PATCH @Ginkgo_VERSION_PATCH@
 #define GKO_VERSION_TAG "@Ginkgo_VERSION_TAG@"
+#define GKO_VERSION_STR @Ginkgo_VERSION_MAJOR@, @Ginkgo_VERSION_MINOR@, @Ginkgo_VERSION_PATCH@
 // clang-format on
 
 /*
diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp
index 3a58e253595..1855a9dc5c7 100644
--- a/include/ginkgo/core/base/exception.hpp
+++ b/include/ginkgo/core/base/exception.hpp
@@ -305,6 +305,31 @@ class BadDimension : public Error {
 };
 
 
+/**
+ * ValueMismatch is thrown if two values are not equal.
+ */
+class ValueMismatch : public Error {
+public:
+    /**
+     * Initializes a value mismatch error.
+     *
+     * @param file The name of the offending source file
+     * @param line The source code line number where the error occurred
+     * @param func The function name where the error occurred
+     * @param val1 The first value to be compared.
+     * @param val2 The second value to be compared.
+     * @param clarification An additional message further describing the error
+     */
+    ValueMismatch(const std::string &file, int line, const std::string &func,
+                  size_type val1, size_type val2,
+                  const std::string &clarification)
+        : Error(file, line,
+                func + ": Value mismatch : " + std::to_string(val1) + " and " +
+                    std::to_string(val2) + " : " + clarification)
+    {}
+};
+
+
 /**
  * AllocationError is thrown if a memory allocation fails.
  */
diff --git a/include/ginkgo/core/base/exception_helpers.hpp b/include/ginkgo/core/base/exception_helpers.hpp
index 778b496bccf..41225f06c98 100644
--- a/include/ginkgo/core/base/exception_helpers.hpp
+++ b/include/ginkgo/core/base/exception_helpers.hpp
@@ -89,17 +89,21 @@ namespace gko {
 
 
 /**
- * Creates a NotSupported exception.
+ * Throws a NotSupported exception.
  * This macro sets the correct information about the location of the error
- * and fills the exception with data about _obj.
+ * and fills the exception with data about _obj, followed by throwing it.
  *
  * @param _obj  the object referenced by NotSupported exception
- *
- * @return NotSupported
  */
-#define GKO_NOT_SUPPORTED(_obj)                       \
-    ::gko::NotSupported(__FILE__, __LINE__, __func__, \
-                        ::gko::name_demangling::get_type_name(typeid(_obj)))
+#define GKO_NOT_SUPPORTED(_obj)                                              \
+    {                                                                        \
+        throw ::gko::NotSupported(                                           \
+            __FILE__, __LINE__, __func__,                                    \
+            ::gko::name_demangling::get_type_name(typeid(_obj)));            \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
 
 
 namespace detail {
@@ -117,6 +121,18 @@ inline dim<2> get_size(const dim<2> &size) { return size; }
 }  // namespace detail
 
 
+/**
+ *Asserts that _val1 and _val2 are equal.
+ *
+ *@throw ValueMisatch if _val1 is different from _val2.
+ */
+#define GKO_ASSERT_EQ(_val1, _val2)                                            \
+    if (_val1 != _val2) {                                                      \
+        throw ::gko::ValueMismatch(__FILE__, __LINE__, __func__, _val1, _val2, \
+                                   "expected equal values");                   \
+    }
+
+
 /**
  *Asserts that _op1 is a square matrix.
  *
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 5561d4064b2..115978f9b18 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -897,7 +897,7 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
           major_(0),
           minor_(0)
     {
-        assert(device_id < max_devices);
+        assert(device_id < max_devices && device_id >= 0);
         this->set_gpu_property();
         this->init_handles();
         increase_num_execs(device_id);
@@ -909,19 +909,19 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
-    static void increase_num_execs(int device_id)
+    static void increase_num_execs(unsigned device_id)
     {
         std::lock_guard<std::mutex> guard(mutex[device_id]);
         num_execs[device_id]++;
     }
 
-    static void decrease_num_execs(int device_id)
+    static void decrease_num_execs(unsigned device_id)
     {
         std::lock_guard<std::mutex> guard(mutex[device_id]);
         num_execs[device_id]--;
     }
 
-    static int get_num_execs(int device_id)
+    static unsigned get_num_execs(unsigned device_id)
     {
         std::lock_guard<std::mutex> guard(mutex[device_id]);
         return num_execs[device_id];
@@ -941,7 +941,7 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     handle_manager<cusparseContext> cusparse_handle_;
 
     static constexpr int max_devices = 64;
-    static int num_execs[max_devices];
+    static unsigned num_execs[max_devices];
     static std::mutex mutex[max_devices];
 };
 
diff --git a/include/ginkgo/core/base/polymorphic_object.hpp b/include/ginkgo/core/base/polymorphic_object.hpp
index a75e1dff392..918764fc615 100644
--- a/include/ginkgo/core/base/polymorphic_object.hpp
+++ b/include/ginkgo/core/base/polymorphic_object.hpp
@@ -220,6 +220,12 @@ class PolymorphicObject : public log::EnableLogging<PolymorphicObject> {
         : exec_{std::move(exec)}
     {}
 
+    // preserve the executor of the object
+    explicit PolymorphicObject(const PolymorphicObject &other)
+    {
+        *this = other;
+    }
+
     /**
      * Implementers of PolymorphicObject should override this function instead
      * of create_default().
diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp
index f069cf7b0e6..e238f276fab 100644
--- a/include/ginkgo/core/base/range.hpp
+++ b/include/ginkgo/core/base/range.hpp
@@ -305,6 +305,11 @@ class range {
      */
     static constexpr size_type dimensionality = accessor::dimensionality;
 
+    /**
+     * Use the default destructor.
+     */
+    ~range() = default;
+
     /**
      * Creates a new range.
      *
@@ -377,6 +382,8 @@ class range {
         return *this;
     }
 
+    GKO_ATTRIBUTES range(const range &other) = default;
+
     /**
      * Returns the length of the specified dimension of the range.
      *
diff --git a/include/ginkgo/core/factorization/par_ilu.hpp b/include/ginkgo/core/factorization/par_ilu.hpp
index 7de220aec66..8e42b4dc5a9 100644
--- a/include/ginkgo/core/factorization/par_ilu.hpp
+++ b/include/ginkgo/core/factorization/par_ilu.hpp
@@ -89,7 +89,7 @@ namespace factorization {
  * @tparam IndexType  Type of the indices of all matrices used in this class
  *
  * @ingroup factor
- * @ingroup linop
+ * @ingroup LinOp
  */
 template <typename ValueType = default_precision, typename IndexType = int32>
 class ParIlu : public Composition<ValueType> {
diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp
index ac6f271a153..36468411a8a 100644
--- a/include/ginkgo/core/matrix/coo.hpp
+++ b/include/ginkgo/core/matrix/coo.hpp
@@ -278,10 +278,8 @@ class Coo : public EnableLinOp<Coo<ValueType, IndexType>>,
           col_idxs_{exec, std::forward<ColIdxsArray>(col_idxs)},
           row_idxs_{exec, std::forward<RowIdxsArray>(row_idxs)}
     {
-        GKO_ENSURE_IN_BOUNDS(values_.get_num_elems() - 1,
-                             col_idxs_.get_num_elems());
-        GKO_ENSURE_IN_BOUNDS(values_.get_num_elems() - 1,
-                             row_idxs_.get_num_elems());
+        GKO_ASSERT_EQ(values_.get_num_elems(), col_idxs_.get_num_elems());
+        GKO_ASSERT_EQ(values_.get_num_elems(), row_idxs_.get_num_elems());
     }
 
     void apply_impl(const LinOp *b, LinOp *x) const override;
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index 391308ffd1b..4dd2e5d862c 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -183,18 +183,33 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             auto nwarps = mtx_srow->get_num_elems();
 
             if (nwarps > 0) {
-                auto exec = mtx_srow->get_executor()->get_master();
-                Array<index_type> srow_host(exec);
-                srow_host = *mtx_srow;
-                auto srow = srow_host.get_data();
-                Array<index_type> row_ptrs_host(exec);
-                row_ptrs_host = mtx_row_ptrs;
-                auto row_ptrs = row_ptrs_host.get_const_data();
+                auto host_srow_exec = mtx_srow->get_executor()->get_master();
+                auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master();
+                const bool is_srow_on_host{host_srow_exec ==
+                                           mtx_srow->get_executor()};
+                const bool is_mtx_on_host{host_mtx_exec ==
+                                          mtx_row_ptrs.get_executor()};
+                Array<index_type> row_ptrs_host(host_mtx_exec);
+                Array<index_type> srow_host(host_srow_exec);
+                const index_type *row_ptrs{};
+                index_type *srow{};
+                if (is_srow_on_host) {
+                    srow = mtx_srow->get_data();
+                } else {
+                    srow_host = *mtx_srow;
+                    srow = srow_host.get_data();
+                }
+                if (is_mtx_on_host) {
+                    row_ptrs = mtx_row_ptrs.get_const_data();
+                } else {
+                    row_ptrs_host = mtx_row_ptrs;
+                    row_ptrs = row_ptrs_host.get_const_data();
+                }
                 for (size_type i = 0; i < nwarps; i++) {
                     srow[i] = 0;
                 }
-                auto num_rows = mtx_row_ptrs.get_num_elems() - 1;
-                auto num_elems = row_ptrs[num_rows];
+                const auto num_rows = mtx_row_ptrs.get_num_elems() - 1;
+                const auto num_elems = row_ptrs[num_rows];
                 for (size_type i = 0; i < num_rows; i++) {
                     auto bucket =
                         ceildiv((ceildiv(row_ptrs[i + 1], warp_size) * nwarps),
@@ -207,7 +222,9 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                 for (size_type i = 1; i < nwarps; i++) {
                     srow[i] += srow[i - 1];
                 }
-                *mtx_srow = srow_host;
+                if (!is_srow_on_host) {
+                    *mtx_srow = srow_host;
+                }
             }
         }
 
@@ -249,28 +266,47 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             // if the number of stored elements is larger than 1e6 or
             // the maximum number of stored elements per row is larger than
             // 64, use load_balance otherwise use classical
+            auto host_mtx_exec = mtx_row_ptrs.get_executor()->get_master();
+            const bool is_mtx_on_host{host_mtx_exec ==
+                                      mtx_row_ptrs.get_executor()};
+            Array<index_type> row_ptrs_host(host_mtx_exec);
+            const index_type *row_ptrs{};
+            if (is_mtx_on_host) {
+                row_ptrs = mtx_row_ptrs.get_const_data();
+            } else {
+                row_ptrs_host = mtx_row_ptrs;
+                row_ptrs = row_ptrs_host.get_const_data();
+            }
             const auto num_rows = mtx_row_ptrs.get_num_elems() - 1;
-            Array<index_type> host_row_ptrs(
-                mtx_row_ptrs.get_executor()->get_master());
-            host_row_ptrs = mtx_row_ptrs;
-            const auto row_val = host_row_ptrs.get_const_data();
-            if (row_val[num_rows] > static_cast<index_type>(1e6)) {
-                std::make_shared<load_balance>(nwarps_)->process(host_row_ptrs,
-                                                                 mtx_srow);
-                this->set_name("load_balance");
+            if (row_ptrs[num_rows] > index_type(1e6)) {
+                load_balance actual_strategy(nwarps_);
+                if (is_mtx_on_host) {
+                    actual_strategy.process(mtx_row_ptrs, mtx_srow);
+                } else {
+                    actual_strategy.process(row_ptrs_host, mtx_srow);
+                }
+                this->set_name(actual_strategy.get_name());
             } else {
                 index_type maxnum = 0;
                 for (index_type i = 1; i < num_rows + 1; i++) {
-                    maxnum = max(maxnum, row_val[i] - row_val[i - 1]);
+                    maxnum = max(maxnum, row_ptrs[i] - row_ptrs[i - 1]);
                 }
                 if (maxnum > 64) {
-                    std::make_shared<load_balance>(nwarps_)->process(
-                        host_row_ptrs, mtx_srow);
-                    this->set_name("load_balance");
+                    load_balance actual_strategy(nwarps_);
+                    if (is_mtx_on_host) {
+                        actual_strategy.process(mtx_row_ptrs, mtx_srow);
+                    } else {
+                        actual_strategy.process(row_ptrs_host, mtx_srow);
+                    }
+                    this->set_name(actual_strategy.get_name());
                 } else {
-                    std::make_shared<classical>()->process(host_row_ptrs,
-                                                           mtx_srow);
-                    this->set_name("classical");
+                    classical actual_strategy;
+                    if (is_mtx_on_host) {
+                        actual_strategy.process(mtx_row_ptrs, mtx_srow);
+                    } else {
+                        actual_strategy.process(row_ptrs_host, mtx_srow);
+                    }
+                    this->set_name(actual_strategy.get_name());
                 }
             }
         }
@@ -498,10 +534,8 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
           srow_(exec),
           strategy_(std::move(strategy))
     {
-        GKO_ENSURE_IN_BOUNDS(values_.get_num_elems() - 1,
-                             col_idxs_.get_num_elems());
-        GKO_ENSURE_IN_BOUNDS(this->get_size()[0], row_ptrs_.get_num_elems());
-        srow_.resize_and_reset(strategy_->clac_size(values_.get_num_elems()));
+        GKO_ASSERT_EQ(values_.get_num_elems(), col_idxs_.get_num_elems());
+        GKO_ASSERT_EQ(this->get_size()[0] + 1, row_ptrs_.get_num_elems());
         this->make_srow();
     }
 
@@ -511,9 +545,13 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                     LinOp *x) const override;
 
     /**
-     * Compute srow, it should be run after setting value.
+     * Computes srow. It should be run after changing any row_ptrs_ value.
      */
-    void make_srow() { strategy_->process(row_ptrs_, &srow_); }
+    void make_srow()
+    {
+        srow_.resize_and_reset(strategy_->clac_size(values_.get_num_elems()));
+        strategy_->process(row_ptrs_, &srow_);
+    }
 
 private:
     Array<value_type> values_;
diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp
index cb7fad12d49..d9572ec0759 100644
--- a/include/ginkgo/core/matrix/ell.hpp
+++ b/include/ginkgo/core/matrix/ell.hpp
@@ -287,10 +287,10 @@ class Ell : public EnableLinOp<Ell<ValueType, IndexType>>,
           num_stored_elements_per_row_{num_stored_elements_per_row},
           stride_{stride}
     {
-        GKO_ENSURE_IN_BOUNDS(num_stored_elements_per_row_ * stride_ - 1,
-                             values_.get_num_elems());
-        GKO_ENSURE_IN_BOUNDS(num_stored_elements_per_row_ * stride_ - 1,
-                             col_idxs_.get_num_elems());
+        GKO_ASSERT_EQ(num_stored_elements_per_row_ * stride_,
+                      values_.get_num_elems());
+        GKO_ASSERT_EQ(num_stored_elements_per_row_ * stride_,
+                      col_idxs_.get_num_elems());
     }
 
     void apply_impl(const LinOp *b, LinOp *x) const override;
diff --git a/include/ginkgo/core/matrix/sparsity_csr.hpp b/include/ginkgo/core/matrix/sparsity_csr.hpp
index 1e43ec0930f..5d1b0e580a8 100644
--- a/include/ginkgo/core/matrix/sparsity_csr.hpp
+++ b/include/ginkgo/core/matrix/sparsity_csr.hpp
@@ -240,9 +240,7 @@ class SparsityCsr
         auto tmp = Array<value_type>{exec->get_master(), 1};
         tmp.get_data()[0] = value;
         value_ = Array<value_type>{exec, std::move(tmp)};
-        GKO_ENSURE_IN_BOUNDS(col_idxs_.get_num_elems() - 1,
-                             col_idxs_.get_num_elems());
-        GKO_ENSURE_IN_BOUNDS(this->get_size()[0], row_ptrs_.get_num_elems());
+        GKO_ASSERT_EQ(this->get_size()[0] + 1, row_ptrs_.get_num_elems());
     }
 
     /**
diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp
index e1ee08879e9..6b41cc176da 100644
--- a/include/ginkgo/core/preconditioner/ilu.hpp
+++ b/include/ginkgo/core/preconditioner/ilu.hpp
@@ -214,7 +214,7 @@ class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
             l_factor = comp_cast->get_operators()[0];
             u_factor = comp_cast->get_operators()[1];
         } else {
-            throw GKO_NOT_SUPPORTED(comp_cast);
+            GKO_NOT_SUPPORTED(comp_cast);
         }
         GKO_ASSERT_EQUAL_DIMENSIONS(l_factor, u_factor);
 
@@ -276,7 +276,8 @@ class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
      *
      */
     template <typename SolverType, typename = void>
-    struct has_with_criteria : std::false_type {};
+    struct has_with_criteria : std::false_type {
+    };
 
     /**
      * @copydoc has_with_criteria
@@ -290,7 +291,8 @@ class Ilu : public EnableLinOp<Ilu<LSolverType, USolverType, ReverseApply>> {
         SolverType,
         xstd::void_t<decltype(std::declval<factory_type_t<SolverType>>()
                                   .with_criteria(with_criteria_param_type()))>>
-        : std::true_type {};
+        : std::true_type {
+    };
 
 
     /**
diff --git a/include/ginkgo/core/preconditioner/jacobi.hpp b/include/ginkgo/core/preconditioner/jacobi.hpp
index 92b77a6c085..1c70ee0c498 100644
--- a/include/ginkgo/core/preconditioner/jacobi.hpp
+++ b/include/ginkgo/core/preconditioner/jacobi.hpp
@@ -56,8 +56,6 @@ namespace preconditioner {
  * @tparam IndexType  type used for storing indices of the matrix
  *
  * @ingroup jacobi
- * @ingroup precond
- * @ingroup LinOp
  */
 template <typename IndexType>
 struct block_interleaved_storage_scheme {
@@ -489,8 +487,7 @@ class Jacobi : public EnableLinOp<Jacobi<ValueType, IndexType>>,
                       parameters_.block_pointers.get_num_elems() - 1)),
           conditioning_(factory->get_executor())
     {
-        if (parameters_.max_block_size >= 32 ||
-            parameters_.max_block_size < 1) {
+        if (parameters_.max_block_size > 32 || parameters_.max_block_size < 1) {
             GKO_NOT_SUPPORTED(this);
         }
         parameters_.block_pointers.set_executor(this->get_executor());
diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp
index 123a4182347..72173c6e49d 100644
--- a/include/ginkgo/core/solver/ir.hpp
+++ b/include/ginkgo/core/solver/ir.hpp
@@ -89,6 +89,9 @@ namespace solver {
  * its eigenvalues `lambda` have to satisfy the equation `|lambda - 1| < 1).
  *
  * @tparam ValueType  precision of matrix elements
+ *
+ * @ingroup solvers
+ * @ingroup LinOp
  */
 template <typename ValueType = default_precision>
 class Ir : public EnableLinOp<Ir<ValueType>> {
diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp
index dce62ae905a..1d04410a47b 100644
--- a/omp/preconditioner/jacobi_kernels.cpp
+++ b/omp/preconditioner/jacobi_kernels.cpp
@@ -387,7 +387,7 @@ void generate(std::shared_ptr<const OmpExecutor> exec,
                                      block[b].get_const_data(), block_size);
             }
             const auto local_prec = prec ? prec[g + b] : precision_reduction();
-            if (local_prec == precision_reduction::autodetect()) {
+            if (local_prec == precision_reduction::autodetect() && cond) {
                 using preconditioner::detail::get_supported_storage_reductions;
                 pr_descriptors[b] = get_supported_storage_reductions<ValueType>(
                     accuracy, cond[g + b],
diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp
index 141c981c30a..f2972965273 100644
--- a/reference/preconditioner/jacobi_kernels.cpp
+++ b/reference/preconditioner/jacobi_kernels.cpp
@@ -369,7 +369,7 @@ void generate(std::shared_ptr<const ReferenceExecutor> exec,
                                      block[b].get_const_data(), block_size);
             }
             const auto local_prec = prec ? prec[g + b] : precision_reduction();
-            if (local_prec == precision_reduction::autodetect()) {
+            if (local_prec == precision_reduction::autodetect() && cond) {
                 using preconditioner::detail::get_supported_storage_reductions;
                 pr_descriptors[b] = get_supported_storage_reductions<ValueType>(
                     accuracy, cond[g + b],
diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp
index a98ce5adf78..0c0889df6ed 100644
--- a/reference/test/matrix/coo_kernels.cpp
+++ b/reference/test/matrix/coo_kernels.cpp
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/coo_kernels.hpp"
 
 
+#include <memory>
+
+
 #include <gtest/gtest.h>
 
 
@@ -90,21 +93,40 @@ class Coo : public ::testing::Test {
 
 TEST_F(Coo, ConvertsToCsr)
 {
-    auto csr_mtx = gko::matrix::Csr<>::create(mtx->get_executor());
-
-    mtx->convert_to(csr_mtx.get());
-
-    assert_equal_to_mtx_in_csr_format(csr_mtx.get());
+    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
+    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
+    auto csr_mtx_c =
+        gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_classical);
+    auto csr_mtx_m =
+        gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_merge);
+
+    mtx->convert_to(csr_mtx_c.get());
+    mtx->convert_to(csr_mtx_m.get());
+
+    assert_equal_to_mtx_in_csr_format(csr_mtx_c.get());
+    assert_equal_to_mtx_in_csr_format(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
 }
 
 
 TEST_F(Coo, MovesToCsr)
 {
-    auto csr_mtx = gko::matrix::Csr<>::create(mtx->get_executor());
-
-    mtx->move_to(csr_mtx.get());
-
-    assert_equal_to_mtx_in_csr_format(csr_mtx.get());
+    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
+    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
+    auto csr_mtx_c =
+        gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_classical);
+    auto csr_mtx_m =
+        gko::matrix::Csr<>::create(mtx->get_executor(), csr_s_merge);
+    auto mtx_clone = mtx->clone();
+
+    mtx->move_to(csr_mtx_c.get());
+    mtx_clone->move_to(csr_mtx_m.get());
+
+    assert_equal_to_mtx_in_csr_format(csr_mtx_c.get());
+    assert_equal_to_mtx_in_csr_format(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
 }
 
 
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index fc30c4a89df..ce24a8ef42a 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <complex>
+#include <memory>
 #include <random>
 
 
@@ -331,17 +332,49 @@ TEST_F(Dense, MovesToCoo)
 }
 
 
-TEST_F(Dense, ConvertsToCsr)
+TEST_F(Dense, ConvertsEmptyMatrixToCsr)
+{
+    auto strategy = std::make_shared<gko::matrix::Csr<>::load_balance>(0);
+    auto from_mtx = gko::matrix::Dense<>::create(exec, gko::dim<2>{0, 0});
+    auto to_mtx =
+        gko::matrix::Csr<>::create(exec, gko::dim<2>{0, 0}, 0, strategy);
+
+    from_mtx->convert_to(to_mtx.get());
+
+    ASSERT_FALSE(to_mtx->get_size());
+}
+
+
+TEST_F(Dense, MovesEmptyMatrixToCsr)
 {
-    auto csr_mtx = gko::matrix::Csr<>::create(mtx4->get_executor());
+    auto strategy = std::make_shared<gko::matrix::Csr<>::load_balance>(0);
+    auto from_mtx = gko::matrix::Dense<>::create(exec, gko::dim<2>{0, 0});
+    auto to_mtx =
+        gko::matrix::Csr<>::create(exec, gko::dim<2>{0, 0}, 0, strategy);
 
-    mtx4->convert_to(csr_mtx.get());
-    auto v = csr_mtx->get_const_values();
-    auto c = csr_mtx->get_const_col_idxs();
-    auto r = csr_mtx->get_const_row_ptrs();
+    from_mtx->move_to(to_mtx.get());
 
-    ASSERT_EQ(csr_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(csr_mtx->get_num_stored_elements(), 4);
+    ASSERT_FALSE(to_mtx->get_size());
+}
+
+
+TEST_F(Dense, ConvertsToCsr)
+{
+    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
+    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
+    auto csr_mtx_c =
+        gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_classical);
+    auto csr_mtx_m =
+        gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_merge);
+
+    mtx4->convert_to(csr_mtx_c.get());
+    mtx4->convert_to(csr_mtx_m.get());
+
+    auto v = csr_mtx_c->get_const_values();
+    auto c = csr_mtx_c->get_const_col_idxs();
+    auto r = csr_mtx_c->get_const_row_ptrs();
+    ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4);
     EXPECT_EQ(r[0], 0);
     EXPECT_EQ(r[1], 3);
     EXPECT_EQ(r[2], 4);
@@ -353,20 +386,30 @@ TEST_F(Dense, ConvertsToCsr)
     EXPECT_EQ(v[1], 3.0);
     EXPECT_EQ(v[2], 2.0);
     EXPECT_EQ(v[3], 5.0);
+    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0);
+    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
 }
 
 
 TEST_F(Dense, MovesToCsr)
 {
-    auto csr_mtx = gko::matrix::Csr<>::create(mtx4->get_executor());
-
-    mtx4->move_to(csr_mtx.get());
-    auto v = csr_mtx->get_const_values();
-    auto c = csr_mtx->get_const_col_idxs();
-    auto r = csr_mtx->get_const_row_ptrs();
-
-    ASSERT_EQ(csr_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(csr_mtx->get_num_stored_elements(), 4);
+    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
+    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
+    auto csr_mtx_c =
+        gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_classical);
+    auto csr_mtx_m =
+        gko::matrix::Csr<>::create(mtx4->get_executor(), csr_s_merge);
+    auto mtx_clone = mtx4->clone();
+
+    mtx4->move_to(csr_mtx_c.get());
+    mtx_clone->move_to(csr_mtx_m.get());
+
+    auto v = csr_mtx_c->get_const_values();
+    auto c = csr_mtx_c->get_const_col_idxs();
+    auto r = csr_mtx_c->get_const_row_ptrs();
+    ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4);
     EXPECT_EQ(r[0], 0);
     EXPECT_EQ(r[1], 3);
     EXPECT_EQ(r[2], 4);
@@ -378,6 +421,9 @@ TEST_F(Dense, MovesToCsr)
     EXPECT_EQ(v[1], 3.0);
     EXPECT_EQ(v[2], 2.0);
     EXPECT_EQ(v[3], 5.0);
+    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0);
+    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
 }
 
 
diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp
index 4dc96332eb8..806949caf9b 100644
--- a/reference/test/matrix/ell_kernels.cpp
+++ b/reference/test/matrix/ell_kernels.cpp
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 
 
+#include <memory>
+
+
 #include <gtest/gtest.h>
 
 
@@ -347,21 +350,40 @@ TEST_F(Ell, MovesWithStrideToDense)
 
 TEST_F(Ell, ConvertsToCsr)
 {
-    auto csr_mtx = gko::matrix::Csr<>::create(mtx1->get_executor());
-
-    mtx1->convert_to(csr_mtx.get());
-
-    assert_equal_to_mtx(csr_mtx.get());
+    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
+    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
+    auto csr_mtx_c =
+        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical);
+    auto csr_mtx_m =
+        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge);
+
+    mtx1->convert_to(csr_mtx_c.get());
+    mtx1->convert_to(csr_mtx_m.get());
+
+    assert_equal_to_mtx(csr_mtx_c.get());
+    assert_equal_to_mtx(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
 }
 
 
 TEST_F(Ell, ConvertsWithStrideToCsr)
 {
-    auto csr_mtx = gko::matrix::Csr<>::create(mtx2->get_executor());
-
-    mtx2->convert_to(csr_mtx.get());
-
-    assert_equal_to_mtx(csr_mtx.get());
+    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
+    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
+    auto csr_mtx_c =
+        gko::matrix::Csr<>::create(mtx2->get_executor(), csr_s_classical);
+    auto csr_mtx_m =
+        gko::matrix::Csr<>::create(mtx2->get_executor(), csr_s_merge);
+    auto mtx_clone = mtx2->clone();
+
+    mtx2->move_to(csr_mtx_c.get());
+    mtx_clone->move_to(csr_mtx_m.get());
+
+    assert_equal_to_mtx(csr_mtx_c.get());
+    assert_equal_to_mtx(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
 }
 
 
diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp
index c2512c5afd9..1ca6b61cd20 100644
--- a/reference/test/matrix/hybrid_kernels.cpp
+++ b/reference/test/matrix/hybrid_kernels.cpp
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/hybrid_kernels.hpp"
 
 
+#include <memory>
+
+
 #include <gtest/gtest.h>
 
 
@@ -244,21 +247,40 @@ TEST_F(Hybrid, MovesToDense)
 
 TEST_F(Hybrid, ConvertsToCsr)
 {
-    auto csr_mtx = Csr::create(mtx1->get_executor());
-
-    mtx1->convert_to(csr_mtx.get());
-
-    assert_equal_to_mtx(csr_mtx.get());
+    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
+    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
+    auto csr_mtx_c =
+        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical);
+    auto csr_mtx_m =
+        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge);
+
+    mtx1->convert_to(csr_mtx_c.get());
+    mtx1->convert_to(csr_mtx_m.get());
+
+    assert_equal_to_mtx(csr_mtx_c.get());
+    assert_equal_to_mtx(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
 }
 
 
 TEST_F(Hybrid, MovesToCsr)
 {
-    auto csr_mtx = Csr::create(mtx1->get_executor());
-
-    mtx1->move_to(csr_mtx.get());
-
-    assert_equal_to_mtx(csr_mtx.get());
+    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
+    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
+    auto csr_mtx_c =
+        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical);
+    auto csr_mtx_m =
+        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge);
+    auto mtx_clone = mtx1->clone();
+
+    mtx1->move_to(csr_mtx_c.get());
+    mtx_clone->move_to(csr_mtx_m.get());
+
+    assert_equal_to_mtx(csr_mtx_c.get());
+    assert_equal_to_mtx(csr_mtx_m.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
 }
 
 
diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp
index 2ae24e260b9..102e218dbb4 100644
--- a/reference/test/matrix/sellp_kernels.cpp
+++ b/reference/test/matrix/sellp_kernels.cpp
@@ -200,29 +200,48 @@ TEST_F(Sellp, MovesToDense)
 
 TEST_F(Sellp, ConvertsToCsr)
 {
-    auto csr_mtx = gko::matrix::Csr<>::create(mtx1->get_executor());
+    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
+    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
+    auto csr_mtx_c =
+        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical);
+    auto csr_mtx_m =
+        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge);
 
-    mtx1->convert_to(csr_mtx.get());
+    mtx1->convert_to(csr_mtx_c.get());
+    mtx1->convert_to(csr_mtx_m.get());
 
     // clang-format off
-	GKO_ASSERT_MTX_NEAR(csr_mtx,
-	                    l({{1.0, 3.0, 2.0},
-	                       {0.0, 5.0, 0.0}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(csr_mtx_c,
+                        l({{1.0, 3.0, 2.0},
+                           {0.0, 5.0, 0.0}}), 0.0);
     // clang-format on
+    GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0);
+    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
 }
 
 
 TEST_F(Sellp, MovesToCsr)
 {
-    auto csr_mtx = gko::matrix::Csr<>::create(mtx1->get_executor());
+    auto csr_s_classical = std::make_shared<gko::matrix::Csr<>::classical>();
+    auto csr_s_merge = std::make_shared<gko::matrix::Csr<>::merge_path>();
+    auto csr_mtx_c =
+        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_classical);
+    auto csr_mtx_m =
+        gko::matrix::Csr<>::create(mtx1->get_executor(), csr_s_merge);
+    auto mtx_clone = mtx1->clone();
 
-    mtx1->move_to(csr_mtx.get());
+    mtx1->move_to(csr_mtx_c.get());
+    mtx_clone->move_to(csr_mtx_m.get());
 
     // clang-format off
-	GKO_ASSERT_MTX_NEAR(csr_mtx,
-	                    l({{1.0, 3.0, 2.0},
-	                       {0.0, 5.0, 0.0}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(csr_mtx_c,
+                        l({{1.0, 3.0, 2.0},
+                           {0.0, 5.0, 0.0}}), 0.0);
     // clang-format on
+    GKO_ASSERT_MTX_NEAR(csr_mtx_c.get(), csr_mtx_m.get(), 0.0);
+    ASSERT_EQ(csr_mtx_c->get_strategy(), csr_s_classical);
+    ASSERT_EQ(csr_mtx_m->get_strategy(), csr_s_merge);
 }
 
 
diff --git a/test_install/CMakeLists.txt b/test_install/CMakeLists.txt
index e3f9eba73d0..7eef8b8b7dd 100644
--- a/test_install/CMakeLists.txt
+++ b/test_install/CMakeLists.txt
@@ -7,6 +7,9 @@ find_package(Ginkgo REQUIRED
             # Alternatively, use `cmake -DCMAKE_PREFIX_PATH=<ginkgo_install_dir>` to specify the install directory
             )
 
+if(GINKGO_HAVE_PAPI_SDE)
+    find_package(PAPI REQUIRED OPTIONAL_COMPONENTS sde)
+endif()
 
 # Needed because of a known issue with CUDA while linking statically.
 # For details, see https://gitlab.kitware.com/cmake/cmake/issues/18614
diff --git a/test_install/test_install.cpp b/test_install/test_install.cpp
index 5205743cd7c..1479f74fcb8 100644
--- a/test_install/test_install.cpp
+++ b/test_install/test_install.cpp
@@ -248,6 +248,7 @@ int main(int, char **)
     {
         using Mtx = gko::matrix::SparsityCsr<>;
         auto test = Mtx::create(refExec, gko::dim<2>{2, 2});
+    }
 
     // core/preconditioner/ilu.hpp
     {
diff --git a/third_party/git-cmake-format/CMakeLists.txt b/third_party/git-cmake-format/CMakeLists.txt
index a733de04a8f..c05253a738c 100644
--- a/third_party/git-cmake-format/CMakeLists.txt
+++ b/third_party/git-cmake-format/CMakeLists.txt
@@ -1,5 +1,5 @@
 ginkgo_load_git_package(git-cmake-format
-    "https://github.com/gflegar/git-cmake-format.git"
-    "9fdc1553c525b3d7ce758892fe666078903a1b21")
+    "https://github.com/ginkgo-project/git-cmake-format.git"
+    "e19ab13e640d58abd3bfdbff5f77b499b2ec4169")
 add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/src
     ${CMAKE_CURRENT_BINARY_DIR}/build EXCLUDE_FROM_ALL)