diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc8cf73d..f682d16b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,6 +35,7 @@ if( NOT is_submodule )
 
     option( ENABLE_ADDR2LINE "Enable addr2line usage in stacktraces" ON )
     option( ENABLE_CUDA "Build with CUDA" OFF )
+    option( ENABLE_HIP "Build with HIP" OFF )
     option( ENABLE_UMPIRE "Build with UMPIRE" OFF )
     option( ENABLE_CHAI "Build with CHAI" OFF )
     option( ENABLE_CALIPER "Build with Caliper" OFF )
@@ -80,6 +81,8 @@ blt_list_append( TO lvarray_dependencies ELEMENTS chai IF ENABLE_CHAI )
 
 blt_list_append( TO lvarray_dependencies ELEMENTS cuda IF ENABLE_CUDA )
 
+blt_list_append( TO lvarray_dependencies ELEMENTS blt::hip IF ENABLE_HIP )
+
 blt_list_append( TO lvarray_dependencies ELEMENTS caliper IF ENABLE_CALIPER )
 
 
diff --git a/Notes.txt b/Notes.txt
new file mode 100644
index 00000000..d07775c5
--- /dev/null
+++ b/Notes.txt
@@ -0,0 +1 @@
+./scripts/uberenv/uberenv.py --prefix=../uberenv-libs/ --spack-config-dir=./scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/ --spec="%clang@10.0.1 +umpire +chai +caliper +pylvarray +benchmarks +examples ^caliper ~adiak ~mpi ~libunwind ~libdw ~papi"
\ No newline at end of file
diff --git a/cmake/CMakeBasics.cmake b/cmake/CMakeBasics.cmake
index 4c3ec217..25c4bef7 100644
--- a/cmake/CMakeBasics.cmake
+++ b/cmake/CMakeBasics.cmake
@@ -12,9 +12,9 @@ option( ENABLE_TOTALVIEW_OUTPUT "" OFF )
 set( LVARRAY_BUILD_OBJ_LIBS OFF CACHE BOOL "" )
 
 
-if( NOT BLT_CXX_STD STREQUAL c++14 )
-    MESSAGE(FATAL_ERROR "c++14 is NOT enabled. LvArray requires c++14")
-endif()
+# if( NOT BLT_CXX_STD STREQUAL c++14 )
+#     MESSAGE(FATAL_ERROR "c++14 is NOT enabled. LvArray requires c++14")
+# endif()
 
 
 blt_append_custom_compiler_flag( FLAGS_VAR CMAKE_CXX_FLAGS DEFAULT "${OpenMP_CXX_FLAGS}")
diff --git a/cmake/Config.cmake b/cmake/Config.cmake
index 0a44fd1b..c513fbab 100644
--- a/cmake/Config.cmake
+++ b/cmake/Config.cmake
@@ -2,8 +2,10 @@
 set( PREPROCESSOR_DEFINES UMPIRE
                           CHAI
                           CUDA
+                          HIP
                           TOTALVIEW_OUTPUT
-                          CALIPER )
+                          CALIPER
+                          MAGMA )
 
 set( USE_CONFIGFILE ON CACHE BOOL "" )
 foreach( DEP in ${PREPROCESSOR_DEFINES})
diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake
index bff94834..c40d0582 100644
--- a/cmake/SetupTPL.cmake
+++ b/cmake/SetupTPL.cmake
@@ -1,19 +1,79 @@
+macro(find_and_register)
+    set(singleValueArgs NAME HEADER)
+    set(multiValueArgs INCLUDE_DIRECTORIES
+                       LIBRARY_DIRECTORIES
+                       LIBRARIES
+                       EXTRA_LIBRARIES
+                       DEPENDS )
+
+    ## parse the arguments
+    cmake_parse_arguments(arg
+                          "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(NOT DEFINED arg_NAME)
+        message(FATAL_ERROR "The find_and_register required parameter NAME specifies the name of the library to register.")
+    endif()
+
+    if(NOT DEFINED arg_INCLUDE_DIRECTORIES)
+        message(FATAL_ERROR "The find_and_register required parameter INCLUDE_DIRECTORIES specifies the directories to search for the given header.")
+    endif()
+
+    if(NOT DEFINED arg_LIBRARY_DIRECTORIES)
+        message(FATAL_ERROR "The find_and_register required parameter LIBRARY_DIRECTORIES specifies the directories to search for the given libraries.")
+    endif()
+
+    if(NOT DEFINED arg_HEADER)
+        message(FATAL_ERROR "The find_and_register required parameter HEADER specifies the header to search for.")
+    endif()
+
+    if(NOT DEFINED arg_LIBRARIES)
+        message(FATAL_ERROR "The find_and_register required parameter LIBRARIES specifies the libraries to search for.")
+    endif()
+
+    find_path(${arg_NAME}_INCLUDE_DIR ${arg_HEADER}
+              PATHS ${arg_INCLUDE_DIRECTORIES}
+              NO_DEFAULT_PATH
+              NO_CMAKE_ENVIRONMENT_PATH
+              NO_CMAKE_PATH
+              NO_SYSTEM_ENVIRONMENT_PATH
+              NO_CMAKE_SYSTEM_PATH)
+
+    if(${arg_NAME}_INCLUDE_DIR STREQUAL ${arg_NAME}_INCLUDE_DIR-NOTFOUND)
+        message(FATAL_ERROR "Could not find '${arg_HEADER}' in '${arg_INCLUDE_DIRECTORIES}'")
+    endif()
+
+    blt_find_libraries(FOUND_LIBS ${arg_NAME}_LIBRARIES
+                       NAMES ${arg_LIBRARIES}
+                       PATHS ${arg_LIBRARY_DIRECTORIES}
+                       REQUIRED ON)
+
+    blt_import_library(NAME ${arg_NAME}
+                         INCLUDES ${${arg_NAME}_INCLUDE_DIR}
+                         LIBRARIES ${${arg_NAME}_LIBRARIES} ${arg_EXTRA_LIBRARIES}
+                         TREAT_INCLUDES_AS_SYSTEM ON
+                         DEPENDS_ON ${arg_DEPENDS})
+
+endmacro(find_and_register)
+
 set(thirdPartyLibs "")
 
-################################
+###############################
 # CAMP
-################################
-if(NOT EXISTS ${CAMP_DIR})
-    message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.")
-endif()
+###############################
+if(CAMP_DIR STREQUAL RAJA_DIR)
+    message(STATUS "LvArray using CAMP from RAJA.")
+else()
+    if(NOT EXISTS ${CAMP_DIR})
+        message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.")
+    endif()
 
-message(STATUS "Using CAMP from ${CAMP_DIR}")
+    message(STATUS "LvArray using CAMP from ${CAMP_DIR}")
 
-find_package(camp REQUIRED PATHS ${CAMP_DIR})
+    find_package(camp REQUIRED PATHS ${CAMP_DIR})
 
-set(ENABLE_CAMP ON CACHE BOOL "")
+    set(thirdPartyLibs ${thirdPartyLibs} camp)
+endif()
 
-set(thirdPartyLibs ${thirdPartyLibs} camp)
 
 ################################
 # RAJA
@@ -22,7 +82,7 @@ if(NOT EXISTS ${RAJA_DIR})
     message(FATAL_ERROR "RAJA_DIR must be defined and point to a valid directory when using RAJA.")
 endif()
 
-message(STATUS "Using RAJA from ${RAJA_DIR}")
+message(STATUS "LvArray using RAJA from ${RAJA_DIR}")
 
 find_package(RAJA REQUIRED PATHS ${RAJA_DIR})
 
@@ -39,20 +99,26 @@ if(ENABLE_UMPIRE)
         message(FATAL_ERROR "UMPIRE_DIR must be defined and point to a valid directory when using Umpire.")
     endif()
 
-    message(STATUS "Using Umpire from ${UMPIRE_DIR}")
+    message(STATUS "LvArray using Umpire from ${UMPIRE_DIR}")
 
     find_package(umpire REQUIRED
                  PATHS ${UMPIRE_DIR})
     
     set(thirdPartyLibs ${thirdPartyLibs} umpire)
 else()
-    message(STATUS "Not using Umpire.")
+    message(STATUS "LvArray not using Umpire.")
 endif()
 
 ################################
 # CHAI
 ################################
 if(ENABLE_CHAI)
+    if(NOT EXISTS ${CHAI_DIR})
+        message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.")
+    endif()
+
+    message(STATUS "Using CHAI from ${CHAI_DIR}")
+
     if(NOT ENABLE_UMPIRE)
         message(FATAL_ERROR "Umpire must be enabled to use CHAI.")
     endif()
@@ -65,32 +131,32 @@ if(ENABLE_CHAI)
         message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.")
     endif()
 
-    message(STATUS "Using CHAI from ${CHAI_DIR}")
+    message(STATUS "LvArray using CHAI from ${CHAI_DIR}")
 
     find_package(chai REQUIRED
                  PATHS ${CHAI_DIR})
-
-    # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that.
-    get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES)
-    list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA)
-    set_target_properties(chai
-                          PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}")
+    
+    # # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that.
+    # get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES)
+    # list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA)
+    # set_target_properties(chai
+    #                       PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}")
 
     set(thirdPartyLibs ${thirdPartyLibs} chai)
 else()
-    message(STATUS "Not using CHAI.")
+    message(STATUS "LvArray not using CHAI.")
 endif()
 
 
-################################
+###############################
 # CALIPER
-################################
+###############################
 if(ENABLE_CALIPER)
     if(NOT EXISTS ${CALIPER_DIR})
         message(FATAL_ERROR "CALIPER_DIR must be defined and point to a valid directory when using caliper.")
     endif()
 
-    message(STATUS "Using caliper from ${CALIPER_DIR}")
+    message(STATUS "LvArray using caliper from ${CALIPER_DIR}")
 
     find_package(caliper REQUIRED
                  PATHS ${CALIPER_DIR})
@@ -102,22 +168,66 @@ if(ENABLE_CALIPER)
 
     set(thirdPartyLibs ${thirdPartyLibs} caliper)
 else()
-    message(STATUS "Not using caliper.")
+    message(STATUS "LvArray not using caliper.")
 endif()
 
 ################################
 # Python
 ################################
-if ( ENABLE_PYLVARRAY )
-    message( STATUS "Python3_EXECUTABLE=${Python3_EXECUTABLE}" )
-    find_package( Python3 REQUIRED
-                  COMPONENTS Development NumPy )
+if(ENABLE_PYLVARRAY)
+    message(STATUS "Python3_EXECUTABLE=${Python3_EXECUTABLE}")
+    find_package(Python3 REQUIRED
+                 COMPONENTS Development NumPy)
 
-    message( STATUS "Python3_INCLUDE_DIRS = ${Python3_INCLUDE_DIRS}" )
-    message( STATUS "Python3_LIBRARY_DIRS = ${Python3_LIBRARY_DIRS}" )
-    message( STATUS "Python3_NumPy_INCLUDE_DIRS = ${Python3_NumPy_INCLUDE_DIRS}" )
+    message(STATUS "Python3_INCLUDE_DIRS = ${Python3_INCLUDE_DIRS}")
+    message(STATUS "Python3_LIBRARY_DIRS = ${Python3_LIBRARY_DIRS}")
+    message(STATUS "Python3_NumPy_INCLUDE_DIRS = ${Python3_NumPy_INCLUDE_DIRS}")
 
-    set( thirdPartyLibs ${thirdPartyLibs} Python3::Python Python3::NumPy )
+    set(thirdPartyLibs ${thirdPartyLibs} Python3::Python Python3::NumPy)
+else()
+    message(STATUS "Not building pylvarray")
+endif()
+
+################################
+# LAPACK/BLAS
+################################
+if(ENABLE_LAPACK)
+    message(STATUS "BLAS_LIBRARIES = ${BLAS_LIBRARIES}")
+    message(STATUS "LAPACK_LIBRARIES = ${LAPACK_LIBRARIES}")
+
+    blt_import_library(NAME blas
+                       TREAT_INCLUDES_AS_SYSTEM ON
+                       LIBRARIES ${BLAS_LIBRARIES})
+
+    blt_import_library(NAME lapack
+                       DEPENDS_ON blas
+                       TREAT_INCLUDES_AS_SYSTEM ON
+                       LIBRARIES ${LAPACK_LIBRARIES})
+
+    set(thirdPartyLibs ${thirdPartyLibs} blas lapack)
+else()
+    message(STATUS "Not using LAPACK or BLAS.")
+endif()
+
+################################
+# MAGMA
+################################
+if(ENABLE_MAGMA)
+    message(STATUS "Using MAGMA from ${MAGMA_DIR}")
+
+    if(NOT ENABLE_LAPACK)
+        message(FATAL_ERROR "LAPACK must be enabled to use MAGMA.")
+    endif()
+
+    find_and_register(NAME magma
+                      INCLUDE_DIRECTORIES ${MAGMA_DIR}/include
+                      LIBRARY_DIRECTORIES ${MAGMA_DIR}/lib
+                      HEADER magma.h
+                      LIBRARIES magma)
+    
+    set(thirdPartyLibs ${thirdPartyLibs} magma)
+else()
+    message(STATUS "Not using MAGMA.")
 endif()
 
 set( thirdPartyLibs ${thirdPartyLibs} CACHE STRING "" )
diff --git a/cmake/blt b/cmake/blt
index c253509a..ddd5a0ca 160000
--- a/cmake/blt
+++ b/cmake/blt
@@ -1 +1 @@
-Subproject commit c253509ab2daf759eb857958597f6f34ab8c1713
+Subproject commit ddd5a0ca7c566d0ae14270b66625c8a363630ddb
diff --git a/host-configs/LLNL/lassen-base.cmake b/host-configs/LLNL/lassen-base.cmake
index 5a443bb9..3a60a7f3 100644
--- a/host-configs/LLNL/lassen-base.cmake
+++ b/host-configs/LLNL/lassen-base.cmake
@@ -21,14 +21,14 @@ set(ENABLE_CUDA ON CACHE BOOL "")
 set(CUDA_TOOLKIT_ROOT_DIR /usr/tce/packages/cuda/cuda-10.1.243 CACHE STRING "")
 set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING "")
 set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc CACHE STRING "")
-set(CUDA_ARCH sm_70 CACHE STRING "")
+set(CUDA_ARCHITECTURES sm_70 CACHE STRING "")
 set(CMAKE_CUDA_STANDARD 14 CACHE STRING "")
-set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCHITECTURES} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "")
 set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "")
 
-set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE)
+set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCHITECTURES}" CACHE STRING "" FORCE)
 
 # Uncomment this line to make nvcc output register usage for each kernel.
 # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --resource-usage" CACHE STRING "" FORCE)
diff --git a/host-configs/ascent-gcc@8.1.1.cmake b/host-configs/ORNL/ascent-gcc@8.1.1.cmake
similarity index 100%
rename from host-configs/ascent-gcc@8.1.1.cmake
rename to host-configs/ORNL/ascent-gcc@8.1.1.cmake
diff --git a/host-configs/ORNL/crusher-base.cmake b/host-configs/ORNL/crusher-base.cmake
new file mode 100644
index 00000000..53f647fa
--- /dev/null
+++ b/host-configs/ORNL/crusher-base.cmake
@@ -0,0 +1,25 @@
+
+set(CMAKE_CXX_STANDARD 14 CACHE STRING "")
+
+set( ENABLE_MPI ON CACHE BOOL "" FORCE )
+set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE )
+
+# HIP Options
+set( ENABLE_HIP ON CACHE BOOL "" FORCE )
+
+# suppress -Werror for now
+set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE )
+
+# GTEST
+set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "")
+set(gtest_disable_pthreads ON CACHE BOOL "")
+
+# disable most binaries and doc generation
+set(ENABLE_TESTS OFF CACHE BOOL "" FORCE)
+set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE)
+set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE)
+set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE)
+set(ENABLE_DOCS OFF CACHE BOOL "" FORCE)
+
+# BLT trying to find MPI fails on cray with cce
+set(ENABLE_FIND_MPI FALSE CACHE BOOL "")
diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake
new file mode 100644
index 00000000..a10fda43
--- /dev/null
+++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake
@@ -0,0 +1,39 @@
+
+set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "")
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
+
+# Set up the tpls
+set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+set(GEOSX_TPL_DIR2 "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-13.0.1" CACHE PATH "")
+
+set(CAMP_DIR "${GEOSX_TPL_DIR2}/camp-0.2.2-oogry5gz2fts7jufeykxzmowajtmgzi3" CACHE PATH "" )
+
+set(RAJA_DIR "${GEOSX_TPL_DIR2}/raja-2022.03.0-ex5v5y6jtotfxxvwcs7bblwvy4ktjykq" CACHE PATH "" )
+
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR2}/umpire-develop-jqqth57w2ets75sljw7lc5uxoi5wwi3c" CACHE PATH "" )
+
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR2}/chai-2022.03.0-w7lka3bkp36mbk5kzucgtp3eowomllgl" CACHE PATH "" )
+
+set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" )
+set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" )
+
+# C++ options
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "")
+
+
+if( ENABLE_HIP )
+  set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
+
+  set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" )
+  set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" )
+
+  set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
+  set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
+endif()
diff --git a/host-configs/ORNL/crusher-cce@14.0.0.cmake b/host-configs/ORNL/crusher-cce@14.0.0.cmake
new file mode 100644
index 00000000..967be640
--- /dev/null
+++ b/host-configs/ORNL/crusher-cce@14.0.0.cmake
@@ -0,0 +1,44 @@
+
+set(CONFIG_NAME "crusher-cce@14.0.0" CACHE PATH "")
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
+
+# Set up the tpls
+set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+
+set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" )
+
+set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" )
+
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" )
+
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" )
+
+set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" )
+set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" )
+
+# C++ options
+set(CRAYPE_VERSION "2.7.15")
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "")
+
+if( ENABLE_HIP )
+  set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
+
+  set( HIP_VERSION_STRING "5.1.0" CACHE STRING "" )
+  set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" )
+
+  set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
+  set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
+endif()
+
+
+
+
+
diff --git a/host-configs/ORNL/crusher-cce@14.0.1.cmake b/host-configs/ORNL/crusher-cce@14.0.1.cmake
new file mode 100644
index 00000000..15c54516
--- /dev/null
+++ b/host-configs/ORNL/crusher-cce@14.0.1.cmake
@@ -0,0 +1,39 @@
+
+set(CONFIG_NAME "crusher-cce@14.0.1" CACHE PATH "")
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
+
+# Set up the tpls
+set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+
+set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" )
+
+set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" )
+
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" )
+
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" )
+
+set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" )
+set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" )
+
+# C++ options
+set(CRAYPE_VERSION "2.7.15")
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "")
+
+if( ENABLE_HIP )
+  set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
+
+  set( HIP_VERSION_STRING "5.1.0" CACHE STRING "" )
+  set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" )
+
+  set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
+  set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
+endif()
diff --git a/host-configs/ORNL/crusher-cce@14.0.2.cmake b/host-configs/ORNL/crusher-cce@14.0.2.cmake
new file mode 100644
index 00000000..d0e29023
--- /dev/null
+++ b/host-configs/ORNL/crusher-cce@14.0.2.cmake
@@ -0,0 +1,39 @@
+
+set(CONFIG_NAME "crusher-cce@14.0.2" CACHE PATH "")
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
+
+# Set up the tpls
+set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.2" CACHE PATH "")
+set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.2" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+
+set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-ksdglvlmamju7gphtyzdavitriemedla" CACHE PATH "" )
+
+set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.1-jxxcauxbzee6nqjmyjz45t5h4f7tv34r" CACHE PATH "" )
+
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.1-vgvqpvi3cwdmvy6cu76sqoghnvprzlwu" CACHE PATH "" )
+
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-7axkiea7q3hzgojswiz7qdbd2yq6bvsf" CACHE PATH "" )
+
+set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-jptrwzs7vdbckndjg5qg4jwckfmgexmw/" CACHE PATH "" )
+set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-p2msdgsmomufcnwhnow5bbazg7463caf/" CACHE PATH "" )
+
+# C++ options
+set(CRAYPE_VERSION "2.7.17")
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "")
+
+if( ENABLE_HIP )
+  set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation
+
+  set( HIP_VERSION_STRING "5.2.0" CACHE STRING "" )
+  set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" )
+
+  set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE )
+  set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE )
+  set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE )
+endif()
diff --git a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake
new file mode 100644
index 00000000..b9d64b28
--- /dev/null
+++ b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake
@@ -0,0 +1,29 @@
+
+set(CONFIG_NAME "crusher-cpu-cce@13.0.1" CACHE PATH "")
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
+
+# Set up the tpls
+set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+set(GEOSX_TPL_DIR2 "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-13.0.1" CACHE PATH "")
+
+set(CAMP_DIR "${GEOSX_TPL_DIR2}/camp-0.2.2-mej6trivmy7o5vlr6a52cml6tzxb5fvk" CACHE PATH "" )
+
+set(RAJA_DIR "${GEOSX_TPL_DIR2}/raja-2022.03.0-tmukf35ms7f2pkfswpejbnt3jtnpkakc" CACHE PATH "" )
+
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR2}/umpire-2022.03.0-unirfq5er4vtyr2koymgi3xxq6h2f5l5" CACHE PATH "" )
+
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR2}/chai-2022.03.0-aggyh463v2rz6s44laqshylc4xeeg4h7" CACHE PATH "" )
+
+set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" )
+set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" )
+
+# C++ options
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "")
+
+# HIP Options
+set( ENABLE_HIP OFF CACHE BOOL "" FORCE )
diff --git a/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake
new file mode 100644
index 00000000..d25d6b2e
--- /dev/null
+++ b/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake
@@ -0,0 +1,30 @@
+
+set(CONFIG_NAME "crusher-cpu-cce@14.0.1" CACHE PATH "")
+include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake )
+
+# Set up the tpls
+set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "")
+set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "")
+
+set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" )
+
+set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" )
+
+set(ENABLE_UMPIRE TRUE CACHE BOOL "" )
+set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" )
+
+set(ENABLE_CHAI TRUE CACHE BOOL "" )
+set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" )
+
+set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" )
+set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" )
+
+# C++ options
+set(CRAYPE_VERSION "2.7.15")
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "")
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "")
+
+# HIP Options
+set( ENABLE_HIP OFF CACHE BOOL "" FORCE )
diff --git a/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake b/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake
new file mode 100644
index 00000000..90ac014b
--- /dev/null
+++ b/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake
@@ -0,0 +1,93 @@
+#################################################################################
+# Generated host-config - Edit at own risk!
+#################################################################################
+#--------------------------------------------------------------------------------
+# SYS_TYPE: toss_3_x86_64_ib
+# Compiler Spec: clang@10.0.1
+# CMake executable path: /usr/tce/packages/cmake/cmake-3.14.5/bin/cmake
+#--------------------------------------------------------------------------------
+
+set(BLT_SOURCE_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/blt-0.5.2-6nztad6saell6ikor6wtxp6qycxtfwh4" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# Compilers
+#--------------------------------------------------------------------------------
+
+set(CMAKE_C_COMPILER "/usr/tce/bin/clang-10.0.1" CACHE PATH "")
+
+set(CMAKE_CXX_COMPILER "/usr/tce/bin/clang++-10.0.1" CACHE PATH "")
+
+set(CMAKE_C_FLAGS "-march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS "-march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(ENABLE_CUDA OFF CACHE BOOL "")
+
+#--------------------------------------------------------------------------------
+# CAMP
+#--------------------------------------------------------------------------------
+
+set(CAMP_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/camp-2022.03.2-2q75xbq2h4ykcyvasoqg55torawlabkw" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# RAJA
+#--------------------------------------------------------------------------------
+
+set(RAJA_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/raja-2022.03.0-jkp4hp7ifyxkxzkbho5ngdnk4x3opaoy" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# Umpire
+#--------------------------------------------------------------------------------
+
+set(ENABLE_UMPIRE ON CACHE BOOL "")
+
+set(UMPIRE_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/umpire-2022.03.1-aerit7injc3hmn2ripnsxtnlwxicjmuu" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# CHAI
+#--------------------------------------------------------------------------------
+
+set(ENABLE_CHAI ON CACHE BOOL "")
+
+set(CHAI_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/chai-2022.03.0-s6w2gsrreu7krgzboekmlukmfestpg7k" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# Caliper
+#--------------------------------------------------------------------------------
+
+#--------------------------------------------------------------------------------
+# Caliper
+#--------------------------------------------------------------------------------
+
+set(ENABLE_CALIPER ON CACHE BOOL "")
+
+set(CALIPER_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/caliper-2.8.0-3fwkrbu4bhnc4bqvhrqcydrzxslq6ryz" CACHE PATH "")
+
+#--------------------------------------------------------------------------------
+# Python
+#--------------------------------------------------------------------------------
+
+set(ENABLE_PYLVARRAY OFF CACHE BOOL "")
+
+#--------------------------------------------------------------------------------
+# Documentation
+#--------------------------------------------------------------------------------
+
+set(ENABLE_DOCS OFF CACHE BOOL "")
+
+#--------------------------------------------------------------------------------
+# addr2line
+#--------------------------------------------------------------------------------
+
+set(ENABLE_ADDR2LINE ON CACHE BOOL "")
+
+#--------------------------------------------------------------------------------
+# Other
+#--------------------------------------------------------------------------------
+
diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py
index 9c4b47d9..df299f3e 100644
--- a/scripts/uberenv/packages/lvarray/package.py
+++ b/scripts/uberenv/packages/lvarray/package.py
@@ -18,6 +18,12 @@ def cmake_cache_entry(name, value, comment=""):
 
     return 'set(%s "%s" CACHE PATH "%s")\n\n' % (name, value, comment)
 
+def cmake_cache_list(name, value, comment=""):
+    """Generate a list for a cmake cache variable"""
+
+    indent = 5 + len(name)
+    join_str = '\n' + ' ' * indent
+    return 'set(%s %s CACHE STRING "%s")\n\n' % (name, join_str.join(value), comment)
 
 def cmake_cache_string(name, string, comment=""):
     """Generate a string for a cmake cache variable"""
@@ -50,6 +56,8 @@ class Lvarray(CMakePackage, CudaPackage):
     variant('chai', default=False, description='Build Chai support')
     variant('caliper', default=False, description='Build Caliper support')
     variant('pylvarray', default=False, description='Build Python support')
+    variant('lapack', default=False, description='Build LAPACK and BLAS support')
+    variant('magma', default=False, description='Build MAGMA support')
     variant('tests', default=True, description='Build tests')
     variant('benchmarks', default=False, description='Build benchmarks')
     variant('examples', default=False, description='Build examples')
@@ -57,31 +65,52 @@ class Lvarray(CMakePackage, CudaPackage):
     variant('addr2line', default=True,
             description='Build support for addr2line.')
 
-    depends_on('blt', when='@0.2.0:', type='build')
+    variant('tpl_build_type', default='none', description='TPL build type',
+            values=('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel', 'none'))
+        
+    conflicts('~lapack', when='+magma')
+
+    depends_on('blt@0.4.1:', when='@0.2.0:', type='build')
 
     depends_on('camp')
-    depends_on('camp+cuda', when='+cuda')
 
     depends_on('raja')
-    depends_on('raja+cuda', when='+cuda')
 
-    # At the moment Umpire doesn't support shared when building with CUDA.
     depends_on('umpire', when='+umpire')
-    depends_on('umpire+cuda~shared', when='+umpire+cuda')
 
     depends_on('chai+raja', when='+chai')
-    depends_on('chai+raja+cuda', when='+chai+cuda')
 
     depends_on('caliper', when='+caliper')
 
     depends_on('python +shared +pic', when='+pylvarray')
-    depends_on('py-numpy@1.19: +blas +lapack +force-parallel-build', when='+pylvarray')
-    depends_on('py-scipy@1.5.2: +force-parallel-build', when='+pylvarray')
+    depends_on('py-numpy@1.19: +blas +lapack', when='+pylvarray')
+    depends_on('py-scipy@1.5.2:', when='+pylvarray')
     depends_on('py-pip', when='+pylvarray')
 
+    depends_on('blas', when='+lapack')
+    depends_on('lapack', when='+lapack')
+    depends_on('magma', when='+magma')
+
     depends_on('doxygen@1.8.13:', when='+docs', type='build')
     depends_on('py-sphinx@1.6.3:', when='+docs', type='build')
 
+    with when('+cuda'):
+        for sm_ in CudaPackage.cuda_arch_values:
+            depends_on('camp +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('raja +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('umpire +cuda ~shared cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_))
+            depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='+chai cuda_arch={0}'.format(sm_))
+            depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='+caliper cuda_arch={0}'.format(sm_))
+    
+    for bt in ('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel'):
+        with when('tpl_build_type={}'.format(bt)):
+            depends_on('camp build_type={}'.format(bt))
+            depends_on('raja build_type={}'.format(bt))
+            depends_on('umpire build_type={}'.format(bt))
+            depends_on('chai build_type={}'.format(bt), when='+chai')
+            depends_on('caliper build_type={}'.format(bt), when='+caliper')
+            depends_on('magma build_type={}'.format(bt), when='+magma')
+
     phases = ['hostconfig', 'cmake', 'build', 'install']
 
     @run_after('build')
@@ -158,178 +187,195 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
         cmake_exe = os.path.realpath(cmake_exe)
 
         host_config_path = self._get_host_config_path(spec)
-        cfg = open(host_config_path, "w")
-        cfg.write("#{0}\n".format("#" * 80))
-        cfg.write("# Generated host-config - Edit at own risk!\n")
-        cfg.write("#{0}\n".format("#" * 80))
-
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# SYS_TYPE: {0}\n".format(sys_type))
-        cfg.write("# Compiler Spec: {0}\n".format(spec.compiler))
-        cfg.write("# CMake executable path: %s\n" % cmake_exe)
-        cfg.write("#{0}\n\n".format("-" * 80))
-
-        if 'blt' in spec:
-            cfg.write(cmake_cache_entry('BLT_SOURCE_DIR', spec['blt'].prefix))
-
-        #######################
-        # Compiler Settings
-        #######################
-
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# Compilers\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
-        cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler))
-        cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler))
-
-        # use global spack compiler flags
-        cflags = ' '.join(spec.compiler_flags['cflags'])
-        cxxflags = ' '.join(spec.compiler_flags['cxxflags'])
-
-        if "%intel" in spec:
-            cflags += ' -qoverride-limits'
-            cxxflags += ' -qoverride-limits'
+        with open(host_config_path, "w") as cfg:
+            cfg.write("#{0}\n".format("#" * 80))
+            cfg.write("# Generated host-config - Edit at own risk!\n")
+            cfg.write("#{0}\n".format("#" * 80))
 
-        if cflags:
-            cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags))
-
-        if cxxflags:
-            cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags))
-
-        release_flags = "-O3 -DNDEBUG"
-        cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELEASE",
-                                     release_flags))
-        reldebinf_flags = "-O3 -g -DNDEBUG"
-        cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELWITHDEBINFO",
-                                     reldebinf_flags))
-        debug_flags = "-O0 -g"
-        cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_DEBUG", debug_flags))
-
-        if "%clang arch=linux-rhel7-ppc64le" in spec:
-            cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", "-Wl,--no-toc-optimize"))
-
-        if "+cuda" in spec:
             cfg.write("#{0}\n".format("-" * 80))
-            cfg.write("# Cuda\n")
+            cfg.write("# SYS_TYPE: {0}\n".format(sys_type))
+            cfg.write("# Compiler Spec: {0}\n".format(spec.compiler))
+            cfg.write("# Spec: {0}\n".format(spec))
+            cfg.write("# CMake executable path: %s\n" % cmake_exe)
             cfg.write("#{0}\n\n".format("-" * 80))
 
-            cfg.write(cmake_cache_option("ENABLE_CUDA", True))
-            cfg.write(cmake_cache_entry("CMAKE_CUDA_STANDARD", 14))
-
-            cudatoolkitdir = spec['cuda'].prefix
-            cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR",
-                                        cudatoolkitdir))
-            cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc"
-            cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", cudacompiler))
-
-            cmake_cuda_flags = ('-restrict --expt-extended-lambda -Werror '
-                                'cross-execution-space-call,reorder,'
-                                'deprecated-declarations')
+            if 'blt' in spec:
+                cfg.write(cmake_cache_entry('BLT_SOURCE_DIR', spec['blt'].prefix))
 
-            archSpecifiers = ("-mtune", "-mcpu", "-march", "-qtune", "-qarch")
-            for archSpecifier in archSpecifiers:
-                for compilerArg in spec.compiler_flags['cxxflags']:
-                    if compilerArg.startswith(archSpecifier):
-                        cmake_cuda_flags += ' -Xcompiler ' + compilerArg
-
-            if not spec.satisfies('cuda_arch=none'):
-                cuda_arch = spec.variants['cuda_arch'].value
-                cmake_cuda_flags += ' -arch sm_{0}'.format(cuda_arch[0])
-
-            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS", cmake_cuda_flags))
-
-            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE",
-                                         "-O3 -Xcompiler -O3 -DNDEBUG"))
-            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO",
-                                         "-O3 -g -lineinfo -Xcompiler -O3"))
-            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG",
-                                         "-O0 -Xcompiler -O0 -g -G"))
-
-        else:
-            cfg.write(cmake_cache_option("ENABLE_CUDA", False))
+            #######################
+            # Compiler Settings
+            #######################
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# CAMP\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# Compilers\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler))
+            cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler))
+
+            # use global spack compiler flags
+            cflags = ' '.join(spec.compiler_flags['cflags'])
+            cxxflags = ' '.join(spec.compiler_flags['cxxflags'])
+
+            if "%intel" in spec:
+                cflags += ' -qoverride-limits'
+                cxxflags += ' -qoverride-limits'
+
+            if cflags:
+                cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags))
+
+            if cxxflags:
+                cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags))
+
+            release_flags = "-O3 -DNDEBUG"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELEASE",
+                                        release_flags))
+            reldebinf_flags = "-O3 -g -DNDEBUG"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELWITHDEBINFO",
+                                        reldebinf_flags))
+            debug_flags = "-O0 -g"
+            cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_DEBUG", debug_flags))
+
+            if "%clang arch=linux-rhel7-ppc64le" in spec:
+                cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", "-Wl,--no-toc-optimize"))
+
+            if "+cuda" in spec:
+                cfg.write("#{0}\n".format("-" * 80))
+                cfg.write("# Cuda\n")
+                cfg.write("#{0}\n\n".format("-" * 80))
+
+                cfg.write(cmake_cache_option("ENABLE_CUDA", True))
+                cfg.write(cmake_cache_entry("CMAKE_CUDA_STANDARD", 14))
+
+                cudatoolkitdir = spec['cuda'].prefix
+                cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR",
+                                            cudatoolkitdir))
+                cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc"
+                cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", cudacompiler))
+
+                cmake_cuda_flags = ('-restrict --expt-extended-lambda -Werror '
+                                    'cross-execution-space-call,reorder,'
+                                    'deprecated-declarations')
+
+                archSpecifiers = ("-mtune", "-mcpu", "-march", "-qtune", "-qarch")
+                for archSpecifier in archSpecifiers:
+                    for compilerArg in spec.compiler_flags['cxxflags']:
+                        if compilerArg.startswith(archSpecifier):
+                            cmake_cuda_flags += ' -Xcompiler ' + compilerArg
+
+                if not spec.satisfies('cuda_arch=none'):
+                    cuda_arch = spec.variants['cuda_arch'].value
+                    cmake_cuda_flags += ' -arch sm_{0}'.format(cuda_arch[0])
+
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS", cmake_cuda_flags))
+
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE",
+                                            "-O3 -Xcompiler -O3 -DNDEBUG"))
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO",
+                                            "-O3 -g -lineinfo -Xcompiler -O3"))
+                cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG",
+                                            "-O0 -Xcompiler -O0 -g -G"))
+
+            else:
+                cfg.write(cmake_cache_option("ENABLE_CUDA", False))
 
-        cfg.write(cmake_cache_entry("CAMP_DIR", spec['camp'].prefix))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# CAMP\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# RAJA\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write(cmake_cache_entry("CAMP_DIR", spec['camp'].prefix))
 
-        cfg.write(cmake_cache_entry("RAJA_DIR", spec['raja'].prefix))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# RAJA\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# Umpire\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write(cmake_cache_entry("RAJA_DIR", spec['raja'].prefix))
 
-        if "+umpire" in spec:
-            cfg.write(cmake_cache_option("ENABLE_UMPIRE", True))
-            cfg.write(cmake_cache_entry("UMPIRE_DIR", spec['umpire'].prefix))
-        else:
-            cfg.write(cmake_cache_option("ENABLE_UMPIRE", False))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# Umpire\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# CHAI\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            if "+umpire" in spec:
+                cfg.write(cmake_cache_option("ENABLE_UMPIRE", True))
+                cfg.write(cmake_cache_entry("UMPIRE_DIR", spec['umpire'].prefix))
+            else:
+                cfg.write(cmake_cache_option("ENABLE_UMPIRE", False))
 
-        if "+chai" in spec:
-            cfg.write(cmake_cache_option("ENABLE_CHAI", True))
-            cfg.write(cmake_cache_entry("CHAI_DIR", spec['chai'].prefix))
-        else:
-            cfg.write(cmake_cache_option("ENABLE_CHAI", False))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# CHAI\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# Caliper\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            if "+chai" in spec:
+                cfg.write(cmake_cache_option("ENABLE_CHAI", True))
+                cfg.write(cmake_cache_entry("CHAI_DIR", spec['chai'].prefix))
+            else:
+                cfg.write(cmake_cache_option("ENABLE_CHAI", False))
 
-        if "+caliper" in spec:
             cfg.write("#{0}\n".format("-" * 80))
             cfg.write("# Caliper\n")
             cfg.write("#{0}\n\n".format("-" * 80))
 
-            cfg.write(cmake_cache_option("ENABLE_CALIPER", True))
-            cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix))
-        else:
-            cfg.write(cmake_cache_option("ENABLE_CALIPER", False))
-
-        cfg.write('#{0}\n'.format('-' * 80))
-        cfg.write('# Python\n')
-        cfg.write('#{0}\n\n'.format('-' * 80))
-        if '+pylvarray' in spec:
-            cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True))
-            cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3')))
-        else:
-            cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', False))
-
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# Documentation\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
-        if "+docs" in spec:
-            cfg.write(cmake_cache_option("ENABLE_DOCS", True))
-            sphinx_dir = spec['py-sphinx'].prefix
-            cfg.write(cmake_cache_string('SPHINX_EXECUTABLE',
-                                         os.path.join(sphinx_dir,
-                                                      'bin',
-                                                      'sphinx-build')))
-
-            doxygen_dir = spec['doxygen'].prefix
-            cfg.write(cmake_cache_string('DOXYGEN_EXECUTABLE',
-                                         os.path.join(doxygen_dir,
-                                                      'bin',
-                                                      'doxygen')))
-        else:
-            cfg.write(cmake_cache_option("ENABLE_DOCS", False))
+            if "+caliper" in spec:
+                cfg.write("#{0}\n".format("-" * 80))
+                cfg.write("# Caliper\n")
+                cfg.write("#{0}\n\n".format("-" * 80))
+
+                cfg.write(cmake_cache_option("ENABLE_CALIPER", True))
+                cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix))
+            else:
+                cfg.write(cmake_cache_option("ENABLE_CALIPER", False))
+
+            cfg.write('#{0}\n'.format('-' * 80))
+            cfg.write('# Python\n')
+            cfg.write('#{0}\n\n'.format('-' * 80))
+            if '+pylvarray' in spec:
+                cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True))
+                cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3')))
+            else:
+                cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', False))
+
+            cfg.write('#{0}\n'.format('-' * 80))
+            cfg.write('# Math libraries\n')
+            cfg.write('#{0}\n\n'.format('-' * 80))
+            if '+lapack' in spec:
+                cfg.write(cmake_cache_option('ENABLE_LAPACK', True))
+                cfg.write(cmake_cache_list('BLAS_LIBRARIES', spec['blas'].libs))
+                cfg.write(cmake_cache_list('LAPACK_LIBRARIES', spec['lapack'].libs))
+            else:
+                cfg.write(cmake_cache_option('ENABLE_LAPACK', False))
+
+            if '+magma' in spec:
+                cfg.write(cmake_cache_option('ENABLE_MAGMA', True))
+                cfg.write(cmake_cache_entry('MAGMA_DIR', spec['magma'].prefix))
+            else:
+                cfg.write(cmake_cache_option('ENABLE_MAGMA', False))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# addr2line\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
-        cfg.write(cmake_cache_option('ENABLE_ADDR2LINE', '+addr2line' in spec))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# Documentation\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
+            if "+docs" in spec:
+                cfg.write(cmake_cache_option("ENABLE_DOCS", True))
+                sphinx_dir = spec['py-sphinx'].prefix
+                cfg.write(cmake_cache_string('SPHINX_EXECUTABLE',
+                                            os.path.join(sphinx_dir,
+                                                        'bin',
+                                                        'sphinx-build')))
+
+                doxygen_dir = spec['doxygen'].prefix
+                cfg.write(cmake_cache_string('DOXYGEN_EXECUTABLE',
+                                            os.path.join(doxygen_dir,
+                                                        'bin',
+                                                        'doxygen')))
+            else:
+                cfg.write(cmake_cache_option("ENABLE_DOCS", False))
 
-        cfg.write("#{0}\n".format("-" * 80))
-        cfg.write("# Other\n")
-        cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# addr2line\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
+            cfg.write(cmake_cache_option('ENABLE_ADDR2LINE', '+addr2line' in spec))
+
+            cfg.write("#{0}\n".format("-" * 80))
+            cfg.write("# Other\n")
+            cfg.write("#{0}\n\n".format("-" * 80))
 
     def cmake_args(self):
         spec = self.spec
diff --git a/scripts/uberenv/packages/magma/cmake-W.patch b/scripts/uberenv/packages/magma/cmake-W.patch
new file mode 100644
index 00000000..59179676
--- /dev/null
+++ b/scripts/uberenv/packages/magma/cmake-W.patch
@@ -0,0 +1,12 @@
+diff -ru magma-2.5.0-orig/CMakeLists.txt magma-2.5.0/CMakeLists.txt
+--- magma-2.5.0-orig/CMakeLists.txt	2019-01-02 11:18:39.000000000 -0800
++++ magma-2.5.0/CMakeLists.txt	2019-04-03 15:58:01.871234891 -0700
+@@ -363,8 +363,6 @@
+ else()
+     # Primarily for gcc / nvcc:
+     # Ignore unused static functions in headers.
+-    set( CMAKE_C_FLAGS     "${CMAKE_C_FLAGS} -Wall -Wno-unused-function" )
+-    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unused-function" )
+ endif()
+ 
+ if (CMAKE_HOST_APPLE)
diff --git a/scripts/uberenv/packages/magma/ibm-xl.patch b/scripts/uberenv/packages/magma/ibm-xl.patch
new file mode 100644
index 00000000..0deab656
--- /dev/null
+++ b/scripts/uberenv/packages/magma/ibm-xl.patch
@@ -0,0 +1,248 @@
+diff -Naur magma-2.2.0/src/dlaex3_m.cpp magma-2.2.0-patched/src/dlaex3_m.cpp
+--- magma-2.2.0/src/dlaex3_m.cpp	2016-11-20 20:20:06.000000000 -0500
++++ magma-2.2.0/src/dlaex3_m.cpp	2017-01-06 15:54:29.423668874 -0500
+@@ -197,7 +197,7 @@
+     magmaDouble_ptr dwork[],
+     magma_queue_t queues[MagmaMaxGPUs][2],
+     magma_range_t range, double vl, double vu, magma_int_t il, magma_int_t iu,
+-    magma_int_t *info )
++    magma_int_t *infom )
+ {
+ #define Q(i_,j_) (Q + (i_) + (j_)*ldq)
+ 
+@@ -209,8 +209,8 @@
+         magma_setdevice(0);
+         magma_dlaex3( k, n, n1, d, Q, ldq, rho,
+                       dlamda, Q2, indx, ctot, w, s, indxq,
+-                      *dwork, range, vl, vu, il, iu, info );
+-        return *info;
++                      *dwork, range, vl, vu, il, iu, infom );
++        return *infom;
+     }
+     double d_one  = 1.;
+     double d_zero = 0.;
+@@ -229,37 +229,37 @@
+     valeig = (range == MagmaRangeV);
+     indeig = (range == MagmaRangeI);
+ 
+-    *info = 0;
++    *infom = 0;
+ 
+     if (k < 0)
+-        *info=-1;
++        *infom=-1;
+     else if (n < k)
+-        *info=-2;
++        *infom=-2;
+     else if (ldq < max(1,n))
+-        *info=-6;
++        *infom=-6;
+     else if (! (alleig || valeig || indeig))
+-        *info = -15;
++        *infom = -15;
+     else {
+         if (valeig) {
+             if (n > 0 && vu <= vl)
+-                *info = -17;
++                *infom = -17;
+         }
+         else if (indeig) {
+             if (il < 1 || il > max(1,n))
+-                *info = -18;
++                *infom = -18;
+             else if (iu < min(n,il) || iu > n)
+-                *info = -19;
++                *infom = -19;
+         }
+     }
+ 
+-    if (*info != 0) {
+-        magma_xerbla( __func__, -(*info) );
+-        return *info;
++    if (*infom != 0) {
++        magma_xerbla( __func__, -(*infom) );
++        return *infom;
+     }
+ 
+     // Quick return if possible
+     if (k == 0)
+-        return *info;
++        return *infom;
+ 
+     magma_device_t orig_dev;
+     magma_getdevice( &orig_dev );
+@@ -360,15 +360,15 @@
+             lapackf77_dlaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo );
+             // If the zero finder fails, the computation is terminated.
+             if (iinfo != 0) {
+-                #pragma omp critical (info)
+-                *info = iinfo;
++                #pragma omp critical (infom)
++                *infom = iinfo;
+                 break;
+             }
+         }
+ 
+         #pragma omp barrier
+ 
+-        if (*info == 0) {
++        if (*infom == 0) {
+             #pragma omp single
+             {
+                 // Prepare the INDXQ sorting permutation.
+@@ -452,8 +452,8 @@
+             }
+         }
+     }  // end omp parallel
+-    if (*info != 0)
+-        return *info;
++    if (*infom != 0)
++        return *infom;
+ 
+     timer_stop( time );
+     timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );
+@@ -474,10 +474,10 @@
+         lapackf77_dlaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo );
+         // If the zero finder fails, the computation is terminated.
+         if (iinfo != 0)
+-            *info=iinfo;
++            *infom=iinfo;
+     }
+-    if (*info != 0)
+-        return *info;
++    if (*infom != 0)
++        return *infom;
+ 
+     // Prepare the INDXQ sorting permutation.
+     magma_int_t nk = n - k;
+@@ -688,5 +688,5 @@
+ 
+     magma_setdevice( orig_dev );
+     
+-    return *info;
++    return *infom;
+ } /* magma_dlaed3_m */
+diff -Naur magma-2.2.0/src/slaex3_m.cpp magma-2.2.0-patched/src/slaex3_m.cpp
+--- magma-2.2.0/src/slaex3_m.cpp	2016-11-20 20:20:24.000000000 -0500
++++ magma-2.2.0/src/slaex3_m.cpp	2017-01-06 10:20:13.200783151 -0500
+@@ -197,7 +197,7 @@
+     magmaFloat_ptr dwork[],
+     magma_queue_t queues[MagmaMaxGPUs][2],
+     magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu,
+-    magma_int_t *info )
++    magma_int_t *infom )
+ {
+ #define Q(i_,j_) (Q + (i_) + (j_)*ldq)
+ 
+@@ -209,8 +209,8 @@
+         magma_setdevice(0);
+         magma_slaex3( k, n, n1, d, Q, ldq, rho,
+                       dlamda, Q2, indx, ctot, w, s, indxq,
+-                      *dwork, range, vl, vu, il, iu, info );
+-        return *info;
++                      *dwork, range, vl, vu, il, iu, infom );
++        return *infom;
+     }
+     float d_one  = 1.;
+     float d_zero = 0.;
+@@ -229,37 +229,37 @@
+     valeig = (range == MagmaRangeV);
+     indeig = (range == MagmaRangeI);
+ 
+-    *info = 0;
++    *infom = 0;
+ 
+     if (k < 0)
+-        *info=-1;
++        *infom=-1;
+     else if (n < k)
+-        *info=-2;
++        *infom=-2;
+     else if (ldq < max(1,n))
+-        *info=-6;
++        *infom=-6;
+     else if (! (alleig || valeig || indeig))
+-        *info = -15;
++        *infom = -15;
+     else {
+         if (valeig) {
+             if (n > 0 && vu <= vl)
+-                *info = -17;
++                *infom = -17;
+         }
+         else if (indeig) {
+             if (il < 1 || il > max(1,n))
+-                *info = -18;
++                *infom = -18;
+             else if (iu < min(n,il) || iu > n)
+-                *info = -19;
++                *infom = -19;
+         }
+     }
+ 
+-    if (*info != 0) {
+-        magma_xerbla( __func__, -(*info) );
+-        return *info;
++    if (*infom != 0) {
++        magma_xerbla( __func__, -(*infom) );
++        return *infom;
+     }
+ 
+     // Quick return if possible
+     if (k == 0)
+-        return *info;
++        return *infom;
+ 
+     magma_device_t orig_dev;
+     magma_getdevice( &orig_dev );
+@@ -360,15 +360,15 @@
+             lapackf77_slaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo );
+             // If the zero finder fails, the computation is terminated.
+             if (iinfo != 0) {
+-                #pragma omp critical (info)
+-                *info = iinfo;
++                #pragma omp critical (infom)
++                *infom = iinfo;
+                 break;
+             }
+         }
+ 
+         #pragma omp barrier
+ 
+-        if (*info == 0) {
++        if (*infom == 0) {
+             #pragma omp single
+             {
+                 // Prepare the INDXQ sorting permutation.
+@@ -452,8 +452,8 @@
+             }
+         }
+     }  // end omp parallel
+-    if (*info != 0)
+-        return *info;
++    if (*infom != 0)
++        return *infom;
+ 
+     timer_stop( time );
+     timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );
+@@ -474,10 +474,10 @@
+         lapackf77_slaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo );
+         // If the zero finder fails, the computation is terminated.
+         if (iinfo != 0)
+-            *info=iinfo;
++            *infom=iinfo;
+     }
+-    if (*info != 0)
+-        return *info;
++    if (*infom != 0)
++        return *infom;
+ 
+     // Prepare the INDXQ sorting permutation.
+     magma_int_t nk = n - k;
+@@ -688,5 +688,5 @@
+ 
+     magma_setdevice( orig_dev );
+     
+-    return *info;
++    return *infom;
+ } /* magma_slaed3_m */
diff --git a/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch b/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch
new file mode 100644
index 00000000..f734a5f1
--- /dev/null
+++ b/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch
@@ -0,0 +1,24 @@
+diff -ru magma-2.3.0/testing/testings.h magma-2.3.0-patched/testing/testings.h
+--- magma-2.3.0/testing/testings.h	2017-11-14 21:34:00.000000000 -0800
++++ magma-2.3.0-patched/testing/testings.h	2018-03-23 20:41:16.459934643 -0700
+@@ -269,4 +269,20 @@
+     typename blas::traits<FloatT>::real_t* sigma,
+     FloatT* A, magma_int_t lda );
+ 
++// This overload for the case sigma = nullptr is a workaround for an issue
++// when building with gcc 4.8.5. This is not an issue with gcc 4.9.2.
++template< typename FloatT >
++void magma_generate_matrix(
++    magma_opts& opts,
++    magma_int_t m, magma_int_t n,
++    std::nullptr_t sigma,
++    FloatT* A, magma_int_t lda )
++{
++    magma_generate_matrix<FloatT>(
++        opts,
++        m, n,
++        (typename blas::traits<FloatT>::real_t*) sigma,
++        A, lda );
++}
++
+ #endif /* TESTINGS_H */
diff --git a/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch b/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch
new file mode 100644
index 00000000..56b58d85
--- /dev/null
+++ b/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch
@@ -0,0 +1,77 @@
+diff -ru magma-2.5.0-orig/CMakeLists.txt magma-2.5.0/CMakeLists.txt
+--- magma-2.5.0-orig/CMakeLists.txt	2019-01-02 11:18:39.000000000 -0800
++++ magma-2.5.0/CMakeLists.txt	2019-04-03 15:58:01.871234891 -0700
+@@ -440,18 +440,20 @@
+ # compile MAGMA sparse library
+ 
+ # sparse doesn't have Fortran at the moment, so no need for above shenanigans
+-include_directories( sparse/include )
+-include_directories( sparse/control )
+-include_directories( testing )
+-cuda_add_library( magma_sparse ${libsparse_all} )
+-target_link_libraries( magma_sparse
+-	magma
++if (MAGMA_SPARSE)
++  include_directories( sparse/include )
++  include_directories( sparse/control )
++  include_directories( testing )
++  cuda_add_library( magma_sparse ${libsparse_all} )
++  target_link_libraries( magma_sparse
++	  magma
+ 	${LAPACK_LIBRARIES}
+ 	${CUDA_CUDART_LIBRARY}
+ 	${CUDA_CUBLAS_LIBRARIES}
+ 	${CUDA_cusparse_LIBRARY}
+-)
+-set( LIBS_SPARSE ${LIBS} magma_sparse )
++  )
++  set( LIBS_SPARSE ${LIBS} magma_sparse )
++endif()
+ 
+ 
+ # ----------------------------------------
+@@ -480,23 +482,31 @@
+ 
+ # ----------------------------------------
+ # compile each sparse tester
+-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY sparse/testing )
+-foreach( TEST ${sparse_testing_all} )
++if (MAGMA_SPARSE)
++  set( CMAKE_RUNTIME_OUTPUT_DIRECTORY sparse/testing )
++  foreach( TEST ${sparse_testing_all} )
+ 	string( REGEX REPLACE "\\.(cpp|f90|F90)"     "" EXE ${TEST} )
+ 	string( REGEX REPLACE "sparse/testing/" "" EXE ${EXE} )
+ 	#message( "${TEST} --> ${EXE}" )
+ 	add_executable( ${EXE} ${TEST} )
+ 	target_link_libraries( ${EXE} ${LIBS_SPARSE} )
+-endforeach()
++  endforeach()
++endif()
+ 
+ 
+ # ----------------------------------------
+ # what to install
+-install( TARGETS magma magma_sparse ${blas_fix}
++set(MAGMA_TARGETS magma)
++set(MAGMA_HEADERS_PATTERNS include/*.h)
++if (MAGMA_SPARSE)
++  set(MAGMA_TARGETS ${MAGMA_TARGETS} magma_sparse)
++  set(MAGMA_HEADERS_PATTERNS ${MAGMA_HEADERS_PATTERNS} sparse/include/*.h)
++endif()
++install( TARGETS ${MAGMA_TARGETS} ${blas_fix}
+          RUNTIME DESTINATION bin
+          LIBRARY DESTINATION lib
+          ARCHIVE DESTINATION lib )
+-file( GLOB headers include/*.h sparse/include/*.h )
++file( GLOB headers ${MAGMA_HEADERS_PATTERNS} )
+ install( FILES ${headers}
+          DESTINATION include )
+ 
+@@ -509,4 +519,6 @@
+ message( STATUS "    NFLAGS       ${CUDA_NVCC_FLAGS}" )
+ message( STATUS "    FFLAGS       ${CMAKE_Fortran_FLAGS}" )
+ message( STATUS "    LIBS         ${LIBS}" )
+-message( STATUS "    LIBS_SPARSE  ${LIBS_SPARSE}" )
++if (MAGMA_SPARSE)
++  message( STATUS "    LIBS_SPARSE  ${LIBS_SPARSE}" )
++endif()
diff --git a/scripts/uberenv/packages/magma/magma-2.5.0.patch b/scripts/uberenv/packages/magma/magma-2.5.0.patch
new file mode 100644
index 00000000..1ac800c5
--- /dev/null
+++ b/scripts/uberenv/packages/magma/magma-2.5.0.patch
@@ -0,0 +1,428 @@
+diff -r 89706c0efbdb .hgtags
+--- a/.hgtags	Wed Jan 02 14:17:26 2019 -0500
++++ b/.hgtags	Wed Apr 03 15:50:54 2019 -0700
+@@ -1,3 +1,4 @@
+ 9c7e7cffa7d0e2decd23cde36a4830dfb55bea13 v2.2.0
+ b2b2e21c22a59a79eefbf1e5cff8e7d539a52c0c v2.3.0
+ 04d08aaa27dc8a551513d268c68fc299e81b6780 v2.4.0
++89706c0efbdbfd48bf8a2c20cc0d73e53c3f387e v2.5.0
+diff -r 89706c0efbdb include/magma_types.h
+--- a/include/magma_types.h	Wed Jan 02 14:17:26 2019 -0500
++++ b/include/magma_types.h	Wed Apr 03 15:50:54 2019 -0700
+@@ -77,7 +77,7 @@
+     typedef magma_int_t    magma_device_t;
+ 
+     // Half precision in CUDA 
+-    #if defined(__cplusplus) && CUDA_VERSION > 7500
++    #if defined(__cplusplus) && CUDA_VERSION >= 7500
+     #include <cuda_fp16.h>
+     typedef __half           magmaHalf;
+     #else
+diff -r 89706c0efbdb sparse/blas/magma_zsampleselect.cu
+--- a/sparse/blas/magma_zsampleselect.cu	Wed Jan 02 14:17:26 2019 -0500
++++ b/sparse/blas/magma_zsampleselect.cu	Wed Apr 03 15:50:54 2019 -0700
+@@ -15,9 +15,12 @@
+ 
+ #define PRECISION_z
+ 
++
+ namespace magma_sampleselect {
+ 
+-__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) {
++__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) 
++{
++#if (__CUDA_ARCH__ >= 350)
+     auto idx = threadIdx.x + blockDim.x * blockIdx.x;
+     if (idx >= size) {
+         return;
+@@ -25,6 +28,7 @@
+ 
+     auto v = in[idx];
+     out[idx] = real(v) * real(v) + imag(v) * imag(v);
++#endif
+ }
+ 
+ } // namespace magma_sampleselect
+@@ -164,36 +168,43 @@
+     magma_queue_t queue )
+ {
+     magma_int_t info = 0;
++    magma_int_t arch = magma_getdevice_arch();
+ 
+-    auto num_blocks = magma_ceildiv(total_size, block_size);
+-    auto local_work = (total_size + num_threads - 1) / num_threads;
+-    auto required_size = sizeof(double) * (total_size + searchtree_size)
++    if( arch >= 350 ) {
++        auto num_blocks = magma_ceildiv(total_size, block_size);
++        auto local_work = (total_size + num_threads - 1) / num_threads;
++        auto required_size = sizeof(double) * (total_size + searchtree_size)
+                          + sizeof(int32_t) * (searchtree_width * (num_grouped_blocks + 1) + 1);
+-    auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);
++        auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size);
+ 
+-    double* gputmp = (double*)*tmp_ptr;
+-    double* gputree = gputmp + total_size;
+-    uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
+-    int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
+-    int32_t* gpucounts = gpurankout + 1;
+-    int32_t* gpulocalcounts = gpucounts + searchtree_width;
+-    uint32_t bucketidx{};
++        double* gputmp = (double*)*tmp_ptr;
++        double* gputree = gputmp + total_size;
++        uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size);
++        int32_t* gpurankout = (int32_t*)(gpubucketidx + 1);
++        int32_t* gpucounts = gpurankout + 1;
++        int32_t* gpulocalcounts = gpucounts + searchtree_width;
++        uint32_t bucketidx{};
+ 
+-    CHECK(realloc_result);
++        CHECK(realloc_result);
+ 
+-    compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
+-        (val, gputmp, total_size);
+-    build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
+-        (gputmp, gputree, total_size);
+-    count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
+-        (gputmp, gputree, gpulocalcounts, total_size, local_work);
+-    reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
+-        (gpulocalcounts, gpucounts, num_grouped_blocks);
+-    sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
+-        (gpucounts, subset_size, gpubucketidx, gpurankout);
+-    magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
+-    magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
+-    *thrs = std::sqrt(*thrs);
++        compute_abs<<<num_blocks, block_size, 0, queue->cuda_stream()>>>
++            (val, gputmp, total_size);
++        build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>>
++            (gputmp, gputree, total_size);
++        count_buckets<<<num_grouped_blocks, block_size, 0, queue->cuda_stream()>>>
++            (gputmp, gputree, gpulocalcounts, total_size, local_work);
++        reduce_counts<<<searchtree_width, num_grouped_blocks, 0, queue->cuda_stream()>>>
++            (gpulocalcounts, gpucounts, num_grouped_blocks);
++        sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>>
++            (gpucounts, subset_size, gpubucketidx, gpurankout);
++        magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue);
++        magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue);
++        *thrs = std::sqrt(*thrs);
++    }
++    else {
++        printf("error: this functionality needs CUDA architecture >= 3.5\n");
++        info = MAGMA_ERR_NOT_SUPPORTED;
++    }
+ 
+ cleanup:
+     return info;
+diff -r 89706c0efbdb src/xhsgetrf_gpu.cpp
+--- a/src/xhsgetrf_gpu.cpp	Wed Jan 02 14:17:26 2019 -0500
++++ b/src/xhsgetrf_gpu.cpp	Wed Apr 03 15:50:54 2019 -0700
+@@ -16,6 +16,131 @@
+ #include <cuda_fp16.h>
+ #endif
+ 
++#if CUDA_VERSION < 9020
++// conversion float to half are not defined for host in CUDA version <9.2
++// thus uses the conversion below when CUDA VERSION is < 9.2.
++#include <string.h>
++//
++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions
++// are met:
++//  * Redistributions of source code must retain the above copyright
++//    notice, this list of conditions and the following disclaimer.
++//  * Redistributions in binary form must reproduce the above copyright
++//    notice, this list of conditions and the following disclaimer in the
++//    documentation and/or other materials provided with the distribution.
++//  * Neither the name of NVIDIA CORPORATION nor the names of its
++//    contributors may be used to endorse or promote products derived
++//    from this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// This code modified from the public domain code here: 
++// https://gist.github.com/rygorous/2156668
++// The URL above includes more robust conversion routines
++// that handle Inf and NaN correctly. 
++// 
++// It is recommended to use the more robust versions in production code.
++
++typedef unsigned uint;
++
++union FP32
++{
++    uint u;
++    float f;
++    struct
++    {
++        uint Mantissa : 23;
++        uint Exponent : 8;
++        uint Sign : 1;
++    };
++};
++
++union FP16
++{
++    unsigned short u;
++    struct
++    {
++        uint Mantissa : 10;
++        uint Exponent : 5;
++        uint Sign : 1;
++    };
++};
++
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++static half approx_float_to_half(float fl)
++{
++    FP32 f32infty = { 255 << 23 };
++    FP32 f16max = { (127 + 16) << 23 };
++    FP32 magic = { 15 << 23 };
++    FP32 expinf = { (255 ^ 31) << 23 };
++    uint sign_mask = 0x80000000u;
++    FP16 o = { 0 };
++
++    FP32 f = *((FP32*)&fl);
++
++    uint sign = f.u & sign_mask;
++    f.u ^= sign;
++
++    if (!(f.f < f32infty.u)) // Inf or NaN
++        o.u = f.u ^ expinf.u;
++    else
++    {
++        if (f.f > f16max.f) f.f = f16max.f;
++        f.f *= magic.f;
++    }
++
++    o.u = f.u >> 13; // Take the mantissa bits
++    o.u |= sign >> 16;
++    half tmp;
++    memcpy(&tmp, &o, sizeof(half));
++    //return *((half*)&o);
++    return tmp;
++}
++
++// from half->float code - just for verification.
++static float half_to_float(half hf)
++{
++    FP16 h;
++    memcpy(&h, &hf, sizeof(half));
++
++    static const FP32 magic = { 113 << 23 };
++    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
++    FP32 o;
++
++    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
++    uint exp = shifted_exp & o.u;   // just the exponent
++    o.u += (127 - 15) << 23;        // exponent adjust
++
++    // handle exponent special cases
++    if (exp == shifted_exp) // Inf/NaN?
++        o.u += (128 - 16) << 23;    // extra exp adjust
++    else if (exp == 0) // Zero/Denormal?
++    {
++        o.u += 1 << 23;             // extra exp adjust
++        o.f -= magic.f;             // renormalize
++    }
++
++    o.u |= (h.u & 0x8000) << 16;    // sign bit
++    return o.f;
++}
++#endif
++
+ #include "magma_internal.h"
+ //#include "nvToolsExt.h"
+ 
+@@ -106,10 +231,13 @@
+     float c_one     = MAGMA_S_ONE;
+     float c_neg_one = MAGMA_S_NEG_ONE;
+     #if 1
++    #if CUDA_VERSION >= 9020
+     const magmaHalf h_one     = (magmaHalf) 1.0;
+     const magmaHalf h_neg_one = (magmaHalf)-1.0;
+-    //const magmaHalf h_one = approx_float_to_half(1.0);
+-    //const magmaHalf h_neg_one = approx_float_to_half(-1.0);
++    #else
++    const magmaHalf h_one = approx_float_to_half(1.0);
++    const magmaHalf h_neg_one = approx_float_to_half(-1.0);
++    #endif
+     #else
+     FP32 float_one    = *((FP32*)&c_one);
+     FP16 half_one     = float_to_half_full(float_one);
+diff -r 89706c0efbdb src/xshgetrf_gpu.cpp
+--- a/src/xshgetrf_gpu.cpp	Wed Jan 02 14:17:26 2019 -0500
++++ b/src/xshgetrf_gpu.cpp	Wed Apr 03 15:50:54 2019 -0700
+@@ -92,7 +92,7 @@
+     magma_mp_type_t enable_tc,
+     magma_mp_type_t mp_algo_type )
+ {
+-#if CUDA_VERSION >= 7500
++#if CUDA_VERSION >= 9000
+     #ifdef HAVE_clBLAS
+     #define  dA(i_, j_) dA,  (dA_offset  + (i_)       + (j_)*ldda)
+     #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_))
+diff -r 89706c0efbdb testing/testing_hgemm.cpp
+--- a/testing/testing_hgemm.cpp	Wed Jan 02 14:17:26 2019 -0500
++++ b/testing/testing_hgemm.cpp	Wed Apr 03 15:50:54 2019 -0700
+@@ -22,6 +22,131 @@
+ #include "magma_operators.h"
+ #include "testings.h"
+ 
++#if CUDA_VERSION < 9020
++// conversion float to half are not defined for host in CUDA version <9.2
++// thus uses the conversion below when CUDA VERSION is < 9.2.
++#include <string.h>
++//
++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions
++// are met:
++//  * Redistributions of source code must retain the above copyright
++//    notice, this list of conditions and the following disclaimer.
++//  * Redistributions in binary form must reproduce the above copyright
++//    notice, this list of conditions and the following disclaimer in the
++//    documentation and/or other materials provided with the distribution.
++//  * Neither the name of NVIDIA CORPORATION nor the names of its
++//    contributors may be used to endorse or promote products derived
++//    from this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
++// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// This code modified from the public domain code here: 
++// https://gist.github.com/rygorous/2156668
++// The URL above includes more robust conversion routines
++// that handle Inf and NaN correctly. 
++// 
++// It is recommended to use the more robust versions in production code.
++
++typedef unsigned uint;
++
++union FP32
++{
++    uint u;
++    float f;
++    struct
++    {
++        uint Mantissa : 23;
++        uint Exponent : 8;
++        uint Sign : 1;
++    };
++};
++
++union FP16
++{
++    unsigned short u;
++    struct
++    {
++        uint Mantissa : 10;
++        uint Exponent : 5;
++        uint Sign : 1;
++    };
++};
++
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++// Approximate solution. This is faster but converts some sNaNs to
++// infinity and doesn't round correctly. Handle with care.
++static half approx_float_to_half(float fl)
++{
++    FP32 f32infty = { 255 << 23 };
++    FP32 f16max = { (127 + 16) << 23 };
++    FP32 magic = { 15 << 23 };
++    FP32 expinf = { (255 ^ 31) << 23 };
++    uint sign_mask = 0x80000000u;
++    FP16 o = { 0 };
++
++    FP32 f = *((FP32*)&fl);
++
++    uint sign = f.u & sign_mask;
++    f.u ^= sign;
++
++    if (!(f.f < f32infty.u)) // Inf or NaN
++        o.u = f.u ^ expinf.u;
++    else
++    {
++        if (f.f > f16max.f) f.f = f16max.f;
++        f.f *= magic.f;
++    }
++
++    o.u = f.u >> 13; // Take the mantissa bits
++    o.u |= sign >> 16;
++    half tmp;
++    memcpy(&tmp, &o, sizeof(half));
++    //return *((half*)&o);
++    return tmp;
++}
++
++// from half->float code - just for verification.
++static float half_to_float(half hf)
++{
++    FP16 h;
++    memcpy(&h, &hf, sizeof(half));
++
++    static const FP32 magic = { 113 << 23 };
++    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
++    FP32 o;
++
++    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
++    uint exp = shifted_exp & o.u;   // just the exponent
++    o.u += (127 - 15) << 23;        // exponent adjust
++
++    // handle exponent special cases
++    if (exp == shifted_exp) // Inf/NaN?
++        o.u += (128 - 16) << 23;    // extra exp adjust
++    else if (exp == 0) // Zero/Denormal?
++    {
++        o.u += 1 << 23;             // extra exp adjust
++        o.f -= magic.f;             // renormalize
++    }
++
++    o.u |= (h.u & 0x8000) << 16;    // sign bit
++    return o.f;
++}
++#endif
++
+ /* ////////////////////////////////////////////////////////////////////////////
+    -- Testing sgemm
+ */
+@@ -47,8 +172,13 @@
+     float c_neg_one = MAGMA_S_NEG_ONE;
+     float alpha = MAGMA_S_MAKE(  0.29, -0.86 );
+     float beta  = MAGMA_S_MAKE( -0.48,  0.38 );
+-    magmaHalf h_alpha = (magmaHalf)alpha;
+-    magmaHalf h_beta  = (magmaHalf)beta;
++    #if CUDA_VERSION >= 9020
++    const magmaHalf h_alpha = (magmaHalf) alpha;
++    const magmaHalf h_beta  = (magmaHalf) beta;
++    #else
++    const magmaHalf h_alpha = approx_float_to_half(alpha);
++    const magmaHalf h_beta  = approx_float_to_half(beta);
++    #endif
+     magma_opts opts;
+     opts.parse_opts( argc, argv );
+     
diff --git a/scripts/uberenv/packages/magma/package.py b/scripts/uberenv/packages/magma/package.py
new file mode 100644
index 00000000..8d37bec6
--- /dev/null
+++ b/scripts/uberenv/packages/magma/package.py
@@ -0,0 +1,125 @@
+# Copyright 2013-2021 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+
+from spack import *
+
+
+class Magma(CMakePackage, CudaPackage):
+    """The MAGMA project aims to develop a dense linear algebra library similar
+       to LAPACK but for heterogeneous/hybrid architectures, starting with
+       current "Multicore+GPU" systems.
+    """
+
+    homepage = "http://icl.cs.utk.edu/magma/"
+    url = "http://icl.cs.utk.edu/projectsfiles/magma/downloads/magma-2.2.0.tar.gz"
+    maintainers = ['stomov', 'luszczek']
+
+    version('2.5.4', sha256='7734fb417ae0c367b418dea15096aef2e278a423e527c615aab47f0683683b67')
+    version('2.5.3', sha256='c602d269a9f9a3df28f6a4f593be819abb12ed3fa413bba1ff8183de721c5ef6')
+    version('2.5.2', sha256='065feb85558f9dd6f4cc4db36ac633a3f787827fc832d0b578a049a43a195620')
+    version('2.5.1', sha256='ce32c199131515336b30c92a907effe0c441ebc5c5bdb255e4b06b2508de109f')
+    version('2.5.0', sha256='4fd45c7e46bd9d9124253e7838bbfb9e6003c64c2c67ffcff02e6c36d2bcfa33')
+    version('2.4.0', sha256='4eb839b1295405fd29c8a6f5b4ed578476010bf976af46573f80d1169f1f9a4f')
+    version('2.3.0', sha256='010a4a057d7aa1e57b9426bffc0958f3d06913c9151463737e289e67dd9ea608')
+    version('2.2.0', sha256='df5d4ace417e5bf52694eae0d91490c6bde4cde1b0da98e8d400c5c3a70d83a2')
+
+    variant('fortran', default=True,
+            description='Enable Fortran bindings support')
+    variant('shared', default=True,
+            description='Enable shared library')
+    variant('cuda', default=True, description='Build with CUDA')
+    variant('cuda_arch', default='none', multi=True,
+            description='Specify CUDA architecture(s)')
+    
+    # corbett5 added this variant
+    variant('fortran_convention', default='default', description='LAPACK/BLAS mangling scheme',
+            values=('default', 'add_', 'nochange', 'upcase'), multi=False)
+
+    depends_on('blas')
+    depends_on('lapack')
+    depends_on('cuda@8:', when='@2.5.1:')  # See PR #14471
+
+    conflicts('~cuda', msg='Magma requires cuda')
+    conflicts('cuda_arch=none',
+              msg='Please indicate a CUDA arch value or values')
+
+    # currently not compatible with CUDA-11
+    # https://bitbucket.org/icl/magma/issues/22/cuda-11-changes-issue
+    # https://bitbucket.org/icl/magma/issues/25/error-cusparsesolveanalysisinfo_t-does-not
+    conflicts('^cuda@11:', when='@:2.5.3')
+
+    patch('ibm-xl.patch', when='@2.2:2.5.0%xl')
+    patch('ibm-xl.patch', when='@2.2:2.5.0%xl_r')
+    patch('magma-2.3.0-gcc-4.8.patch', when='@2.3.0%gcc@:4.8')
+    patch('magma-2.5.0.patch', when='@2.5.0')
+    patch('magma-2.5.0-cmake.patch', when='@2.5.0')
+    patch('cmake-W.patch', when='@2.5.0:%nvhpc')
+
+    def cmake_args(self):
+        spec = self.spec
+        options = []
+
+        options.extend([
+            '-DCMAKE_INSTALL_PREFIX=%s' % self.prefix,
+            '-DCMAKE_INSTALL_NAME_DIR:PATH=%s/lib' % self.prefix,
+            '-DBLAS_LIBRARIES=%s' % spec['blas'].libs.joined(';'),
+            # As of MAGMA v2.3.0, CMakeLists.txt does not use the variable
+            # BLAS_LIBRARIES, but only LAPACK_LIBRARIES, so we need to
+            # explicitly add blas to LAPACK_LIBRARIES.
+            '-DLAPACK_LIBRARIES=%s' %
+            (spec['lapack'].libs + spec['blas'].libs).joined(';')
+        ])
+
+        options += ['-DBUILD_SHARED_LIBS=%s' %
+                    ('ON' if ('+shared' in spec) else 'OFF')]
+
+        if '+fortran' in spec:
+            options.extend([
+                '-DUSE_FORTRAN=yes'
+            ])
+            if spec.satisfies('%xl') or spec.satisfies('%xl_r'):
+                options.extend([
+                    '-DCMAKE_Fortran_COMPILER=%s' % self.compiler.f77
+                ])
+        
+        # corbett5 added this else block
+        else:
+            options.extend([
+                '-DUSE_FORTRAN=no'
+            ])
+
+        if spec.satisfies('^cuda'):
+            cuda_arch = self.spec.variants['cuda_arch'].value
+            if '@:2.2.0' in spec:
+                capabilities = ' '.join('sm{0}'.format(i) for i in cuda_arch)
+                options.extend(['-DGPU_TARGET=' + capabilities])
+            else:
+                capabilities = ' '.join('sm_{0}'.format(i) for i in cuda_arch)
+                options.extend(['-DGPU_TARGET=' + capabilities])
+
+        if '@2.5.0' in spec:
+            options.extend(['-DMAGMA_SPARSE=OFF'])
+            if spec.compiler.name in ['xl', 'xl_r']:
+                options.extend(['-DCMAKE_DISABLE_FIND_PACKAGE_OpenMP=TRUE'])
+        
+        # corbett5 added these definitions
+        if spec.variants['fortran_convention'].value == 'add_':
+            options.extend(['-DFORTRAN_CONVENTION=-DADD_'])
+        
+        if spec.variants['fortran_convention'].value == 'nochange':
+            options.extend(['-DFORTRAN_CONVENTION=-DNOCHANGE'])
+        
+        if spec.variants['fortran_convention'].value == 'upcase':
+            options.extend(['-DFORTRAN_CONVENTION=-DUPCASE'])
+
+        return options
+
+    @run_after('install')
+    def post_install(self):
+        install('magmablas/atomics.cuh', self.prefix.include)
+        install('control/magma_threadsetting.h', self.prefix.include)
+        install('control/pthread_barrier.h', self.prefix.include)
+        install('control/magma_internal.h', self.prefix.include)
diff --git a/scripts/uberenv/project.json b/scripts/uberenv/project.json
index 9822f975..703db7a4 100644
--- a/scripts/uberenv/project.json
+++ b/scripts/uberenv/project.json
@@ -3,8 +3,8 @@
   "package_version" : "develop",
   "package_final_phase" : "hostconfig",
   "package_source_dir" : "../..",
-  "spack_url": "https://github.com/corbett5/spack",
-  "spack_branch": "package/corbett/lvarray-update",
+  "spack_url": "https://github.com/spack/spack",
+  "spack_branch": "v0.19.0",
   "spack_activate" : {},
   "spack_clean_packages": ["lvarray"],
   "build_jobs": 100
diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
index b1bf26cb..652d26c4 100644
--- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
+++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml
@@ -1,31 +1,91 @@
 compilers:
 - compiler:
-    spec: clang@10.0.1
+    spec: clang@upstream-2019.03.19
     paths:
-      cc: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang++
+      cc: /usr/tce/packages/clang/clang-upstream-2019.03.19/bin/clang
+      cxx: /usr/tce/packages/clang/clang-upstream-2019.03.19/bin/clang++
+      f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf_r
+      fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf_r
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@11.0.1
+    paths:
+      cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -mcpu=native -mtune=native
-      cxxflags: -mcpu=native -mtune=native
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
 - compiler:
-    spec: clang@11.0.1
+    spec: clang@12.0.1
+    paths:
+      cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@13.0.1
+    paths:
+      cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang
+      cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: clang@14.0.4
     paths:
-      cc: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang++
+      cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang
+      cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -mcpu=native -mtune=native
-      cxxflags: -mcpu=native -mtune=native
+      cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+      cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@7.3.0
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
@@ -37,25 +97,55 @@ compilers:
       f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
       fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
     flags:
-      cflags: -mcpu=native -mtune=native
-      cxxflags: -mcpu=native -mtune=native
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@9.3.1
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel7
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@10.2.1
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
 - compiler:
-    spec: xl@16.1.1
+    spec: intel@19.1.2
     paths:
-      cc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlc
-      cxx: /usr/tce/packages/xl/xl-2021.03.11/bin/xlC
-      f77: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf
-      fc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf
+      cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc
+      cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc
+      f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
+      fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
     flags:
-      cflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036
-      cxxflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036
+      cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native
+      cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native
     operating_system: rhel7
-    target: ppc64le
+    target: x86_64
     modules: []
     environment: {}
     extra_rpaths: []
diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml
index d054887c..575d66db 100644
--- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml
+++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml
@@ -2,6 +2,21 @@ packages:
   all:
     target: [default]
     compiler: [gcc, clang, xl]
+    providers:
+      blas: [netlib-lapack, essl]
+      lapack: [netlib-lapack, essl]
+  
+  netlib-lapack:
+    buildable: False
+    externals:
+    - spec: netlib-lapack@3.10.0 ~external-blas
+      prefix: /usr/tcetmp/packages/lapack/lapack-3.10.0-P9-xl-2022.03.10/
+  
+  essl:
+    buildable: False
+    externals:
+      - spec: essl@6.2.1 ~ilp64 threads=openmp +cuda +lapack
+        prefix: /usr/tcetmp/packages/essl/essl-6.2.1/
 
   cuda:
     buildable: False
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
deleted file mode 100644
index 3d9648a7..00000000
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-compilers:
-- compiler:
-    spec: clang@10.0.1
-    paths:
-      cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: clang@11.0.1
-    paths:
-      cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang
-      cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: gcc@7.3.0
-    paths:
-      cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc
-      cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++
-      f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: gcc@8.3.1
-    paths:
-      cc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gcc
-      cxx: /usr/tce/packages/gcc/gcc-8.3.1/bin/g++
-      f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-      fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran
-    flags:
-      cflags: -march=native -mtune=native
-      cxxflags: -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
-- compiler:
-    spec: intel@19.1.2
-    paths:
-      cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc
-      cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc
-      f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
-      fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort
-    flags:
-      cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native
-      cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native
-    operating_system: rhel7
-    target: x86_64
-    modules: []
-    environment: {}
-    extra_rpaths: []
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
index 0c6b833b..43971e78 100644
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
+++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml
@@ -107,3 +107,10 @@ packages:
     externals:
     - spec: pkg-config@0.27.1
       prefix: /usr/bin/
+  
+  ninja:
+    buildable: False
+    externals:
+    - spec: ninja@1.11.0
+      prefix: /usr/tce/packages/ninja/ninja-1.11.0
+
diff --git a/scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml
new file mode 100644
index 00000000..15bdbccd
--- /dev/null
+++ b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml
@@ -0,0 +1,31 @@
+compilers:
+- compiler:
+    spec: clang@14.0.6
+    paths:
+      cc: /usr/tce/packages/clang/clang-14.0.6-magic/bin/clang
+      cxx: /usr/tce/packages/clang/clang-14.0.6-magic/bin/clang++
+      f77: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel8
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
+- compiler:
+    spec: gcc@12.1.1
+    paths:
+      cc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gcc
+      cxx: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/g++
+      f77: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran
+      fc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran
+    flags:
+      cflags: -march=native -mtune=native
+      cxxflags: -march=native -mtune=native
+    operating_system: rhel8
+    target: x86_64
+    modules: []
+    environment: {}
+    extra_rpaths: []
diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml
similarity index 72%
rename from scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml
rename to scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml
index e7ed36f4..d3d2714a 100644
--- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml
+++ b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml
@@ -2,6 +2,15 @@ packages:
   all:
     target: [default]
     compiler: [gcc, clang, intel]
+    providers:
+      blas: [intel-oneapi-mkl]
+      lapack: [intel-oneapi-mkl]
+
+  intel-oneapi-mkl:
+    buildable: False
+    externals:
+      - spec: intel-oneapi-mkl@2022.1.0
+        prefix: /usr/tce/backend/installations/linux-rhel8-x86_64/intel-19.0.4/intel-oneapi-mkl-2022.1.0-sksz67twjxftvwchnagedk36gf7plkrp/
 
   cmake:
     buildable: False
diff --git a/src/Array.hpp b/src/Array.hpp
index d05769cd..503d4750 100644
--- a/src/Array.hpp
+++ b/src/Array.hpp
@@ -91,10 +91,10 @@ class Array : public ArrayView< T,
   {
     this->m_strides = indexing::calculateStrides< PERMUTATION >( this->m_dims );
 
-#if !defined(__CUDA_ARCH__)
+#if !defined(LVARRAY_DEVICE_COMPILE)
     setName( "" );
 #endif
-#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__)
+#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE)
     Array::TV_ttf_display_type( nullptr );
 #endif
   }
@@ -121,10 +121,10 @@ class Array : public ArrayView< T,
   {
     this->m_strides = indexing::calculateStrides< PERMUTATION >( this->m_dims );
 
-#if !defined(__CUDA_ARCH__)
+#if !defined(LVARRAY_DEVICE_COMPILE)
     setName( "" );
 #endif
-#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__)
+#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE)
     Array::TV_ttf_display_type( nullptr );
 #endif
   }
@@ -588,7 +588,7 @@ class Array : public ArrayView< T,
   void setName( std::string const & name )
   { this->m_dataBuffer.template setName< decltype(*this) >( name ); }
 
-#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__)
+#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE)
   /**
    * @brief Static function that will be used by Totalview to display the array contents.
    * @param av A pointer to the array that is being displayed.
diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp
index 706f2014..5efb4bc0 100644
--- a/src/ArrayOfArraysView.hpp
+++ b/src/ArrayOfArraysView.hpp
@@ -252,6 +252,7 @@ class ArrayOfArraysView
    * @param src the SparsityPatternView to be moved from.
    * @return *this.
    */
+  LVARRAY_HOST_DEVICE
   inline
   ArrayOfArraysView & operator=( ArrayOfArraysView && src )
   {
@@ -587,6 +588,9 @@ class ArrayOfArraysView
 
   #if defined(LVARRAY_USE_CUDA)
     if( space == MemorySpace::cuda ) touch = false;
+  #endif
+  #if defined(LVARRAY_USE_HIP)
+    if( space == MemorySpace::hip ) touch = false;
   #endif
     m_offsets.move( space, touch );
   }
@@ -725,12 +729,11 @@ class ArrayOfArraysView
     auto const fillOffsets = [&]()
     {
       m_offsets[ 0 ] = 0;
-//      RAJA::inclusive_scan< POLICY >( capacities,
-//                                      capacities + numSubArrays,
-//                                      m_offsets.data() + 1 );
-
-      RAJA::inclusive_scan< POLICY >( RAJA::make_span< INDEX_TYPE const * >( capacities, numSubArrays ),
-                                      RAJA::make_span< INDEX_TYPE * >( m_offsets.data()+1, numSubArrays ) );
+#if ( RAJA_VERSION_MAJOR == 1 && RAJA_VERSION_MINOR >= 13 ) || ( RAJA_VERSION_MAJOR > 1 )
+      RAJA::inclusive_scan< POLICY >( RAJA::make_span( capacities, numSubArrays ), RAJA::make_span( m_offsets.data() + 1, numSubArrays ) );
+#else
+      RAJA::inclusive_scan< POLICY >( capacities, capacities + numSubArrays, m_offsets.data() + 1 );
+#endif
     };
     resizeFromOffsetsImpl( numSubArrays, fillOffsets, buffers ... );
   }
diff --git a/src/ArraySlice.hpp b/src/ArraySlice.hpp
index b4e22345..84357d8b 100644
--- a/src/ArraySlice.hpp
+++ b/src/ArraySlice.hpp
@@ -126,7 +126,7 @@ class ArraySlice
     m_dims( inputDimensions ),
     m_strides( inputStrides )
   {
-#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) && defined(LVARRAY_BOUNDS_CHECK)
+#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) && defined(LVARRAY_BOUNDS_CHECK)
     ArraySlice::TV_ttf_display_type( nullptr );
 #endif
   }
@@ -308,6 +308,15 @@ class ArraySlice
     return m_data[ linearIndex( indices ... ) ];
   }
 
+  /**
+   * @brief 
+   */
+  LVARRAY_HOST_DEVICE inline constexpr
+  T * data() const
+  {
+    return m_data;
+  }
+
   /**
    * @return Return a pointer to the values.
    * @tparam USD_ Dummy template parameter, do not specify.
@@ -341,7 +350,7 @@ class ArraySlice
 
   ///@}
 
-#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) && defined(LVARRAY_BOUNDS_CHECK)
+#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) && defined(LVARRAY_BOUNDS_CHECK)
   /**
    * @brief Static function that will be used by Totalview to display the array contents.
    * @param av A pointer to the array that is being displayed.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 03f627c2..8d4ad2ca 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -38,7 +38,7 @@ set( lvarray_headers
      sortedArrayManipulationHelpers.hpp
      system.hpp
      tensorOps.hpp
-     totalview/tv_data_display.h
+#     totalview/tv_data_display.h
      typeManipulation.hpp
      umpireInterface.hpp )
 
@@ -46,7 +46,7 @@ blt_list_append( TO lvarray_headers ELEMENTS ChaiBuffer.hpp IF ENABLE_CHAI )
 
 set( lvarray_sources
      system.cpp
-     totalview/tv_data_display.c
+#     totalview/tv_data_display.c
      umpireInterface.cpp )
 
 blt_add_library( NAME             lvarray
@@ -79,3 +79,7 @@ lvarray_add_code_checks( PREFIX lvarray
 if( ENABLE_PYLVARRAY )
     add_subdirectory( python )
 endif()
+
+if( ENABLE_LAPACK )
+    add_subdirectory( dense )
+endif()
diff --git a/src/CRSMatrixView.hpp b/src/CRSMatrixView.hpp
index bc954672..9a8bbca5 100644
--- a/src/CRSMatrixView.hpp
+++ b/src/CRSMatrixView.hpp
@@ -111,7 +111,6 @@ class CRSMatrixView : protected SparsityPatternView< COL_TYPE, INDEX_TYPE, BUFFE
   /**
    * @brief Default move constructor.
    */
-  inline
   CRSMatrixView( CRSMatrixView && ) = default;
 
   /**
diff --git a/src/ChaiBuffer.hpp b/src/ChaiBuffer.hpp
index b5d26fa1..83e8c254 100644
--- a/src/ChaiBuffer.hpp
+++ b/src/ChaiBuffer.hpp
@@ -56,7 +56,11 @@ inline chai::ExecutionSpace toChaiExecutionSpace( MemorySpace const space )
   if( space == MemorySpace::host )
     return chai::CPU;
 #if defined(LVARRAY_USE_CUDA)
-  if( space == MemorySpace::cuda || space == MemorySpace::hip )
+  if( space == MemorySpace::cuda )
+    return chai::GPU;
+#endif
+#if defined(LVARRAY_USE_HIP)
+  if( space == MemorySpace::hip )
     return chai::GPU;
 #endif
 
@@ -79,6 +83,10 @@ inline MemorySpace toMemorySpace( chai::ExecutionSpace const space )
   if( space == chai::GPU )
     return MemorySpace::cuda;
 #endif
+#if defined(LVARRAY_USE_HIP)
+  if( space == chai::GPU )
+    return MemorySpace::hip;
+#endif
 
   LVARRAY_ERROR( "Unrecognized execution space " << static_cast< int >( space ) );
 
@@ -185,7 +193,7 @@ class ChaiBuffer
     m_capacity( src.m_capacity ),
     m_pointerRecord( src.m_pointerRecord )
   {
-  #if defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__)
+  #if defined(LVARRAY_USE_DEVICE) && !defined(LVARRAY_DEVICE_COMPILE)
     move( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), true );
   #endif
   }
@@ -203,7 +211,7 @@ class ChaiBuffer
     m_capacity( src.m_capacity ),
     m_pointerRecord( src.m_pointerRecord )
   {
-  #if defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__)
+  #if defined(LVARRAY_USE_DEVICE) && !defined(LVARRAY_DEVICE_COMPILE)
     moveNested( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), size, true );
   #else
     LVARRAY_UNUSED_VARIABLE( size );
@@ -370,7 +378,7 @@ class ChaiBuffer
   inline
   void moveNested( MemorySpace const space, std::ptrdiff_t const size, bool const touch ) const
   {
-  #if defined(LVARRAY_USE_CUDA)
+  #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP )
     chai::ExecutionSpace const chaiSpace = internal::toChaiExecutionSpace( space );
     if( m_pointerRecord == nullptr ||
         m_capacity == 0 ||
@@ -398,16 +406,15 @@ class ChaiBuffer
    */
   void move( MemorySpace const space, bool const touch ) const
   {
-  #if defined(LVARRAY_USE_CUDA)
+  #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP)
     chai::ExecutionSpace const chaiSpace = internal::toChaiExecutionSpace( space );
     if( m_pointerRecord == nullptr ||
         m_capacity == 0 ||
         chaiSpace == chai::NONE ) return;
 
+    auto & am = internal::getArrayManager();
     const_cast< T * & >( m_pointer ) =
-      static_cast< T * >( internal::getArrayManager().move( const_cast< T_non_const * >( m_pointer ),
-                                                            m_pointerRecord,
-                                                            chaiSpace ) );
+      static_cast< T * >( am.move( const_cast< T_non_const * >( m_pointer ), m_pointerRecord, chaiSpace ) );
 
     if( !std::is_const< T >::value && touch ) m_pointerRecord->m_touched[ chaiSpace ] = true;
     m_pointerRecord->m_last_space = chaiSpace;
diff --git a/src/LvArrayConfig.hpp.in b/src/LvArrayConfig.hpp.in
index 2c997ab5..bf48242a 100644
--- a/src/LvArrayConfig.hpp.in
+++ b/src/LvArrayConfig.hpp.in
@@ -26,8 +26,12 @@
 
 #cmakedefine LVARRAY_USE_CUDA
 
+#cmakedefine LVARRAY_USE_HIP
+
 #cmakedefine LVARRAY_USE_TOTALVIEW_OUTPUT
 
 #cmakedefine LVARRAY_USE_CALIPER
 
+#cmakedefine LVARRAY_USE_MAGMA
+
 #cmakedefine LVARRAY_ADDR2LINE_EXEC @LVARRAY_ADDR2LINE_EXEC@
diff --git a/src/Macros.hpp b/src/Macros.hpp
index 544f5e19..82cf24d1 100644
--- a/src/Macros.hpp
+++ b/src/Macros.hpp
@@ -22,10 +22,33 @@
 #include <iostream>
 #include <type_traits>
 
-#if defined(LVARRAY_USE_CUDA)
-  #include <cassert>
+
+#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP)
+/// Macro defined when using a device.
+#define LVARRAY_USE_DEVICE
 #endif
 
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+/// Macro defined when currently compiling on device (only defined in the device context).
+#define LVARRAY_DEVICE_COMPILE
+/// Marks a function/lambda for inlining
+#define LVARRAY_FORCE_INLINE __forceinline__
+#else
+/// Marks a function/lambda for inlining
+#define LVARRAY_FORCE_INLINE inline
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+// Denotes whether to define decorator macros later in this file.
+#define LVARRAY_DECORATE
+#endif
+
+
+
+//#if !defined(NDEBUG) && defined(LVARRAY_DEVICE_COMPILE)
+  #include <cassert>
+//#endif
+
 /**
  * @brief Convert @p A into a string.
  * @param A the token to convert to a string.
@@ -38,6 +61,8 @@
  */
 #define STRINGIZE( A ) STRINGIZE_NX( A )
 
+//#pragma message "LVARRAY_DEVICE_COMPILE: " STRINGIZE(LVARRAY_DEVICE_COMPILE)
+
 /**
  * @brief Mark @p X as an unused argument, used to silence compiler warnings.
  * @param X the unused argument.
@@ -91,8 +116,11 @@
  *       and a stack trace along with the provided message. On device none of this is
  *       guaranteed. In fact it is only guaranteed to abort the current kernel.
  */
-#if defined(__CUDA_ARCH__)
-  #if !defined(NDEBUG)
+// cce processes __host__ functions with __hip_device_compile__=1 when -x hip?
+// the entire compilation unit has __hip_device_compile__=1, whereas __cuda_arch__
+// seems to be scope-defined as it isn't defined in __host__ functions
+#if defined(LVARRAY_DEVICE_COMPILE)
+  #if !defined(NDEBUG) || __HIP_DEVICE_COMPILE__
 #define LVARRAY_ERROR_IF( EXP, MSG ) \
   do \
   { \
@@ -535,7 +563,7 @@
  */
 #define LVARRAY_ASSERT_GE( lhs, rhs ) LVARRAY_ASSERT_GE_MSG( lhs, rhs, "" )
 
-#if defined(LVARRAY_USE_CUDA) && defined(__CUDACC__)
+#if defined(LVARRAY_DECORATE)
 /// Mark a function for both host and device usage.
 #define LVARRAY_HOST_DEVICE __host__ __device__
 
@@ -549,8 +577,12 @@
  *   call host only code. This is safe as long as the host only instantiations are only called on
  *   the host. To use place directly above a the template.
  */
+#if defined(LVARRAY_USE_CUDA)
 #define DISABLE_HD_WARNING _Pragma("hd_warning_disable")
 #else
+#define DISABLE_HD_WARNING
+#endif
+#else
 /// Mark a function for both host and device usage.
 #define LVARRAY_HOST_DEVICE
 
diff --git a/src/SortedArrayView.hpp b/src/SortedArrayView.hpp
index ab7ca790..8559a3fc 100644
--- a/src/SortedArrayView.hpp
+++ b/src/SortedArrayView.hpp
@@ -274,6 +274,9 @@ class SortedArrayView
   {
   #if defined(LVARRAY_USE_CUDA)
     if( space == MemorySpace::cuda ) touch = false;
+  #endif
+  #if defined(LVARRAY_USE_HIP)
+    if( space == MemorySpace::hip ) touch = false;
   #endif
     m_values.move( space, touch );
   }
diff --git a/src/arrayManipulation.hpp b/src/arrayManipulation.hpp
index 5409e60f..21f708e1 100644
--- a/src/arrayManipulation.hpp
+++ b/src/arrayManipulation.hpp
@@ -298,7 +298,7 @@ void resize( T * const LVARRAY_RESTRICT ptr,
     if( newSize - size > 0 )
     {
       std::size_t const sizeDiff = integerConversion< std::size_t >( newSize - size );
-      std::memset( reinterpret_cast< void * >( ptr + size ), 0, ( sizeDiff ) * sizeof( T ) );
+      memset( reinterpret_cast< void * >( ptr + size ), 0, ( sizeDiff ) * sizeof( T ) );
     }
   }
   else
diff --git a/src/bufferManipulation.hpp b/src/bufferManipulation.hpp
index 62b94539..83e5a00e 100644
--- a/src/bufferManipulation.hpp
+++ b/src/bufferManipulation.hpp
@@ -277,7 +277,7 @@ void resize( BUFFER & buf, std::ptrdiff_t const size, std::ptrdiff_t const newSi
 
   arrayManipulation::resize( buf.data(), size, newSize, std::forward< ARGS >( args )... );
 
-#if !defined(__CUDA_ARCH__)
+#if !defined(LVARRAY_DEVICE_COMPILE)
   if( newSize > 0 )
   {
     buf.registerTouch( MemorySpace::host );
diff --git a/src/dense/BlasLapackInterface.cpp b/src/dense/BlasLapackInterface.cpp
new file mode 100644
index 00000000..ca4309c5
--- /dev/null
+++ b/src/dense/BlasLapackInterface.cpp
@@ -0,0 +1,210 @@
+#include "BlasLapackInterface.hpp"
+#include "backendHelpers.hpp"
+
+extern "C"
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_SGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( sgemm )
+void LVARRAY_SGEMM( 
+   char const * TRANSA,
+   char const * TRANSB,
+   int const * M,
+   int const * N,
+   int const * K,
+   float const * ALPHA,
+   float const * A,
+   int const * LDA,
+   float const * B,
+   int const * LDB,
+   float const * BETA,
+   float * C,
+   int const * LDC );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_DGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( dgemm )
+void LVARRAY_DGEMM( 
+   char const * TRANSA,
+   char const * TRANSB,
+   int const * M,
+   int const * N,
+   int const * K,
+   double const * ALPHA,
+   double const * A,
+   int const * LDA,
+   double const * B,
+   int const * LDB,
+   double const * BETA,
+   double * C,
+   int const * LDC );
+  
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_CGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( cgemm )
+void LVARRAY_CGEMM( 
+   char const * TRANSA,
+   char const * TRANSB,
+   int const * M,
+   int const * N,
+   int const * K,
+   std::complex< float > const * ALPHA,
+   std::complex< float > const * A,
+   int const * LDA,
+   std::complex< float > const * B,
+   int const * LDB,
+   std::complex< float > const * BETA,
+   std::complex< float > * C,
+   int const * LDC );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_ZGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( zgemm )
+void LVARRAY_ZGEMM( 
+   char const * TRANSA,
+   char const * TRANSB,
+   int const * M,
+   int const * N,
+   int const * K,
+   std::complex< double > const * ALPHA,
+   std::complex< double > const * A,
+   int const * LDA,
+   std::complex< double > const * B,
+   int const * LDB,
+   std::complex< double > const * BETA,
+   std::complex< double > * C,
+   int const * LDC );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_SGESV LVARRAY_LAPACK_FORTRAN_MANGLE( sgesv )
+void LVARRAY_SGESV( 
+  int const * N,
+  int const * NRHS,
+  float * A,
+  int const * LDA,
+  int * IPIV,
+  float * B,
+  int const * LDB,
+  int * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_DGESV LVARRAY_LAPACK_FORTRAN_MANGLE( dgesv )
+void LVARRAY_DGESV( 
+  int const * N,
+  int const * NRHS,
+  double * A,
+  int const * LDA,
+  int * IPIV,
+  double * B,
+  int const * LDB,
+  int * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_CGESV LVARRAY_LAPACK_FORTRAN_MANGLE( cgesv )
+void LVARRAY_CGESV( 
+  int const * N,
+  int const * NRHS,
+  std::complex< float > * A,
+  int const * LDA,
+  int * IPIV,
+  std::complex< float > * B,
+  int const * LDB,
+  int * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_ZGESV LVARRAY_LAPACK_FORTRAN_MANGLE( zgesv )
+void LVARRAY_ZGESV( 
+  int const * N,
+  int const * NRHS,
+  std::complex< double > * A,
+  int const * LDA,
+  int * IPIV,
+  std::complex< double > * B,
+  int const * LDB,
+  int * INFO );
+
+} // extern "C"
+
+namespace LvArray
+{
+namespace dense
+{
+
+char toLapackChar( Operation const op )
+{
+  if( op == Operation::NO_OP ) return 'N';
+  if( op == Operation::TRANSPOSE ) return 'T';
+  if( op == Operation::ADJOINT ) return 'C';
+
+  LVARRAY_ERROR( "Unknown operation: " << int( op ) );
+  return '\0';
+}
+
+
+template< typename T >
+void BlasLapackInterface< T >::gemm(
+  Operation opA,
+  Operation opB,
+  T const alpha,
+  Matrix< T const > const & A,
+  Matrix< T const > const & B,
+  T const beta, 
+  Matrix< T > const & C )
+{
+  char const TRANSA = toLapackChar( opA );
+  char const TRANSB = toLapackChar( opB );
+  int const M = C.sizes[ 0 ];
+  int const N = C.sizes[ 1 ];
+  int const K = opA == Operation::NO_OP ? A.sizes[ 1 ] : A.sizes[ 0 ];
+  int const LDA = std::max( std::ptrdiff_t{ 1 }, A.strides[ 1 ] );
+  int const LDB = std::max( std::ptrdiff_t{ 1 }, B.strides[ 1 ] );
+  int const LDC = std::max( std::ptrdiff_t{ 1 }, C.strides[ 1 ] );
+
+  TypeDispatch< T >::dispatch( LVARRAY_SGEMM, LVARRAY_DGEMM, LVARRAY_CGEMM, LVARRAY_ZGEMM,
+    &TRANSA,
+    &TRANSB,
+    &M,
+    &N,
+    &K,
+    &alpha,
+    A.data,
+    &LDA,
+    B.data,
+    &LDB,
+    &beta,
+    C.data,
+    &LDC );
+}
+
+
+template< typename T >
+void BlasLapackInterface< T >::gesv(
+  Matrix< T > const & A,
+  Matrix< T > const & B,
+  Vector< int > const & pivots )
+{
+  int const N = A.sizes[ 0 ];
+  int const NRHS = B.sizes[ 1 ];
+  int const LDA = A.strides[ 1 ];
+  int const LDB = B.strides[ 1 ];
+  int INFO = 0;
+
+  TypeDispatch< T >::dispatch( LVARRAY_SGESV, LVARRAY_DGESV, LVARRAY_CGESV, LVARRAY_ZGESV,
+    &N,
+    &NRHS,
+    A.data,
+    &LDA,
+    pivots.data,
+    B.data,
+    &LDB,
+    &INFO );
+  
+  LVARRAY_ERROR_IF( INFO < 0, "The " << -INFO << "-th argument had an illegal value." );
+  LVARRAY_ERROR_IF( INFO > 0, "The factorization has been completed but U( " << INFO - 1 << ", " << INFO - 1 <<
+                              " ) is exactly zero so the solution could not be computed." );
+}
+
+template class BlasLapackInterface< float >;
+template class BlasLapackInterface< double >;
+template class BlasLapackInterface< std::complex< float > >;
+template class BlasLapackInterface< std::complex< double > >;
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/BlasLapackInterface.hpp b/src/dense/BlasLapackInterface.hpp
new file mode 100644
index 00000000..ed747828
--- /dev/null
+++ b/src/dense/BlasLapackInterface.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "common.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+template< typename T >
+struct BlasLapackInterface
+{
+  static constexpr MemorySpace MEMORY_SPACE = MemorySpace::host;
+
+  static void gemm(
+    Operation opA,
+    Operation opB,
+    T const alpha,
+    Matrix< T const > const & A,
+    Matrix< T const > const & B,
+    T const beta, 
+    Matrix< T > const & C );
+  
+  static void gesv(
+    Matrix< T > const & A,
+    Matrix< T > const & B,
+    Vector< int > const & pivots );
+};
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/CMakeLists.txt b/src/dense/CMakeLists.txt
new file mode 100644
index 00000000..8c7b4b0c
--- /dev/null
+++ b/src/dense/CMakeLists.txt
@@ -0,0 +1,41 @@
+set( lvarraydense_headers
+     common.hpp
+     backendHelpers.hpp
+     BlasLapackInterface.hpp
+    )
+
+set( lvarraydense_sources
+     common.cpp
+     BlasLapackInterface.cpp
+    )
+
+set( dependencies lvarray ${lvarray_dependencies} blas lapack )
+
+if( ENABLE_MAGMA )
+    set( dependencies ${dependencies} magma )
+endif()
+    
+
+blt_add_library( NAME             lvarraydense
+                 SOURCES          ${lvarraydense_sources}
+                 HEADERS          ${lvarraydense_headers}
+                 DEPENDS_ON       ${dependencies}
+                 SHARED TRUE
+                 CLEAR_PREFIX TRUE
+                 )
+
+target_include_directories( lvarraydense
+                            PUBLIC
+                            $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>
+                            $<INSTALL_INTERFACE:include> )
+
+install( TARGETS lvarraydense
+         EXPORT lvarraydense
+         ARCHIVE DESTINATION lib
+         LIBRARY DESTINATION lib
+         RUNTIME DESTINATION lib )
+
+install( EXPORT lvarraydense
+         DESTINATION share/lvarray/cmake/ )
+
+lvarray_add_code_checks( PREFIX lvarraydense )
diff --git a/src/dense/backendHelpers.hpp b/src/dense/backendHelpers.hpp
new file mode 100644
index 00000000..5de71cf8
--- /dev/null
+++ b/src/dense/backendHelpers.hpp
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <complex>
+
+/// This macro provide a flexible interface for Fortran naming convention for compiled objects
+// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE
+#define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name
+// #else
+// #define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name ## _
+// #endif
+
+namespace LvArray
+{
+namespace dense
+{
+
+template< typename T >
+struct TypeDispatch
+{};
+
+template<>
+struct TypeDispatch< float >
+{
+  template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS >
+  static constexpr auto dispatch(
+    F_FLOAT && fFloat,
+    F_DOUBLE &&,
+    F_CFLOAT &&,
+    F_CDOUBLE &&,
+    ARGS && ... args )
+  {
+    return fFloat( std::forward< ARGS >( args ) ... );
+  }
+};
+
+template<>
+struct TypeDispatch< double >
+{
+  template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS >
+  static constexpr auto dispatch(
+    F_FLOAT &&,
+    F_DOUBLE && fDouble,
+    F_CFLOAT &&,
+    F_CDOUBLE &&,
+    ARGS && ... args )
+  {
+    return fDouble( std::forward< ARGS >( args ) ... );
+  }
+};
+
+template<>
+struct TypeDispatch< std::complex< float > >
+{
+  template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS >
+  static constexpr auto dispatch(
+    F_FLOAT &&,
+    F_DOUBLE &&,
+    F_CFLOAT && fCFloat,
+    F_CDOUBLE &&,
+    ARGS && ... args )
+  {
+    return fCFloat( std::forward< ARGS >( args ) ... );
+  }
+};
+
+template<>
+struct TypeDispatch< std::complex< double > >
+{
+  template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS >
+  static constexpr auto dispatch(
+    F_FLOAT &&,
+    F_DOUBLE &&,
+    F_CFLOAT &&,
+    F_CDOUBLE && fCDouble,
+    ARGS && ... args )
+  {
+    return fCDouble( std::forward< ARGS >( args ) ... );
+  }
+};
+
+} // namespace dense
+} // namespace LvArray
diff --git a/src/dense/common.cpp b/src/dense/common.cpp
new file mode 100644
index 00000000..b1cab9fe
--- /dev/null
+++ b/src/dense/common.cpp
@@ -0,0 +1,21 @@
+#include "common.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+Operation transposeOp( Operation const op )
+{
+  switch( op )
+  {
+    case Operation::NO_OP: return Operation::TRANSPOSE;
+    case Operation::TRANSPOSE: return Operation::NO_OP;
+    case Operation::ADJOINT: LVARRAY_ERROR( "Not supported" );
+  }
+
+  return Operation::NO_OP;
+}
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/common.hpp b/src/dense/common.hpp
new file mode 100644
index 00000000..376b589c
--- /dev/null
+++ b/src/dense/common.hpp
@@ -0,0 +1,323 @@
+#pragma once
+
+#include "../Array.hpp"
+#include "../ChaiBuffer.hpp"
+
+#include <complex>
+
+namespace LvArray
+{
+namespace dense
+{
+namespace internal
+{
+
+/**
+ * TODO make a complex type and add it to the main LvArray. Make a uniform way of interacting with various complex number implementations.
+ */
+template< typename T >
+struct RealVersion
+{
+  using Type = T;
+};
+
+/**
+ *
+ */
+template< typename T >
+struct RealVersion< std::complex< T > >
+{
+  using Type = T;
+};
+
+} // namespace internal
+
+/**
+ *
+ */
+enum class SymmetricMatrixStorageType
+{
+  UPPER_TRIANGULAR,
+  LOWER_TRIANGULAR,
+};
+
+enum class Operation
+{
+  NO_OP,
+  TRANSPOSE,
+  ADJOINT,
+};
+
+Operation transposeOp( Operation const op );
+
+/**
+ *
+ */
+template< typename T >
+using RealVersion = typename internal::RealVersion< T >::Type;
+
+/**
+ *
+ */
+template< typename T >
+static constexpr bool IsComplex = !std::is_same< RealVersion< T >, T >::value;
+
+/**
+ *
+ */
+template< typename T, typename U >
+static constexpr bool IsComplexT = IsComplex< T > && std::is_same< RealVersion< T >, U >::value;
+
+/**
+ *
+ */
+template< typename T >
+struct Matrix
+{
+  Matrix(
+    typeManipulation::CArray< std::ptrdiff_t, 2 > const & sizesIn,
+    typeManipulation::CArray< std::ptrdiff_t, 2 > const & stridesIn,
+    T * const dataIn ):
+    sizes{ sizesIn },
+    strides{ stridesIn },
+    data{ dataIn }
+  {
+    LVARRAY_ERROR_IF_LT( sizes[ 0 ], 0 );
+    LVARRAY_ERROR_IF_LT( sizes[ 1 ], 0 );
+    LVARRAY_ERROR_IF_LT( strides[ 0 ], 0 );
+    LVARRAY_ERROR_IF_LT( strides[ 1 ], 0 );
+  }
+
+  Matrix( T & value ):
+    sizes{ 1, 1 },
+    strides{ 1, 1 },
+    data{ &value }
+  {}
+
+  Matrix( Matrix< std::remove_const_t< T > > const & src ):
+    sizes{ src.sizes },
+    strides{ src.strides },
+    data{ src.data }
+  {}
+
+  bool isSquare() const
+  { return sizes[0] == sizes[1]; }
+
+  bool isColumnMajor() const
+  { return strides[ 0 ] == 1; }
+
+  bool isRowMajor() const
+  { return strides[ 1 ] == 1; }
+
+  bool isContiguous() const
+  { return isColumnMajor() || isRowMajor(); }
+
+  std::ptrdiff_t nRows() const
+  { return sizes[ 0 ]; }
+
+  std::ptrdiff_t nCols() const
+  { return sizes[ 1 ]; }
+
+  Matrix transpose() const
+  {
+    return Matrix( { sizes[ 1 ], sizes[ 0 ] }, { strides[ 1 ], strides[ 0 ] }, data );
+  }
+
+  typeManipulation::CArray< std::ptrdiff_t, 2 > sizes;
+  typeManipulation::CArray< std::ptrdiff_t, 2 > strides;
+  T * data;
+};
+
+template< typename T, typename PERM, typename INDEX_TYPE, template< typename > class BUFFER_TYPE >
+Matrix< T > toMatrix(
+  Array< T, 2, PERM, INDEX_TYPE, BUFFER_TYPE > const & array,
+  MemorySpace const space,
+  bool const touch )
+{
+  array.move( space, touch );
+  return Matrix< T >( array.dimsArray(), array.stridesArray(), array.data() );
+}
+
+/**
+ *
+ */
+template< typename T >
+struct Vector
+{
+  template< int USD, typename INDEX_TYPE >
+  Vector( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ):
+    size{ integerConversion< std::ptrdiff_t >( slice.size() ) },
+    stride{ integerConversion< std::ptrdiff_t >( slice.stride( 0 ) ) },
+    data{ slice.data() }
+  {}
+
+  Vector( T & value ):
+    size{ 1 },
+    stride{ 1 },
+    data{ &value }
+  {}
+
+  std::ptrdiff_t const size;
+  std::ptrdiff_t const stride;
+  T * const data;
+};
+
+/**
+ *
+ */
+template< typename T >
+struct Workspace
+{
+  virtual ~Workspace()
+  {};
+
+  virtual Vector< T > work() = 0;
+
+  virtual Vector< T > work2() = 0;
+
+  virtual Vector< T > work3() = 0;
+
+  virtual Vector< RealVersion< T > > rwork() = 0;
+
+  virtual Vector< int > iwork() = 0;
+
+  virtual void resizeWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0;
+
+  virtual void resizeWork2( MemorySpace const space, std::ptrdiff_t const newSize ) = 0;
+
+  virtual void resizeWork3( MemorySpace const space, std::ptrdiff_t const newSize ) = 0;
+
+  virtual void resizeRWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0;
+
+  virtual void resizeIWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0;
+};
+
+/**
+ *
+ */
+template< typename T, template< typename > class BUFFER_TYPE >
+struct ArrayWorkspace : public Workspace< T >
+{
+  ArrayWorkspace()
+  {
+    m_work.setName( "ArrayWorkspace::m_work" );
+    m_work2.setName( "ArrayWorkspace::m_work2" );
+    m_work3.setName( "ArrayWorkspace::m_work3" );
+    m_rwork.setName( "ArrayWorkspace::m_rwork" );
+    m_iwork.setName( "ArrayWorkspace::m_iwork" );
+  }
+
+  virtual Vector< T > work() override
+  { return m_work.toSlice(); }
+
+  virtual Vector< T > work2() override
+  { return m_work2.toSlice(); }
+
+  virtual Vector< T > work3() override
+  { return m_work3.toSlice(); }
+
+  virtual Vector< RealVersion< T > > rwork() override
+  { return m_rwork.toSlice(); }
+
+  virtual Vector< int > iwork() override
+  { return m_iwork.toSlice(); }
+
+  virtual void resizeWork( MemorySpace const space, std::ptrdiff_t const newSize ) override
+  {
+    m_work.resizeWithoutInitializationOrDestruction( space, newSize );
+  }
+
+  virtual void resizeWork2( MemorySpace const space, std::ptrdiff_t const newSize ) override
+  {
+    m_work2.resizeWithoutInitializationOrDestruction( space, newSize );
+  }
+
+  virtual void resizeWork3( MemorySpace const space, std::ptrdiff_t const newSize ) override
+  {
+    m_work3.resizeWithoutInitializationOrDestruction( space, newSize );
+  }
+ 
+  virtual void resizeRWork( MemorySpace const space, std::ptrdiff_t const newSize ) override
+  {
+    m_rwork.resizeWithoutInitializationOrDestruction( space, newSize );
+  }
+
+  virtual void resizeIWork( MemorySpace const space, std::ptrdiff_t const newSize ) override
+  {
+    m_iwork.resizeWithoutInitializationOrDestruction( space, newSize );
+  }
+
+private:
+  Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work;
+
+  Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work2;
+
+  Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work3;
+
+  Array< RealVersion< T >, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_rwork;
+
+  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_iwork;
+};
+
+/**
+ *
+ */
+template< typename T >
+struct OptimalSizeCalculation : public Workspace< T >
+{
+  OptimalSizeCalculation()
+  {}
+
+  virtual Vector< T > work() override
+  { return m_work; }
+
+  virtual Vector< T > work2() override
+  { return m_work2; }
+
+  virtual Vector< T > work3() override
+  { return m_work3; }
+
+  virtual Vector< RealVersion< T > > rwork() override
+  { return m_rwork; }
+
+  virtual Vector< int > iwork() override
+  { return m_iwork; }
+
+  virtual void resizeWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override
+  { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
+
+  virtual void resizeWork2( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override
+  { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
+
+  virtual void resizeWork3( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override
+  { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
+
+  virtual void resizeRWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override
+  { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
+
+  virtual void resizeIWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override
+  { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); }
+
+  std::ptrdiff_t optimalWorkSize() const
+  { return static_cast< std::ptrdiff_t >( m_work.real() ); }
+
+  std::ptrdiff_t optimalRWorkSize() const
+  { return static_cast< std::ptrdiff_t >( m_rwork ); }
+
+  std::ptrdiff_t optimalIWorkSize() const
+  { return m_iwork; }
+
+private:
+  T m_work { -1 };
+
+  T m_work2 { -1 };
+
+  T m_work3 { -1 };
+
+  RealVersion< T > m_rwork { -1 };
+
+  int m_iwork { -1 };
+};
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/dense.hpp b/src/dense/dense.hpp
new file mode 100644
index 00000000..2fcf202d
--- /dev/null
+++ b/src/dense/dense.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "common.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+template< typename INTERFACE, typename MATRIX_A, typename MATRIX_B, typename MATRIX_C, typename SCALAR >
+void gemm(
+  Operation opA,
+  Operation opB,
+  SCALAR const alpha,
+  MATRIX_A const & Ain,
+  MATRIX_B const & Bin,
+  SCALAR const beta,
+  MATRIX_C const & Cin )
+{
+  Matrix< SCALAR const > A = toMatrix( Ain, INTERFACE::MEMORY_SPACE, false );
+  Matrix< SCALAR const > B = toMatrix( Bin, INTERFACE::MEMORY_SPACE, false );
+  Matrix< SCALAR > const C = toMatrix( Cin, INTERFACE::MEMORY_SPACE, true );
+
+  // Check the sizes
+  LVARRAY_ERROR_IF_NE( C.sizes[ 0 ], A.sizes[ 0 + (opA != Operation::NO_OP) ] );
+  LVARRAY_ERROR_IF_NE( C.sizes[ 1 ], B.sizes[ 1 - (opB != Operation::NO_OP) ] );
+  LVARRAY_ERROR_IF_NE( A.sizes[ 1 - (opA != Operation::NO_OP) ],
+                       B.sizes[ 0 + (opB != Operation::NO_OP) ] );
+
+  // Check that everything is contiguous
+  LVARRAY_ERROR_IF( !A.isContiguous(), "Matrix A must have one stride on dimension." );
+  LVARRAY_ERROR_IF( !B.isContiguous(), "Matrix B must have one stride one dimension." );
+  LVARRAY_ERROR_IF( !C.isColumnMajor(), "Matrix C must be column major." );
+
+  // TODO(corbett5): Don't think this will work for Hermitian matrices.
+  if( !A.isColumnMajor() )
+  {
+    A = A.transpose();
+    opA = transposeOp( opA );
+  }
+  if( !B.isColumnMajor() )
+  {
+    B = B.transpose();
+    opB = transposeOp( opB );
+  }
+
+  INTERFACE::gemm( opA, opB, alpha, A, B, beta, C );
+}
+
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/eigenDecomposition.cpp b/src/dense/eigenDecomposition.cpp
new file mode 100644
index 00000000..68236057
--- /dev/null
+++ b/src/dense/eigenDecomposition.cpp
@@ -0,0 +1,409 @@
+#include "eigenDecomposition.hpp"
+#include "backendHelpers.hpp"
+
+extern "C"
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_CHEEVR LVARRAY_LAPACK_FORTRAN_MANGLE( cheevr )
+void LVARRAY_CHEEVR( 
+  char const * JOBZ,
+  char const * RANGE,
+  char const * UPLO,
+  LvArray::dense::DenseInt const * N,
+  std::complex< float > * A,
+  LvArray::dense::DenseInt const * LDA,
+  float const * VL,
+  float const * VU,
+  LvArray::dense::DenseInt const * IL,
+  LvArray::dense::DenseInt const * IU,
+  float const * ABSTOL,
+  LvArray::dense::DenseInt * M,
+  float * W,
+  std::complex< float > * Z,
+  LvArray::dense::DenseInt const * LDZ,
+  LvArray::dense::DenseInt * ISUPPZ,
+  std::complex< float > * WORK,
+  LvArray::dense::DenseInt const * LWORK,
+  float * RWORK,
+  LvArray::dense::DenseInt const * LRWORK,
+  LvArray::dense::DenseInt * IWORK,
+  LvArray::dense::DenseInt const * LIWORK,
+  LvArray::dense::DenseInt * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_ZHEEVR LVARRAY_LAPACK_FORTRAN_MANGLE( zheevr )
+void LVARRAY_ZHEEVR( 
+  char const * JOBZ,
+  char const * RANGE,
+  char const * UPLO,
+  LvArray::dense::DenseInt const * N,
+  std::complex< double > * A,
+  LvArray::dense::DenseInt const * LDA,
+  double const * VL,
+  double const * VU,
+  LvArray::dense::DenseInt const * IL,
+  LvArray::dense::DenseInt const * IU,
+  double const * ABSTOL,
+  LvArray::dense::DenseInt * M,
+  double * W,
+  std::complex< double > * Z,
+  LvArray::dense::DenseInt const * LDZ,
+  LvArray::dense::DenseInt * ISUPPZ,
+  std::complex< double > * WORK,
+  LvArray::dense::DenseInt const * LWORK,
+  double * RWORK,
+  LvArray::dense::DenseInt const * LRWORK,
+  LvArray::dense::DenseInt * IWORK,
+  LvArray::dense::DenseInt const * LIWORK,
+  LvArray::dense::DenseInt * INFO );
+
+
+} // extern "C"
+
+namespace LvArray
+{
+namespace dense
+{
+namespace internal
+{
+
+/**
+ * 
+ */
+template< typename T >
+DenseInt heevr(
+  BuiltInBackends const backend,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< T > > const & A,
+  Vector< T > const & eigenvalues,
+  Matrix< std::complex< T > > const & eigenvectors,
+  Vector< DenseInt > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType,
+  bool const compute )
+{
+  LVARRAY_UNUSED_VARIABLE( backend );
+
+  LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." );
+
+  char const * const JOBZ = decompositionOptions.typeArg();
+  char const * const RANGE = decompositionOptions.rangeArg();
+  char const * const UPLO = getOption( storageType );
+  DenseInt const N = A.sizes[ 1 ];
+  DenseInt const LDA = A.strides[ 1 ];
+
+  T const VL = decompositionOptions.rangeMin;
+  T const VU = decompositionOptions.rangeMax;
+
+  DenseInt maxEigenvaluesToFind = N;
+  DenseInt const IL = decompositionOptions.indexMin;
+  DenseInt const IU = decompositionOptions.indexMax;
+  if( decompositionOptions.range == EigenDecompositionOptions::BY_INDEX )
+  {
+    LVARRAY_ERROR_IF_GT( IU, N );
+    maxEigenvaluesToFind = IU - IL + 1;
+  }
+
+  LVARRAY_ERROR_IF_LT( eigenvalues.size, maxEigenvaluesToFind );
+
+  DenseInt const ABSTOL = decompositionOptions.abstol;
+  DenseInt M = 0;
+
+  if( decompositionOptions.type == EigenDecompositionOptions::EIGENVALUES_AND_VECTORS )
+  {
+    LVARRAY_ERROR_IF_NE( eigenvectors.sizes[ 0 ], N );
+    LVARRAY_ERROR_IF_LT( eigenvectors.sizes[ 1 ], maxEigenvaluesToFind );
+  }
+
+  DenseInt const LDZ = std::max( 1, eigenvectors.strides[ 1 ] );
+
+  if( decompositionOptions.range == EigenDecompositionOptions::ALL ||
+      ( decompositionOptions.range == EigenDecompositionOptions::BY_INDEX &&
+        maxEigenvaluesToFind == N ) )
+  {
+    LVARRAY_ERROR_IF_LT( support.size, 2 * maxEigenvaluesToFind );
+  }
+
+  DenseInt const LWORK = compute ? workspace.work().size : -1;
+  DenseInt const LRWORK = compute ? workspace.rwork().size : -1;
+  DenseInt const LIWORK = compute ? workspace.iwork().size : -1;
+
+  DenseInt INFO = 0;
+
+  // With C++ 17 we can remove the reinterpret_cast with constexpr if.
+  if( backend == BuiltInBackends::LAPACK )
+  {
+    if( std::is_same< T, float >::value )
+    {
+      LVARRAY_CHEEVR(
+        JOBZ,
+        RANGE,
+        UPLO,
+        &N,
+        reinterpret_cast< std::complex< float > * >( A.data ),
+        &LDA,
+        reinterpret_cast< float const * >( &VL ),
+        reinterpret_cast< float const * >( &VU ),
+        &IL,
+        &IU,
+        reinterpret_cast< float const * >( &ABSTOL ),
+        &M,
+        reinterpret_cast< float * >( eigenvalues.data ),
+        reinterpret_cast< std::complex< float > * >( eigenvectors.data ),
+        &LDZ,
+        support.data,
+        reinterpret_cast< std::complex< float > * >( workspace.work().data ),
+        &LWORK,
+        reinterpret_cast< float * >( workspace.rwork().data ),
+        &LRWORK,
+        workspace.iwork().data,
+        &LIWORK,
+        &INFO );
+    }
+    else
+    {
+      LVARRAY_ZHEEVR(
+        JOBZ,
+        RANGE,
+        UPLO,
+        &N,
+        reinterpret_cast< std::complex< double > * >( A.data ),
+        &LDA,
+        reinterpret_cast< double const * >( &VL ),
+        reinterpret_cast< double const * >( &VU ),
+        &IL,
+        &IU,
+        reinterpret_cast< double const * >( &ABSTOL ),
+        &M,
+        reinterpret_cast< double * >( eigenvalues.data ),
+        reinterpret_cast< std::complex< double > * >( eigenvectors.data ),
+        &LDZ,
+        support.data,
+        reinterpret_cast< std::complex< double > * >( workspace.work().data ),
+        &LWORK,
+        reinterpret_cast< double * >( workspace.rwork().data ),
+        &LRWORK,
+        workspace.iwork().data,
+        &LIWORK,
+        &INFO );
+    }
+  }
+#if defined( LVARRAY_USE_MAGMA )
+  else if( backend == BuiltInBackends::MAGMA )
+  {
+    if( std::is_same< T, float >::value )
+    {
+      magma_cheevr(
+        magma_vec_const( *JOBZ ),
+        magma_range_const( *RANGE ),
+        magma_uplo_const( *UPLO ),
+        N,
+        reinterpret_cast< magmaFloatComplex * >( A.data ),
+        LDA,
+        VL,
+        VU,
+        IL,
+        IU,
+        ABSTOL,
+        &M,
+        reinterpret_cast< float * >( eigenvalues.data ),
+        reinterpret_cast< magmaFloatComplex * >( eigenvectors.data ),
+        LDZ,
+        support.data,
+        reinterpret_cast< magmaFloatComplex * >( workspace.work().data ),
+        LWORK,
+        reinterpret_cast< float * >( workspace.rwork().data ),
+        LRWORK,
+        workspace.iwork().data,
+        LIWORK,
+        &INFO );
+    }
+    else
+    {
+      magma_zheevr(
+        magma_vec_const( *JOBZ ),
+        magma_range_const( *RANGE ),
+        magma_uplo_const( *UPLO ),
+        N,
+        reinterpret_cast< magmaDoubleComplex * >( A.data ),
+        LDA,
+        VL,
+        VU,
+        IL,
+        IU,
+        ABSTOL,
+        &M,
+        reinterpret_cast< double * >( eigenvalues.data ),
+        reinterpret_cast< magmaDoubleComplex * >( eigenvectors.data ),
+        LDZ,
+        support.data,
+        reinterpret_cast< magmaDoubleComplex * >( workspace.work().data ),
+        LWORK,
+        reinterpret_cast< double * >( workspace.rwork().data ),
+        LRWORK,
+        workspace.iwork().data,
+        LIWORK,
+        &INFO );
+    }
+  }
+  else if( backend == BuiltInBackends::MAGMA_GPU )
+  {
+    int LDWA = N;
+    int LDWZ = N;
+
+    if( compute )
+    {
+      workspace.resizeWork2( MemorySpace::host, LDWA * N );
+      workspace.resizeWork3( MemorySpace::host, LDWZ * maxEigenvaluesToFind );
+    }
+
+    if( std::is_same< T, float >::value )
+    {
+      magma_cheevr_gpu(
+        magma_vec_const( *JOBZ ),
+        magma_range_const( *RANGE ),
+        magma_uplo_const( *UPLO ),
+        N,
+        reinterpret_cast< magmaFloatComplex * >( A.data ),
+        LDA,
+        VL,
+        VU,
+        IL,
+        IU,
+        ABSTOL,
+        &M,
+        reinterpret_cast< float * >( eigenvalues.data ),
+        reinterpret_cast< magmaFloatComplex * >( eigenvectors.data ),
+        LDZ,
+        support.data,
+        reinterpret_cast< magmaFloatComplex * >( workspace.work2().data ),
+        LDWA,
+        reinterpret_cast< magmaFloatComplex * >( workspace.work3().data ),
+        LDWZ,
+        reinterpret_cast< magmaFloatComplex * >( workspace.work().data ),
+        LWORK,
+        reinterpret_cast< float * >( workspace.rwork().data ),
+        LRWORK,
+        workspace.iwork().data,
+        LIWORK,
+        &INFO );
+    }
+    else
+    {
+      magma_zheevr_gpu(
+        magma_vec_const( *JOBZ ),
+        magma_range_const( *RANGE ),
+        magma_uplo_const( *UPLO ),
+        N,
+        reinterpret_cast< magmaDoubleComplex * >( A.data ),
+        LDA,
+        VL,
+        VU,
+        IL,
+        IU,
+        ABSTOL,
+        &M,
+        reinterpret_cast< double * >( eigenvalues.data ),
+        reinterpret_cast< magmaDoubleComplex * >( eigenvectors.data ),
+        LDZ,
+        support.data,
+        reinterpret_cast< magmaDoubleComplex * >( workspace.work2().data ),
+        LDWA,
+        reinterpret_cast< magmaDoubleComplex * >( workspace.work3().data ),
+        LDWZ,
+        reinterpret_cast< magmaDoubleComplex * >( workspace.work().data ),
+        LWORK,
+        reinterpret_cast< double * >( workspace.rwork().data ),
+        LRWORK,
+        workspace.iwork().data,
+        LIWORK,
+        &INFO );
+    }
+  }
+#endif
+  else
+  {
+    LVARRAY_ERROR( "Unknown built in backend: " << static_cast< int >( backend ) );
+  }
+
+  LVARRAY_ERROR_IF_NE( INFO, 0 );
+
+  return M;
+}
+
+} // namespace internal
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template< typename T >
+DenseInt heevr(
+  BuiltInBackends const backend,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< T > > const & A,
+  Vector< T > const & eigenvalues,
+  Matrix< std::complex< T > > const & eigenvectors,
+  Vector< DenseInt > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType )
+{
+  // TODO(corbett5): I think we can support row major by simply complex-conjugating all entries.
+  // I'm not sure exactly how this would work for the eigenvectors though.
+  LVARRAY_ERROR_IF( !A.isColumnMajor(), "Row major is not yet supported." );
+  LVARRAY_ERROR_IF( !eigenvectors.isColumnMajor(), "Row major is not yet supported." );
+
+  bool const reallocateWork = workspace.work().size < 2 * A.sizes[ 0 ];
+  bool const reallocateRWork = workspace.rwork().size < 24 * A.sizes[ 0 ];
+  bool const reallocateIWork = workspace.iwork().size < 10 * A.sizes[ 0 ];
+
+  if( reallocateWork || reallocateRWork || reallocateIWork )
+  {
+    OptimalSizeCalculation< std::complex< T > > optimalSizes;
+    internal::heevr( backend, decompositionOptions, A, eigenvalues, eigenvectors, support, optimalSizes, storageType, false );
+    
+    if( reallocateWork )
+    {
+      workspace.resizeWork( MemorySpace::host, optimalSizes.optimalWorkSize() );
+    }
+
+    if( reallocateRWork )
+    {
+      workspace.resizeRWork( MemorySpace::host, optimalSizes.optimalRWorkSize() );
+    }
+
+    if( reallocateIWork )
+    {
+      workspace.resizeIWork( MemorySpace::host, optimalSizes.optimalIWorkSize() );
+    }
+  }
+
+  return internal::heevr( backend, decompositionOptions, A, eigenvalues, eigenvectors, support, workspace, storageType, true );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// explicit instantiations.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template DenseInt heevr< float >(
+  BuiltInBackends const backend,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< float > > const & A,
+  Vector< float > const & eigenvalues,
+  Matrix< std::complex< float > > const & eigenvectors,
+  Vector< DenseInt > const & support,
+  Workspace< std::complex< float > > & workspace,
+  SymmetricMatrixStorageType const storageType );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template DenseInt heevr< double >(
+  BuiltInBackends const backend,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< double > > const & A,
+  Vector< double > const & eigenvalues,
+  Matrix< std::complex< double > > const & eigenvectors,
+  Vector< DenseInt > const & support,
+  Workspace< std::complex< double > > & workspace,
+  SymmetricMatrixStorageType const storageType );
+
+} // namespace dense
+} // namespace LvArray
diff --git a/src/dense/eigenDecomposition.hpp b/src/dense/eigenDecomposition.hpp
new file mode 100644
index 00000000..5e7f3819
--- /dev/null
+++ b/src/dense/eigenDecomposition.hpp
@@ -0,0 +1,220 @@
+#pragma once
+
+#include "common.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+/**
+ *
+ */
+struct EigenDecompositionOptions
+{
+  /**
+   *
+   */
+  enum Type
+  {
+    EIGENVALUES,
+    EIGENVALUES_AND_VECTORS,
+  };
+
+  /**
+   *
+   */
+  enum Range
+  {
+    ALL,
+    IN_INTERVAL,
+    BY_INDEX,
+  };
+
+  /**
+   *
+   */
+  EigenDecompositionOptions( Type const typeP, double const abstolP=0 ):
+    type{ typeP },
+    abstol{ abstolP }
+  {
+    LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type );
+  }
+
+  /**
+   *
+   */
+  EigenDecompositionOptions(
+    Type const typeP,
+    double const rangeMinP,
+    double const rangeMaxP,
+    double const abstolP ):
+    type{ typeP },
+    range{ IN_INTERVAL },
+    rangeMin{ rangeMinP },
+    rangeMax{ rangeMaxP },
+    abstol{ abstolP }
+  {
+    LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type );
+    LVARRAY_ERROR_IF_GE( rangeMin, rangeMax );
+  }
+
+  /**
+   * TODO: Not sure how I feel about the one based indexing for eigenvalues by index.
+   */
+  EigenDecompositionOptions(
+    Type const typeP,
+    DenseInt const indexMinP,
+    DenseInt const indexMaxP,
+    double const abstolP ):
+    type{ typeP },
+    range{ IN_INTERVAL },
+    indexMin{ indexMinP },
+    indexMax{ indexMaxP },
+    abstol{ abstolP }
+  {
+    LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type );
+    LVARRAY_ERROR_IF_LT( indexMin, 1 );
+    LVARRAY_ERROR_IF_GT( indexMin, indexMax );
+  }
+
+  /**
+   *
+   */
+  char const * typeArg() const
+  {
+    static constexpr char const * const eigenvalueString = "N";
+    static constexpr char const * const eigenvectorString = "V";
+
+    return type == EIGENVALUES ? eigenvalueString : eigenvectorString;
+  }
+
+  /**
+   *
+   */
+  char const * rangeArg() const
+  {
+    static constexpr char const * const allString = "A";
+    static constexpr char const * const intervalString = "V";
+    static constexpr char const * const indexString = "I";
+
+    if( range == ALL )
+    { return allString; }
+
+    return range == IN_INTERVAL ? intervalString : indexString;
+  }
+
+  ///
+  Type const type;
+
+  ///
+  Range const range = ALL;
+  
+  ///
+  double const rangeMin = std::numeric_limits< double >::max();
+  
+  ///
+  double const rangeMax = std::numeric_limits< double >::lowest();
+  
+  ///
+  DenseInt const indexMin = std::numeric_limits< DenseInt >::max();
+  
+  ///
+  DenseInt const indexMax = std::numeric_limits< DenseInt >::lowest();
+  
+  ///
+  double const abstol = 0;
+};
+
+
+/**
+ *
+ */
+template< typename T >
+DenseInt heevr(
+  BuiltInBackends const backend,
+  EigenDecompositionOptions const decompositionOptions,
+  Matrix< std::complex< T > > const & A,
+  Vector< T > const & eigenValues,
+  Matrix< std::complex< T > > const & eigenVectors,
+  Vector< DenseInt > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType );
+
+/**
+ *
+ */
+template< typename BACK_END, typename T, int USD_A, int USD_V, typename INDEX_TYPE >
+DenseInt heevr(
+  BACK_END && backend,
+  EigenDecompositionOptions const decompositionOptions,
+  ArraySlice< std::complex< T >, 2, USD_A, INDEX_TYPE > const & A,
+  ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues,
+  ArraySlice< std::complex< T >, 2, USD_V, INDEX_TYPE > const & eigenVectors,
+  ArraySlice< DenseInt, 1, 0, INDEX_TYPE > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType )
+{
+  Matrix< std::complex< T > > AMatrix( A );
+  Vector< T > eigenValuesVector( eigenValues );
+  Matrix< std::complex< T > > eigenVectorsMatrix( eigenVectors );
+  Vector< DenseInt > supportVector( support );
+
+  return heevr(
+    std::forward< BACK_END >( backend ),
+    decompositionOptions,
+    AMatrix,
+    eigenValuesVector,
+    eigenVectorsMatrix,
+    supportVector,
+    workspace,
+    storageType );
+}
+
+/**
+ *
+ */
+template< typename BACK_END, typename T, int USD_A, int USD_V, typename INDEX_TYPE, template< typename > class BUFFER_TYPE >
+DenseInt heevr(
+  BACK_END && backend,
+  EigenDecompositionOptions const decompositionOptions,
+  ArrayView< std::complex< T >, 2, USD_A, INDEX_TYPE, BUFFER_TYPE > const & A,
+  ArrayView< T, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & eigenValues,
+  ArrayView< std::complex< T >, 2, USD_V, INDEX_TYPE, BUFFER_TYPE > const & eigenVectors,
+  ArrayView< DenseInt, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & support,
+  Workspace< std::complex< T > > & workspace,
+  SymmetricMatrixStorageType const storageType )
+{
+  MemorySpace const space = getSpaceForBackend( backend );
+  
+  // The A matrix isn't touched because it is destroyed.
+  A.move( space, false );
+  eigenVectors.move( space, true );
+
+#if defined( LVARRAY_USE_MAGMA )
+  // MAGMA wants the eigenvalues and support on the CPU.
+  if( backend == BuiltInBackends::MAGMA_GPU )
+  {
+    eigenValues.move( MemorySpace::host, true );
+    support.move( MemorySpace::host, true );
+  }
+  else
+#endif
+  {
+    eigenValues.move( space, true );
+    support.move( space, true );
+  }
+
+  return heevr(
+    std::forward< BACK_END >( backend ),
+    decompositionOptions,
+    A.toSlice(),
+    eigenValues.toSlice(),
+    eigenVectors.toSlice(),
+    support.toSlice(),
+    workspace,
+    storageType );
+}
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/dense/linearSolve.cpp b/src/dense/linearSolve.cpp
new file mode 100644
index 00000000..33d5f503
--- /dev/null
+++ b/src/dense/linearSolve.cpp
@@ -0,0 +1,278 @@
+#include "linearSolve.hpp"
+#include "backendHelpers.hpp"
+
+extern "C"
+{
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_SGESV LVARRAY_LAPACK_FORTRAN_MANGLE( sgesv )
+void LVARRAY_SGESV( 
+  LvArray::dense::DenseInt const * N,
+  LvArray::dense::DenseInt const * NRHS,
+  float * A,
+  LvArray::dense::DenseInt const * LDA,
+  LvArray::dense::DenseInt * IPIV,
+  float * B,
+  LvArray::dense::DenseInt const * LDB,
+  LvArray::dense::DenseInt * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_DGESV LVARRAY_LAPACK_FORTRAN_MANGLE( dgesv )
+void LVARRAY_DGESV( 
+  LvArray::dense::DenseInt const * N,
+  LvArray::dense::DenseInt const * NRHS,
+  double * A,
+  LvArray::dense::DenseInt const * LDA,
+  LvArray::dense::DenseInt * IPIV,
+  double * B,
+  LvArray::dense::DenseInt const * LDB,
+  LvArray::dense::DenseInt * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_CGESV LVARRAY_LAPACK_FORTRAN_MANGLE( cgesv )
+void LVARRAY_CGESV( 
+  LvArray::dense::DenseInt const * N,
+  LvArray::dense::DenseInt const * NRHS,
+  std::complex< float > * A,
+  LvArray::dense::DenseInt const * LDA,
+  LvArray::dense::DenseInt * IPIV,
+  std::complex< float > * B,
+  LvArray::dense::DenseInt const * LDB,
+  LvArray::dense::DenseInt * INFO );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define LVARRAY_ZGESV LVARRAY_LAPACK_FORTRAN_MANGLE( zgesv )
+void LVARRAY_ZGESV( 
+  LvArray::dense::DenseInt const * N,
+  LvArray::dense::DenseInt const * NRHS,
+  std::complex< double > * A,
+  LvArray::dense::DenseInt const * LDA,
+  LvArray::dense::DenseInt * IPIV,
+  std::complex< double > * B,
+  LvArray::dense::DenseInt const * LDB,
+  LvArray::dense::DenseInt * INFO );
+
+} // extern "C"
+
+namespace LvArray
+{
+namespace dense
+{
+
+template< typename T >
+void gesv(
+  BuiltInBackends const backend,
+  Matrix< T > const & A,
+  Matrix< T > const & B,
+  Vector< DenseInt > const & pivots )
+{
+  LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." );
+  LVARRAY_ERROR_IF( !A.isColumnMajor(), "The matrix A must be column major." );
+
+  LVARRAY_ERROR_IF_NE( A.sizes[ 0 ], B.sizes[ 0 ] );
+  LVARRAY_ERROR_IF( !B.isColumnMajor(), "The matrix B must be column major." );
+
+  LVARRAY_ERROR_IF_NE( pivots.size, A.sizes[ 0 ] );
+
+  DenseInt const N = A.sizes[ 1 ];
+  DenseInt const NRHS = B.sizes[ 1 ];
+  DenseInt const LDA = A.strides[ 1 ];
+  DenseInt const LDB = B.strides[ 1 ];
+  DenseInt INFO = 0;
+
+  if( backend == BuiltInBackends::LAPACK )
+  {
+    if( std::is_same< T, float >::value )
+    {
+      LVARRAY_SGESV(
+        &N,
+        &NRHS,
+        reinterpret_cast< float * >( A.data ),
+        &LDA,
+        pivots.data,
+        reinterpret_cast< float * >( B.data ),
+        &LDB,
+        &INFO );
+    }
+    if( std::is_same< T, double >::value )
+    {
+      LVARRAY_DGESV(
+        &N,
+        &NRHS,
+        reinterpret_cast< double * >( A.data ),
+        &LDA,
+        pivots.data,
+        reinterpret_cast< double * >( B.data ),
+        &LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, float > )
+    {
+      LVARRAY_CGESV(
+        &N,
+        &NRHS,
+        reinterpret_cast< std::complex< float > * >( A.data ),
+        &LDA,
+        pivots.data,
+        reinterpret_cast< std::complex< float > * >( B.data ),
+        &LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, double > )
+    {
+      LVARRAY_ZGESV(
+        &N,
+        &NRHS,
+        reinterpret_cast< std::complex< double > * >( A.data ),
+        &LDA,
+        pivots.data,
+        reinterpret_cast< std::complex< double > * >( B.data ),
+        &LDB,
+        &INFO );
+    }
+  }
+#if defined( LVARRAY_USE_MAGMA )
+  else if( backend == BuiltInBackends::MAGMA )
+  {
+    if( std::is_same< T, float >::value )
+    {
+      magma_sgesv(
+        N,
+        NRHS,
+        reinterpret_cast< float * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< float * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( std::is_same< T, double >::value )
+    {
+      magma_dgesv(
+        N,
+        NRHS,
+        reinterpret_cast< double * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< double * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, float > )
+    {
+      magma_cgesv(
+        N,
+        NRHS,
+        reinterpret_cast< magmaFloatComplex * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< magmaFloatComplex * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, double > )
+    {
+      magma_zgesv(
+        N,
+        NRHS,
+        reinterpret_cast< magmaDoubleComplex * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< magmaDoubleComplex * >( B.data ),
+        LDB,
+        &INFO );
+    }
+  }
+  else if( backend == BuiltInBackends::MAGMA_GPU )
+  {
+    if( std::is_same< T, float >::value )
+    {
+      magma_sgesv_gpu(
+        N,
+        NRHS,
+        reinterpret_cast< float * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< float * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( std::is_same< T, double >::value )
+    {
+      magma_dgesv_gpu(
+        N,
+        NRHS,
+        reinterpret_cast< double * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< double * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, float > )
+    {
+      magma_cgesv_gpu(
+        N,
+        NRHS,
+        reinterpret_cast< magmaFloatComplex * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< magmaFloatComplex * >( B.data ),
+        LDB,
+        &INFO );
+    }
+    if( IsComplexT< T, double > )
+    {
+      magma_zgesv_gpu(
+        N,
+        NRHS,
+        reinterpret_cast< magmaDoubleComplex * >( A.data ),
+        LDA,
+        pivots.data,
+        reinterpret_cast< magmaDoubleComplex * >( B.data ),
+        LDB,
+        &INFO );
+    }
+  }
+#endif
+  else
+  {
+    LVARRAY_ERROR( "Unknown built in backend: " << static_cast< int >( backend ) );
+  }
+
+  LVARRAY_ERROR_IF( INFO < 0, "The " << -INFO << "-th argument had an illegal value." );
+  LVARRAY_ERROR_IF( INFO > 0, "The factorization has been completed but U( " << INFO - 1 << ", " << INFO - 1 <<
+                              " ) is exactly zero so the solution could not be computed." );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template void gesv< float >(
+  BuiltInBackends const backend,
+  Matrix< float > const & A,
+  Matrix< float > const & B,
+  Vector< DenseInt > const & pivots );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template void gesv< double >(
+  BuiltInBackends const backend,
+  Matrix< double > const & A,
+  Matrix< double > const & B,
+  Vector< DenseInt > const & pivots );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template void gesv< std::complex< float > >(
+  BuiltInBackends const backend,
+  Matrix< std::complex< float > > const & A,
+  Matrix< std::complex< float > > const & B,
+  Vector< DenseInt > const & pivots );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template void gesv< std::complex< double > >(
+  BuiltInBackends const backend,
+  Matrix< std::complex< double > > const & A,
+  Matrix< std::complex< double > > const & B,
+  Vector< DenseInt > const & pivots );
+
+
+} // namespace dense
+} // namespace LvArray
diff --git a/src/dense/linearSolve.hpp b/src/dense/linearSolve.hpp
new file mode 100644
index 00000000..3efe7719
--- /dev/null
+++ b/src/dense/linearSolve.hpp
@@ -0,0 +1,85 @@
+#pragma once
+
+#include "common.hpp"
+
+namespace LvArray
+{
+namespace dense
+{
+
+/**
+ * @brief Solves the matrix equation A X = B for X using (s, d, c, z)gesv.
+ * 
+ * @tparam T The type of values in the matrices. Must be one of float, double, std::complex< float >, or std::complex< double >.
+ * @param backend The built in backend that implements (s, d, c, z)gesv.
+ * @param A The input matrix, which is overwritten with L and U from the LU decomposition.
+ * @param B The input right hand side, is overwritten with the solution X.
+ * @param pivots The permutation matrix used when factoring A.
+ * 
+ * @note When using @c MAGMA_GPU as the backend both @param A and @param B should be on the GPU while @param pivots
+ *   remains on the host.
+ */
+template< typename T >
+void gesv(
+  BuiltInBackends const backend,
+  Matrix< T > const & A,
+  Matrix< T > const & B,
+  Vector< DenseInt > const & pivots );
+
+/**
+ *
+ */
+template< typename BACK_END, typename T, int USD_A, int NDIM_B, int USD_B, typename INDEX_TYPE >
+void gesv(
+  BACK_END && backend,
+  ArraySlice< T, 2, USD_A, INDEX_TYPE > const & A,
+  ArraySlice< T, NDIM_B, USD_B, INDEX_TYPE > const & B,
+  ArraySlice< DenseInt, 1, 0, INDEX_TYPE > const & pivots )
+{
+  Matrix< T > AMatrix( A );
+  Matrix< T > BMatrix( B );
+  Vector< DenseInt > pivotsVector( pivots );
+
+  gesv(
+    std::forward< BACK_END >( backend ),
+    AMatrix,
+    BMatrix,
+    pivots );
+}
+
+/**
+ *
+ */
+template< typename BACK_END, typename T, int USD_A, int NDIM_B, int USD_B, typename INDEX_TYPE, template< typename > class BUFFER_TYPE >
+void gesv(
+  BACK_END && backend,
+  ArrayView< T, 2, USD_A, INDEX_TYPE, BUFFER_TYPE > const & A,
+  ArrayView< T, NDIM_B, USD_B, INDEX_TYPE, BUFFER_TYPE > const & B,
+  ArrayView< DenseInt, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & pivots )
+{
+  // TODO(corbett5): Unclear about the touch here since A is destroyed but the LU decomposition may still be useful.
+  MemorySpace const space = getSpaceForBackend( backend );
+  A.move( space, true );
+  B.move( space, true );
+
+#if defined( LVARRAY_USE_MAGMA )
+  // MAGMA wants the pivots on the CPU.
+  if( backend == BuiltInBackends::MAGMA_GPU )
+  {
+    pivots.move( MemorySpace::host, true );
+  }
+  else
+#endif
+  {
+    pivots.move( space, true );
+  }
+
+  return gesv(
+    std::forward< BACK_END >( backend ),
+    A.toSlice(),
+    B.toSlice(),
+    pivots.toSlice() );
+}
+
+} // namespace dense
+} // namespace LvArray
\ No newline at end of file
diff --git a/src/math.hpp b/src/math.hpp
index f832e0fa..d45f68f3 100644
--- a/src/math.hpp
+++ b/src/math.hpp
@@ -45,7 +45,7 @@ namespace internal
  * @return @p u converted to @tparam T.
  */
 template< typename T, typename U >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 T convert( T const, U const u )
 { return u; }
 
@@ -55,7 +55,7 @@ T convert( T const, U const u )
  * @return The number of values stored in @tparam T, by default this is 1.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 int numValues( T const )
 { return 1; }
 
@@ -76,7 +76,7 @@ struct SingleType
  * @param x The value to return.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 SingleType< T > getFirst( T const x )
 { return x; }
 
@@ -86,7 +86,7 @@ SingleType< T > getFirst( T const x )
  * @param x The value to return.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 SingleType< T > getSecond( T const x )
 { return x; }
 
@@ -110,7 +110,7 @@ T lessThan( T const x, T const y )
  * @return @p u converted to @c __half.
  */
 template< typename U >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 __half convert( __half const, U const u )
 { return __float2half_rn( u ); }
 
@@ -122,7 +122,7 @@ __half convert( __half const, U const u )
  * @return A @c __half2 with both halves having value @p u converted to @c __half.
  */
 template< typename U >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 __half2 convert( __half2 const, U const u )
 { return __float2half2_rn( u ); }
 
@@ -131,10 +131,10 @@ __half2 convert( __half2 const, U const u )
  * @param u The value to convert.
  * @return A @c __half2 with both halves having value @p u.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 __half2 convert( __half2 const, __half const u )
 {
-#if defined( __CUDA_ARCH__ )
+#if defined( LVARRAY_DEVICE_COMPILE )
   return __half2half2( u );
 #else
   return __float2half2_rn( u );
@@ -151,7 +151,7 @@ __half2 convert( __half2 const, __half const u )
  * @return A @c __half2 containing @p u as the first value and @p v as the second.
  */
 template< typename U, typename V >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 __half2 convert( __half2 const, U const u, V const v )
 { return __floats2half2_rn( u, v ); }
 
@@ -161,10 +161,10 @@ __half2 convert( __half2 const, U const u, V const v )
  * @param v The second value to convert.
  * @return A @c __half2 containing @p u as the first value and @p v as the second.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 __half2 convert( __half2 const, __half const u, __half const v )
 {
-#if defined( __CUDA_ARCH__ )
+#if defined( LVARRAY_DEVICE_COMPILE )
   return __halves2half2( u, v );
 #else
   return __floats2half2_rn( u, v );
@@ -175,7 +175,7 @@ __half2 convert( __half2 const, __half const u, __half const v )
  * @brief Return the number of values stored in a @c __half2, which is 2.
  * @return The number of values stored in a @c __half2, which is 2.
  */
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 int numValues( __half2 const & )
 { return 2; }
 
@@ -193,7 +193,7 @@ struct SingleType< __half2 >
  * @return The fist @c __half in @p x.
  * @param x The value to query.
  */
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half getFirst( __half2 const x )
 { return __low2half( x ); }
 
@@ -201,16 +201,19 @@ __half getFirst( __half2 const x )
  * @return The second @c __half in @p x.
  * @param x The value to query.
  */
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half getSecond( __half2 const x )
 { return __high2half( x ); }
 
+#endif
+
+#if defined( LVARRAY_USE_DEVICE )
 /**
  * @return 1 if @p x is less than @p y, else 0.
  * @param x The first value.
  * @param y The second value.
  */
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half lessThan( __half const x, __half const y )
 { return __hlt( x, y ); }
 
@@ -219,10 +222,9 @@ __half lessThan( __half const x, __half const y )
  * @param x The first value.
  * @param y The second value.
  */
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 lessThan( __half2 const x, __half2 const y )
 { return __hlt2( x, y ); }
-
 #endif
 
 } // namespace internal
@@ -238,7 +240,7 @@ __half2 lessThan( __half2 const x, __half2 const y )
  * @return The number of values stored in type @tparam T.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 int numValues()
 { return internal::numValues( T() ); }
 
@@ -258,7 +260,7 @@ using SingleType = typename internal::SingleType< T >::type;
  * @return @p u converted to @tparam T.
  */
 template< typename T, typename U >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 T convert( U const u )
 { return internal::convert( T(), u ); }
 
@@ -273,7 +275,7 @@ T convert( U const u )
  * @return @p u, @p v converted to @tparam T.
  */
 template< typename T, typename U, typename V >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 T convert( U const u, V const v )
 { return internal::convert( T(), u, v ); }
 
@@ -284,7 +286,7 @@ T convert( U const u, V const v )
  * @note If @code numValues< T >() == 1 @endcode then @p x is returned.
  */
 template< typename T >
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 SingleType< T > getFirst( T const x )
 { return internal::getFirst( x ); }
 
@@ -295,7 +297,7 @@ SingleType< T > getFirst( T const x )
  * @note If @code numValues< T >() == 1 @endcode then @p x is returned.
  */
 template< typename T >
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 SingleType< T > getSecond( T const x )
 { return internal::getSecond( x ); }
 
@@ -306,35 +308,37 @@ SingleType< T > getSecond( T const x )
  * @param b The second number.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 std::enable_if_t< std::is_arithmetic< T >::value, T >
 max( T const a, T const b )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::max( a, b );
 #else
   return std::max( a, b );
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc max( T, T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half max( __half const a, __half const b )
 {
-#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
   return __hmax( a, b );
+#elif defined(LVARRAY_USE_HIP)
+  return __hgt( a, b ) ? a : b;
 #else
   return a > b ? a : b;
 #endif
 }
 
 /// @copydoc max( T, T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 max( __half2 const a, __half2 const b )
 {
-#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
   return __hmax2( a, b );
 #else
   __half2 const aFactor = __hge2( a, b );
@@ -353,11 +357,11 @@ __half2 max( __half2 const a, __half2 const b )
  * @param b The second number.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 std::enable_if_t< std::is_arithmetic< T >::value, T >
 min( T const a, T const b )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::min( a, b );
 #else
   return std::min( a, b );
@@ -367,7 +371,8 @@ min( T const a, T const b )
 #if defined( LVARRAY_USE_CUDA )
 
 /// @copydoc min( T, T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE
+LVARRAY_FORCE_INLINE
 __half min( __half const a, __half const b )
 {
 #if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
@@ -378,7 +383,8 @@ __half min( __half const a, __half const b )
 }
 
 /// @copydoc min( T, T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE
+LVARRAY_FORCE_INLINE
 __half2 min( __half2 const a, __half2 const b )
 {
 #if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
@@ -398,20 +404,20 @@ __half2 min( __half2 const a, __half2 const b )
  * @note This set of overloads is valid for any numeric type.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 T abs( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::abs( x );
 #else
   return std::abs( x );
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc abs( T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half abs( __half const x )
 {
 #if CUDART_VERSION > 11000
@@ -422,7 +428,7 @@ __half abs( __half const x )
 }
 
 /// @copydoc abs( T )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 abs( __half2 const x )
 {
 #if CUDART_VERSION > 11000
@@ -440,7 +446,7 @@ __half2 abs( __half2 const x )
  * @param x The value to square.
  */
 template< typename T >
-LVARRAY_HOST_DEVICE inline constexpr
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr
 T square( T const x )
 { return x * x; }
 
@@ -457,10 +463,10 @@ T square( T const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is @c double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float sqrt( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::sqrtf( x );
 #else
   return std::sqrt( x );
@@ -469,25 +475,25 @@ float sqrt( float const x )
 
 /// @copydoc sqrt( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double sqrt( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::sqrt( double( x ) );
 #else
   return std::sqrt( x );
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc sqrt( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half sqrt( __half const x )
 { return ::hsqrt( x ); }
 
 /// @copydoc sqrt( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 sqrt( __half2 const x )
 { return ::h2sqrt( x ); }
 
@@ -499,10 +505,10 @@ __half2 sqrt( __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float invSqrt( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::rsqrtf( x );
 #else
   return 1 / std::sqrt( x );
@@ -511,25 +517,25 @@ float invSqrt( float const x )
 
 /// @copydoc invSqrt( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double invSqrt( T const x )
 {
-#if defined( __CUDA_ARCH__ )
+#if defined( LVARRAY_DEVICE_COMPILE )
   return ::rsqrt( double( x ) );
 #else
   return 1 / std::sqrt( x );
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc invSqrt( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half invSqrt( __half const x )
 { return ::hrsqrt( x ); }
 
 /// @copydoc invSqrt( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 invSqrt( __half2 const x )
 { return ::h2rsqrt( x ); }
 
@@ -548,10 +554,10 @@ __half2 invSqrt( __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float sin( float const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::sinf( theta );
 #else
   return std::sin( theta );
@@ -560,25 +566,25 @@ float sin( float const theta )
 
 /// @copydoc sin( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double sin( T const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::sin( double( theta ) );
 #else
   return std::sin( theta );
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc sin( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half sin( __half const theta )
 { return ::hsin( theta ); }
 
 /// @copydoc sin( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 sin( __half2 const theta )
 { return ::h2sin( theta ); }
 
@@ -590,10 +596,10 @@ __half2 sin( __half2 const theta )
  * @note This set of overloads is valid for any numeric type. If @p theta is not a float
  *   it is converted to a double and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float cos( float const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::cosf( theta );
 #else
   return std::cos( theta );
@@ -602,25 +608,25 @@ float cos( float const theta )
 
 /// @copydoc cos( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double cos( T const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::cos( double( theta ) );
 #else
   return std::cos( theta );
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc cos( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half cos( __half const theta )
 { return ::hcos( theta ); }
 
 /// @copydoc cos( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 cos( __half2 const theta )
 { return ::h2cos( theta ); }
 
@@ -632,11 +638,15 @@ __half2 cos( __half2 const theta )
  * @param sinTheta The sine of @p theta.
  * @param cosTheta The cosine of @p theta.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 void sincos( float const theta, float & sinTheta, float & cosTheta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
+  #if defined(LVARRAY_USE_CUDA)
   ::sincos( theta, &sinTheta, &cosTheta );
+  #elif defined(LVARRAY_USE_HIP)
+  ::sincosf( theta, &sinTheta, &cosTheta );
+  #endif
 #else
   sinTheta = std::sin( theta );
   cosTheta = std::cos( theta );
@@ -645,11 +655,11 @@ void sincos( float const theta, float & sinTheta, float & cosTheta )
 
 /// @copydoc sincos( float, float &, float & )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 void sincos( double const theta, double & sinTheta, double & cosTheta )
 {
-#if defined(__CUDA_ARCH__)
-  ::sincos( theta, &sinTheta, &cosTheta );
+#if defined(LVARRAY_DEVICE_COMPILE)
+  ::sincos( theta, &sinTheta, &cosTheta ); // hip and cuda versions both use double
 #else
   sinTheta = std::sin( theta );
   cosTheta = std::cos( theta );
@@ -658,10 +668,10 @@ void sincos( double const theta, double & sinTheta, double & cosTheta )
 
 /// @copydoc sincos( float, float &, float & )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 void sincos( T const theta, double & sinTheta, double & cosTheta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   double s, c;
   ::sincos( theta, &s, &c );
   sinTheta = s;
@@ -672,10 +682,10 @@ void sincos( T const theta, double & sinTheta, double & cosTheta )
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc sincos( float, float &, float & )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 void sincos( __half const theta, __half & sinTheta, __half & cosTheta )
 {
   sinTheta = ::hsin( theta );
@@ -683,7 +693,7 @@ void sincos( __half const theta, __half & sinTheta, __half & cosTheta )
 }
 
 /// @copydoc sincos( float, float &, float & )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 void sincos( __half2 const theta, __half2 & sinTheta, __half2 & cosTheta )
 {
   sinTheta = ::h2sin( theta );
@@ -698,10 +708,10 @@ void sincos( __half2 const theta, __half2 & sinTheta, __half2 & cosTheta )
  * @note This set of overloads is valid for any numeric type. If @p theta is not a float
  *   it is converted to a double and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float tan( float const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::tanf( theta );
 #else
   return std::tan( theta );
@@ -710,20 +720,20 @@ float tan( float const theta )
 
 /// @copydoc tan( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double tan( T const theta )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::tan( double( theta ) );
 #else
   return std::tan( theta );
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc tan( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half tan( __half const theta )
 {
   __half s, c;
@@ -732,7 +742,7 @@ __half tan( __half const theta )
 }
 
 /// @copydoc tan( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 tan( __half2 const theta )
 {
   __half2 s, c;
@@ -764,7 +774,7 @@ namespace internal
  * @note Modified from https://developer.download.nvidia.com/cg/asin.html
  */
 template< typename T >
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 T asinImpl( T const x )
 {
   T const negate = lessThan( x, math::convert< T >( 0 ) );
@@ -786,7 +796,7 @@ T asinImpl( T const x )
  * @note Modified from https://developer.download.nvidia.com/cg/acos.html
  */
 template< typename T >
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 T acosImpl( T const x )
 {
   T const negate = lessThan( x, math::convert< T >( 0 ) );
@@ -808,7 +818,8 @@ T acosImpl( T const x )
  * @note Modified from https://developer.download.nvidia.com/cg/atan2.html
  */
 template< typename T >
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE
+LVARRAY_FORCE_INLINE
 T atan2Impl( T const y, T const x )
 {
   T const absX = abs( x );
@@ -842,10 +853,10 @@ T atan2Impl( T const y, T const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float asin( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::asinf( x );
 #else
   return std::asin( x );
@@ -854,25 +865,25 @@ float asin( float const x )
 
 /// @copydoc asin( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double asin( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::asin( double( x ) );
 #else
   return std::asin( x );
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc asin( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half asin( __half const x )
 { return internal::asinImpl( x ); }
 
 /// @copydoc asin( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 asin( __half2 const x )
 { return internal::asinImpl( x ); }
 
@@ -884,10 +895,10 @@ __half2 asin( __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float acos( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::acosf( x );
 #else
   return std::acos( x );
@@ -896,25 +907,25 @@ float acos( float const x )
 
 /// @copydoc acos( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double acos( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::acos( double( x ) );
 #else
   return std::acos( x );
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc acos( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half acos( __half const x )
 { return internal::acosImpl( x ); }
 
 /// @copydoc acos( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 acos( __half2 const x )
 { return internal::acosImpl( x ); }
 
@@ -927,10 +938,10 @@ __half2 acos( __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float atan2( float const y, float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::atan2f( y, x );
 #else
   return std::atan2( y, x );
@@ -939,10 +950,10 @@ float atan2( float const y, float const x )
 
 /// @copydoc atan2( float, float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double atan2( T const y, T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::atan2( double( y ), double( x ) );
 #else
   return std::atan2( y, x );
@@ -952,12 +963,12 @@ double atan2( T const y, T const x )
 #if defined( LVARRAY_USE_CUDA )
 
 /// @copydoc atan2( float, float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half atan2( __half const y, __half const x )
 { return internal::atan2Impl( y, x ); }
 
 /// @copydoc atan2( float, float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 atan2( __half2 const y, __half2 const x )
 { return internal::atan2Impl( y, x ); }
 
@@ -976,10 +987,10 @@ __half2 atan2( __half2 const y, __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float exp( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::expf( x );
 #else
   return std::exp( x );
@@ -988,25 +999,25 @@ float exp( float const x )
 
 /// @copydoc exp( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double exp( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::exp( double( x ) );
 #else
   return std::exp( x );
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc exp( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half exp( __half const x )
 { return ::hexp( x ); }
 
 /// @copydoc exp( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 exp( __half2 const x )
 { return ::h2exp( x ); }
 
@@ -1018,10 +1029,10 @@ __half2 exp( __half2 const x )
  * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double
  *   and the return type is double.
  */
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 float log( float const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::logf( x );
 #else
   return std::log( x );
@@ -1030,25 +1041,25 @@ float log( float const x )
 
 /// @copydoc log( float )
 template< typename T >
-LVARRAY_HOST_DEVICE inline
+LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE
 double log( T const x )
 {
-#if defined(__CUDA_ARCH__)
+#if defined(LVARRAY_DEVICE_COMPILE)
   return ::log( double( x ) );
 #else
   return std::log( x );
 #endif
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_DEVICE )
 
 /// @copydoc log( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half log( __half const x )
 { return ::hlog( x ); }
 
 /// @copydoc log( float )
-LVARRAY_DEVICE inline
+LVARRAY_DEVICE LVARRAY_FORCE_INLINE
 __half2 log( __half2 const x )
 { return ::h2log( x ); }
 
diff --git a/src/sortedArrayManipulation.hpp b/src/sortedArrayManipulation.hpp
index 7e9cae5d..d4bdbeed 100644
--- a/src/sortedArrayManipulation.hpp
+++ b/src/sortedArrayManipulation.hpp
@@ -211,7 +211,7 @@ LVARRAY_HOST_DEVICE inline void makeSorted( RandomAccessIterator const first,
                                             RandomAccessIterator const last,
                                             Compare && comp=Compare() )
 {
-#ifdef __CUDA_ARCH__
+#if defined(LVARRAY_DEVICE_COMPILE)
   if( last - first > internal::INTROSORT_THRESHOLD )
   {
     internal::introsortLoop( first, last, comp );
diff --git a/src/system.cpp b/src/system.cpp
index 25a2ec13..a6532ac5 100644
--- a/src/system.cpp
+++ b/src/system.cpp
@@ -417,11 +417,16 @@ std::string calculateSize( size_t const bytes )
     suffix = "MB";
     shift = 20;
   }
-  else
+  else if( bytes >> 10 != 0 )
   {
     suffix = "KB";
     shift = 10;
   }
+  else
+  {
+    suffix = "B";
+    shift = 0;
+  }
 
   double const units = double( bytes ) / ( 1 << shift );
 
diff --git a/unitTests/CMakeLists.txt b/unitTests/CMakeLists.txt
index 4d91681e..3ac33255 100644
--- a/unitTests/CMakeLists.txt
+++ b/unitTests/CMakeLists.txt
@@ -149,3 +149,8 @@ install(TARGETS testTensorOps
 if( ENABLE_PYLVARRAY )
     add_subdirectory( python )
 endif()
+
+if( ENABLE_LAPACK )
+    add_subdirectory( dense )
+endif()
+
diff --git a/unitTests/dense/CMakeLists.txt b/unitTests/dense/CMakeLists.txt
new file mode 100644
index 00000000..f87b2fda
--- /dev/null
+++ b/unitTests/dense/CMakeLists.txt
@@ -0,0 +1,34 @@
+###################################################################################################
+# Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors.
+# All rights reserved.
+# See the LICENSE file for details.
+# SPDX-License-Identifier: (BSD-3-Clause)
+###################################################################################################
+
+#
+# Specify list of tests
+#
+set( testSources
+     testgemm.cpp
+)
+
+#
+# Add gtest C++ based tests
+#
+foreach(test ${testSources})
+    get_filename_component( test_name ${test} NAME_WE )
+    blt_add_executable( NAME ${test_name}
+                        SOURCES ${test}
+                        OUTPUT_DIR ${TEST_OUTPUT_DIRECTORY}
+                        DEPENDS_ON gtest lvarray lvarraydense ${lvarray_dependencies} )
+
+    target_include_directories( ${test_name} PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../../src )
+
+    blt_add_test( NAME ${test_name}
+                  COMMAND ${test_name} )
+
+    install(TARGETS ${test_name}
+            DESTINATION bin)
+endforeach()
+
+
diff --git a/unitTests/dense/testEigenDecomposition.cpp b/unitTests/dense/testEigenDecomposition.cpp
new file mode 100644
index 00000000..bc29aa05
--- /dev/null
+++ b/unitTests/dense/testEigenDecomposition.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors.
+ * All rights reserved.
+ * See the LICENSE file for details.
+ * SPDX-License-Identifier: (BSD-3-Clause)
+ */
+
+// Source includes
+#include "dense/eigenDecomposition.hpp"
+
+#include "../testUtils.hpp"
+
+#if defined( LVARRAY_USE_MAGMA )
+  #include <magma.h>
+#endif
+
+namespace LvArray
+{
+namespace testing
+{
+
+using namespace dense;
+
+template< typename T >
+using Array1d = Array< T, 1, RAJA::PERM_I, DenseInt, DEFAULT_BUFFER >;
+
+template< typename T, typename PERM >
+using Array2d = Array< T, 2, PERM, DenseInt, DEFAULT_BUFFER >;
+
+// TODO(corbett5): significantly improve this test.
+
+template< typename T >
+struct HEEVR_TEST
+{
+  HEEVR_TEST( BuiltInBackends const backend ):
+    m_backend( backend )
+  {
+    m_matrix.setName( "matrix" );
+    m_eigenvalues.setName( "m_eigenvalues" );
+    m_eigenvectors.setName( "eigenvectors" );
+    m_support.setName( "support" );
+  }
+  
+  void threeByThreeEigenvalues()
+  {
+    resize( 20, 20, 0 );
+
+    m_matrix( 1, 1 ) = 2;
+    m_matrix( 0, 0 ) = 3;
+    m_matrix( 2, 2 ) = -4;
+
+    SymmetricMatrixStorageType storageType = SymmetricMatrixStorageType::UPPER_TRIANGULAR;
+
+    heevr(
+      m_backend,
+      EigenDecompositionOptions( EigenDecompositionOptions::EIGENVALUES ),
+      m_matrix.toView(),
+      m_eigenvalues.toView(),
+      m_eigenvectors.toView(),
+      m_support,
+      m_workspace,
+      storageType );
+
+    EXPECT_DOUBLE_EQ( m_eigenvalues[ 0 ], -4 );
+    EXPECT_DOUBLE_EQ( m_eigenvalues[ 1 ], 2 );
+    EXPECT_DOUBLE_EQ( m_eigenvalues[ 2 ], 3 );
+  }
+
+private:
+  void resize( DenseInt const n, DenseInt const nvals, DenseInt const nvec )
+  {
+    m_matrix.resize( n, n );
+    m_eigenvalues.resize( nvals );
+    m_eigenvectors.resize( n, nvec );
+    m_support.resize( 2 * n );
+  }
+
+  BuiltInBackends const m_backend;
+  Array2d< std::complex< T >, RAJA::PERM_JI > m_matrix;
+  Array1d< T > m_eigenvalues;
+  Array2d< std::complex< T >, RAJA::PERM_JI > m_eigenvectors;
+  Array1d< int > m_support;
+  ArrayWorkspace< std::complex< T >, ChaiBuffer > m_workspace;
+};
+
+TEST( eigenvalues_float, lapack )
+{
+  HEEVR_TEST< float > test( BuiltInBackends::LAPACK );
+
+  test.threeByThreeEigenvalues();
+}
+
+TEST( eigenvalues_double, lapack )
+{
+  HEEVR_TEST< double > test( BuiltInBackends::LAPACK );
+
+  test.threeByThreeEigenvalues();
+}
+
+#if defined( LVARRAY_USE_MAGMA )
+
+TEST( eigenvalues_float, magma )
+{
+  HEEVR_TEST< float > test( BuiltInBackends::MAGMA );
+
+  test.threeByThreeEigenvalues();
+}
+
+TEST( eigenvalues_double, magma )
+{
+  HEEVR_TEST< double > test( BuiltInBackends::MAGMA );
+
+  test.threeByThreeEigenvalues();
+}
+
+TEST( eigenvalues_float, magma_gpu )
+{
+  HEEVR_TEST< float > test( BuiltInBackends::MAGMA_GPU );
+
+  test.threeByThreeEigenvalues();
+}
+
+TEST( eigenvalues_double, magma_gpu )
+{
+  HEEVR_TEST< double > test( BuiltInBackends::MAGMA_GPU );
+
+  test.threeByThreeEigenvalues();
+}
+
+#endif
+
+} // namespace testing
+} // namespace LvArray
+
+// This is the default gtest main method. It is included for ease of debugging.
+int main( int argc, char * * argv )
+{
+#if defined( LVARRAY_USE_MAGMA )
+  magma_init();
+#endif
+
+  ::testing::InitGoogleTest( &argc, argv );
+  int const result = RUN_ALL_TESTS();
+
+#if defined( LVARRAY_USE_MAGMA )
+  magma_finalize();
+#endif
+
+  return result;
+}
diff --git a/unitTests/dense/testLinearSolve.cpp b/unitTests/dense/testLinearSolve.cpp
new file mode 100644
index 00000000..7a1ab3c8
--- /dev/null
+++ b/unitTests/dense/testLinearSolve.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors.
+ * All rights reserved.
+ * See the LICENSE file for details.
+ * SPDX-License-Identifier: (BSD-3-Clause)
+ */
+
+// Source includes
+#include "dense/linearSolve.hpp"
+
+#include "../testUtils.hpp"
+
+#include "output.hpp"
+
+#if defined( LVARRAY_USE_MAGMA )
+  #include <magma.h>
+#endif
+
+#define EXPECT_COMPLEX_NEAR( z1, z2, absError ) \
+  EXPECT_NEAR( std::real( z1 ), std::real( z2 ), absError ); \
+  EXPECT_NEAR( std::imag( z1 ), std::imag( z2 ), absError )
+
+namespace LvArray
+{
+namespace testing
+{
+
+using namespace dense;
+
+template< typename T >
+using Array1d = Array< T, 1, RAJA::PERM_I, DenseInt, DEFAULT_BUFFER >;
+
+template< typename T, typename PERM >
+using Array2d = Array< T, 2, PERM, DenseInt, DEFAULT_BUFFER >;
+
+
+template< typename T >
+struct GESV_Test : public ::testing::Test
+{
+  void test( BuiltInBackends const backend, DenseInt const N, DenseInt const nrhs )
+  {
+    Array2d< T, RAJA::PERM_JI > A( N, N );
+    Array2d< T, RAJA::PERM_JI > B( N, nrhs ) ;
+    Array1d< DenseInt > pivots( N );
+
+    for( DenseInt row = 0; row < N; ++row )
+    {
+      for( DenseInt col = 0; col < N; ++col )
+      {
+        A( row, col ) = randomNumber();
+      }
+
+      for( DenseInt col = 0; col < nrhs; ++col )
+      {
+        B( row, col ) = randomNumber();
+      }
+    }
+
+    Array2d< T, RAJA::PERM_JI > ACopy( A );
+    Array2d< T, RAJA::PERM_JI > X( B );
+    gesv( backend, ACopy.toView(), X.toView(), pivots );
+
+    // TODO(corbett5): replace this with matrix matrix multiplication
+    X.move( MemorySpace::host, true );
+    for( DenseInt i = 0; i < N; ++i )
+    {
+      for( DenseInt j = 0; j < nrhs; ++j )
+      {
+        T dot = 0;
+        for( DenseInt k = 0; k < N; ++k )
+        {
+          dot += A( i, k ) * X( k, j );
+        }
+
+        EXPECT_COMPLEX_NEAR( dot, B( i, j ), 10 * N * std::numeric_limits< RealVersion< T > >::epsilon() );
+      }
+    }
+  }
+
+private:
+  
+  template< typename _T=T >
+  std::enable_if_t< !IsComplex< _T >, T >
+  randomNumber()
+  { return m_dist( m_gen ); }
+
+  template< typename _T=T >
+  std::enable_if_t< IsComplex< _T >, T >
+  randomNumber()
+  { return { m_dist( m_gen ), m_dist( m_gen ) }; }
+
+  std::mt19937_64 m_gen;
+  std::uniform_real_distribution< RealVersion< T > > m_dist;
+};
+
+using GESV_Test_types = ::testing::Types<
+  float,
+  double,
+  std::complex< float >,
+  std::complex< double >
+  >;
+TYPED_TEST_SUITE( GESV_Test, GESV_Test_types, );
+
+TYPED_TEST( GESV_Test, LAPACK_2x2 )
+{
+  this->test( BuiltInBackends::LAPACK, 2, 1 );
+  this->test( BuiltInBackends::LAPACK, 2, 2 );
+}
+
+TYPED_TEST( GESV_Test, LAPACK_10x10 )
+{
+  this->test( BuiltInBackends::LAPACK, 10, 1 );
+  this->test( BuiltInBackends::LAPACK, 10, 3 );
+}
+
+TYPED_TEST( GESV_Test, LAPACK_100x100 )
+{
+  this->test( BuiltInBackends::LAPACK, 100, 1 );
+  this->test( BuiltInBackends::LAPACK, 100, 10 );
+}
+
+TYPED_TEST( GESV_Test, LAPACK_1000x1000 )
+{
+  this->test( BuiltInBackends::LAPACK, 100, 1 );
+  this->test( BuiltInBackends::LAPACK, 100, 10 );
+}
+
+#if defined( LVARRAY_USE_MAGMA )
+
+TYPED_TEST( GESV_Test, MAGMA_2x2 )
+{
+  this->test( BuiltInBackends::MAGMA, 2, 1 );
+  this->test( BuiltInBackends::MAGMA, 2, 2 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_10x10 )
+{
+  this->test( BuiltInBackends::MAGMA, 10, 1 );
+  this->test( BuiltInBackends::MAGMA, 10, 3 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_100x100 )
+{
+  this->test( BuiltInBackends::MAGMA, 100, 1 );
+  this->test( BuiltInBackends::MAGMA, 100, 10 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_1000x1000 )
+{
+  this->test( BuiltInBackends::MAGMA, 100, 1 );
+  this->test( BuiltInBackends::MAGMA, 100, 10 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_GPU_2x2 )
+{
+  this->test( BuiltInBackends::MAGMA_GPU, 2, 1 );
+  this->test( BuiltInBackends::MAGMA_GPU, 2, 2 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_GPU_10x10 )
+{
+  this->test( BuiltInBackends::MAGMA_GPU, 10, 1 );
+  this->test( BuiltInBackends::MAGMA_GPU, 10, 3 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_GPU_100x100 )
+{
+  this->test( BuiltInBackends::MAGMA_GPU, 100, 1 );
+  this->test( BuiltInBackends::MAGMA_GPU, 100, 10 );
+}
+
+TYPED_TEST( GESV_Test, MAGMA_GPU_1000x1000 )
+{
+  this->test( BuiltInBackends::MAGMA_GPU, 100, 1 );
+  this->test( BuiltInBackends::MAGMA_GPU, 100, 10 );
+}
+
+#endif
+
+} // namespace testing
+} // namespace LvArray
+
+// This is the default gtest main method. It is included for ease of debugging.
+int main( int argc, char * * argv )
+{
+#if defined( LVARRAY_USE_MAGMA )
+  magma_init();
+#endif
+
+  ::testing::InitGoogleTest( &argc, argv );
+  int const result = RUN_ALL_TESTS();
+
+#if defined( LVARRAY_USE_MAGMA )
+  magma_finalize();
+#endif
+
+  return result;
+}
diff --git a/unitTests/dense/testgemm.cpp b/unitTests/dense/testgemm.cpp
new file mode 100644
index 00000000..51f50773
--- /dev/null
+++ b/unitTests/dense/testgemm.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors.
+ * All rights reserved.
+ * See the LICENSE file for details.
+ * SPDX-License-Identifier: (BSD-3-Clause)
+ */
+
+// Source includes
+#include "dense/dense.hpp"
+#include "dense/BlasLapackInterface.hpp"
+
+#include "../testUtils.hpp"
+
+#include <random>
+
+#if defined( LVARRAY_USE_MAGMA )
+  #include <magma.h>
+#endif
+
+#define EXPECT_COMPLEX_NEAR( z1, z2, absError ) \
+  EXPECT_NEAR( std::real( z1 ), std::real( z2 ), absError ); \
+  EXPECT_NEAR( std::imag( z1 ), std::imag( z2 ), absError )
+
+namespace LvArray
+{
+namespace testing
+{
+
+// This should probably go in a common place
+template< typename T, typename PERM >
+using Array2d = Array< T, 2, PERM, std::ptrdiff_t, DEFAULT_BUFFER >;
+
+template< typename T >
+std::enable_if_t< std::is_floating_point< T >::value, T >
+randomValue( std::mt19937 & gen )
+{ return std::uniform_real_distribution< T >{ -1, 1 }( gen ); }
+
+template< typename T >
+std::enable_if_t< dense::IsComplex< T >, T >
+randomValue( std::mt19937 & gen )
+{ 
+  return { std::uniform_real_distribution< dense::RealVersion< T > >{ -1, 1 }( gen ),
+           std::uniform_real_distribution< dense::RealVersion< T > >{ -1, 1 }( gen ) }; 
+}
+
+template< typename T, typename PERM >
+Array2d< T, PERM > randomMatrix( std::ptrdiff_t const N, std::ptrdiff_t const M )
+{
+  std::mt19937 gen( std::random_device{}() );
+
+  Array2d< T, PERM > const ret( N, M );
+
+  for( std::ptrdiff_t r = 0; r < N; ++r )
+  {
+    for( std::ptrdiff_t c = 0; c < M; ++c )
+    {
+      ret( r, c ) = T{10} * randomValue< T >( gen );
+    }
+  }
+
+  return ret; 
+}
+
+template< typename T, typename PERM >
+std::enable_if_t< std::is_floating_point< T >::value >
+checkEqual( Array2d< T, PERM > const & lhs, Array2d< T, PERM > const & rhs, double rTol )
+{
+  ASSERT_EQ( lhs.size( 0 ), rhs.size( 0 ) );
+  ASSERT_EQ( lhs.size( 1 ), rhs.size( 1 ) );
+
+  for( std::ptrdiff_t i = 0; i < lhs.size(); ++i )
+  {
+    EXPECT_NEAR( lhs.data()[ i ], rhs.data()[ i ], std::abs( lhs.data()[ i ] ) * rTol );
+  }
+}
+
+template< typename T, typename PERM >
+std::enable_if_t< dense::IsComplex< T > >
+checkEqual( Array2d< T, PERM > const & lhs, Array2d< T, PERM > const & rhs, double rTol )
+{
+  ASSERT_EQ( lhs.size( 0 ), rhs.size( 0 ) );
+  ASSERT_EQ( lhs.size( 1 ), rhs.size( 1 ) );
+
+  for( std::ptrdiff_t i = 0; i < lhs.size(); ++i )
+  {
+    EXPECT_COMPLEX_NEAR( lhs.data()[ i ], rhs.data()[ i ], std::abs( lhs.data()[ i ] ) * rTol );
+  }
+}
+
+template< typename INTERFACE, typename T, typename PERM_A, typename PERM_B >
+struct GemmTest
+{
+  std::mt19937 gen();
+
+  void Rij_eq_AikBkj()
+  {
+    std::mt19937 gen( std::random_device{}() );
+
+    int const N = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen );
+    int const M = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen );
+    int const K = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen );
+    
+    T const alpha = T{10} * randomValue< T >( gen );
+    T const beta = T{10} * randomValue< T >( gen );
+    
+    Array2d< T, PERM_A > const A = randomMatrix< T, PERM_A >( N, K );
+    Array2d< T, PERM_B > const B = randomMatrix< T, PERM_B >( K, M );
+    Array2d< T, RAJA::PERM_JI > const C = randomMatrix< T, RAJA::PERM_JI >( N, M );
+    
+    Array2d< T, PERM_A > const Acopy = A;
+    Array2d< T, PERM_B > const Bcopy = B;
+    Array2d< T, RAJA::PERM_JI > const Ccopy = C;
+
+    dense::gemm< INTERFACE >( dense::Operation::NO_OP, dense::Operation::NO_OP, alpha, A, B, beta, C );
+
+    A.move( MemorySpace::host, false );
+    B.move( MemorySpace::host, false );
+    C.move( MemorySpace::host, false );
+
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        T dot = 0;
+        for( std::ptrdiff_t k = 0; k < K; ++k )
+        {
+          dot += Acopy( i, k ) * Bcopy( k, j );
+        }
+
+        Ccopy( i, j ) = alpha * dot + beta * Ccopy( i, j );
+      }
+    }
+
+    checkEqual( A, Acopy, 0 );
+    checkEqual( B, Bcopy, 0 );
+    checkEqual( C, Ccopy, 1e3 * std::numeric_limits< dense::RealVersion< T > >::epsilon() );
+  }
+};
+
+TEST( LapackInterface_float, Rij_eq_AikBkj )
+{
+  GemmTest< dense::BlasLapackInterface< float >, float, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_double, Rij_eq_AikBkj )
+{
+  GemmTest< dense::BlasLapackInterface< double >, double, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_complex_float, Rij_eq_AikBkj )
+{
+  GemmTest< dense::BlasLapackInterface< std::complex< float > >, std::complex< float >, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_complex_double, Rij_eq_AikBkj )
+{
+  GemmTest< dense::BlasLapackInterface< std::complex< double > >, std::complex< double >, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_float, Rij_eq_AikBkj_foo )
+{
+  GemmTest< dense::BlasLapackInterface< float >, float, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_double, Rij_eq_AikBkj_foo )
+{
+  GemmTest< dense::BlasLapackInterface< double >, double, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_complex_float, Rij_eq_AikBkj_foo )
+{
+  GemmTest< dense::BlasLapackInterface< std::complex< float > >, std::complex< float >, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+TEST( LapackInterface_complex_double, Rij_eq_AikBkj_foo )
+{
+  GemmTest< dense::BlasLapackInterface< std::complex< double > >, std::complex< double >, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj();
+}
+
+
+} // namespace testing
+} // namespace LvArray
+
+// This is the default gtest main method. It is included for ease of debugging.
+int main( int argc, char * * argv )
+{
+#if defined( LVARRAY_USE_MAGMA )
+  magma_init();
+#endif
+
+  ::testing::InitGoogleTest( &argc, argv );
+  int const result = RUN_ALL_TESTS();
+
+#if defined( LVARRAY_USE_MAGMA )
+  magma_finalize();
+#endif
+
+  return result;
+}
diff --git a/unitTests/testArray1DOfArray1D.cpp b/unitTests/testArray1DOfArray1D.cpp
index faa53b52..7ff271cb 100644
--- a/unitTests/testArray1DOfArray1D.cpp
+++ b/unitTests/testArray1DOfArray1D.cpp
@@ -233,7 +233,7 @@ using Array1DOfArray1DTestTypes = ::testing::Types<
   , std::pair< Array1D< Tensor, ChaiBuffer >, serialPolicy >
   , std::pair< Array1D< TestString, ChaiBuffer >, serialPolicy >
 #endif
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< Array1D< int, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< Array1D< Tensor, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testArray1DOfArray1DOfArray1D.cpp b/unitTests/testArray1DOfArray1DOfArray1D.cpp
index 5dc93fe8..5038d778 100644
--- a/unitTests/testArray1DOfArray1DOfArray1D.cpp
+++ b/unitTests/testArray1DOfArray1DOfArray1D.cpp
@@ -272,7 +272,7 @@ using Array1DOfArray1DOfArray1DTestTypes = ::testing::Types<
   , std::pair< Array1D< Tensor, ChaiBuffer >, serialPolicy >
   , std::pair< Array1D< TestString, ChaiBuffer >, serialPolicy >
 #endif
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP ) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< Array1D< int, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< Array1D< Tensor, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testArrayOfArrays.cpp b/unitTests/testArrayOfArrays.cpp
index 784fd448..aa20086b 100644
--- a/unitTests/testArrayOfArrays.cpp
+++ b/unitTests/testArrayOfArrays.cpp
@@ -1284,7 +1284,7 @@ using ArrayOfArraysViewTestTypes = ::testing::Types<
   , std::pair< ArrayOfArrays< TestString, std::ptrdiff_t, ChaiBuffer >, serialPolicy >
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< ArrayOfArrays< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< ArrayOfArrays< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
@@ -1467,7 +1467,7 @@ using ArrayOfArraysViewAtomicTestTypes = ::testing::Types<
   , std::pair< ArrayOfArrays< TestString, std::ptrdiff_t, ChaiBuffer >, parallelHostPolicy >
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< ArrayOfArrays< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< ArrayOfArrays< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testArrayOfSets.cpp b/unitTests/testArrayOfSets.cpp
index d3b9f540..ac71a76b 100644
--- a/unitTests/testArrayOfSets.cpp
+++ b/unitTests/testArrayOfSets.cpp
@@ -925,7 +925,7 @@ using ArrayOfSetsViewTestTypes = ::testing::Types<
   , std::pair< ArrayOfSets< TestString, std::ptrdiff_t, ChaiBuffer >, serialPolicy >
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA)  || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< ArrayOfSets< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< ArrayOfSets< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testArray_ChaiBuffer.cpp b/unitTests/testArray_ChaiBuffer.cpp
index 34825981..8bd5aaed 100644
--- a/unitTests/testArray_ChaiBuffer.cpp
+++ b/unitTests/testArray_ChaiBuffer.cpp
@@ -42,6 +42,10 @@ class ArrayTest : public ::testing::Test
     auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) );
     std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::cuda };
     std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool };
+  #elif defined(LVARRAY_USE_HIP)
+    auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) );
+    std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::hip };
+    std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool };
   #else
     std::initializer_list< MemorySpace > const spaces = { MemorySpace::host };
     std::initializer_list< umpire::Allocator > const allocators = { hostPool };
@@ -61,13 +65,19 @@ class ArrayTest : public ::testing::Test
     array.move( MemorySpace::cuda, true );
     EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "DEVICE_pool" );
 
+    array.move( MemorySpace::host, true );
+    EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "HOST_pool" );
+  #elif defined(LVARRAY_USE_HIP)
+    array.move( MemorySpace::hip, true );
+    EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "DEVICE_pool" );
+
     array.move( MemorySpace::host, true );
     EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "HOST_pool" );
   #endif
   }
 
 #if defined( LVARRAY_USE_CUDA )
-  void testDeviceAlloc()
+  void testCudaDeviceAlloc()
   {
     Array< int, 1, RAJA::PERM_I, int, ChaiBuffer > array;
 
@@ -86,6 +96,26 @@ class ArrayTest : public ::testing::Test
     }
   }
 #endif
+#if defined(LVARRAY_USE_HIP)
+  void testHIPDeviceAlloc()
+  {
+    Array< int, 1, RAJA::PERM_I, int, ChaiBuffer > array;
+
+    array.resizeWithoutInitializationOrDestruction( MemorySpace::hip, 100 );
+
+    T * const devPtr = array.data();
+    forall< parallelDevicePolicy< 32 > >( array.size(), [devPtr] LVARRAY_DEVICE ( int const i )
+        {
+          new ( &devPtr[ i ] ) T( i );
+        } );
+
+    array.move( MemorySpace::host, true );
+    for( int i = 0; i < array.size(); ++i )
+    {
+      EXPECT_EQ( array[ i ], T( i ) );
+    }
+  }
+#endif
 };
 
 /// The list of types to instantiate ArrayTest with.
@@ -104,7 +134,15 @@ TYPED_TEST( ArrayTest, AllocatorConstruction )
 
 TYPED_TEST( ArrayTest, DeviceAlloc )
 {
-  this->testDeviceAlloc();
+  this->testCudaDeviceAlloc();
+}
+
+#endif
+#if defined(LVARRAY_USE_HIP)
+
+TYPED_TEST( ArrayTest, DeviceAlloc )
+{
+  this->testHIPDeviceAlloc();
 }
 
 #endif
diff --git a/unitTests/testCRSMatrix.cpp b/unitTests/testCRSMatrix.cpp
index 987aa4e9..3c6c0556 100644
--- a/unitTests/testCRSMatrix.cpp
+++ b/unitTests/testCRSMatrix.cpp
@@ -1036,7 +1036,7 @@ using CRSMatrixViewTestTypes = ::testing::Types<
   , std::pair< CRSMatrix< TestString, int, std::ptrdiff_t, ChaiBuffer >, serialPolicy >
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
@@ -1276,7 +1276,7 @@ using CRSMatrixViewAtomicTestTypes = ::testing::Types<
   , std::pair< CRSMatrix< double, int, std::ptrdiff_t, ChaiBuffer >, parallelHostPolicy >
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< CRSMatrix< double, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testChaiBuffer.cpp b/unitTests/testChaiBuffer.cpp
index 8c6d9937..27b3401a 100644
--- a/unitTests/testChaiBuffer.cpp
+++ b/unitTests/testChaiBuffer.cpp
@@ -41,6 +41,10 @@ class ChaiBufferTest : public ::testing::Test
     auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) );
     std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::cuda };
     std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool };
+  #elif defined( LVARRAY_USE_HIP )
+    auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) );
+    std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::hip };
+    std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool };
   #else
     std::initializer_list< MemorySpace > const spaces = { MemorySpace::host };
     std::initializer_list< umpire::Allocator > const allocators = { hostPool };
@@ -62,6 +66,12 @@ class ChaiBufferTest : public ::testing::Test
     buffer.move( MemorySpace::cuda, true );
     EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "DEVICE_pool" );
 
+    buffer.move( MemorySpace::host, true );
+    EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "HOST_pool" );
+  #elif defined(LVARRAY_USE_HIP)
+    buffer.move( MemorySpace::hip, true );
+    EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "DEVICE_pool" );
+
     buffer.move( MemorySpace::host, true );
     EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "HOST_pool" );
   #endif
@@ -188,6 +198,126 @@ class ChaiBufferTest : public ::testing::Test
       EXPECT_EQ( buffer[ i ], T( i ) );
     }
 
+    bufferManipulation::free( buffer, size );
+  }
+#elif defined( LVARRAY_USE_HIP )
+  void testMove()
+  {
+    ChaiBuffer< T > buffer( true );
+
+    int const size = 100;
+    buffer.reallocate( 0, MemorySpace::host, size );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+
+    for( int i = 0; i < size; ++i )
+    {
+      new ( &buffer[ i ] ) T( i );
+    }
+
+    buffer.move( MemorySpace::hip, true );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip );
+    T * const devPtr = buffer.data();
+
+    forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i )
+        {
+          devPtr[ i ] += devPtr[ i ];
+        } );
+
+    // Check that the device changes are seen on the host. Then modify the values without touching.
+    buffer.move( MemorySpace::host, false );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+    for( int i = 0; i < size; ++i )
+    {
+      EXPECT_EQ( buffer[ i ], T( i ) + T( i ) );
+      buffer[ i ] = T( 0 );
+    }
+
+    buffer.move( MemorySpace::hip, true );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip );
+    forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i )
+        {
+          devPtr[ i ] += devPtr[ i ];
+        } );
+
+    buffer.move( MemorySpace::host, false );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+    for( int i = 0; i < size; ++i )
+    {
+      EXPECT_EQ( buffer[ i ], T( i ) + T( i ) + T( i ) + T( i ) );
+    }
+
+    bufferManipulation::free( buffer, size );
+  }
+
+  void testCapture()
+  {
+    ChaiBuffer< T > buffer( true );
+
+    int const size = 100;
+    buffer.reallocate( 0, MemorySpace::host, size );
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+
+    for( int i = 0; i < size; ++i )
+    {
+      new ( &buffer[ i ] ) T( i );
+    }
+
+    forall< parallelDevicePolicy< 32 > >( size, [buffer] LVARRAY_DEVICE ( int const i )
+        {
+          buffer[ i ] += buffer[ i ];
+        } );
+
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip );
+
+
+    // Check that the device changes are seen on the host. Then modify the values without touching.
+    ChaiBuffer< T const > constBuffer( buffer );
+    forall< serialPolicy >( size, [constBuffer] ( int const i )
+    {
+      EXPECT_EQ( constBuffer[ i ], T( i ) + T( i ) );
+      const_cast< T & >( constBuffer[ i ] ) = T( 0 );
+    } );
+
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+    EXPECT_EQ( constBuffer.getPreviousSpace(), MemorySpace::host );
+
+    forall< parallelDevicePolicy< 32 > >( size, [buffer] LVARRAY_DEVICE ( int const i )
+        {
+          buffer[ i ] += buffer[ i ];
+        } );
+
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip );
+
+    forall< serialPolicy >( size, [constBuffer] ( int const i )
+    {
+      EXPECT_EQ( constBuffer[ i ], T( i ) + T( i ) + T( i ) + T( i ) );
+    } );
+
+    EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host );
+    EXPECT_EQ( constBuffer.getPreviousSpace(), MemorySpace::host );
+
+    bufferManipulation::free( buffer, size );
+  }
+
+  void testDeviceRealloc()
+  {
+    ChaiBuffer< T > buffer( true );
+
+    int const size = 100;
+    buffer.reallocate( 0, MemorySpace::hip, size );
+
+    T * const devPtr = buffer.data();
+    forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i )
+        {
+          new ( &devPtr[ i ] ) T( i );
+        } );
+
+    buffer.move( MemorySpace::host, true );
+    for( int i = 0; i < size; ++i )
+    {
+      EXPECT_EQ( buffer[ i ], T( i ) );
+    }
+
     bufferManipulation::free( buffer, size );
   }
 #endif
@@ -205,7 +335,7 @@ TYPED_TEST( ChaiBufferTest, AllocatorConstruction )
   this->testAllocatorConstruction();
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP )
 
 TYPED_TEST( ChaiBufferTest, Move )
 {
diff --git a/unitTests/testMath.cpp b/unitTests/testMath.cpp
index 08502c4f..f6d193cb 100644
--- a/unitTests/testMath.cpp
+++ b/unitTests/testMath.cpp
@@ -145,12 +145,14 @@ using TestMathTypes = ::testing::Types<
   , std::pair< long long int, serialPolicy >
   , std::pair< float, serialPolicy >
   , std::pair< double, serialPolicy >
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP )
   , std::pair< int, parallelDevicePolicy< 32 > >
   , std::pair< long int, parallelDevicePolicy< 32 > >
   , std::pair< long long int, parallelDevicePolicy< 32 > >
   , std::pair< float, parallelDevicePolicy< 32 > >
   , std::pair< double, parallelDevicePolicy< 32 > >
+#endif
+#if defined( LVARRAY_USE_CUDA )
   , std::pair< __half, parallelDevicePolicy< 32 > >
 #endif
   >;
@@ -331,7 +333,7 @@ struct TestMath2 : public ::testing::Test
   }
 };
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP )
 
 using TestMath2Types = ::testing::Types<
   std::pair< __half2, parallelDevicePolicy< 32 > >
@@ -403,7 +405,8 @@ void forAllHalvesinMinus1to1( bool const include1, LAMBDA && lambda )
         }
       } );
 }
-
+#endif
+#if defined(LVARRAY_USE_CUDA)
 void asinHalfAccuracy()
 {
   RAJA::ReduceMax< RAJA::cuda_reduce, double > maxDiff( 0 );
diff --git a/unitTests/testMemcpy.cpp b/unitTests/testMemcpy.cpp
index f3adcece..0e44243d 100644
--- a/unitTests/testMemcpy.cpp
+++ b/unitTests/testMemcpy.cpp
@@ -242,7 +242,106 @@ void testAsyncMemcpyDevice()
     EXPECT_EQ( x[ i ], -i );
   }
 }
+#elif defined(LVARRAY_USE_HIP)
 
+template< template< typename > class BUFFER_TYPE >
+void testMemcpyDevice()
+{
+  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > x( 100 );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    x[ i ] = i;
+  }
+
+  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > y( x.size() );
+  y.move( MemorySpace::hip );
+  int * yPtr = y.data();
+
+  memcpy< 0, 0 >( y, {}, x.toViewConst(), {} );
+
+  forall< RAJA::hip_exec< 32 > >( y.size(), [yPtr] LVARRAY_DEVICE ( std::ptrdiff_t const i )
+      {
+        PORTABLE_EXPECT_EQ( yPtr[ i ], i );
+        yPtr[ i ] *= 2;
+      } );
+
+  memcpy< 0, 0 >( x, {}, y.toViewConst(), {} );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    EXPECT_EQ( x[ i ], 2 * i );
+  }
+
+  // Move y to the CPU but then capture and modify a view on device. This way y's data pointer is still pointing
+  // to host memory but the subsequent memcpy should pick up that it's previous space is on device.
+  y.move( MemorySpace::host );
+
+  ArrayView< int, 1, 0, std::ptrdiff_t, BUFFER_TYPE > const yView = y.toView();
+  forall< RAJA::hip_exec< 32 > >( y.size(), [yView] LVARRAY_DEVICE ( std::ptrdiff_t const i )
+      {
+        yView[ i ] = -i;
+      } );
+
+  memcpy< 0, 0 >( x, {}, y.toViewConst(), {} );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    EXPECT_EQ( x[ i ], -i );
+  }
+}
+
+template< template< typename > class BUFFER_TYPE >
+void testAsyncMemcpyDevice()
+{
+  camp::resources::Resource stream{ camp::resources::Hip{} };
+
+  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > x( 100 );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    x[ i ] = i;
+  }
+
+  Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > y( x.size() );
+  y.move( MemorySpace::hip );
+  int * yPtr = y.data();
+
+  camp::resources::Event e = memcpy< 0, 0 >( stream, y.toView(), {}, x.toViewConst(), {} );
+  stream.wait_for( &e );
+
+  forall< RAJA::hip_exec< 32 > >( y.size(), [yPtr] LVARRAY_DEVICE ( std::ptrdiff_t const i )
+      {
+        PORTABLE_EXPECT_EQ( yPtr[ i ], i );
+        yPtr[ i ] *= 2;
+      } );
+
+  e = memcpy< 0, 0 >( stream, x, {}, y.toViewConst(), {} );
+  stream.wait_for( &e );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    EXPECT_EQ( x[ i ], 2 * i );
+  }
+
+  // Move y to the CPU but then capture and modify a view on device. This way y's data pointer is still pointing
+  // to host memory but the subsequent memcpy should pick up that it's previous space is on device.
+  y.move( MemorySpace::host );
+
+  ArrayView< int, 1, 0, std::ptrdiff_t, BUFFER_TYPE > const yView = y.toView();
+  forall< RAJA::hip_exec< 32 > >( y.size(), [yView] LVARRAY_DEVICE ( std::ptrdiff_t const i )
+      {
+        yView[ i ] = -i;
+      } );
+
+  e = memcpy< 0, 0 >( stream, x, {}, y.toViewConst(), {} );
+  stream.wait_for( &e );
+
+  for( std::ptrdiff_t i = 0; i < x.size(); ++i )
+  {
+    EXPECT_EQ( x[ i ], -i );
+  }
+}
 #endif
 
 TEST( TestMemcpy, MallocBuffer1D )
@@ -282,7 +381,7 @@ TEST( TestMemcpy, ChaiBuffer2D )
   testMemcpy2D< ChaiBuffer >();
 }
 
-#if defined( LVARRAY_USE_CUDA )
+#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP )
 
 TEST( TestMemcpy, ChaiBufferDevice )
 {
diff --git a/unitTests/testSortedArray.cpp b/unitTests/testSortedArray.cpp
index 5198bd24..fe52ddfc 100644
--- a/unitTests/testSortedArray.cpp
+++ b/unitTests/testSortedArray.cpp
@@ -451,7 +451,7 @@ using SortedArrayViewTestTypes = ::testing::Types<
   std::pair< SortedArray< int, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy >
   , std::pair< SortedArray< Tensor, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy >
   , std::pair< SortedArray< TestString, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy >
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined( LVARRAY_USE_HIP ) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< SortedArray< int, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< SortedArray< Tensor, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testSortedArrayManipulation.cpp b/unitTests/testSortedArrayManipulation.cpp
index 2d784cb2..ae376cb4 100644
--- a/unitTests/testSortedArrayManipulation.cpp
+++ b/unitTests/testSortedArrayManipulation.cpp
@@ -190,7 +190,7 @@ using SingleArrayTestTypes = ::testing::Types<
   , std::tuple< TestString, sortedArrayManipulation::less< TestString >, serialPolicy >
   , std::tuple< TestString, sortedArrayManipulation::greater< TestString >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< int, sortedArrayManipulation::less< int >, parallelDevicePolicy< 256 > >
   , std::tuple< int, sortedArrayManipulation::greater< int >, parallelDevicePolicy< 256 > >
   , std::tuple< Tensor, sortedArrayManipulation::less< Tensor >, parallelDevicePolicy< 256 > >
@@ -290,7 +290,7 @@ using DualArrayTestTypes = ::testing::Types<
   , std::tuple< TestString, TestString, sortedArrayManipulation::less< TestString >, serialPolicy >
   , std::tuple< TestString, TestString, sortedArrayManipulation::greater< TestString >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) )&& defined(LVARRAY_USE_CHAI)
   , std::tuple< int, int, sortedArrayManipulation::less< int >, parallelDevicePolicy< 256 > >
   , std::tuple< int, int, sortedArrayManipulation::greater< int >, parallelDevicePolicy< 256 > >
   , std::tuple< Tensor, Tensor, sortedArrayManipulation::less< Tensor >, parallelDevicePolicy< 256 > >
diff --git a/unitTests/testSparsityPattern.cpp b/unitTests/testSparsityPattern.cpp
index 50ec30f9..fee7a995 100644
--- a/unitTests/testSparsityPattern.cpp
+++ b/unitTests/testSparsityPattern.cpp
@@ -1016,7 +1016,7 @@ using SparsityPatternViewTestTypes = ::testing::Types<
 #endif
 #endif
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< SparsityPattern< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #if !defined( __ibmxl__ )
   , std::pair< SparsityPattern< uint, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
@@ -1171,7 +1171,7 @@ using CRSMatrixTestTypes = ::testing::Types<
   std::pair< CRSMatrix< int, int, std::ptrdiff_t, MallocBuffer >, serialPolicy >
   , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, MallocBuffer >, serialPolicy >
   , std::pair< CRSMatrix< TestString, int, std::ptrdiff_t, MallocBuffer >, serialPolicy >
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
   , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testStackArray.cpp b/unitTests/testStackArray.cpp
index 249ccebb..e29206ab 100644
--- a/unitTests/testStackArray.cpp
+++ b/unitTests/testStackArray.cpp
@@ -281,7 +281,7 @@ using StackArrayCaptureTestTypes = ::testing::Types<
   , std::pair< RAJA::PERM_KIJ, serialPolicy >
   , std::pair< RAJA::PERM_KJI, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA)
+#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP)
   , std::pair< RAJA::PERM_I, parallelDevicePolicy< 32 > >
   , std::pair< RAJA::PERM_IJ, parallelDevicePolicy< 32 > >
   , std::pair< RAJA::PERM_JI, parallelDevicePolicy< 32 > >
diff --git a/unitTests/testTensorOpsEigen.cpp b/unitTests/testTensorOpsEigen.cpp
index 46ff354d..2c556ec7 100644
--- a/unitTests/testTensorOpsEigen.cpp
+++ b/unitTests/testTensorOpsEigen.cpp
@@ -243,7 +243,7 @@ using TestEigendecompositionTypes = ::testing::Types<
   , std::tuple< double, double, std::integral_constant< int, 3 >, serialPolicy >
   , std::tuple< std::int64_t, double, std::integral_constant< int, 3 >, serialPolicy >
   , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy >
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< std::int64_t, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
diff --git a/unitTests/testTensorOpsFixedSize.cpp b/unitTests/testTensorOpsFixedSize.cpp
index e66fd5a3..21392a17 100644
--- a/unitTests/testTensorOpsFixedSize.cpp
+++ b/unitTests/testTensorOpsFixedSize.cpp
@@ -569,7 +569,7 @@ using FixedSizeSquareMatrixTestTypes = ::testing::Types<
   std::tuple< double, std::integral_constant< int, 2 >, serialPolicy >
   , std::tuple< double, std::integral_constant< int, 3 >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< double, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
 #endif
@@ -616,6 +616,5 @@ TYPED_TEST( FixedSizeSquareMatrixTest, denseToSymmetric )
 {
   this->denseToSymmetric();
 }
-
 } // namespace testing
 } // namespace LvArray
diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp
index 4909a686..9edfa950 100644
--- a/unitTests/testTensorOpsInverse.hpp
+++ b/unitTests/testTensorOpsInverse.hpp
@@ -375,7 +375,7 @@ using InverseTestTypes = ::testing::Types<
   , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy >
   , std::tuple< int, double, std::integral_constant< int, 3 >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< int, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
@@ -400,7 +400,7 @@ using InverseFloatOnlyTestTypes = ::testing::Types<
   , std::tuple< double, double, std::integral_constant< int, 3 >, serialPolicy >
   , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< double, double, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
diff --git a/unitTests/testTensorOpsNoSize.cpp b/unitTests/testTensorOpsNoSize.cpp
index b08e5ae1..8c1112d4 100644
--- a/unitTests/testTensorOpsNoSize.cpp
+++ b/unitTests/testTensorOpsNoSize.cpp
@@ -349,7 +349,7 @@ using NoSizeTestTypes = ::testing::Types<
   std::tuple< double, serialPolicy >
   , std::tuple< int, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, parallelDevicePolicy< 32 > >
   , std::tuple< int, parallelDevicePolicy< 32 > >
 #endif
diff --git a/unitTests/testTensorOpsOneSize.cpp b/unitTests/testTensorOpsOneSize.cpp
index fc351c75..78946638 100644
--- a/unitTests/testTensorOpsOneSize.cpp
+++ b/unitTests/testTensorOpsOneSize.cpp
@@ -693,7 +693,7 @@ using OneSizeTestTypes = ::testing::Types<
   , std::tuple< int, std::integral_constant< int, 3 >, serialPolicy >
   , std::tuple< double, std::integral_constant< int, 6 >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > >
   , std::tuple< int, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
   , std::tuple< double, std::integral_constant< int, 6 >, parallelDevicePolicy< 32 > >
diff --git a/unitTests/testTensorOpsThreeSizes.hpp b/unitTests/testTensorOpsThreeSizes.hpp
index 5a27092a..b4546a9b 100644
--- a/unitTests/testTensorOpsThreeSizes.hpp
+++ b/unitTests/testTensorOpsThreeSizes.hpp
@@ -530,7 +530,7 @@ using ThreeSizesTestTypes = ::testing::Types<
                 std::integral_constant< int, 3 >,
                 serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double,
                 std::integral_constant< int, 2 >,
                 std::integral_constant< int, 3 >,
diff --git a/unitTests/testTensorOpsTwoSizes.hpp b/unitTests/testTensorOpsTwoSizes.hpp
index 07978011..5492b2b5 100644
--- a/unitTests/testTensorOpsTwoSizes.hpp
+++ b/unitTests/testTensorOpsTwoSizes.hpp
@@ -930,7 +930,7 @@ using TwoSizesTestTypes = ::testing::Types<
   , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, serialPolicy >
   , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, serialPolicy >
 
-#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI)
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
   , std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
   , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, parallelDevicePolicy< 32 > >
   , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
diff --git a/unitTests/testTensorOpsTwoSizes1.cpp b/unitTests/testTensorOpsTwoSizes1.cpp
index 7f5a97d5..96ac793c 100644
--- a/unitTests/testTensorOpsTwoSizes1.cpp
+++ b/unitTests/testTensorOpsTwoSizes1.cpp
@@ -6,13 +6,941 @@
  */
 
 // Source includes
-#include "testTensorOpsTwoSizes.hpp"
+//#include "testTensorOpsTwoSizes.hpp"
+
+// Source includes
+#include "tensorOps.hpp"
+#include "Array.hpp"
+#include "testUtils.hpp"
+#include "output.hpp"
+#include "testTensorOpsCommon.hpp"
+
+// TPL includes
+#include <gtest/gtest.h>
 
 namespace LvArray
 {
 namespace testing
 {
 
+template< typename T_N_M_POLICY_TUPLE >
+class TwoSizesTest : public ::testing::Test
+{
+public:
+  using T = std::tuple_element_t< 0, T_N_M_POLICY_TUPLE >;
+  static constexpr std::ptrdiff_t N = std::tuple_element_t< 1, T_N_M_POLICY_TUPLE > {};
+  static constexpr std::ptrdiff_t M = std::tuple_element_t< 2, T_N_M_POLICY_TUPLE > {};
+  using POLICY = std::tuple_element_t< 3, T_N_M_POLICY_TUPLE >;
+
+  void SetUp() override
+  {
+    fill( m_matrixA_IJK.toSlice(), m_matrixASeed );
+    fill( m_matrixA_IKJ.toSlice(), m_matrixASeed );
+    fill( m_matrixA_KJI.toSlice(), m_matrixASeed );
+    fill( m_matrixA_local, m_matrixASeed );
+
+    fill( m_matrixB_IJK.toSlice(), m_matrixBSeed );
+    fill( m_matrixB_IKJ.toSlice(), m_matrixBSeed );
+    fill( m_matrixB_KJI.toSlice(), m_matrixBSeed );
+    fill( m_matrixB_local, m_matrixBSeed );
+
+    fill( m_matrixNN_IJK.toSlice(), m_matrixNNSeed );
+    fill( m_matrixNN_IKJ.toSlice(), m_matrixNNSeed );
+    fill( m_matrixNN_KJI.toSlice(), m_matrixNNSeed );
+    fill( m_matrixNN_local, m_matrixNNSeed );
+
+    fill( m_matrixMN_IJK.toSlice(), m_matrixMNSeed );
+    fill( m_matrixMN_IKJ.toSlice(), m_matrixMNSeed );
+    fill( m_matrixMN_KJI.toSlice(), m_matrixMNSeed );
+    fill( m_matrixMN_local, m_matrixMNSeed );
+
+    fill( m_vectorN_IJ.toSlice(), m_vectorNSeed );
+    fill( m_vectorN_JI.toSlice(), m_vectorNSeed );
+    fill( m_vectorN_local, m_vectorNSeed );
+
+    fill( m_vectorM_IJ.toSlice(), m_vectorMSeed );
+    fill( m_vectorM_JI.toSlice(), m_vectorMSeed );
+    fill( m_vectorM_local, m_vectorMSeed );
+  }
+
+  void testScale()
+  {
+    T scale = T( 3.14 );
+    T result[ N ][ M ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        result[ i ][ j ] = m_matrixA_local[ i ][ j ] * scale;
+      }
+    }
+
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    std::ptrdiff_t const aSeed = m_matrixASeed;
+    forall< POLICY >( 1, [scale, result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, aSeed] LVARRAY_HOST_DEVICE ( int )
+        {
+          tensorOps::scale< N, M >( matrixA_IJK[ 0 ], scale );
+          CHECK_EQUALITY_2D( N, M, matrixA_IJK[ 0 ], result );
+
+          tensorOps::scale< N, M >( matrixA_IKJ[ 0 ], scale );
+          CHECK_EQUALITY_2D( N, M, matrixA_IKJ[ 0 ], result );
+
+          tensorOps::scale< N, M >( matrixA_KJI[ 0 ], scale );
+          CHECK_EQUALITY_2D( N, M, matrixA_KJI[ 0 ], result );
+
+          T matrix_local[ N ][ M ];
+          fill( matrix_local, aSeed );
+          tensorOps::scale< N, M >( matrix_local, scale );
+          CHECK_EQUALITY_2D( N, M, matrix_local, result );
+        } );
+  }
+
+  void testFill()
+  {
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI] LVARRAY_HOST_DEVICE ( int )
+        {
+          for( int i = 0; i < 3; ++i )
+          {
+            T const value = 3.14 * i;
+            tensorOps::fill< N, M >( matrixA_IJK[ 0 ], value );
+            for( std::ptrdiff_t j = 0; j < N; ++j )
+            {
+              for( std::ptrdiff_t k = 0; k < M; ++k )
+              {
+                PORTABLE_EXPECT_EQ( matrixA_IJK( 0, j, k ), value );
+              }
+            }
+
+            tensorOps::fill< N, M >( matrixA_IKJ[ 0 ], value );
+            for( std::ptrdiff_t j = 0; j < N; ++j )
+            {
+              for( std::ptrdiff_t k = 0; k < M; ++k )
+              {
+                PORTABLE_EXPECT_EQ( matrixA_IKJ( 0, j, k ), value );
+              }
+            }
+
+            tensorOps::fill< N, M >( matrixA_KJI[ 0 ], value );
+            for( std::ptrdiff_t j = 0; j < N; ++j )
+            {
+              for( std::ptrdiff_t k = 0; k < M; ++k )
+              {
+                PORTABLE_EXPECT_EQ( matrixA_KJI( 0, j, k ), value );
+              }
+            }
+
+            T matrix_local[ N ][ M ];
+            tensorOps::fill< N, M >( matrix_local, value );
+            for( std::ptrdiff_t j = 0; j < N; ++j )
+            {
+              for( std::ptrdiff_t k = 0; k < M; ++k )
+              {
+                PORTABLE_EXPECT_EQ( matrix_local[ j ][ k ], value );
+              }
+            }
+          }
+        } );
+  }
+
+  void testAiBj()
+  {
+    T result[ N ][ M ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        result[ i ][ j ] = m_vectorN_local[ i ] * m_vectorM_local[ j ];
+      }
+    }
+
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst();
+    T const ( &vectorN_local )[ N ] = m_vectorN_local;
+
+    ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst();
+    T const ( &vectorM_local )[ M ] = m_vectorM_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, vectorN_IJ, vectorN_JI, vectorN_local,
+                          vectorM_IJ, vectorM_JI, vectorM_local, matrixSeed] LVARRAY_HOST_DEVICE ( int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( matrix, matrixSeed ); \
+            tensorOps::Rij_eq_AiBj< N, M >( matrix, vectorN, vectorM ); \
+            CHECK_EQUALITY_2D( N, M, matrix, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testPlusAiBj()
+  {
+    T result[ N ][ M ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        result[ i ][ j ] = m_matrixA_local[ i ][ j ] + m_vectorN_local[ i ] * m_vectorM_local[ j ];
+      }
+    }
+
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst();
+    T const ( &vectorN_local )[ N ] = m_vectorN_local;
+
+    ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst();
+    T const ( &vectorM_local )[ M ] = m_vectorM_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorM_local, matrixSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( matrix, matrixSeed ); \
+            tensorOps::Rij_add_AiBj< N, M >( matrix, vectorN, vectorM ); \
+            CHECK_EQUALITY_2D( N, M, matrix, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testAijBj()
+  {
+    T result[ N ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      T dot = 0;
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        dot += m_matrixA_local[ i ][ j ] * m_vectorM_local[ j ];
+      }
+      result[ i ] = dot;
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst();
+    T const ( &matrix_local )[ N ][ M ] = m_matrixA_local;
+
+    ArrayViewT< T, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toView();
+    ArrayViewT< T, 2, 0 > const vectorN_JI = m_vectorN_JI.toView();
+
+    ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst();
+    T const ( &vectorM_local )[ M ] = m_vectorM_local;
+
+    std::ptrdiff_t const vectorNSeed = m_vectorNSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorM_IJ, vectorM_JI, vectorM_local, vectorNSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( vectorN, vectorNSeed ); \
+            tensorOps::Ri_eq_AijBj< N, M >( vectorN, matrix, vectorM ); \
+            CHECK_EQUALITY_1D( N, vectorN, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T vectorN_local[ N ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testPlusAijBj()
+  {
+    T result[ N ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      T dot = 0;
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        dot += m_matrixA_local[ i ][ j ] * m_vectorM_local[ j ];
+      }
+      result[ i ] = m_vectorN_local[ i ] + dot;
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst();
+    T const ( &matrix_local )[ N ][ M ] = m_matrixA_local;
+
+    ArrayViewT< T, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toView();
+    ArrayViewT< T, 2, 0 > const vectorN_JI = m_vectorN_JI.toView();
+
+    ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst();
+    T const ( &vectorM_local )[ M ] = m_vectorM_local;
+
+    std::ptrdiff_t const vectorNSeed = m_vectorNSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorM_IJ, vectorM_JI, vectorM_local, vectorNSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( vectorN, vectorNSeed ); \
+            tensorOps::Ri_add_AijBj< N, M >( vectorN, matrix, vectorM ); \
+            CHECK_EQUALITY_1D( N, vectorN, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T vectorN_local[ N ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testAjiBj()
+  {
+    T result[ M ];
+    for( std::ptrdiff_t i = 0; i < M; ++i )
+    {
+      T dot = 0;
+      for( std::ptrdiff_t j = 0; j < N; ++j )
+      {
+        dot += m_matrixA_local[ j ][ i ] * m_vectorN_local[ j ];
+      }
+      result[ i ] = dot;
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst();
+    T const ( &matrix_local )[ N ][ M ] = m_matrixA_local;
+
+    ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst();
+    T const ( &vectorN_local )[ N ] = m_vectorN_local;
+
+    ArrayViewT< T, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toView();
+    ArrayViewT< T, 2, 0 > const vectorM_JI = m_vectorM_JI.toView();
+
+    std::ptrdiff_t const vectorMSeed = m_vectorMSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorMSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( vectorM, vectorMSeed ); \
+            tensorOps::Ri_eq_AjiBj< M, N >( vectorM, matrix, vectorN ); \
+            CHECK_EQUALITY_1D( M, vectorM, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T vectorM_local[ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testPlusAjiBj()
+  {
+    T result[ M ];
+    for( std::ptrdiff_t i = 0; i < M; ++i )
+    {
+      T dot = 0;
+      for( std::ptrdiff_t j = 0; j < N; ++j )
+      {
+        dot += m_matrixA_local[ j ][ i ] * m_vectorN_local[ j ];
+      }
+      result[ i ] = m_vectorM_local[ i ] + dot;
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst();
+    T const ( &matrix_local )[ N ][ M ] = m_matrixA_local;
+
+    ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst();
+    ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst();
+    T const ( &vectorN_local )[ N ] = m_vectorN_local;
+
+    ArrayViewT< T, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toView();
+    ArrayViewT< T, 2, 0 > const vectorM_JI = m_vectorM_JI.toView();
+
+    std::ptrdiff_t const vectorMSeed = m_vectorMSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorMSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( matrix, vectorN, vectorM ) \
+            fill( vectorM, vectorMSeed ); \
+            tensorOps::Ri_add_AjiBj< M, N >( vectorM, matrix, vectorN ); \
+            CHECK_EQUALITY_1D( M, vectorM, result )
+
+          #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \
+            _TEST( matrix, vectorN, vectorM0 ); \
+            _TEST( matrix, vectorN, vectorM1 ); \
+            _TEST( matrix, vectorN, vectorM2 )
+
+          T vectorM_local[ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+          _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testCopy()
+  {
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst();
+    T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( dstMatrix, srcMatrix ) \
+            fill( dstMatrix, matrixSeed ); \
+            tensorOps::copy< N, M >( dstMatrix, srcMatrix ); \
+            CHECK_EQUALITY_2D( N, M, dstMatrix, srcMatrix )
+
+          #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \
+            _TEST( dstMatrix, srcMatrix0 ); \
+            _TEST( dstMatrix, srcMatrix1 ); \
+            _TEST( dstMatrix, srcMatrix2 ); \
+            _TEST( dstMatrix, srcMatrix3 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testScaledCopy()
+  {
+    T scale = T( 3.14 );
+    T result[ N ][ M ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        result[ i ][ j ] = scale * m_matrixB_local[ i ][ j ];
+      }
+    }
+
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst();
+    T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [scale, result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( dstMatrix, srcMatrix ) \
+            fill( dstMatrix, matrixSeed ); \
+            tensorOps::scaledCopy< N, M >( dstMatrix, srcMatrix, scale ); \
+            CHECK_EQUALITY_2D( N, M, dstMatrix, result )
+
+          #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \
+            _TEST( dstMatrix, srcMatrix0 ); \
+            _TEST( dstMatrix, srcMatrix1 ); \
+            _TEST( dstMatrix, srcMatrix2 ); \
+            _TEST( dstMatrix, srcMatrix3 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testAdd()
+  {
+    T result[ N ][ M ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < M; ++j )
+      {
+        result[ i ][ j ] = m_matrixA_local[ i ][ j ] + m_matrixB_local[ i ][ j ];
+      }
+    }
+
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst();
+    T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE (
+                        int )
+        {
+          #define _TEST( dstMatrix, srcMatrix ) \
+            fill( dstMatrix, matrixSeed ); \
+            tensorOps::add< N, M >( dstMatrix, srcMatrix ); \
+            CHECK_EQUALITY_2D( N, M, dstMatrix, result )
+
+          #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \
+            _TEST( dstMatrix, srcMatrix0 ); \
+            _TEST( dstMatrix, srcMatrix1 ); \
+            _TEST( dstMatrix, srcMatrix2 ); \
+            _TEST( dstMatrix, srcMatrix3 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testScaledAdd()
+  {
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toView();
+    ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toView();
+    ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toView();
+
+    T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed ]
+                      LVARRAY_HOST_DEVICE ( int )
+        {
+          #define _TEST( dstMatrix, srcMatrix ) \
+            fill( dstMatrix, matrixSeed ); \
+            tensorOps::scaledAdd< N, M >( dstMatrix, srcMatrix, scale ); \
+            CHECK_EQUALITY_2D( N, M, dstMatrix, result ); \
+
+          #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \
+            _TEST( dstMatrix, srcMatrix0 ); \
+            _TEST( dstMatrix, srcMatrix1 ); \
+            _TEST( dstMatrix, srcMatrix2 ); \
+            _TEST( dstMatrix, srcMatrix3 )
+
+          T matrixA_local[ N ][ M ];
+          fill( matrixA_local, matrixSeed );
+
+          T const scale = T( 3.14 );
+          T result[ N ][ M ];
+          for( std::ptrdiff_t i = 0; i < N; ++i )
+          {
+            for( std::ptrdiff_t j = 0; j < M; ++j )
+            {
+              result[ i ][ j ] = matrixA_local[ i ][ j ] + scale * matrixB_local[ i ][ j ];
+            }
+          }
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+          _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local );
+
+            #undef _TEST_PERMS
+            #undef _TEST
+        } );
+  }
+
+  void testAkiAkj()
+  {
+    T result[ N ][ N ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < N; ++j )
+      {
+        T dot = 0;
+        for( std::ptrdiff_t k = 0; k < M; ++k )
+        {
+          dot += m_matrixMN_local[ k ][ i ] * m_matrixMN_local[ k ][ j ];
+        }
+        result[ i ][ j ] = dot;
+      }
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixMN_IJK = m_matrixMN_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixMN_IKJ = m_matrixMN_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixMN_KJI = m_matrixMN_KJI.toViewConst();
+    T const ( &matrixMN_local )[ M ][ N ] = m_matrixMN_local;
+
+    ArrayViewT< T, 3, 2 > const matrixNN_IJK = m_matrixNN_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixNN_IKJ = m_matrixNN_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixNN_KJI = m_matrixNN_KJI.toView();
+
+    std::ptrdiff_t const matrixNNSeed = m_matrixNNSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixMN_IJK, matrixMN_IKJ, matrixMN_KJI, matrixMN_local, matrixNN_IJK,
+                       matrixNN_IKJ, matrixNN_KJI, matrixNNSeed ] LVARRAY_HOST_DEVICE ( int )
+        {
+          #define _TEST( matrixNN, matrixMN ) \
+            fill( matrixNN, matrixNNSeed ); \
+            tensorOps::Rij_eq_AkiAkj< N, M >( matrixNN, matrixMN ); \
+            CHECK_EQUALITY_2D( N, N, matrixNN, result )
+
+          #define _TEST_PERMS( matrixNN, matrixMN0, matrixMN1, matrixMN2, matrixMN3 ) \
+            _TEST( matrixNN, matrixMN0 ); \
+            _TEST( matrixNN, matrixMN1 ); \
+            _TEST( matrixNN, matrixMN2 ); \
+            _TEST( matrixNN, matrixMN3 )
+
+          T matrixNN_local[ N ][ N ];
+
+          _TEST_PERMS( matrixNN_IJK[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixNN_IKJ[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixNN_KJI[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixNN_local, matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testPlusAikAjk()
+  {
+    T result[ N ][ N ];
+    for( std::ptrdiff_t i = 0; i < N; ++i )
+    {
+      for( std::ptrdiff_t j = 0; j < N; ++j )
+      {
+        T dot = 0;
+        for( std::ptrdiff_t k = 0; k < M; ++k )
+        {
+          dot += m_matrixA_local[ i ][ k ] * m_matrixA_local[ j ][ k ];
+        }
+        result[ i ][ j ] = m_matrixNN_local[ i ][ j ] + dot;
+      }
+    }
+
+    ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst();
+    T const ( &matrixA_local )[ N ][ M ] = m_matrixA_local;
+
+    ArrayViewT< T, 3, 2 > const matrixNN_IJK = m_matrixNN_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixNN_IKJ = m_matrixNN_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixNN_KJI = m_matrixNN_KJI.toView();
+
+    std::ptrdiff_t const matrixNNSeed = m_matrixNNSeed;
+
+    forall< POLICY >( 1,
+                      [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixA_local, matrixNN_IJK,
+                       matrixNN_IKJ, matrixNN_KJI, matrixNNSeed ] LVARRAY_HOST_DEVICE ( int )
+        {
+          #define _TEST( matrixNN, matrixA ) \
+            fill( matrixNN, matrixNNSeed ); \
+            tensorOps::Rij_add_AikAjk< N, M >( matrixNN, matrixA ); \
+            CHECK_EQUALITY_2D( N, N, matrixNN, result )
+
+          #define _TEST_PERMS( matrixNN, matrixA0, matrixA1, matrixA2, matrixA3 ) \
+            _TEST( matrixNN, matrixA0 ); \
+            _TEST( matrixNN, matrixA1 ); \
+            _TEST( matrixNN, matrixA2 ); \
+            _TEST( matrixNN, matrixA3 )
+
+          T matrixNN_local[ N ][ N ];
+
+          _TEST_PERMS( matrixNN_IJK[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local );
+          _TEST_PERMS( matrixNN_IKJ[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local );
+          _TEST_PERMS( matrixNN_KJI[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local );
+          _TEST_PERMS( matrixNN_local, matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local );
+
+          #undef _TEST_PERMS
+          #undef _TEST
+        } );
+  }
+
+  void testTranspose()
+  {
+    ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView();
+    ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView();
+    ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView();
+
+    ArrayViewT< T const, 3, 2 > const matrixMN_IJK_view = m_matrixMN_IJK.toViewConst();
+    ArrayViewT< T const, 3, 1 > const matrixMN_IKJ_view = m_matrixMN_IKJ.toViewConst();
+    ArrayViewT< T const, 3, 0 > const matrixMN_KJI_view = m_matrixMN_KJI.toViewConst();
+    T const ( &matrixMN_local )[ M ][ N ] = m_matrixMN_local;
+
+    std::ptrdiff_t const matrixSeed = m_matrixASeed;
+
+    forall< POLICY >( 1, [=] LVARRAY_HOST_DEVICE ( int )
+        {
+          #define _TEST( dstMatrix, srcMatrix ) \
+            fill( dstMatrix, matrixSeed ); \
+            tensorOps::transpose< N, M >( dstMatrix, srcMatrix ); \
+            for( int i = 0; i < N; ++i ) \
+            { \
+              for( int j = 0; j < M; ++j ) \
+              { \
+                PORTABLE_EXPECT_EQ( dstMatrix[ i ][ j ], srcMatrix[ j ][ i ] ); \
+              } \
+            }
+
+          #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \
+            _TEST( dstMatrix, srcMatrix0 ); \
+            _TEST( dstMatrix, srcMatrix1 ); \
+            _TEST( dstMatrix, srcMatrix2 ); \
+            _TEST( dstMatrix, srcMatrix3 )
+
+          T matrix_local[ N ][ M ];
+
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+          _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local );
+
+      #undef _TEST_PERMS
+      #undef _TEST
+        } );
+  }
+
+private:
+  std::ptrdiff_t const m_matrixASeed = 0;
+  ArrayT< T, RAJA::PERM_IJK > m_matrixA_IJK { 1, N, M };
+  ArrayT< T, RAJA::PERM_IKJ > m_matrixA_IKJ { 1, N, M };
+  ArrayT< T, RAJA::PERM_KJI > m_matrixA_KJI { 1, N, M };
+  T m_matrixA_local[ N ][ M ];
+
+  std::ptrdiff_t const m_matrixBSeed = m_matrixASeed + N * M;
+  ArrayT< T, RAJA::PERM_IJK > m_matrixB_IJK { 1, N, M };
+  ArrayT< T, RAJA::PERM_IKJ > m_matrixB_IKJ { 1, N, M };
+  ArrayT< T, RAJA::PERM_KJI > m_matrixB_KJI { 1, N, M };
+  T m_matrixB_local[ N ][ M ];
+
+  std::ptrdiff_t const m_matrixNNSeed = m_matrixBSeed + N * M;
+  ArrayT< T, RAJA::PERM_IJK > m_matrixNN_IJK { 1, N, N };
+  ArrayT< T, RAJA::PERM_IKJ > m_matrixNN_IKJ { 1, N, N };
+  ArrayT< T, RAJA::PERM_KJI > m_matrixNN_KJI { 1, N, N };
+  T m_matrixNN_local[ N ][ N ];
+
+  std::ptrdiff_t const m_matrixMNSeed = m_matrixNNSeed + N * N;
+  ArrayT< T, RAJA::PERM_IJK > m_matrixMN_IJK { 1, M, N };
+  ArrayT< T, RAJA::PERM_IKJ > m_matrixMN_IKJ { 1, M, N };
+  ArrayT< T, RAJA::PERM_KJI > m_matrixMN_KJI { 1, M, N };
+  T m_matrixMN_local[ M ][ N ];
+
+  std::ptrdiff_t const m_vectorNSeed = m_matrixMNSeed + N * M;
+  ArrayT< T, RAJA::PERM_IJ > m_vectorN_IJ { 1, N };
+  ArrayT< T, RAJA::PERM_JI > m_vectorN_JI { 1, N };
+  T m_vectorN_local[ N ];
+
+  std::ptrdiff_t const m_vectorMSeed = m_vectorNSeed + N;
+  ArrayT< T, RAJA::PERM_IJ > m_vectorM_IJ { 1, M };
+  ArrayT< T, RAJA::PERM_JI > m_vectorM_JI { 1, M };
+  T m_vectorM_local[ M ];
+};
+
+
+using TwoSizesTestTypes = ::testing::Types<
+  std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, serialPolicy >
+  , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, serialPolicy >
+  , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, serialPolicy >
+
+#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI)
+  , std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
+  , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, parallelDevicePolicy< 32 > >
+  , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > >
+#endif
+  >;
+
+TYPED_TEST_SUITE( TwoSizesTest, TwoSizesTestTypes, );
+
+
 TYPED_TEST( TwoSizesTest, scale )
 {
   this->testScale();
diff --git a/unitTests/testTypeManipulation.cpp b/unitTests/testTypeManipulation.cpp
index 5bef9a4c..494fb038 100644
--- a/unitTests/testTypeManipulation.cpp
+++ b/unitTests/testTypeManipulation.cpp
@@ -78,6 +78,23 @@ CUDA_TEST( typeManipulation, forEachArg )
         }, intReducer, floatReducer, doubleReducer );
       } );
 
+  EXPECT_EQ( intReducer.get(), 2 );
+  EXPECT_EQ( floatReducer.get(), 4 );
+  EXPECT_EQ( doubleReducer.get(), 7 );
+#elif defined(LVARRAY_USE_HIP)
+  // Test on device.
+  RAJA::ReduceSum< RAJA::hip_reduce, int > intReducer( 1 );
+  RAJA::ReduceSum< RAJA::hip_reduce, float > floatReducer( 3 );
+  RAJA::ReduceSum< RAJA::hip_reduce, double > doubleReducer( 6 );
+  forall< parallelDevicePolicy< 32 > >( 1, [intReducer, floatReducer, doubleReducer] LVARRAY_DEVICE ( int )
+      {
+        // This has to be a host-device lambda to avoid errors.
+        typeManipulation::forEachArg( [] LVARRAY_HOST_DEVICE ( auto & reducer )
+        {
+          reducer += 1;
+        }, intReducer, floatReducer, doubleReducer );
+      } );
+
   EXPECT_EQ( intReducer.get(), 2 );
   EXPECT_EQ( floatReducer.get(), 4 );
   EXPECT_EQ( doubleReducer.get(), 7 );
diff --git a/unitTests/testUtils.hpp b/unitTests/testUtils.hpp
index 5a2db2bf..a4a3efa1 100644
--- a/unitTests/testUtils.hpp
+++ b/unitTests/testUtils.hpp
@@ -20,6 +20,7 @@
 
 // TPL includes
 #include <RAJA/RAJA.hpp>
+#include <umpire/strategy/QuickPool.hpp>
 #include <gtest/gtest.h>
 
 // System includes
@@ -72,6 +73,19 @@ struct RAJAHelper< RAJA::cuda_exec< N > >
   static constexpr MemorySpace space = MemorySpace::cuda;
 };
 
+#elif defined(LVARRAY_USE_HIP)
+
+template< unsigned long THREADS_PER_BLOCK >
+using parallelDevicePolicy = RAJA::hip_exec< THREADS_PER_BLOCK >;
+
+template< unsigned long N >
+struct RAJAHelper< RAJA::hip_exec< N > >
+{
+  using ReducePolicy = RAJA::hip_reduce;
+  using AtomicPolicy = RAJA::hip_atomic;
+  static constexpr MemorySpace space = MemorySpace::hip;
+};
+
 #endif
 
 template< typename POLICY, typename INDEX_TYPE, typename LAMBDA >
@@ -103,14 +117,14 @@ LAYOUT const & getRAJAViewLayout( RAJA::View< T, LAYOUT > const & view )
 }
 
 
-#ifndef __CUDA_ARCH__
-#define PORTABLE_EXPECT_EQ( L, R ) EXPECT_EQ( L, R )
-#define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) EXPECT_LE( math::abs( ( L ) -( R ) ), EPSILON ) << \
-    STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R );
-#else
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #define PORTABLE_EXPECT_EQ( L, R ) LVARRAY_ERROR_IF_NE( L, R )
 #define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) LVARRAY_ERROR_IF_GE_MSG( math::abs( ( L ) -( R ) ), EPSILON, \
                                                                        STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R ) );
+#else
+#define PORTABLE_EXPECT_EQ( L, R ) EXPECT_EQ( L, R )
+#define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) EXPECT_LE( math::abs( ( L ) -( R ) ), EPSILON ) << \
+    STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R );
 #endif
 
 // Comparator that compares a std::pair by it's first object.