diff --git a/CMakeLists.txt b/CMakeLists.txt index fc8cf73d..f682d16b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ if( NOT is_submodule ) option( ENABLE_ADDR2LINE "Enable addr2line usage in stacktraces" ON ) option( ENABLE_CUDA "Build with CUDA" OFF ) + option( ENABLE_HIP "Build with HIP" OFF ) option( ENABLE_UMPIRE "Build with UMPIRE" OFF ) option( ENABLE_CHAI "Build with CHAI" OFF ) option( ENABLE_CALIPER "Build with Caliper" OFF ) @@ -80,6 +81,8 @@ blt_list_append( TO lvarray_dependencies ELEMENTS chai IF ENABLE_CHAI ) blt_list_append( TO lvarray_dependencies ELEMENTS cuda IF ENABLE_CUDA ) +blt_list_append( TO lvarray_dependencies ELEMENTS blt::hip IF ENABLE_HIP ) + blt_list_append( TO lvarray_dependencies ELEMENTS caliper IF ENABLE_CALIPER ) diff --git a/Notes.txt b/Notes.txt new file mode 100644 index 00000000..d07775c5 --- /dev/null +++ b/Notes.txt @@ -0,0 +1 @@ +./scripts/uberenv/uberenv.py --prefix=../uberenv-libs/ --spack-config-dir=./scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/ --spec="%clang@10.0.1 +umpire +chai +caliper +pylvarray +benchmarks +examples ^caliper ~adiak ~mpi ~libunwind ~libdw ~papi" \ No newline at end of file diff --git a/cmake/CMakeBasics.cmake b/cmake/CMakeBasics.cmake index 4c3ec217..25c4bef7 100644 --- a/cmake/CMakeBasics.cmake +++ b/cmake/CMakeBasics.cmake @@ -12,9 +12,9 @@ option( ENABLE_TOTALVIEW_OUTPUT "" OFF ) set( LVARRAY_BUILD_OBJ_LIBS OFF CACHE BOOL "" ) -if( NOT BLT_CXX_STD STREQUAL c++14 ) - MESSAGE(FATAL_ERROR "c++14 is NOT enabled. LvArray requires c++14") -endif() +# if( NOT BLT_CXX_STD STREQUAL c++14 ) +# MESSAGE(FATAL_ERROR "c++14 is NOT enabled. LvArray requires c++14") +# endif() blt_append_custom_compiler_flag( FLAGS_VAR CMAKE_CXX_FLAGS DEFAULT "${OpenMP_CXX_FLAGS}") diff --git a/cmake/Config.cmake b/cmake/Config.cmake index 0a44fd1b..c513fbab 100644 --- a/cmake/Config.cmake +++ b/cmake/Config.cmake @@ -2,8 +2,10 @@ set( PREPROCESSOR_DEFINES UMPIRE CHAI CUDA + HIP TOTALVIEW_OUTPUT - CALIPER ) + CALIPER + MAGMA ) set( USE_CONFIGFILE ON CACHE BOOL "" ) foreach( DEP in ${PREPROCESSOR_DEFINES}) diff --git a/cmake/SetupTPL.cmake b/cmake/SetupTPL.cmake index bff94834..c40d0582 100644 --- a/cmake/SetupTPL.cmake +++ b/cmake/SetupTPL.cmake @@ -1,19 +1,79 @@ +macro(find_and_register) + set(singleValueArgs NAME HEADER) + set(multiValueArgs INCLUDE_DIRECTORIES + LIBRARY_DIRECTORIES + LIBRARIES + EXTRA_LIBRARIES + DEPENDS ) + + ## parse the arguments + cmake_parse_arguments(arg + "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT DEFINED arg_NAME) + message(FATAL_ERROR "The find_and_register required parameter NAME specifies the name of the library to register.") + endif() + + if(NOT DEFINED arg_INCLUDE_DIRECTORIES) + message(FATAL_ERROR "The find_and_register required parameter INCLUDE_DIRECTORIES specifies the directories to search for the given header.") + endif() + + if(NOT DEFINED arg_LIBRARY_DIRECTORIES) + message(FATAL_ERROR "The find_and_register required parameter LIBRARY_DIRECTORIES specifies the directories to search for the given libraries.") + endif() + + if(NOT DEFINED arg_HEADER) + message(FATAL_ERROR "The find_and_register required parameter HEADER specifies the header to search for.") + endif() + + if(NOT DEFINED arg_LIBRARIES) + message(FATAL_ERROR "The find_and_register required parameter LIBRARIES specifies the libraries to search for.") + endif() + + find_path(${arg_NAME}_INCLUDE_DIR ${arg_HEADER} + PATHS ${arg_INCLUDE_DIRECTORIES} + NO_DEFAULT_PATH + NO_CMAKE_ENVIRONMENT_PATH + NO_CMAKE_PATH + NO_SYSTEM_ENVIRONMENT_PATH + NO_CMAKE_SYSTEM_PATH) + + if(${arg_NAME}_INCLUDE_DIR STREQUAL ${arg_NAME}_INCLUDE_DIR-NOTFOUND) + message(FATAL_ERROR "Could not find '${arg_HEADER}' in '${arg_INCLUDE_DIRECTORIES}'") + endif() + + blt_find_libraries(FOUND_LIBS ${arg_NAME}_LIBRARIES + NAMES ${arg_LIBRARIES} + PATHS ${arg_LIBRARY_DIRECTORIES} + REQUIRED ON) + + blt_import_library(NAME ${arg_NAME} + INCLUDES ${${arg_NAME}_INCLUDE_DIR} + LIBRARIES ${${arg_NAME}_LIBRARIES} ${arg_EXTRA_LIBRARIES} + TREAT_INCLUDES_AS_SYSTEM ON + DEPENDS_ON ${arg_DEPENDS}) + +endmacro(find_and_register) + set(thirdPartyLibs "") -################################ +############################### # CAMP -################################ -if(NOT EXISTS ${CAMP_DIR}) - message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.") -endif() +############################### +if(CAMP_DIR STREQUAL RAJA_DIR) + message(STATUS "LvArray using CAMP from RAJA.") +else() + if(NOT EXISTS ${CAMP_DIR}) + message(FATAL_ERROR "CAMP_DIR must be defined and point to a valid directory when using CAMP.") + endif() -message(STATUS "Using CAMP from ${CAMP_DIR}") + message(STATUS "LvArray using CAMP from ${CAMP_DIR}") -find_package(camp REQUIRED PATHS ${CAMP_DIR}) + find_package(camp REQUIRED PATHS ${CAMP_DIR}) -set(ENABLE_CAMP ON CACHE BOOL "") + set(thirdPartyLibs ${thirdPartyLibs} camp) +endif() -set(thirdPartyLibs ${thirdPartyLibs} camp) ################################ # RAJA @@ -22,7 +82,7 @@ if(NOT EXISTS ${RAJA_DIR}) message(FATAL_ERROR "RAJA_DIR must be defined and point to a valid directory when using RAJA.") endif() -message(STATUS "Using RAJA from ${RAJA_DIR}") +message(STATUS "LvArray using RAJA from ${RAJA_DIR}") find_package(RAJA REQUIRED PATHS ${RAJA_DIR}) @@ -39,20 +99,26 @@ if(ENABLE_UMPIRE) message(FATAL_ERROR "UMPIRE_DIR must be defined and point to a valid directory when using Umpire.") endif() - message(STATUS "Using Umpire from ${UMPIRE_DIR}") + message(STATUS "LvArray using Umpire from ${UMPIRE_DIR}") find_package(umpire REQUIRED PATHS ${UMPIRE_DIR}) set(thirdPartyLibs ${thirdPartyLibs} umpire) else() - message(STATUS "Not using Umpire.") + message(STATUS "LvArray not using Umpire.") endif() ################################ # CHAI ################################ if(ENABLE_CHAI) + if(NOT EXISTS ${CHAI_DIR}) + message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.") + endif() + + message(STATUS "Using CHAI from ${CHAI_DIR}") + if(NOT ENABLE_UMPIRE) message(FATAL_ERROR "Umpire must be enabled to use CHAI.") endif() @@ -65,32 +131,32 @@ if(ENABLE_CHAI) message(FATAL_ERROR "CHAI_DIR must be defined and point to a valid directory when using CHAI.") endif() - message(STATUS "Using CHAI from ${CHAI_DIR}") + message(STATUS "LvArray using CHAI from ${CHAI_DIR}") find_package(chai REQUIRED PATHS ${CHAI_DIR}) - - # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that. - get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES) - list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA) - set_target_properties(chai - PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}") + + # # If this isn't done chai will add -lRAJA to the link line, but we don't link to RAJA like that. + # get_target_property(CHAI_LINK_LIBRARIES chai INTERFACE_LINK_LIBRARIES) + # list(REMOVE_ITEM CHAI_LINK_LIBRARIES RAJA) + # set_target_properties(chai + # PROPERTIES INTERFACE_LINK_LIBRARIES "${CHAI_LINK_LIBRARIES}") set(thirdPartyLibs ${thirdPartyLibs} chai) else() - message(STATUS "Not using CHAI.") + message(STATUS "LvArray not using CHAI.") endif() -################################ +############################### # CALIPER -################################ +############################### if(ENABLE_CALIPER) if(NOT EXISTS ${CALIPER_DIR}) message(FATAL_ERROR "CALIPER_DIR must be defined and point to a valid directory when using caliper.") endif() - message(STATUS "Using caliper from ${CALIPER_DIR}") + message(STATUS "LvArray using caliper from ${CALIPER_DIR}") find_package(caliper REQUIRED PATHS ${CALIPER_DIR}) @@ -102,22 +168,66 @@ if(ENABLE_CALIPER) set(thirdPartyLibs ${thirdPartyLibs} caliper) else() - message(STATUS "Not using caliper.") + message(STATUS "LvArray not using caliper.") endif() ################################ # Python ################################ -if ( ENABLE_PYLVARRAY ) - message( STATUS "Python3_EXECUTABLE=${Python3_EXECUTABLE}" ) - find_package( Python3 REQUIRED - COMPONENTS Development NumPy ) +if(ENABLE_PYLVARRAY) + message(STATUS "Python3_EXECUTABLE=${Python3_EXECUTABLE}") + find_package(Python3 REQUIRED + COMPONENTS Development NumPy) - message( STATUS "Python3_INCLUDE_DIRS = ${Python3_INCLUDE_DIRS}" ) - message( STATUS "Python3_LIBRARY_DIRS = ${Python3_LIBRARY_DIRS}" ) - message( STATUS "Python3_NumPy_INCLUDE_DIRS = ${Python3_NumPy_INCLUDE_DIRS}" ) + message(STATUS "Python3_INCLUDE_DIRS = ${Python3_INCLUDE_DIRS}") + message(STATUS "Python3_LIBRARY_DIRS = ${Python3_LIBRARY_DIRS}") + message(STATUS "Python3_NumPy_INCLUDE_DIRS = ${Python3_NumPy_INCLUDE_DIRS}") - set( thirdPartyLibs ${thirdPartyLibs} Python3::Python Python3::NumPy ) + set(thirdPartyLibs ${thirdPartyLibs} Python3::Python Python3::NumPy) +else() + message(STATUS "Not building pylvarray") +endif() + +################################ +# LAPACK/BLAS +################################ +if(ENABLE_LAPACK) + message(STATUS "BLAS_LIBRARIES = ${BLAS_LIBRARIES}") + message(STATUS "LAPACK_LIBRARIES = ${LAPACK_LIBRARIES}") + + blt_import_library(NAME blas + TREAT_INCLUDES_AS_SYSTEM ON + LIBRARIES ${BLAS_LIBRARIES}) + + blt_import_library(NAME lapack + DEPENDS_ON blas + TREAT_INCLUDES_AS_SYSTEM ON + LIBRARIES ${LAPACK_LIBRARIES}) + + set(thirdPartyLibs ${thirdPartyLibs} blas lapack) +else() + message(STATUS "Not using LAPACK or BLAS.") +endif() + +################################ +# MAGMA +################################ +if(ENABLE_MAGMA) + message(STATUS "Using MAGMA from ${MAGMA_DIR}") + + if(NOT ENABLE_LAPACK) + message(FATAL_ERROR "LAPACK must be enabled to use MAGMA.") + endif() + + find_and_register(NAME magma + INCLUDE_DIRECTORIES ${MAGMA_DIR}/include + LIBRARY_DIRECTORIES ${MAGMA_DIR}/lib + HEADER magma.h + LIBRARIES magma) + + set(thirdPartyLibs ${thirdPartyLibs} magma) +else() + message(STATUS "Not using MAGMA.") endif() set( thirdPartyLibs ${thirdPartyLibs} CACHE STRING "" ) diff --git a/cmake/blt b/cmake/blt index c253509a..ddd5a0ca 160000 --- a/cmake/blt +++ b/cmake/blt @@ -1 +1 @@ -Subproject commit c253509ab2daf759eb857958597f6f34ab8c1713 +Subproject commit ddd5a0ca7c566d0ae14270b66625c8a363630ddb diff --git a/host-configs/LLNL/lassen-base.cmake b/host-configs/LLNL/lassen-base.cmake index 5a443bb9..3a60a7f3 100644 --- a/host-configs/LLNL/lassen-base.cmake +++ b/host-configs/LLNL/lassen-base.cmake @@ -21,14 +21,14 @@ set(ENABLE_CUDA ON CACHE BOOL "") set(CUDA_TOOLKIT_ROOT_DIR /usr/tce/packages/cuda/cuda-10.1.243 CACHE STRING "") set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING "") set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc CACHE STRING "") -set(CUDA_ARCH sm_70 CACHE STRING "") +set(CUDA_ARCHITECTURES sm_70 CACHE STRING "") set(CMAKE_CUDA_STANDARD 14 CACHE STRING "") -set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCH} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "") +set(CMAKE_CUDA_FLAGS "-restrict -arch ${CUDA_ARCHITECTURES} --expt-extended-lambda -Werror cross-execution-space-call,reorder,deprecated-declarations" CACHE STRING "") set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -Xcompiler -DNDEBUG -Xcompiler -O3 -Xcompiler -mcpu=powerpc64le -Xcompiler -mtune=powerpc64le" CACHE STRING "") set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo ${CMAKE_CUDA_FLAGS_RELEASE}" CACHE STRING "") set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0 -Xcompiler -O0" CACHE STRING "") -set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCH}" CACHE STRING "" FORCE) +set(CHAI_CUDA_FLAGS "-arch ${CUDA_ARCHITECTURES}" CACHE STRING "" FORCE) # Uncomment this line to make nvcc output register usage for each kernel. # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --resource-usage" CACHE STRING "" FORCE) diff --git a/host-configs/ascent-gcc@8.1.1.cmake b/host-configs/ORNL/ascent-gcc@8.1.1.cmake similarity index 100% rename from host-configs/ascent-gcc@8.1.1.cmake rename to host-configs/ORNL/ascent-gcc@8.1.1.cmake diff --git a/host-configs/ORNL/crusher-base.cmake b/host-configs/ORNL/crusher-base.cmake new file mode 100644 index 00000000..53f647fa --- /dev/null +++ b/host-configs/ORNL/crusher-base.cmake @@ -0,0 +1,25 @@ + +set(CMAKE_CXX_STANDARD 14 CACHE STRING "") + +set( ENABLE_MPI ON CACHE BOOL "" FORCE ) +set( ENABLE_FIND_MPI ON CACHE BOOL "" FORCE ) + +# HIP Options +set( ENABLE_HIP ON CACHE BOOL "" FORCE ) + +# suppress -Werror for now +set( ENABLE_WARNINGS_AS_ERRORS FALSE CACHE BOOL "" FORCE ) + +# GTEST +set(ENABLE_GTEST_DEATH_TESTS OFF CACHE BOOL "") +set(gtest_disable_pthreads ON CACHE BOOL "") + +# disable most binaries and doc generation +set(ENABLE_TESTS OFF CACHE BOOL "" FORCE) +set(DISABLE_UNIT_TESTS ON CACHE BOOL "" FORCE) +set(ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE) +set(ENABLE_BENCHMARKS OFF CACHE BOOL "" FORCE) +set(ENABLE_DOCS OFF CACHE BOOL "" FORCE) + +# BLT trying to find MPI fails on cray with cce +set(ENABLE_FIND_MPI FALSE CACHE BOOL "") diff --git a/host-configs/ORNL/crusher-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cce@13.0.1.cmake new file mode 100644 index 00000000..a10fda43 --- /dev/null +++ b/host-configs/ORNL/crusher-cce@13.0.1.cmake @@ -0,0 +1,39 @@ + +set(CONFIG_NAME "crusher-cce@13.0.1" CACHE PATH "") +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) + +# Set up the tpls +set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") +set(GEOSX_TPL_DIR2 "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-13.0.1" CACHE PATH "") + +set(CAMP_DIR "${GEOSX_TPL_DIR2}/camp-0.2.2-oogry5gz2fts7jufeykxzmowajtmgzi3" CACHE PATH "" ) + +set(RAJA_DIR "${GEOSX_TPL_DIR2}/raja-2022.03.0-ex5v5y6jtotfxxvwcs7bblwvy4ktjykq" CACHE PATH "" ) + +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR2}/umpire-develop-jqqth57w2ets75sljw7lc5uxoi5wwi3c" CACHE PATH "" ) + +set(ENABLE_CHAI TRUE CACHE BOOL "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR2}/chai-2022.03.0-w7lka3bkp36mbk5kzucgtp3eowomllgl" CACHE PATH "" ) + +set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" ) +set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" ) + +# C++ options +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "") + + +if( ENABLE_HIP ) + set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation + + set( HIP_VERSION_STRING "4.5.2" CACHE STRING "" ) + set( HIP_ROOT "/opt/rocm-4.5.2" CACHE PATH "" ) + + set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) + set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) + set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) + set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) +endif() diff --git a/host-configs/ORNL/crusher-cce@14.0.0.cmake b/host-configs/ORNL/crusher-cce@14.0.0.cmake new file mode 100644 index 00000000..967be640 --- /dev/null +++ b/host-configs/ORNL/crusher-cce@14.0.0.cmake @@ -0,0 +1,44 @@ + +set(CONFIG_NAME "crusher-cce@14.0.0" CACHE PATH "") +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) + +# Set up the tpls +set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") + +set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" ) + +set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" ) + +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" ) + +set(ENABLE_CHAI TRUE CACHE BOOL "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" ) + +set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" ) +set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" ) + +# C++ options +set(CRAYPE_VERSION "2.7.15") +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "") + +if( ENABLE_HIP ) + set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation + + set( HIP_VERSION_STRING "5.1.0" CACHE STRING "" ) + set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" ) + + set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) + set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) + set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) + set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) +endif() + + + + + diff --git a/host-configs/ORNL/crusher-cce@14.0.1.cmake b/host-configs/ORNL/crusher-cce@14.0.1.cmake new file mode 100644 index 00000000..15c54516 --- /dev/null +++ b/host-configs/ORNL/crusher-cce@14.0.1.cmake @@ -0,0 +1,39 @@ + +set(CONFIG_NAME "crusher-cce@14.0.1" CACHE PATH "") +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) + +# Set up the tpls +set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") + +set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" ) + +set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" ) + +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" ) + +set(ENABLE_CHAI TRUE CACHE BOOL "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" ) + +set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" ) +set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" ) + +# C++ options +set(CRAYPE_VERSION "2.7.15") +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "") + +if( ENABLE_HIP ) + set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation + + set( HIP_VERSION_STRING "5.1.0" CACHE STRING "" ) + set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" ) + + set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) + set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) + set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) + set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) +endif() diff --git a/host-configs/ORNL/crusher-cce@14.0.2.cmake b/host-configs/ORNL/crusher-cce@14.0.2.cmake new file mode 100644 index 00000000..d0e29023 --- /dev/null +++ b/host-configs/ORNL/crusher-cce@14.0.2.cmake @@ -0,0 +1,39 @@ + +set(CONFIG_NAME "crusher-cce@14.0.2" CACHE PATH "") +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) + +# Set up the tpls +set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.2" CACHE PATH "") +set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.2" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") + +set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-ksdglvlmamju7gphtyzdavitriemedla" CACHE PATH "" ) + +set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.1-jxxcauxbzee6nqjmyjz45t5h4f7tv34r" CACHE PATH "" ) + +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.1-vgvqpvi3cwdmvy6cu76sqoghnvprzlwu" CACHE PATH "" ) + +set(ENABLE_CHAI TRUE CACHE BOOL "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-7axkiea7q3hzgojswiz7qdbd2yq6bvsf" CACHE PATH "" ) + +set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-jptrwzs7vdbckndjg5qg4jwckfmgexmw/" CACHE PATH "" ) +set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-p2msdgsmomufcnwhnow5bbazg7463caf/" CACHE PATH "" ) + +# C++ options +set(CRAYPE_VERSION "2.7.17") +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "") + +if( ENABLE_HIP ) + set( ENABLE_CLANG_HIP ON CACHE BOOL "" FORCE ) # don't invoke hipcc, rely on cce link-time compilation + + set( HIP_VERSION_STRING "5.2.0" CACHE STRING "" ) + set( HIP_ROOT "/opt/rocm-${HIP_VERSION_STRING}" CACHE PATH "" ) + + set( CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "" FORCE ) + set( AMDGPU_TARGETS "${CMAKE_HIP_ARCHITECTURES}" CACHE STRING "" FORCE ) + set( CMAKE_CXX_FLAGS "-fgpu-rdc" CACHE STRING "" FORCE ) + set( CMAKE_CXX_LINK_FLAGS "-fgpu-rdc --hip-link" CACHE STRING "" FORCE ) +endif() diff --git a/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake new file mode 100644 index 00000000..b9d64b28 --- /dev/null +++ b/host-configs/ORNL/crusher-cpu-cce@13.0.1.cmake @@ -0,0 +1,29 @@ + +set(CONFIG_NAME "crusher-cpu-cce@13.0.1" CACHE PATH "") +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) + +# Set up the tpls +set(GEOSX_TPL_ROOT_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen2/cce-13.0.1" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") +set(GEOSX_TPL_DIR2 "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-13.0.1" CACHE PATH "") + +set(CAMP_DIR "${GEOSX_TPL_DIR2}/camp-0.2.2-mej6trivmy7o5vlr6a52cml6tzxb5fvk" CACHE PATH "" ) + +set(RAJA_DIR "${GEOSX_TPL_DIR2}/raja-2022.03.0-tmukf35ms7f2pkfswpejbnt3jtnpkakc" CACHE PATH "" ) + +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR2}/umpire-2022.03.0-unirfq5er4vtyr2koymgi3xxq6h2f5l5" CACHE PATH "" ) + +set(ENABLE_CHAI TRUE CACHE BOOL "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR2}/chai-2022.03.0-aggyh463v2rz6s44laqshylc4xeeg4h7" CACHE PATH "" ) + +set(METIS_DIR "${GEOSX_TPL_DIR}/metis-5.1.0-zcfkawg5ifqpzcihrc3i6cdrrijusc2p/" CACHE PATH "" ) +set(PARMETIS_DIR "${GEOSX_TPL_DIR}/parmetis-4.0.3-t2amifl5hh7yewre24gn2x3mlrz7qkl5/" CACHE PATH "" ) + +# C++ options +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.13/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.13/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.13/bin/ftn" CACHE PATH "") + +# HIP Options +set( ENABLE_HIP OFF CACHE BOOL "" FORCE ) diff --git a/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake b/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake new file mode 100644 index 00000000..d25d6b2e --- /dev/null +++ b/host-configs/ORNL/crusher-cpu-cce@14.0.1.cmake @@ -0,0 +1,30 @@ + +set(CONFIG_NAME "crusher-cpu-cce@14.0.1" CACHE PATH "") +include( ${CMAKE_CURRENT_LIST_DIR}/crusher-base.cmake ) + +# Set up the tpls +set(SYSTEM_TPL_DIR "/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR "/gpfs/alpine/geo127/world-shared/cray-sles15-zen3/cce-14.0.0" CACHE PATH "") +set(GEOSX_TPL_DIR ${GEOSX_TPL_ROOT_DIR} CACHE PATH "") + +set(CAMP_DIR "${GEOSX_TPL_DIR}/camp-0.2.2-r3wj7xcb7xycpqhda7rf5b2lekrsqx4w" CACHE PATH "" ) + +set(RAJA_DIR "${GEOSX_TPL_DIR}/raja-2022.03.0-33qhciagztrs7zbnyk3uufxwjllerqer" CACHE PATH "" ) + +set(ENABLE_UMPIRE TRUE CACHE BOOL "" ) +set(UMPIRE_DIR "${GEOSX_TPL_DIR}/umpire-2022.03.0-72z2rwrcmmojgau6wgfmcckrnzbvi4aa" CACHE PATH "" ) + +set(ENABLE_CHAI TRUE CACHE BOOL "" ) +set(CHAI_DIR "${GEOSX_TPL_DIR}/chai-2022.03.0-da6y22vj4iy3uru2bne7df7tpngckt3n" CACHE PATH "" ) + +set(METIS_DIR "${SYSTEM_TPL_DIR}/metis-5.1.0-q4xwj6aya5ooq4owhrhh2qllanjmyuew/" CACHE PATH "" ) +set(PARMETIS_DIR "${SYSTEM_TPL_DIR}/parmetis-4.0.3-fyed2lqdzmulw6d4cl3oxt6r6jaqdtwg/" CACHE PATH "" ) + +# C++ options +set(CRAYPE_VERSION "2.7.15") +set(CMAKE_C_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/cc" CACHE PATH "") +set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/CC" CACHE PATH "") +set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/${CRAYPE_VERSION}/bin/ftn" CACHE PATH "") + +# HIP Options +set( ENABLE_HIP OFF CACHE BOOL "" FORCE ) diff --git a/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake b/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake new file mode 100644 index 00000000..90ac014b --- /dev/null +++ b/new-configs/quartz-toss_3_x86_64_ib-clang@10.0.1.cmake @@ -0,0 +1,93 @@ +################################################################################# +# Generated host-config - Edit at own risk! +################################################################################# +#-------------------------------------------------------------------------------- +# SYS_TYPE: toss_3_x86_64_ib +# Compiler Spec: clang@10.0.1 +# CMake executable path: /usr/tce/packages/cmake/cmake-3.14.5/bin/cmake +#-------------------------------------------------------------------------------- + +set(BLT_SOURCE_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/blt-0.5.2-6nztad6saell6ikor6wtxp6qycxtfwh4" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# Compilers +#-------------------------------------------------------------------------------- + +set(CMAKE_C_COMPILER "/usr/tce/bin/clang-10.0.1" CACHE PATH "") + +set(CMAKE_CXX_COMPILER "/usr/tce/bin/clang++-10.0.1" CACHE PATH "") + +set(CMAKE_C_FLAGS "-march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0" CACHE PATH "") + +set(CMAKE_CXX_FLAGS "-march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0" CACHE PATH "") + +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG" CACHE STRING "") + +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "") + +set(ENABLE_CUDA OFF CACHE BOOL "") + +#-------------------------------------------------------------------------------- +# CAMP +#-------------------------------------------------------------------------------- + +set(CAMP_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/camp-2022.03.2-2q75xbq2h4ykcyvasoqg55torawlabkw" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# RAJA +#-------------------------------------------------------------------------------- + +set(RAJA_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/raja-2022.03.0-jkp4hp7ifyxkxzkbho5ngdnk4x3opaoy" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# Umpire +#-------------------------------------------------------------------------------- + +set(ENABLE_UMPIRE ON CACHE BOOL "") + +set(UMPIRE_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/umpire-2022.03.1-aerit7injc3hmn2ripnsxtnlwxicjmuu" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# CHAI +#-------------------------------------------------------------------------------- + +set(ENABLE_CHAI ON CACHE BOOL "") + +set(CHAI_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/chai-2022.03.0-s6w2gsrreu7krgzboekmlukmfestpg7k" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# Caliper +#-------------------------------------------------------------------------------- + +#-------------------------------------------------------------------------------- +# Caliper +#-------------------------------------------------------------------------------- + +set(ENABLE_CALIPER ON CACHE BOOL "") + +set(CALIPER_DIR "/usr/WS2/corbett5/LvArray/uberenv-libs/linux-rhel7-broadwell/clang-10.0.1/caliper-2.8.0-3fwkrbu4bhnc4bqvhrqcydrzxslq6ryz" CACHE PATH "") + +#-------------------------------------------------------------------------------- +# Python +#-------------------------------------------------------------------------------- + +set(ENABLE_PYLVARRAY OFF CACHE BOOL "") + +#-------------------------------------------------------------------------------- +# Documentation +#-------------------------------------------------------------------------------- + +set(ENABLE_DOCS OFF CACHE BOOL "") + +#-------------------------------------------------------------------------------- +# addr2line +#-------------------------------------------------------------------------------- + +set(ENABLE_ADDR2LINE ON CACHE BOOL "") + +#-------------------------------------------------------------------------------- +# Other +#-------------------------------------------------------------------------------- + diff --git a/scripts/uberenv/packages/lvarray/package.py b/scripts/uberenv/packages/lvarray/package.py index 9c4b47d9..df299f3e 100644 --- a/scripts/uberenv/packages/lvarray/package.py +++ b/scripts/uberenv/packages/lvarray/package.py @@ -18,6 +18,12 @@ def cmake_cache_entry(name, value, comment=""): return 'set(%s "%s" CACHE PATH "%s")\n\n' % (name, value, comment) +def cmake_cache_list(name, value, comment=""): + """Generate a list for a cmake cache variable""" + + indent = 5 + len(name) + join_str = '\n' + ' ' * indent + return 'set(%s %s CACHE STRING "%s")\n\n' % (name, join_str.join(value), comment) def cmake_cache_string(name, string, comment=""): """Generate a string for a cmake cache variable""" @@ -50,6 +56,8 @@ class Lvarray(CMakePackage, CudaPackage): variant('chai', default=False, description='Build Chai support') variant('caliper', default=False, description='Build Caliper support') variant('pylvarray', default=False, description='Build Python support') + variant('lapack', default=False, description='Build LAPACK and BLAS support') + variant('magma', default=False, description='Build MAGMA support') variant('tests', default=True, description='Build tests') variant('benchmarks', default=False, description='Build benchmarks') variant('examples', default=False, description='Build examples') @@ -57,31 +65,52 @@ class Lvarray(CMakePackage, CudaPackage): variant('addr2line', default=True, description='Build support for addr2line.') - depends_on('blt', when='@0.2.0:', type='build') + variant('tpl_build_type', default='none', description='TPL build type', + values=('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel', 'none')) + + conflicts('~lapack', when='+magma') + + depends_on('blt@0.4.1:', when='@0.2.0:', type='build') depends_on('camp') - depends_on('camp+cuda', when='+cuda') depends_on('raja') - depends_on('raja+cuda', when='+cuda') - # At the moment Umpire doesn't support shared when building with CUDA. depends_on('umpire', when='+umpire') - depends_on('umpire+cuda~shared', when='+umpire+cuda') depends_on('chai+raja', when='+chai') - depends_on('chai+raja+cuda', when='+chai+cuda') depends_on('caliper', when='+caliper') depends_on('python +shared +pic', when='+pylvarray') - depends_on('py-numpy@1.19: +blas +lapack +force-parallel-build', when='+pylvarray') - depends_on('py-scipy@1.5.2: +force-parallel-build', when='+pylvarray') + depends_on('py-numpy@1.19: +blas +lapack', when='+pylvarray') + depends_on('py-scipy@1.5.2:', when='+pylvarray') depends_on('py-pip', when='+pylvarray') + depends_on('blas', when='+lapack') + depends_on('lapack', when='+lapack') + depends_on('magma', when='+magma') + depends_on('doxygen@1.8.13:', when='+docs', type='build') depends_on('py-sphinx@1.6.3:', when='+docs', type='build') + with when('+cuda'): + for sm_ in CudaPackage.cuda_arch_values: + depends_on('camp +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('raja +cuda cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('umpire +cuda ~shared cuda_arch={0}'.format(sm_), when='cuda_arch={0}'.format(sm_)) + depends_on('chai +cuda cuda_arch={0}'.format(sm_), when='+chai cuda_arch={0}'.format(sm_)) + depends_on('caliper +cuda cuda_arch={0}'.format(sm_), when='+caliper cuda_arch={0}'.format(sm_)) + + for bt in ('Debug', 'Release', 'RelWithDebInfo', 'MinSizeRel'): + with when('tpl_build_type={}'.format(bt)): + depends_on('camp build_type={}'.format(bt)) + depends_on('raja build_type={}'.format(bt)) + depends_on('umpire build_type={}'.format(bt)) + depends_on('chai build_type={}'.format(bt), when='+chai') + depends_on('caliper build_type={}'.format(bt), when='+caliper') + depends_on('magma build_type={}'.format(bt), when='+magma') + phases = ['hostconfig', 'cmake', 'build', 'install'] @run_after('build') @@ -158,178 +187,195 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None): cmake_exe = os.path.realpath(cmake_exe) host_config_path = self._get_host_config_path(spec) - cfg = open(host_config_path, "w") - cfg.write("#{0}\n".format("#" * 80)) - cfg.write("# Generated host-config - Edit at own risk!\n") - cfg.write("#{0}\n".format("#" * 80)) - - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# SYS_TYPE: {0}\n".format(sys_type)) - cfg.write("# Compiler Spec: {0}\n".format(spec.compiler)) - cfg.write("# CMake executable path: %s\n" % cmake_exe) - cfg.write("#{0}\n\n".format("-" * 80)) - - if 'blt' in spec: - cfg.write(cmake_cache_entry('BLT_SOURCE_DIR', spec['blt'].prefix)) - - ####################### - # Compiler Settings - ####################### - - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Compilers\n") - cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler)) - cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler)) - - # use global spack compiler flags - cflags = ' '.join(spec.compiler_flags['cflags']) - cxxflags = ' '.join(spec.compiler_flags['cxxflags']) - - if "%intel" in spec: - cflags += ' -qoverride-limits' - cxxflags += ' -qoverride-limits' + with open(host_config_path, "w") as cfg: + cfg.write("#{0}\n".format("#" * 80)) + cfg.write("# Generated host-config - Edit at own risk!\n") + cfg.write("#{0}\n".format("#" * 80)) - if cflags: - cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags)) - - if cxxflags: - cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags)) - - release_flags = "-O3 -DNDEBUG" - cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELEASE", - release_flags)) - reldebinf_flags = "-O3 -g -DNDEBUG" - cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELWITHDEBINFO", - reldebinf_flags)) - debug_flags = "-O0 -g" - cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_DEBUG", debug_flags)) - - if "%clang arch=linux-rhel7-ppc64le" in spec: - cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", "-Wl,--no-toc-optimize")) - - if "+cuda" in spec: cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Cuda\n") + cfg.write("# SYS_TYPE: {0}\n".format(sys_type)) + cfg.write("# Compiler Spec: {0}\n".format(spec.compiler)) + cfg.write("# Spec: {0}\n".format(spec)) + cfg.write("# CMake executable path: %s\n" % cmake_exe) cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write(cmake_cache_option("ENABLE_CUDA", True)) - cfg.write(cmake_cache_entry("CMAKE_CUDA_STANDARD", 14)) - - cudatoolkitdir = spec['cuda'].prefix - cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR", - cudatoolkitdir)) - cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" - cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", cudacompiler)) - - cmake_cuda_flags = ('-restrict --expt-extended-lambda -Werror ' - 'cross-execution-space-call,reorder,' - 'deprecated-declarations') + if 'blt' in spec: + cfg.write(cmake_cache_entry('BLT_SOURCE_DIR', spec['blt'].prefix)) - archSpecifiers = ("-mtune", "-mcpu", "-march", "-qtune", "-qarch") - for archSpecifier in archSpecifiers: - for compilerArg in spec.compiler_flags['cxxflags']: - if compilerArg.startswith(archSpecifier): - cmake_cuda_flags += ' -Xcompiler ' + compilerArg - - if not spec.satisfies('cuda_arch=none'): - cuda_arch = spec.variants['cuda_arch'].value - cmake_cuda_flags += ' -arch sm_{0}'.format(cuda_arch[0]) - - cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS", cmake_cuda_flags)) - - cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", - "-O3 -Xcompiler -O3 -DNDEBUG")) - cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", - "-O3 -g -lineinfo -Xcompiler -O3")) - cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG", - "-O0 -Xcompiler -O0 -g -G")) - - else: - cfg.write(cmake_cache_option("ENABLE_CUDA", False)) + ####################### + # Compiler Settings + ####################### - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# CAMP\n") - cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Compilers\n") + cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler)) + cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler)) + + # use global spack compiler flags + cflags = ' '.join(spec.compiler_flags['cflags']) + cxxflags = ' '.join(spec.compiler_flags['cxxflags']) + + if "%intel" in spec: + cflags += ' -qoverride-limits' + cxxflags += ' -qoverride-limits' + + if cflags: + cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags)) + + if cxxflags: + cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags)) + + release_flags = "-O3 -DNDEBUG" + cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELEASE", + release_flags)) + reldebinf_flags = "-O3 -g -DNDEBUG" + cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_RELWITHDEBINFO", + reldebinf_flags)) + debug_flags = "-O0 -g" + cfg.write(cmake_cache_string("CMAKE_CXX_FLAGS_DEBUG", debug_flags)) + + if "%clang arch=linux-rhel7-ppc64le" in spec: + cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", "-Wl,--no-toc-optimize")) + + if "+cuda" in spec: + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Cuda\n") + cfg.write("#{0}\n\n".format("-" * 80)) + + cfg.write(cmake_cache_option("ENABLE_CUDA", True)) + cfg.write(cmake_cache_entry("CMAKE_CUDA_STANDARD", 14)) + + cudatoolkitdir = spec['cuda'].prefix + cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR", + cudatoolkitdir)) + cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc" + cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER", cudacompiler)) + + cmake_cuda_flags = ('-restrict --expt-extended-lambda -Werror ' + 'cross-execution-space-call,reorder,' + 'deprecated-declarations') + + archSpecifiers = ("-mtune", "-mcpu", "-march", "-qtune", "-qarch") + for archSpecifier in archSpecifiers: + for compilerArg in spec.compiler_flags['cxxflags']: + if compilerArg.startswith(archSpecifier): + cmake_cuda_flags += ' -Xcompiler ' + compilerArg + + if not spec.satisfies('cuda_arch=none'): + cuda_arch = spec.variants['cuda_arch'].value + cmake_cuda_flags += ' -arch sm_{0}'.format(cuda_arch[0]) + + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS", cmake_cuda_flags)) + + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", + "-O3 -Xcompiler -O3 -DNDEBUG")) + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", + "-O3 -g -lineinfo -Xcompiler -O3")) + cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG", + "-O0 -Xcompiler -O0 -g -G")) + + else: + cfg.write(cmake_cache_option("ENABLE_CUDA", False)) - cfg.write(cmake_cache_entry("CAMP_DIR", spec['camp'].prefix)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# CAMP\n") + cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# RAJA\n") - cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write(cmake_cache_entry("CAMP_DIR", spec['camp'].prefix)) - cfg.write(cmake_cache_entry("RAJA_DIR", spec['raja'].prefix)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# RAJA\n") + cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Umpire\n") - cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write(cmake_cache_entry("RAJA_DIR", spec['raja'].prefix)) - if "+umpire" in spec: - cfg.write(cmake_cache_option("ENABLE_UMPIRE", True)) - cfg.write(cmake_cache_entry("UMPIRE_DIR", spec['umpire'].prefix)) - else: - cfg.write(cmake_cache_option("ENABLE_UMPIRE", False)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Umpire\n") + cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# CHAI\n") - cfg.write("#{0}\n\n".format("-" * 80)) + if "+umpire" in spec: + cfg.write(cmake_cache_option("ENABLE_UMPIRE", True)) + cfg.write(cmake_cache_entry("UMPIRE_DIR", spec['umpire'].prefix)) + else: + cfg.write(cmake_cache_option("ENABLE_UMPIRE", False)) - if "+chai" in spec: - cfg.write(cmake_cache_option("ENABLE_CHAI", True)) - cfg.write(cmake_cache_entry("CHAI_DIR", spec['chai'].prefix)) - else: - cfg.write(cmake_cache_option("ENABLE_CHAI", False)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# CHAI\n") + cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Caliper\n") - cfg.write("#{0}\n\n".format("-" * 80)) + if "+chai" in spec: + cfg.write(cmake_cache_option("ENABLE_CHAI", True)) + cfg.write(cmake_cache_entry("CHAI_DIR", spec['chai'].prefix)) + else: + cfg.write(cmake_cache_option("ENABLE_CHAI", False)) - if "+caliper" in spec: cfg.write("#{0}\n".format("-" * 80)) cfg.write("# Caliper\n") cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write(cmake_cache_option("ENABLE_CALIPER", True)) - cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix)) - else: - cfg.write(cmake_cache_option("ENABLE_CALIPER", False)) - - cfg.write('#{0}\n'.format('-' * 80)) - cfg.write('# Python\n') - cfg.write('#{0}\n\n'.format('-' * 80)) - if '+pylvarray' in spec: - cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True)) - cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3'))) - else: - cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', False)) - - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Documentation\n") - cfg.write("#{0}\n\n".format("-" * 80)) - if "+docs" in spec: - cfg.write(cmake_cache_option("ENABLE_DOCS", True)) - sphinx_dir = spec['py-sphinx'].prefix - cfg.write(cmake_cache_string('SPHINX_EXECUTABLE', - os.path.join(sphinx_dir, - 'bin', - 'sphinx-build'))) - - doxygen_dir = spec['doxygen'].prefix - cfg.write(cmake_cache_string('DOXYGEN_EXECUTABLE', - os.path.join(doxygen_dir, - 'bin', - 'doxygen'))) - else: - cfg.write(cmake_cache_option("ENABLE_DOCS", False)) + if "+caliper" in spec: + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Caliper\n") + cfg.write("#{0}\n\n".format("-" * 80)) + + cfg.write(cmake_cache_option("ENABLE_CALIPER", True)) + cfg.write(cmake_cache_entry("CALIPER_DIR", spec['caliper'].prefix)) + else: + cfg.write(cmake_cache_option("ENABLE_CALIPER", False)) + + cfg.write('#{0}\n'.format('-' * 80)) + cfg.write('# Python\n') + cfg.write('#{0}\n\n'.format('-' * 80)) + if '+pylvarray' in spec: + cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', True)) + cfg.write(cmake_cache_entry('Python3_EXECUTABLE', os.path.join(spec['python'].prefix.bin, 'python3'))) + else: + cfg.write(cmake_cache_option('ENABLE_PYLVARRAY', False)) + + cfg.write('#{0}\n'.format('-' * 80)) + cfg.write('# Math libraries\n') + cfg.write('#{0}\n\n'.format('-' * 80)) + if '+lapack' in spec: + cfg.write(cmake_cache_option('ENABLE_LAPACK', True)) + cfg.write(cmake_cache_list('BLAS_LIBRARIES', spec['blas'].libs)) + cfg.write(cmake_cache_list('LAPACK_LIBRARIES', spec['lapack'].libs)) + else: + cfg.write(cmake_cache_option('ENABLE_LAPACK', False)) + + if '+magma' in spec: + cfg.write(cmake_cache_option('ENABLE_MAGMA', True)) + cfg.write(cmake_cache_entry('MAGMA_DIR', spec['magma'].prefix)) + else: + cfg.write(cmake_cache_option('ENABLE_MAGMA', False)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# addr2line\n") - cfg.write("#{0}\n\n".format("-" * 80)) - cfg.write(cmake_cache_option('ENABLE_ADDR2LINE', '+addr2line' in spec)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Documentation\n") + cfg.write("#{0}\n\n".format("-" * 80)) + if "+docs" in spec: + cfg.write(cmake_cache_option("ENABLE_DOCS", True)) + sphinx_dir = spec['py-sphinx'].prefix + cfg.write(cmake_cache_string('SPHINX_EXECUTABLE', + os.path.join(sphinx_dir, + 'bin', + 'sphinx-build'))) + + doxygen_dir = spec['doxygen'].prefix + cfg.write(cmake_cache_string('DOXYGEN_EXECUTABLE', + os.path.join(doxygen_dir, + 'bin', + 'doxygen'))) + else: + cfg.write(cmake_cache_option("ENABLE_DOCS", False)) - cfg.write("#{0}\n".format("-" * 80)) - cfg.write("# Other\n") - cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# addr2line\n") + cfg.write("#{0}\n\n".format("-" * 80)) + cfg.write(cmake_cache_option('ENABLE_ADDR2LINE', '+addr2line' in spec)) + + cfg.write("#{0}\n".format("-" * 80)) + cfg.write("# Other\n") + cfg.write("#{0}\n\n".format("-" * 80)) def cmake_args(self): spec = self.spec diff --git a/scripts/uberenv/packages/magma/cmake-W.patch b/scripts/uberenv/packages/magma/cmake-W.patch new file mode 100644 index 00000000..59179676 --- /dev/null +++ b/scripts/uberenv/packages/magma/cmake-W.patch @@ -0,0 +1,12 @@ +diff -ru magma-2.5.0-orig/CMakeLists.txt magma-2.5.0/CMakeLists.txt +--- magma-2.5.0-orig/CMakeLists.txt 2019-01-02 11:18:39.000000000 -0800 ++++ magma-2.5.0/CMakeLists.txt 2019-04-03 15:58:01.871234891 -0700 +@@ -363,8 +363,6 @@ + else() + # Primarily for gcc / nvcc: + # Ignore unused static functions in headers. +- set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unused-function" ) +- set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unused-function" ) + endif() + + if (CMAKE_HOST_APPLE) diff --git a/scripts/uberenv/packages/magma/ibm-xl.patch b/scripts/uberenv/packages/magma/ibm-xl.patch new file mode 100644 index 00000000..0deab656 --- /dev/null +++ b/scripts/uberenv/packages/magma/ibm-xl.patch @@ -0,0 +1,248 @@ +diff -Naur magma-2.2.0/src/dlaex3_m.cpp magma-2.2.0-patched/src/dlaex3_m.cpp +--- magma-2.2.0/src/dlaex3_m.cpp 2016-11-20 20:20:06.000000000 -0500 ++++ magma-2.2.0/src/dlaex3_m.cpp 2017-01-06 15:54:29.423668874 -0500 +@@ -197,7 +197,7 @@ + magmaDouble_ptr dwork[], + magma_queue_t queues[MagmaMaxGPUs][2], + magma_range_t range, double vl, double vu, magma_int_t il, magma_int_t iu, +- magma_int_t *info ) ++ magma_int_t *infom ) + { + #define Q(i_,j_) (Q + (i_) + (j_)*ldq) + +@@ -209,8 +209,8 @@ + magma_setdevice(0); + magma_dlaex3( k, n, n1, d, Q, ldq, rho, + dlamda, Q2, indx, ctot, w, s, indxq, +- *dwork, range, vl, vu, il, iu, info ); +- return *info; ++ *dwork, range, vl, vu, il, iu, infom ); ++ return *infom; + } + double d_one = 1.; + double d_zero = 0.; +@@ -229,37 +229,37 @@ + valeig = (range == MagmaRangeV); + indeig = (range == MagmaRangeI); + +- *info = 0; ++ *infom = 0; + + if (k < 0) +- *info=-1; ++ *infom=-1; + else if (n < k) +- *info=-2; ++ *infom=-2; + else if (ldq < max(1,n)) +- *info=-6; ++ *infom=-6; + else if (! (alleig || valeig || indeig)) +- *info = -15; ++ *infom = -15; + else { + if (valeig) { + if (n > 0 && vu <= vl) +- *info = -17; ++ *infom = -17; + } + else if (indeig) { + if (il < 1 || il > max(1,n)) +- *info = -18; ++ *infom = -18; + else if (iu < min(n,il) || iu > n) +- *info = -19; ++ *infom = -19; + } + } + +- if (*info != 0) { +- magma_xerbla( __func__, -(*info) ); +- return *info; ++ if (*infom != 0) { ++ magma_xerbla( __func__, -(*infom) ); ++ return *infom; + } + + // Quick return if possible + if (k == 0) +- return *info; ++ return *infom; + + magma_device_t orig_dev; + magma_getdevice( &orig_dev ); +@@ -360,15 +360,15 @@ + lapackf77_dlaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo ); + // If the zero finder fails, the computation is terminated. + if (iinfo != 0) { +- #pragma omp critical (info) +- *info = iinfo; ++ #pragma omp critical (infom) ++ *infom = iinfo; + break; + } + } + + #pragma omp barrier + +- if (*info == 0) { ++ if (*infom == 0) { + #pragma omp single + { + // Prepare the INDXQ sorting permutation. +@@ -452,8 +452,8 @@ + } + } + } // end omp parallel +- if (*info != 0) +- return *info; ++ if (*infom != 0) ++ return *infom; + + timer_stop( time ); + timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); +@@ -474,10 +474,10 @@ + lapackf77_dlaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo ); + // If the zero finder fails, the computation is terminated. + if (iinfo != 0) +- *info=iinfo; ++ *infom=iinfo; + } +- if (*info != 0) +- return *info; ++ if (*infom != 0) ++ return *infom; + + // Prepare the INDXQ sorting permutation. + magma_int_t nk = n - k; +@@ -688,5 +688,5 @@ + + magma_setdevice( orig_dev ); + +- return *info; ++ return *infom; + } /* magma_dlaed3_m */ +diff -Naur magma-2.2.0/src/slaex3_m.cpp magma-2.2.0-patched/src/slaex3_m.cpp +--- magma-2.2.0/src/slaex3_m.cpp 2016-11-20 20:20:24.000000000 -0500 ++++ magma-2.2.0/src/slaex3_m.cpp 2017-01-06 10:20:13.200783151 -0500 +@@ -197,7 +197,7 @@ + magmaFloat_ptr dwork[], + magma_queue_t queues[MagmaMaxGPUs][2], + magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu, +- magma_int_t *info ) ++ magma_int_t *infom ) + { + #define Q(i_,j_) (Q + (i_) + (j_)*ldq) + +@@ -209,8 +209,8 @@ + magma_setdevice(0); + magma_slaex3( k, n, n1, d, Q, ldq, rho, + dlamda, Q2, indx, ctot, w, s, indxq, +- *dwork, range, vl, vu, il, iu, info ); +- return *info; ++ *dwork, range, vl, vu, il, iu, infom ); ++ return *infom; + } + float d_one = 1.; + float d_zero = 0.; +@@ -229,37 +229,37 @@ + valeig = (range == MagmaRangeV); + indeig = (range == MagmaRangeI); + +- *info = 0; ++ *infom = 0; + + if (k < 0) +- *info=-1; ++ *infom=-1; + else if (n < k) +- *info=-2; ++ *infom=-2; + else if (ldq < max(1,n)) +- *info=-6; ++ *infom=-6; + else if (! (alleig || valeig || indeig)) +- *info = -15; ++ *infom = -15; + else { + if (valeig) { + if (n > 0 && vu <= vl) +- *info = -17; ++ *infom = -17; + } + else if (indeig) { + if (il < 1 || il > max(1,n)) +- *info = -18; ++ *infom = -18; + else if (iu < min(n,il) || iu > n) +- *info = -19; ++ *infom = -19; + } + } + +- if (*info != 0) { +- magma_xerbla( __func__, -(*info) ); +- return *info; ++ if (*infom != 0) { ++ magma_xerbla( __func__, -(*infom) ); ++ return *infom; + } + + // Quick return if possible + if (k == 0) +- return *info; ++ return *infom; + + magma_device_t orig_dev; + magma_getdevice( &orig_dev ); +@@ -360,15 +360,15 @@ + lapackf77_slaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo ); + // If the zero finder fails, the computation is terminated. + if (iinfo != 0) { +- #pragma omp critical (info) +- *info = iinfo; ++ #pragma omp critical (infom) ++ *infom = iinfo; + break; + } + } + + #pragma omp barrier + +- if (*info == 0) { ++ if (*infom == 0) { + #pragma omp single + { + // Prepare the INDXQ sorting permutation. +@@ -452,8 +452,8 @@ + } + } + } // end omp parallel +- if (*info != 0) +- return *info; ++ if (*infom != 0) ++ return *infom; + + timer_stop( time ); + timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); +@@ -474,10 +474,10 @@ + lapackf77_slaed4( &k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo ); + // If the zero finder fails, the computation is terminated. + if (iinfo != 0) +- *info=iinfo; ++ *infom=iinfo; + } +- if (*info != 0) +- return *info; ++ if (*infom != 0) ++ return *infom; + + // Prepare the INDXQ sorting permutation. + magma_int_t nk = n - k; +@@ -688,5 +688,5 @@ + + magma_setdevice( orig_dev ); + +- return *info; ++ return *infom; + } /* magma_slaed3_m */ diff --git a/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch b/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch new file mode 100644 index 00000000..f734a5f1 --- /dev/null +++ b/scripts/uberenv/packages/magma/magma-2.3.0-gcc-4.8.patch @@ -0,0 +1,24 @@ +diff -ru magma-2.3.0/testing/testings.h magma-2.3.0-patched/testing/testings.h +--- magma-2.3.0/testing/testings.h 2017-11-14 21:34:00.000000000 -0800 ++++ magma-2.3.0-patched/testing/testings.h 2018-03-23 20:41:16.459934643 -0700 +@@ -269,4 +269,20 @@ + typename blas::traits::real_t* sigma, + FloatT* A, magma_int_t lda ); + ++// This overload for the case sigma = nullptr is a workaround for an issue ++// when building with gcc 4.8.5. This is not an issue with gcc 4.9.2. ++template< typename FloatT > ++void magma_generate_matrix( ++ magma_opts& opts, ++ magma_int_t m, magma_int_t n, ++ std::nullptr_t sigma, ++ FloatT* A, magma_int_t lda ) ++{ ++ magma_generate_matrix( ++ opts, ++ m, n, ++ (typename blas::traits::real_t*) sigma, ++ A, lda ); ++} ++ + #endif /* TESTINGS_H */ diff --git a/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch b/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch new file mode 100644 index 00000000..56b58d85 --- /dev/null +++ b/scripts/uberenv/packages/magma/magma-2.5.0-cmake.patch @@ -0,0 +1,77 @@ +diff -ru magma-2.5.0-orig/CMakeLists.txt magma-2.5.0/CMakeLists.txt +--- magma-2.5.0-orig/CMakeLists.txt 2019-01-02 11:18:39.000000000 -0800 ++++ magma-2.5.0/CMakeLists.txt 2019-04-03 15:58:01.871234891 -0700 +@@ -440,18 +440,20 @@ + # compile MAGMA sparse library + + # sparse doesn't have Fortran at the moment, so no need for above shenanigans +-include_directories( sparse/include ) +-include_directories( sparse/control ) +-include_directories( testing ) +-cuda_add_library( magma_sparse ${libsparse_all} ) +-target_link_libraries( magma_sparse +- magma ++if (MAGMA_SPARSE) ++ include_directories( sparse/include ) ++ include_directories( sparse/control ) ++ include_directories( testing ) ++ cuda_add_library( magma_sparse ${libsparse_all} ) ++ target_link_libraries( magma_sparse ++ magma + ${LAPACK_LIBRARIES} + ${CUDA_CUDART_LIBRARY} + ${CUDA_CUBLAS_LIBRARIES} + ${CUDA_cusparse_LIBRARY} +-) +-set( LIBS_SPARSE ${LIBS} magma_sparse ) ++ ) ++ set( LIBS_SPARSE ${LIBS} magma_sparse ) ++endif() + + + # ---------------------------------------- +@@ -480,23 +482,31 @@ + + # ---------------------------------------- + # compile each sparse tester +-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY sparse/testing ) +-foreach( TEST ${sparse_testing_all} ) ++if (MAGMA_SPARSE) ++ set( CMAKE_RUNTIME_OUTPUT_DIRECTORY sparse/testing ) ++ foreach( TEST ${sparse_testing_all} ) + string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} ) + string( REGEX REPLACE "sparse/testing/" "" EXE ${EXE} ) + #message( "${TEST} --> ${EXE}" ) + add_executable( ${EXE} ${TEST} ) + target_link_libraries( ${EXE} ${LIBS_SPARSE} ) +-endforeach() ++ endforeach() ++endif() + + + # ---------------------------------------- + # what to install +-install( TARGETS magma magma_sparse ${blas_fix} ++set(MAGMA_TARGETS magma) ++set(MAGMA_HEADERS_PATTERNS include/*.h) ++if (MAGMA_SPARSE) ++ set(MAGMA_TARGETS ${MAGMA_TARGETS} magma_sparse) ++ set(MAGMA_HEADERS_PATTERNS ${MAGMA_HEADERS_PATTERNS} sparse/include/*.h) ++endif() ++install( TARGETS ${MAGMA_TARGETS} ${blas_fix} + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib ) +-file( GLOB headers include/*.h sparse/include/*.h ) ++file( GLOB headers ${MAGMA_HEADERS_PATTERNS} ) + install( FILES ${headers} + DESTINATION include ) + +@@ -509,4 +519,6 @@ + message( STATUS " NFLAGS ${CUDA_NVCC_FLAGS}" ) + message( STATUS " FFLAGS ${CMAKE_Fortran_FLAGS}" ) + message( STATUS " LIBS ${LIBS}" ) +-message( STATUS " LIBS_SPARSE ${LIBS_SPARSE}" ) ++if (MAGMA_SPARSE) ++ message( STATUS " LIBS_SPARSE ${LIBS_SPARSE}" ) ++endif() diff --git a/scripts/uberenv/packages/magma/magma-2.5.0.patch b/scripts/uberenv/packages/magma/magma-2.5.0.patch new file mode 100644 index 00000000..1ac800c5 --- /dev/null +++ b/scripts/uberenv/packages/magma/magma-2.5.0.patch @@ -0,0 +1,428 @@ +diff -r 89706c0efbdb .hgtags +--- a/.hgtags Wed Jan 02 14:17:26 2019 -0500 ++++ b/.hgtags Wed Apr 03 15:50:54 2019 -0700 +@@ -1,3 +1,4 @@ + 9c7e7cffa7d0e2decd23cde36a4830dfb55bea13 v2.2.0 + b2b2e21c22a59a79eefbf1e5cff8e7d539a52c0c v2.3.0 + 04d08aaa27dc8a551513d268c68fc299e81b6780 v2.4.0 ++89706c0efbdbfd48bf8a2c20cc0d73e53c3f387e v2.5.0 +diff -r 89706c0efbdb include/magma_types.h +--- a/include/magma_types.h Wed Jan 02 14:17:26 2019 -0500 ++++ b/include/magma_types.h Wed Apr 03 15:50:54 2019 -0700 +@@ -77,7 +77,7 @@ + typedef magma_int_t magma_device_t; + + // Half precision in CUDA +- #if defined(__cplusplus) && CUDA_VERSION > 7500 ++ #if defined(__cplusplus) && CUDA_VERSION >= 7500 + #include + typedef __half magmaHalf; + #else +diff -r 89706c0efbdb sparse/blas/magma_zsampleselect.cu +--- a/sparse/blas/magma_zsampleselect.cu Wed Jan 02 14:17:26 2019 -0500 ++++ b/sparse/blas/magma_zsampleselect.cu Wed Apr 03 15:50:54 2019 -0700 +@@ -15,9 +15,12 @@ + + #define PRECISION_z + ++ + namespace magma_sampleselect { + +-__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) { ++__global__ void compute_abs(const magmaDoubleComplex* __restrict__ in, double* __restrict__ out, int32_t size) ++{ ++#if (__CUDA_ARCH__ >= 350) + auto idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= size) { + return; +@@ -25,6 +28,7 @@ + + auto v = in[idx]; + out[idx] = real(v) * real(v) + imag(v) * imag(v); ++#endif + } + + } // namespace magma_sampleselect +@@ -164,36 +168,43 @@ + magma_queue_t queue ) + { + magma_int_t info = 0; ++ magma_int_t arch = magma_getdevice_arch(); + +- auto num_blocks = magma_ceildiv(total_size, block_size); +- auto local_work = (total_size + num_threads - 1) / num_threads; +- auto required_size = sizeof(double) * (total_size + searchtree_size) ++ if( arch >= 350 ) { ++ auto num_blocks = magma_ceildiv(total_size, block_size); ++ auto local_work = (total_size + num_threads - 1) / num_threads; ++ auto required_size = sizeof(double) * (total_size + searchtree_size) + + sizeof(int32_t) * (searchtree_width * (num_grouped_blocks + 1) + 1); +- auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size); ++ auto realloc_result = realloc_if_necessary(tmp_ptr, tmp_size, required_size); + +- double* gputmp = (double*)*tmp_ptr; +- double* gputree = gputmp + total_size; +- uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size); +- int32_t* gpurankout = (int32_t*)(gpubucketidx + 1); +- int32_t* gpucounts = gpurankout + 1; +- int32_t* gpulocalcounts = gpucounts + searchtree_width; +- uint32_t bucketidx{}; ++ double* gputmp = (double*)*tmp_ptr; ++ double* gputree = gputmp + total_size; ++ uint32_t* gpubucketidx = (uint32_t*)(gputree + searchtree_size); ++ int32_t* gpurankout = (int32_t*)(gpubucketidx + 1); ++ int32_t* gpucounts = gpurankout + 1; ++ int32_t* gpulocalcounts = gpucounts + searchtree_width; ++ uint32_t bucketidx{}; + +- CHECK(realloc_result); ++ CHECK(realloc_result); + +- compute_abs<<cuda_stream()>>> +- (val, gputmp, total_size); +- build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>> +- (gputmp, gputree, total_size); +- count_buckets<<cuda_stream()>>> +- (gputmp, gputree, gpulocalcounts, total_size, local_work); +- reduce_counts<<cuda_stream()>>> +- (gpulocalcounts, gpucounts, num_grouped_blocks); +- sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>> +- (gpucounts, subset_size, gpubucketidx, gpurankout); +- magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue); +- magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue); +- *thrs = std::sqrt(*thrs); ++ compute_abs<<cuda_stream()>>> ++ (val, gputmp, total_size); ++ build_searchtree<<<1, sample_size, 0, queue->cuda_stream()>>> ++ (gputmp, gputree, total_size); ++ count_buckets<<cuda_stream()>>> ++ (gputmp, gputree, gpulocalcounts, total_size, local_work); ++ reduce_counts<<cuda_stream()>>> ++ (gpulocalcounts, gpucounts, num_grouped_blocks); ++ sampleselect_findbucket<<<1, searchtree_width / 2, 0, queue->cuda_stream()>>> ++ (gpucounts, subset_size, gpubucketidx, gpurankout); ++ magma_getvector(1, sizeof(uint32_t), gpubucketidx, 1, &bucketidx, 1, queue); ++ magma_dgetvector(1, gputree + searchtree_width - 1 + bucketidx, 1, thrs, 1, queue); ++ *thrs = std::sqrt(*thrs); ++ } ++ else { ++ printf("error: this functionality needs CUDA architecture >= 3.5\n"); ++ info = MAGMA_ERR_NOT_SUPPORTED; ++ } + + cleanup: + return info; +diff -r 89706c0efbdb src/xhsgetrf_gpu.cpp +--- a/src/xhsgetrf_gpu.cpp Wed Jan 02 14:17:26 2019 -0500 ++++ b/src/xhsgetrf_gpu.cpp Wed Apr 03 15:50:54 2019 -0700 +@@ -16,6 +16,131 @@ + #include + #endif + ++#if CUDA_VERSION < 9020 ++// conversion float to half are not defined for host in CUDA version <9.2 ++// thus uses the conversion below when CUDA VERSION is < 9.2. ++#include ++// ++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved. ++// ++// Redistribution and use in source and binary forms, with or without ++// modification, are permitted provided that the following conditions ++// are met: ++// * Redistributions of source code must retain the above copyright ++// notice, this list of conditions and the following disclaimer. ++// * Redistributions in binary form must reproduce the above copyright ++// notice, this list of conditions and the following disclaimer in the ++// documentation and/or other materials provided with the distribution. ++// * Neither the name of NVIDIA CORPORATION nor the names of its ++// contributors may be used to endorse or promote products derived ++// from this software without specific prior written permission. ++// ++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY ++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ++// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++// This code modified from the public domain code here: ++// https://gist.github.com/rygorous/2156668 ++// The URL above includes more robust conversion routines ++// that handle Inf and NaN correctly. ++// ++// It is recommended to use the more robust versions in production code. ++ ++typedef unsigned uint; ++ ++union FP32 ++{ ++ uint u; ++ float f; ++ struct ++ { ++ uint Mantissa : 23; ++ uint Exponent : 8; ++ uint Sign : 1; ++ }; ++}; ++ ++union FP16 ++{ ++ unsigned short u; ++ struct ++ { ++ uint Mantissa : 10; ++ uint Exponent : 5; ++ uint Sign : 1; ++ }; ++}; ++ ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++static half approx_float_to_half(float fl) ++{ ++ FP32 f32infty = { 255 << 23 }; ++ FP32 f16max = { (127 + 16) << 23 }; ++ FP32 magic = { 15 << 23 }; ++ FP32 expinf = { (255 ^ 31) << 23 }; ++ uint sign_mask = 0x80000000u; ++ FP16 o = { 0 }; ++ ++ FP32 f = *((FP32*)&fl); ++ ++ uint sign = f.u & sign_mask; ++ f.u ^= sign; ++ ++ if (!(f.f < f32infty.u)) // Inf or NaN ++ o.u = f.u ^ expinf.u; ++ else ++ { ++ if (f.f > f16max.f) f.f = f16max.f; ++ f.f *= magic.f; ++ } ++ ++ o.u = f.u >> 13; // Take the mantissa bits ++ o.u |= sign >> 16; ++ half tmp; ++ memcpy(&tmp, &o, sizeof(half)); ++ //return *((half*)&o); ++ return tmp; ++} ++ ++// from half->float code - just for verification. ++static float half_to_float(half hf) ++{ ++ FP16 h; ++ memcpy(&h, &hf, sizeof(half)); ++ ++ static const FP32 magic = { 113 << 23 }; ++ static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift ++ FP32 o; ++ ++ o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits ++ uint exp = shifted_exp & o.u; // just the exponent ++ o.u += (127 - 15) << 23; // exponent adjust ++ ++ // handle exponent special cases ++ if (exp == shifted_exp) // Inf/NaN? ++ o.u += (128 - 16) << 23; // extra exp adjust ++ else if (exp == 0) // Zero/Denormal? ++ { ++ o.u += 1 << 23; // extra exp adjust ++ o.f -= magic.f; // renormalize ++ } ++ ++ o.u |= (h.u & 0x8000) << 16; // sign bit ++ return o.f; ++} ++#endif ++ + #include "magma_internal.h" + //#include "nvToolsExt.h" + +@@ -106,10 +231,13 @@ + float c_one = MAGMA_S_ONE; + float c_neg_one = MAGMA_S_NEG_ONE; + #if 1 ++ #if CUDA_VERSION >= 9020 + const magmaHalf h_one = (magmaHalf) 1.0; + const magmaHalf h_neg_one = (magmaHalf)-1.0; +- //const magmaHalf h_one = approx_float_to_half(1.0); +- //const magmaHalf h_neg_one = approx_float_to_half(-1.0); ++ #else ++ const magmaHalf h_one = approx_float_to_half(1.0); ++ const magmaHalf h_neg_one = approx_float_to_half(-1.0); ++ #endif + #else + FP32 float_one = *((FP32*)&c_one); + FP16 half_one = float_to_half_full(float_one); +diff -r 89706c0efbdb src/xshgetrf_gpu.cpp +--- a/src/xshgetrf_gpu.cpp Wed Jan 02 14:17:26 2019 -0500 ++++ b/src/xshgetrf_gpu.cpp Wed Apr 03 15:50:54 2019 -0700 +@@ -92,7 +92,7 @@ + magma_mp_type_t enable_tc, + magma_mp_type_t mp_algo_type ) + { +-#if CUDA_VERSION >= 7500 ++#if CUDA_VERSION >= 9000 + #ifdef HAVE_clBLAS + #define dA(i_, j_) dA, (dA_offset + (i_) + (j_)*ldda) + #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_)) +diff -r 89706c0efbdb testing/testing_hgemm.cpp +--- a/testing/testing_hgemm.cpp Wed Jan 02 14:17:26 2019 -0500 ++++ b/testing/testing_hgemm.cpp Wed Apr 03 15:50:54 2019 -0700 +@@ -22,6 +22,131 @@ + #include "magma_operators.h" + #include "testings.h" + ++#if CUDA_VERSION < 9020 ++// conversion float to half are not defined for host in CUDA version <9.2 ++// thus uses the conversion below when CUDA VERSION is < 9.2. ++#include ++// ++// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved. ++// ++// Redistribution and use in source and binary forms, with or without ++// modification, are permitted provided that the following conditions ++// are met: ++// * Redistributions of source code must retain the above copyright ++// notice, this list of conditions and the following disclaimer. ++// * Redistributions in binary form must reproduce the above copyright ++// notice, this list of conditions and the following disclaimer in the ++// documentation and/or other materials provided with the distribution. ++// * Neither the name of NVIDIA CORPORATION nor the names of its ++// contributors may be used to endorse or promote products derived ++// from this software without specific prior written permission. ++// ++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY ++// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR ++// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++// This code modified from the public domain code here: ++// https://gist.github.com/rygorous/2156668 ++// The URL above includes more robust conversion routines ++// that handle Inf and NaN correctly. ++// ++// It is recommended to use the more robust versions in production code. ++ ++typedef unsigned uint; ++ ++union FP32 ++{ ++ uint u; ++ float f; ++ struct ++ { ++ uint Mantissa : 23; ++ uint Exponent : 8; ++ uint Sign : 1; ++ }; ++}; ++ ++union FP16 ++{ ++ unsigned short u; ++ struct ++ { ++ uint Mantissa : 10; ++ uint Exponent : 5; ++ uint Sign : 1; ++ }; ++}; ++ ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++// Approximate solution. This is faster but converts some sNaNs to ++// infinity and doesn't round correctly. Handle with care. ++static half approx_float_to_half(float fl) ++{ ++ FP32 f32infty = { 255 << 23 }; ++ FP32 f16max = { (127 + 16) << 23 }; ++ FP32 magic = { 15 << 23 }; ++ FP32 expinf = { (255 ^ 31) << 23 }; ++ uint sign_mask = 0x80000000u; ++ FP16 o = { 0 }; ++ ++ FP32 f = *((FP32*)&fl); ++ ++ uint sign = f.u & sign_mask; ++ f.u ^= sign; ++ ++ if (!(f.f < f32infty.u)) // Inf or NaN ++ o.u = f.u ^ expinf.u; ++ else ++ { ++ if (f.f > f16max.f) f.f = f16max.f; ++ f.f *= magic.f; ++ } ++ ++ o.u = f.u >> 13; // Take the mantissa bits ++ o.u |= sign >> 16; ++ half tmp; ++ memcpy(&tmp, &o, sizeof(half)); ++ //return *((half*)&o); ++ return tmp; ++} ++ ++// from half->float code - just for verification. ++static float half_to_float(half hf) ++{ ++ FP16 h; ++ memcpy(&h, &hf, sizeof(half)); ++ ++ static const FP32 magic = { 113 << 23 }; ++ static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift ++ FP32 o; ++ ++ o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits ++ uint exp = shifted_exp & o.u; // just the exponent ++ o.u += (127 - 15) << 23; // exponent adjust ++ ++ // handle exponent special cases ++ if (exp == shifted_exp) // Inf/NaN? ++ o.u += (128 - 16) << 23; // extra exp adjust ++ else if (exp == 0) // Zero/Denormal? ++ { ++ o.u += 1 << 23; // extra exp adjust ++ o.f -= magic.f; // renormalize ++ } ++ ++ o.u |= (h.u & 0x8000) << 16; // sign bit ++ return o.f; ++} ++#endif ++ + /* //////////////////////////////////////////////////////////////////////////// + -- Testing sgemm + */ +@@ -47,8 +172,13 @@ + float c_neg_one = MAGMA_S_NEG_ONE; + float alpha = MAGMA_S_MAKE( 0.29, -0.86 ); + float beta = MAGMA_S_MAKE( -0.48, 0.38 ); +- magmaHalf h_alpha = (magmaHalf)alpha; +- magmaHalf h_beta = (magmaHalf)beta; ++ #if CUDA_VERSION >= 9020 ++ const magmaHalf h_alpha = (magmaHalf) alpha; ++ const magmaHalf h_beta = (magmaHalf) beta; ++ #else ++ const magmaHalf h_alpha = approx_float_to_half(alpha); ++ const magmaHalf h_beta = approx_float_to_half(beta); ++ #endif + magma_opts opts; + opts.parse_opts( argc, argv ); + diff --git a/scripts/uberenv/packages/magma/package.py b/scripts/uberenv/packages/magma/package.py new file mode 100644 index 00000000..8d37bec6 --- /dev/null +++ b/scripts/uberenv/packages/magma/package.py @@ -0,0 +1,125 @@ +# Copyright 2013-2021 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +from spack import * + + +class Magma(CMakePackage, CudaPackage): + """The MAGMA project aims to develop a dense linear algebra library similar + to LAPACK but for heterogeneous/hybrid architectures, starting with + current "Multicore+GPU" systems. + """ + + homepage = "http://icl.cs.utk.edu/magma/" + url = "http://icl.cs.utk.edu/projectsfiles/magma/downloads/magma-2.2.0.tar.gz" + maintainers = ['stomov', 'luszczek'] + + version('2.5.4', sha256='7734fb417ae0c367b418dea15096aef2e278a423e527c615aab47f0683683b67') + version('2.5.3', sha256='c602d269a9f9a3df28f6a4f593be819abb12ed3fa413bba1ff8183de721c5ef6') + version('2.5.2', sha256='065feb85558f9dd6f4cc4db36ac633a3f787827fc832d0b578a049a43a195620') + version('2.5.1', sha256='ce32c199131515336b30c92a907effe0c441ebc5c5bdb255e4b06b2508de109f') + version('2.5.0', sha256='4fd45c7e46bd9d9124253e7838bbfb9e6003c64c2c67ffcff02e6c36d2bcfa33') + version('2.4.0', sha256='4eb839b1295405fd29c8a6f5b4ed578476010bf976af46573f80d1169f1f9a4f') + version('2.3.0', sha256='010a4a057d7aa1e57b9426bffc0958f3d06913c9151463737e289e67dd9ea608') + version('2.2.0', sha256='df5d4ace417e5bf52694eae0d91490c6bde4cde1b0da98e8d400c5c3a70d83a2') + + variant('fortran', default=True, + description='Enable Fortran bindings support') + variant('shared', default=True, + description='Enable shared library') + variant('cuda', default=True, description='Build with CUDA') + variant('cuda_arch', default='none', multi=True, + description='Specify CUDA architecture(s)') + + # corbett5 added this variant + variant('fortran_convention', default='default', description='LAPACK/BLAS mangling scheme', + values=('default', 'add_', 'nochange', 'upcase'), multi=False) + + depends_on('blas') + depends_on('lapack') + depends_on('cuda@8:', when='@2.5.1:') # See PR #14471 + + conflicts('~cuda', msg='Magma requires cuda') + conflicts('cuda_arch=none', + msg='Please indicate a CUDA arch value or values') + + # currently not compatible with CUDA-11 + # https://bitbucket.org/icl/magma/issues/22/cuda-11-changes-issue + # https://bitbucket.org/icl/magma/issues/25/error-cusparsesolveanalysisinfo_t-does-not + conflicts('^cuda@11:', when='@:2.5.3') + + patch('ibm-xl.patch', when='@2.2:2.5.0%xl') + patch('ibm-xl.patch', when='@2.2:2.5.0%xl_r') + patch('magma-2.3.0-gcc-4.8.patch', when='@2.3.0%gcc@:4.8') + patch('magma-2.5.0.patch', when='@2.5.0') + patch('magma-2.5.0-cmake.patch', when='@2.5.0') + patch('cmake-W.patch', when='@2.5.0:%nvhpc') + + def cmake_args(self): + spec = self.spec + options = [] + + options.extend([ + '-DCMAKE_INSTALL_PREFIX=%s' % self.prefix, + '-DCMAKE_INSTALL_NAME_DIR:PATH=%s/lib' % self.prefix, + '-DBLAS_LIBRARIES=%s' % spec['blas'].libs.joined(';'), + # As of MAGMA v2.3.0, CMakeLists.txt does not use the variable + # BLAS_LIBRARIES, but only LAPACK_LIBRARIES, so we need to + # explicitly add blas to LAPACK_LIBRARIES. + '-DLAPACK_LIBRARIES=%s' % + (spec['lapack'].libs + spec['blas'].libs).joined(';') + ]) + + options += ['-DBUILD_SHARED_LIBS=%s' % + ('ON' if ('+shared' in spec) else 'OFF')] + + if '+fortran' in spec: + options.extend([ + '-DUSE_FORTRAN=yes' + ]) + if spec.satisfies('%xl') or spec.satisfies('%xl_r'): + options.extend([ + '-DCMAKE_Fortran_COMPILER=%s' % self.compiler.f77 + ]) + + # corbett5 added this else block + else: + options.extend([ + '-DUSE_FORTRAN=no' + ]) + + if spec.satisfies('^cuda'): + cuda_arch = self.spec.variants['cuda_arch'].value + if '@:2.2.0' in spec: + capabilities = ' '.join('sm{0}'.format(i) for i in cuda_arch) + options.extend(['-DGPU_TARGET=' + capabilities]) + else: + capabilities = ' '.join('sm_{0}'.format(i) for i in cuda_arch) + options.extend(['-DGPU_TARGET=' + capabilities]) + + if '@2.5.0' in spec: + options.extend(['-DMAGMA_SPARSE=OFF']) + if spec.compiler.name in ['xl', 'xl_r']: + options.extend(['-DCMAKE_DISABLE_FIND_PACKAGE_OpenMP=TRUE']) + + # corbett5 added these definitions + if spec.variants['fortran_convention'].value == 'add_': + options.extend(['-DFORTRAN_CONVENTION=-DADD_']) + + if spec.variants['fortran_convention'].value == 'nochange': + options.extend(['-DFORTRAN_CONVENTION=-DNOCHANGE']) + + if spec.variants['fortran_convention'].value == 'upcase': + options.extend(['-DFORTRAN_CONVENTION=-DUPCASE']) + + return options + + @run_after('install') + def post_install(self): + install('magmablas/atomics.cuh', self.prefix.include) + install('control/magma_threadsetting.h', self.prefix.include) + install('control/pthread_barrier.h', self.prefix.include) + install('control/magma_internal.h', self.prefix.include) diff --git a/scripts/uberenv/project.json b/scripts/uberenv/project.json index 9822f975..703db7a4 100644 --- a/scripts/uberenv/project.json +++ b/scripts/uberenv/project.json @@ -3,8 +3,8 @@ "package_version" : "develop", "package_final_phase" : "hostconfig", "package_source_dir" : "../..", - "spack_url": "https://github.com/corbett5/spack", - "spack_branch": "package/corbett/lvarray-update", + "spack_url": "https://github.com/spack/spack", + "spack_branch": "v0.19.0", "spack_activate" : {}, "spack_clean_packages": ["lvarray"], "build_jobs": 100 diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml index b1bf26cb..652d26c4 100644 --- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml +++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/compilers.yaml @@ -1,31 +1,91 @@ compilers: - compiler: - spec: clang@10.0.1 + spec: clang@upstream-2019.03.19 paths: - cc: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-ibm-10.0.1/bin/clang++ + cc: /usr/tce/packages/clang/clang-upstream-2019.03.19/bin/clang + cxx: /usr/tce/packages/clang/clang-upstream-2019.03.19/bin/clang++ + f77: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf_r + fc: /usr/tce/packages/xl/xl-beta-2019.06.20/bin/xlf_r + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@11.0.1 + paths: + cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++ f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -mcpu=native -mtune=native - cxxflags: -mcpu=native -mtune=native + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 operating_system: rhel7 - target: ppc64le + target: x86_64 modules: [] environment: {} extra_rpaths: [] - compiler: - spec: clang@11.0.1 + spec: clang@12.0.1 + paths: + cc: /usr/tce/packages/clang/clang-12.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-12.0.1/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@13.0.1 + paths: + cc: /usr/tce/packages/clang/clang-13.0.1/bin/clang + cxx: /usr/tce/packages/clang/clang-13.0.1/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: clang@14.0.4 paths: - cc: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-ibm-11.0.1/bin/clang++ + cc: /usr/tce/packages/clang/clang-14.0.4/bin/clang + cxx: /usr/tce/packages/clang/clang-14.0.4/bin/clang++ f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -mcpu=native -mtune=native - cxxflags: -mcpu=native -mtune=native + cflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + cxxflags: -march=native -mtune=native --gcc-toolchain=/usr/tce/packages/gcc/gcc-8.1.0 + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@7.3.0 + paths: + cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++ + f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native operating_system: rhel7 - target: ppc64le + target: x86_64 modules: [] environment: {} extra_rpaths: [] @@ -37,25 +97,55 @@ compilers: f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran flags: - cflags: -mcpu=native -mtune=native - cxxflags: -mcpu=native -mtune=native + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@9.3.1 + paths: + cc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-9.3.1/bin/g++ + f77: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-9.3.1/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel7 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@10.2.1 + paths: + cc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-10.2.1/bin/g++ + f77: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-10.2.1/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native operating_system: rhel7 - target: ppc64le + target: x86_64 modules: [] environment: {} extra_rpaths: [] - compiler: - spec: xl@16.1.1 + spec: intel@19.1.2 paths: - cc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlc - cxx: /usr/tce/packages/xl/xl-2021.03.11/bin/xlC - f77: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf - fc: /usr/tce/packages/xl/xl-2021.03.11/bin/xlf + cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc + cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc + f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort + fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort flags: - cflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036 - cxxflags: -qarch=pwr9 -qtune=pwr9 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qsmp=omp -qhot -qnoeh -qsuppress=1500-029 -qsuppress=1500-036 + cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native + cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native operating_system: rhel7 - target: ppc64le + target: x86_64 modules: [] environment: {} extra_rpaths: [] diff --git a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml index d054887c..575d66db 100644 --- a/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml +++ b/scripts/uberenv/spack_configs/blueos_3_ppc64le_ib_p9/packages.yaml @@ -2,6 +2,21 @@ packages: all: target: [default] compiler: [gcc, clang, xl] + providers: + blas: [netlib-lapack, essl] + lapack: [netlib-lapack, essl] + + netlib-lapack: + buildable: False + externals: + - spec: netlib-lapack@3.10.0 ~external-blas + prefix: /usr/tcetmp/packages/lapack/lapack-3.10.0-P9-xl-2022.03.10/ + + essl: + buildable: False + externals: + - spec: essl@6.2.1 ~ilp64 threads=openmp +cuda +lapack + prefix: /usr/tcetmp/packages/essl/essl-6.2.1/ cuda: buildable: False diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml deleted file mode 100644 index 3d9648a7..00000000 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/compilers.yaml +++ /dev/null @@ -1,76 +0,0 @@ -compilers: -- compiler: - spec: clang@10.0.1 - paths: - cc: /usr/tce/packages/clang/clang-10.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-10.0.1/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: clang@11.0.1 - paths: - cc: /usr/tce/packages/clang/clang-11.0.1/bin/clang - cxx: /usr/tce/packages/clang/clang-11.0.1/bin/clang++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@7.3.0 - paths: - cc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-7.3.0/bin/g++ - f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran - flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: gcc@8.3.1 - paths: - cc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-8.3.1/bin/g++ - f77: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-8.3.1/bin/gfortran - flags: - cflags: -march=native -mtune=native - cxxflags: -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] -- compiler: - spec: intel@19.1.2 - paths: - cc: /usr/tce/packages/intel/intel-19.1.2/bin/icc - cxx: /usr/tce/packages/intel/intel-19.1.2/bin/icpc - f77: /usr/tce/packages/intel/intel-19.1.2/bin/ifort - fc: /usr/tce/packages/intel/intel-19.1.2/bin/ifort - flags: - cflags: -gcc-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/gcc -march=native -mtune=native - cxxflags: -gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++ -march=native -mtune=native - operating_system: rhel7 - target: x86_64 - modules: [] - environment: {} - extra_rpaths: [] diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml index 0c6b833b..43971e78 100644 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml +++ b/scripts/uberenv/spack_configs/toss_3_x86_64_ib_python/packages.yaml @@ -107,3 +107,10 @@ packages: externals: - spec: pkg-config@0.27.1 prefix: /usr/bin/ + + ninja: + buildable: False + externals: + - spec: ninja@1.11.0 + prefix: /usr/tce/packages/ninja/ninja-1.11.0 + diff --git a/scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml new file mode 100644 index 00000000..15bdbccd --- /dev/null +++ b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/compilers.yaml @@ -0,0 +1,31 @@ +compilers: +- compiler: + spec: clang@14.0.6 + paths: + cc: /usr/tce/packages/clang/clang-14.0.6-magic/bin/clang + cxx: /usr/tce/packages/clang/clang-14.0.6-magic/bin/clang++ + f77: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel8 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] +- compiler: + spec: gcc@12.1.1 + paths: + cc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/g++ + f77: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-12.1.1-magic/bin/gfortran + flags: + cflags: -march=native -mtune=native + cxxflags: -march=native -mtune=native + operating_system: rhel8 + target: x86_64 + modules: [] + environment: {} + extra_rpaths: [] diff --git a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml similarity index 72% rename from scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml rename to scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml index e7ed36f4..d3d2714a 100644 --- a/scripts/uberenv/spack_configs/toss_3_x86_64_ib/packages.yaml +++ b/scripts/uberenv/spack_configs/toss_4_x86_64_ib/packages.yaml @@ -2,6 +2,15 @@ packages: all: target: [default] compiler: [gcc, clang, intel] + providers: + blas: [intel-oneapi-mkl] + lapack: [intel-oneapi-mkl] + + intel-oneapi-mkl: + buildable: False + externals: + - spec: intel-oneapi-mkl@2022.1.0 + prefix: /usr/tce/backend/installations/linux-rhel8-x86_64/intel-19.0.4/intel-oneapi-mkl-2022.1.0-sksz67twjxftvwchnagedk36gf7plkrp/ cmake: buildable: False diff --git a/src/Array.hpp b/src/Array.hpp index d05769cd..503d4750 100644 --- a/src/Array.hpp +++ b/src/Array.hpp @@ -91,10 +91,10 @@ class Array : public ArrayView< T, { this->m_strides = indexing::calculateStrides< PERMUTATION >( this->m_dims ); -#if !defined(__CUDA_ARCH__) +#if !defined(LVARRAY_DEVICE_COMPILE) setName( "" ); #endif -#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) +#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) Array::TV_ttf_display_type( nullptr ); #endif } @@ -121,10 +121,10 @@ class Array : public ArrayView< T, { this->m_strides = indexing::calculateStrides< PERMUTATION >( this->m_dims ); -#if !defined(__CUDA_ARCH__) +#if !defined(LVARRAY_DEVICE_COMPILE) setName( "" ); #endif -#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) +#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) Array::TV_ttf_display_type( nullptr ); #endif } @@ -588,7 +588,7 @@ class Array : public ArrayView< T, void setName( std::string const & name ) { this->m_dataBuffer.template setName< decltype(*this) >( name ); } -#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) +#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) /** * @brief Static function that will be used by Totalview to display the array contents. * @param av A pointer to the array that is being displayed. diff --git a/src/ArrayOfArraysView.hpp b/src/ArrayOfArraysView.hpp index 706f2014..5efb4bc0 100644 --- a/src/ArrayOfArraysView.hpp +++ b/src/ArrayOfArraysView.hpp @@ -252,6 +252,7 @@ class ArrayOfArraysView * @param src the SparsityPatternView to be moved from. * @return *this. */ + LVARRAY_HOST_DEVICE inline ArrayOfArraysView & operator=( ArrayOfArraysView && src ) { @@ -587,6 +588,9 @@ class ArrayOfArraysView #if defined(LVARRAY_USE_CUDA) if( space == MemorySpace::cuda ) touch = false; + #endif + #if defined(LVARRAY_USE_HIP) + if( space == MemorySpace::hip ) touch = false; #endif m_offsets.move( space, touch ); } @@ -725,12 +729,11 @@ class ArrayOfArraysView auto const fillOffsets = [&]() { m_offsets[ 0 ] = 0; -// RAJA::inclusive_scan< POLICY >( capacities, -// capacities + numSubArrays, -// m_offsets.data() + 1 ); - - RAJA::inclusive_scan< POLICY >( RAJA::make_span< INDEX_TYPE const * >( capacities, numSubArrays ), - RAJA::make_span< INDEX_TYPE * >( m_offsets.data()+1, numSubArrays ) ); +#if ( RAJA_VERSION_MAJOR == 1 && RAJA_VERSION_MINOR >= 13 ) || ( RAJA_VERSION_MAJOR > 1 ) + RAJA::inclusive_scan< POLICY >( RAJA::make_span( capacities, numSubArrays ), RAJA::make_span( m_offsets.data() + 1, numSubArrays ) ); +#else + RAJA::inclusive_scan< POLICY >( capacities, capacities + numSubArrays, m_offsets.data() + 1 ); +#endif }; resizeFromOffsetsImpl( numSubArrays, fillOffsets, buffers ... ); } diff --git a/src/ArraySlice.hpp b/src/ArraySlice.hpp index b4e22345..84357d8b 100644 --- a/src/ArraySlice.hpp +++ b/src/ArraySlice.hpp @@ -126,7 +126,7 @@ class ArraySlice m_dims( inputDimensions ), m_strides( inputStrides ) { -#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) && defined(LVARRAY_BOUNDS_CHECK) +#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) && defined(LVARRAY_BOUNDS_CHECK) ArraySlice::TV_ttf_display_type( nullptr ); #endif } @@ -308,6 +308,15 @@ class ArraySlice return m_data[ linearIndex( indices ... ) ]; } + /** + * @brief + */ + LVARRAY_HOST_DEVICE inline constexpr + T * data() const + { + return m_data; + } + /** * @return Return a pointer to the values. * @tparam USD_ Dummy template parameter, do not specify. @@ -341,7 +350,7 @@ class ArraySlice ///@} -#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(__CUDA_ARCH__) && defined(LVARRAY_BOUNDS_CHECK) +#if defined(LVARRAY_USE_TOTALVIEW_OUTPUT) && !defined(LVARRAY_DEVICE_COMPILE) && defined(LVARRAY_BOUNDS_CHECK) /** * @brief Static function that will be used by Totalview to display the array contents. * @param av A pointer to the array that is being displayed. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 03f627c2..8d4ad2ca 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -38,7 +38,7 @@ set( lvarray_headers sortedArrayManipulationHelpers.hpp system.hpp tensorOps.hpp - totalview/tv_data_display.h +# totalview/tv_data_display.h typeManipulation.hpp umpireInterface.hpp ) @@ -46,7 +46,7 @@ blt_list_append( TO lvarray_headers ELEMENTS ChaiBuffer.hpp IF ENABLE_CHAI ) set( lvarray_sources system.cpp - totalview/tv_data_display.c +# totalview/tv_data_display.c umpireInterface.cpp ) blt_add_library( NAME lvarray @@ -79,3 +79,7 @@ lvarray_add_code_checks( PREFIX lvarray if( ENABLE_PYLVARRAY ) add_subdirectory( python ) endif() + +if( ENABLE_LAPACK ) + add_subdirectory( dense ) +endif() diff --git a/src/CRSMatrixView.hpp b/src/CRSMatrixView.hpp index bc954672..9a8bbca5 100644 --- a/src/CRSMatrixView.hpp +++ b/src/CRSMatrixView.hpp @@ -111,7 +111,6 @@ class CRSMatrixView : protected SparsityPatternView< COL_TYPE, INDEX_TYPE, BUFFE /** * @brief Default move constructor. */ - inline CRSMatrixView( CRSMatrixView && ) = default; /** diff --git a/src/ChaiBuffer.hpp b/src/ChaiBuffer.hpp index b5d26fa1..83e8c254 100644 --- a/src/ChaiBuffer.hpp +++ b/src/ChaiBuffer.hpp @@ -56,7 +56,11 @@ inline chai::ExecutionSpace toChaiExecutionSpace( MemorySpace const space ) if( space == MemorySpace::host ) return chai::CPU; #if defined(LVARRAY_USE_CUDA) - if( space == MemorySpace::cuda || space == MemorySpace::hip ) + if( space == MemorySpace::cuda ) + return chai::GPU; +#endif +#if defined(LVARRAY_USE_HIP) + if( space == MemorySpace::hip ) return chai::GPU; #endif @@ -79,6 +83,10 @@ inline MemorySpace toMemorySpace( chai::ExecutionSpace const space ) if( space == chai::GPU ) return MemorySpace::cuda; #endif +#if defined(LVARRAY_USE_HIP) + if( space == chai::GPU ) + return MemorySpace::hip; +#endif LVARRAY_ERROR( "Unrecognized execution space " << static_cast< int >( space ) ); @@ -185,7 +193,7 @@ class ChaiBuffer m_capacity( src.m_capacity ), m_pointerRecord( src.m_pointerRecord ) { - #if defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) + #if defined(LVARRAY_USE_DEVICE) && !defined(LVARRAY_DEVICE_COMPILE) move( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), true ); #endif } @@ -203,7 +211,7 @@ class ChaiBuffer m_capacity( src.m_capacity ), m_pointerRecord( src.m_pointerRecord ) { - #if defined(LVARRAY_USE_CUDA) && !defined(__CUDA_ARCH__) + #if defined(LVARRAY_USE_DEVICE) && !defined(LVARRAY_DEVICE_COMPILE) moveNested( internal::toMemorySpace( internal::getArrayManager().getExecutionSpace() ), size, true ); #else LVARRAY_UNUSED_VARIABLE( size ); @@ -370,7 +378,7 @@ class ChaiBuffer inline void moveNested( MemorySpace const space, std::ptrdiff_t const size, bool const touch ) const { - #if defined(LVARRAY_USE_CUDA) + #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP ) chai::ExecutionSpace const chaiSpace = internal::toChaiExecutionSpace( space ); if( m_pointerRecord == nullptr || m_capacity == 0 || @@ -398,16 +406,15 @@ class ChaiBuffer */ void move( MemorySpace const space, bool const touch ) const { - #if defined(LVARRAY_USE_CUDA) + #if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) chai::ExecutionSpace const chaiSpace = internal::toChaiExecutionSpace( space ); if( m_pointerRecord == nullptr || m_capacity == 0 || chaiSpace == chai::NONE ) return; + auto & am = internal::getArrayManager(); const_cast< T * & >( m_pointer ) = - static_cast< T * >( internal::getArrayManager().move( const_cast< T_non_const * >( m_pointer ), - m_pointerRecord, - chaiSpace ) ); + static_cast< T * >( am.move( const_cast< T_non_const * >( m_pointer ), m_pointerRecord, chaiSpace ) ); if( !std::is_const< T >::value && touch ) m_pointerRecord->m_touched[ chaiSpace ] = true; m_pointerRecord->m_last_space = chaiSpace; diff --git a/src/LvArrayConfig.hpp.in b/src/LvArrayConfig.hpp.in index 2c997ab5..bf48242a 100644 --- a/src/LvArrayConfig.hpp.in +++ b/src/LvArrayConfig.hpp.in @@ -26,8 +26,12 @@ #cmakedefine LVARRAY_USE_CUDA +#cmakedefine LVARRAY_USE_HIP + #cmakedefine LVARRAY_USE_TOTALVIEW_OUTPUT #cmakedefine LVARRAY_USE_CALIPER +#cmakedefine LVARRAY_USE_MAGMA + #cmakedefine LVARRAY_ADDR2LINE_EXEC @LVARRAY_ADDR2LINE_EXEC@ diff --git a/src/Macros.hpp b/src/Macros.hpp index 544f5e19..82cf24d1 100644 --- a/src/Macros.hpp +++ b/src/Macros.hpp @@ -22,10 +22,33 @@ #include #include -#if defined(LVARRAY_USE_CUDA) - #include + +#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) +/// Macro defined when using a device. +#define LVARRAY_USE_DEVICE #endif +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +/// Macro defined when currently compiling on device (only defined in the device context). +#define LVARRAY_DEVICE_COMPILE +/// Marks a function/lambda for inlining +#define LVARRAY_FORCE_INLINE __forceinline__ +#else +/// Marks a function/lambda for inlining +#define LVARRAY_FORCE_INLINE inline +#endif + +#if defined(__CUDACC__) || defined(__HIPCC__) +// Denotes whether to define decorator macros later in this file. +#define LVARRAY_DECORATE +#endif + + + +//#if !defined(NDEBUG) && defined(LVARRAY_DEVICE_COMPILE) + #include +//#endif + /** * @brief Convert @p A into a string. * @param A the token to convert to a string. @@ -38,6 +61,8 @@ */ #define STRINGIZE( A ) STRINGIZE_NX( A ) +//#pragma message "LVARRAY_DEVICE_COMPILE: " STRINGIZE(LVARRAY_DEVICE_COMPILE) + /** * @brief Mark @p X as an unused argument, used to silence compiler warnings. * @param X the unused argument. @@ -91,8 +116,11 @@ * and a stack trace along with the provided message. On device none of this is * guaranteed. In fact it is only guaranteed to abort the current kernel. */ -#if defined(__CUDA_ARCH__) - #if !defined(NDEBUG) +// cce processes __host__ functions with __hip_device_compile__=1 when -x hip? +// the entire compilation unit has __hip_device_compile__=1, whereas __cuda_arch__ +// seems to be scope-defined as it isn't defined in __host__ functions +#if defined(LVARRAY_DEVICE_COMPILE) + #if !defined(NDEBUG) || __HIP_DEVICE_COMPILE__ #define LVARRAY_ERROR_IF( EXP, MSG ) \ do \ { \ @@ -535,7 +563,7 @@ */ #define LVARRAY_ASSERT_GE( lhs, rhs ) LVARRAY_ASSERT_GE_MSG( lhs, rhs, "" ) -#if defined(LVARRAY_USE_CUDA) && defined(__CUDACC__) +#if defined(LVARRAY_DECORATE) /// Mark a function for both host and device usage. #define LVARRAY_HOST_DEVICE __host__ __device__ @@ -549,8 +577,12 @@ * call host only code. This is safe as long as the host only instantiations are only called on * the host. To use place directly above a the template. */ +#if defined(LVARRAY_USE_CUDA) #define DISABLE_HD_WARNING _Pragma("hd_warning_disable") #else +#define DISABLE_HD_WARNING +#endif +#else /// Mark a function for both host and device usage. #define LVARRAY_HOST_DEVICE diff --git a/src/SortedArrayView.hpp b/src/SortedArrayView.hpp index ab7ca790..8559a3fc 100644 --- a/src/SortedArrayView.hpp +++ b/src/SortedArrayView.hpp @@ -274,6 +274,9 @@ class SortedArrayView { #if defined(LVARRAY_USE_CUDA) if( space == MemorySpace::cuda ) touch = false; + #endif + #if defined(LVARRAY_USE_HIP) + if( space == MemorySpace::hip ) touch = false; #endif m_values.move( space, touch ); } diff --git a/src/arrayManipulation.hpp b/src/arrayManipulation.hpp index 5409e60f..21f708e1 100644 --- a/src/arrayManipulation.hpp +++ b/src/arrayManipulation.hpp @@ -298,7 +298,7 @@ void resize( T * const LVARRAY_RESTRICT ptr, if( newSize - size > 0 ) { std::size_t const sizeDiff = integerConversion< std::size_t >( newSize - size ); - std::memset( reinterpret_cast< void * >( ptr + size ), 0, ( sizeDiff ) * sizeof( T ) ); + memset( reinterpret_cast< void * >( ptr + size ), 0, ( sizeDiff ) * sizeof( T ) ); } } else diff --git a/src/bufferManipulation.hpp b/src/bufferManipulation.hpp index 62b94539..83e5a00e 100644 --- a/src/bufferManipulation.hpp +++ b/src/bufferManipulation.hpp @@ -277,7 +277,7 @@ void resize( BUFFER & buf, std::ptrdiff_t const size, std::ptrdiff_t const newSi arrayManipulation::resize( buf.data(), size, newSize, std::forward< ARGS >( args )... ); -#if !defined(__CUDA_ARCH__) +#if !defined(LVARRAY_DEVICE_COMPILE) if( newSize > 0 ) { buf.registerTouch( MemorySpace::host ); diff --git a/src/dense/BlasLapackInterface.cpp b/src/dense/BlasLapackInterface.cpp new file mode 100644 index 00000000..ca4309c5 --- /dev/null +++ b/src/dense/BlasLapackInterface.cpp @@ -0,0 +1,210 @@ +#include "BlasLapackInterface.hpp" +#include "backendHelpers.hpp" + +extern "C" +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_SGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( sgemm ) +void LVARRAY_SGEMM( + char const * TRANSA, + char const * TRANSB, + int const * M, + int const * N, + int const * K, + float const * ALPHA, + float const * A, + int const * LDA, + float const * B, + int const * LDB, + float const * BETA, + float * C, + int const * LDC ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_DGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( dgemm ) +void LVARRAY_DGEMM( + char const * TRANSA, + char const * TRANSB, + int const * M, + int const * N, + int const * K, + double const * ALPHA, + double const * A, + int const * LDA, + double const * B, + int const * LDB, + double const * BETA, + double * C, + int const * LDC ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_CGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( cgemm ) +void LVARRAY_CGEMM( + char const * TRANSA, + char const * TRANSB, + int const * M, + int const * N, + int const * K, + std::complex< float > const * ALPHA, + std::complex< float > const * A, + int const * LDA, + std::complex< float > const * B, + int const * LDB, + std::complex< float > const * BETA, + std::complex< float > * C, + int const * LDC ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_ZGEMM LVARRAY_LAPACK_FORTRAN_MANGLE( zgemm ) +void LVARRAY_ZGEMM( + char const * TRANSA, + char const * TRANSB, + int const * M, + int const * N, + int const * K, + std::complex< double > const * ALPHA, + std::complex< double > const * A, + int const * LDA, + std::complex< double > const * B, + int const * LDB, + std::complex< double > const * BETA, + std::complex< double > * C, + int const * LDC ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_SGESV LVARRAY_LAPACK_FORTRAN_MANGLE( sgesv ) +void LVARRAY_SGESV( + int const * N, + int const * NRHS, + float * A, + int const * LDA, + int * IPIV, + float * B, + int const * LDB, + int * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_DGESV LVARRAY_LAPACK_FORTRAN_MANGLE( dgesv ) +void LVARRAY_DGESV( + int const * N, + int const * NRHS, + double * A, + int const * LDA, + int * IPIV, + double * B, + int const * LDB, + int * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_CGESV LVARRAY_LAPACK_FORTRAN_MANGLE( cgesv ) +void LVARRAY_CGESV( + int const * N, + int const * NRHS, + std::complex< float > * A, + int const * LDA, + int * IPIV, + std::complex< float > * B, + int const * LDB, + int * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_ZGESV LVARRAY_LAPACK_FORTRAN_MANGLE( zgesv ) +void LVARRAY_ZGESV( + int const * N, + int const * NRHS, + std::complex< double > * A, + int const * LDA, + int * IPIV, + std::complex< double > * B, + int const * LDB, + int * INFO ); + +} // extern "C" + +namespace LvArray +{ +namespace dense +{ + +char toLapackChar( Operation const op ) +{ + if( op == Operation::NO_OP ) return 'N'; + if( op == Operation::TRANSPOSE ) return 'T'; + if( op == Operation::ADJOINT ) return 'C'; + + LVARRAY_ERROR( "Unknown operation: " << int( op ) ); + return '\0'; +} + + +template< typename T > +void BlasLapackInterface< T >::gemm( + Operation opA, + Operation opB, + T const alpha, + Matrix< T const > const & A, + Matrix< T const > const & B, + T const beta, + Matrix< T > const & C ) +{ + char const TRANSA = toLapackChar( opA ); + char const TRANSB = toLapackChar( opB ); + int const M = C.sizes[ 0 ]; + int const N = C.sizes[ 1 ]; + int const K = opA == Operation::NO_OP ? A.sizes[ 1 ] : A.sizes[ 0 ]; + int const LDA = std::max( std::ptrdiff_t{ 1 }, A.strides[ 1 ] ); + int const LDB = std::max( std::ptrdiff_t{ 1 }, B.strides[ 1 ] ); + int const LDC = std::max( std::ptrdiff_t{ 1 }, C.strides[ 1 ] ); + + TypeDispatch< T >::dispatch( LVARRAY_SGEMM, LVARRAY_DGEMM, LVARRAY_CGEMM, LVARRAY_ZGEMM, + &TRANSA, + &TRANSB, + &M, + &N, + &K, + &alpha, + A.data, + &LDA, + B.data, + &LDB, + &beta, + C.data, + &LDC ); +} + + +template< typename T > +void BlasLapackInterface< T >::gesv( + Matrix< T > const & A, + Matrix< T > const & B, + Vector< int > const & pivots ) +{ + int const N = A.sizes[ 0 ]; + int const NRHS = B.sizes[ 1 ]; + int const LDA = A.strides[ 1 ]; + int const LDB = B.strides[ 1 ]; + int INFO = 0; + + TypeDispatch< T >::dispatch( LVARRAY_SGESV, LVARRAY_DGESV, LVARRAY_CGESV, LVARRAY_ZGESV, + &N, + &NRHS, + A.data, + &LDA, + pivots.data, + B.data, + &LDB, + &INFO ); + + LVARRAY_ERROR_IF( INFO < 0, "The " << -INFO << "-th argument had an illegal value." ); + LVARRAY_ERROR_IF( INFO > 0, "The factorization has been completed but U( " << INFO - 1 << ", " << INFO - 1 << + " ) is exactly zero so the solution could not be computed." ); +} + +template class BlasLapackInterface< float >; +template class BlasLapackInterface< double >; +template class BlasLapackInterface< std::complex< float > >; +template class BlasLapackInterface< std::complex< double > >; + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/BlasLapackInterface.hpp b/src/dense/BlasLapackInterface.hpp new file mode 100644 index 00000000..ed747828 --- /dev/null +++ b/src/dense/BlasLapackInterface.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include "common.hpp" + +namespace LvArray +{ +namespace dense +{ + +template< typename T > +struct BlasLapackInterface +{ + static constexpr MemorySpace MEMORY_SPACE = MemorySpace::host; + + static void gemm( + Operation opA, + Operation opB, + T const alpha, + Matrix< T const > const & A, + Matrix< T const > const & B, + T const beta, + Matrix< T > const & C ); + + static void gesv( + Matrix< T > const & A, + Matrix< T > const & B, + Vector< int > const & pivots ); +}; + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/CMakeLists.txt b/src/dense/CMakeLists.txt new file mode 100644 index 00000000..8c7b4b0c --- /dev/null +++ b/src/dense/CMakeLists.txt @@ -0,0 +1,41 @@ +set( lvarraydense_headers + common.hpp + backendHelpers.hpp + BlasLapackInterface.hpp + ) + +set( lvarraydense_sources + common.cpp + BlasLapackInterface.cpp + ) + +set( dependencies lvarray ${lvarray_dependencies} blas lapack ) + +if( ENABLE_MAGMA ) + set( dependencies ${dependencies} magma ) +endif() + + +blt_add_library( NAME lvarraydense + SOURCES ${lvarraydense_sources} + HEADERS ${lvarraydense_headers} + DEPENDS_ON ${dependencies} + SHARED TRUE + CLEAR_PREFIX TRUE + ) + +target_include_directories( lvarraydense + PUBLIC + $ + $ ) + +install( TARGETS lvarraydense + EXPORT lvarraydense + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib + RUNTIME DESTINATION lib ) + +install( EXPORT lvarraydense + DESTINATION share/lvarray/cmake/ ) + +lvarray_add_code_checks( PREFIX lvarraydense ) diff --git a/src/dense/backendHelpers.hpp b/src/dense/backendHelpers.hpp new file mode 100644 index 00000000..5de71cf8 --- /dev/null +++ b/src/dense/backendHelpers.hpp @@ -0,0 +1,82 @@ +#pragma once + +#include + +/// This macro provide a flexible interface for Fortran naming convention for compiled objects +// #ifdef FORTRAN_MANGLE_NO_UNDERSCORE +#define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name +// #else +// #define LVARRAY_LAPACK_FORTRAN_MANGLE( name ) name ## _ +// #endif + +namespace LvArray +{ +namespace dense +{ + +template< typename T > +struct TypeDispatch +{}; + +template<> +struct TypeDispatch< float > +{ + template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS > + static constexpr auto dispatch( + F_FLOAT && fFloat, + F_DOUBLE &&, + F_CFLOAT &&, + F_CDOUBLE &&, + ARGS && ... args ) + { + return fFloat( std::forward< ARGS >( args ) ... ); + } +}; + +template<> +struct TypeDispatch< double > +{ + template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS > + static constexpr auto dispatch( + F_FLOAT &&, + F_DOUBLE && fDouble, + F_CFLOAT &&, + F_CDOUBLE &&, + ARGS && ... args ) + { + return fDouble( std::forward< ARGS >( args ) ... ); + } +}; + +template<> +struct TypeDispatch< std::complex< float > > +{ + template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS > + static constexpr auto dispatch( + F_FLOAT &&, + F_DOUBLE &&, + F_CFLOAT && fCFloat, + F_CDOUBLE &&, + ARGS && ... args ) + { + return fCFloat( std::forward< ARGS >( args ) ... ); + } +}; + +template<> +struct TypeDispatch< std::complex< double > > +{ + template< typename F_FLOAT, typename F_DOUBLE, typename F_CFLOAT, typename F_CDOUBLE, typename ... ARGS > + static constexpr auto dispatch( + F_FLOAT &&, + F_DOUBLE &&, + F_CFLOAT &&, + F_CDOUBLE && fCDouble, + ARGS && ... args ) + { + return fCDouble( std::forward< ARGS >( args ) ... ); + } +}; + +} // namespace dense +} // namespace LvArray diff --git a/src/dense/common.cpp b/src/dense/common.cpp new file mode 100644 index 00000000..b1cab9fe --- /dev/null +++ b/src/dense/common.cpp @@ -0,0 +1,21 @@ +#include "common.hpp" + +namespace LvArray +{ +namespace dense +{ + +Operation transposeOp( Operation const op ) +{ + switch( op ) + { + case Operation::NO_OP: return Operation::TRANSPOSE; + case Operation::TRANSPOSE: return Operation::NO_OP; + case Operation::ADJOINT: LVARRAY_ERROR( "Not supported" ); + } + + return Operation::NO_OP; +} + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/common.hpp b/src/dense/common.hpp new file mode 100644 index 00000000..376b589c --- /dev/null +++ b/src/dense/common.hpp @@ -0,0 +1,323 @@ +#pragma once + +#include "../Array.hpp" +#include "../ChaiBuffer.hpp" + +#include + +namespace LvArray +{ +namespace dense +{ +namespace internal +{ + +/** + * TODO make a complex type and add it to the main LvArray. Make a uniform way of interacting with various complex number implementations. + */ +template< typename T > +struct RealVersion +{ + using Type = T; +}; + +/** + * + */ +template< typename T > +struct RealVersion< std::complex< T > > +{ + using Type = T; +}; + +} // namespace internal + +/** + * + */ +enum class SymmetricMatrixStorageType +{ + UPPER_TRIANGULAR, + LOWER_TRIANGULAR, +}; + +enum class Operation +{ + NO_OP, + TRANSPOSE, + ADJOINT, +}; + +Operation transposeOp( Operation const op ); + +/** + * + */ +template< typename T > +using RealVersion = typename internal::RealVersion< T >::Type; + +/** + * + */ +template< typename T > +static constexpr bool IsComplex = !std::is_same< RealVersion< T >, T >::value; + +/** + * + */ +template< typename T, typename U > +static constexpr bool IsComplexT = IsComplex< T > && std::is_same< RealVersion< T >, U >::value; + +/** + * + */ +template< typename T > +struct Matrix +{ + Matrix( + typeManipulation::CArray< std::ptrdiff_t, 2 > const & sizesIn, + typeManipulation::CArray< std::ptrdiff_t, 2 > const & stridesIn, + T * const dataIn ): + sizes{ sizesIn }, + strides{ stridesIn }, + data{ dataIn } + { + LVARRAY_ERROR_IF_LT( sizes[ 0 ], 0 ); + LVARRAY_ERROR_IF_LT( sizes[ 1 ], 0 ); + LVARRAY_ERROR_IF_LT( strides[ 0 ], 0 ); + LVARRAY_ERROR_IF_LT( strides[ 1 ], 0 ); + } + + Matrix( T & value ): + sizes{ 1, 1 }, + strides{ 1, 1 }, + data{ &value } + {} + + Matrix( Matrix< std::remove_const_t< T > > const & src ): + sizes{ src.sizes }, + strides{ src.strides }, + data{ src.data } + {} + + bool isSquare() const + { return sizes[0] == sizes[1]; } + + bool isColumnMajor() const + { return strides[ 0 ] == 1; } + + bool isRowMajor() const + { return strides[ 1 ] == 1; } + + bool isContiguous() const + { return isColumnMajor() || isRowMajor(); } + + std::ptrdiff_t nRows() const + { return sizes[ 0 ]; } + + std::ptrdiff_t nCols() const + { return sizes[ 1 ]; } + + Matrix transpose() const + { + return Matrix( { sizes[ 1 ], sizes[ 0 ] }, { strides[ 1 ], strides[ 0 ] }, data ); + } + + typeManipulation::CArray< std::ptrdiff_t, 2 > sizes; + typeManipulation::CArray< std::ptrdiff_t, 2 > strides; + T * data; +}; + +template< typename T, typename PERM, typename INDEX_TYPE, template< typename > class BUFFER_TYPE > +Matrix< T > toMatrix( + Array< T, 2, PERM, INDEX_TYPE, BUFFER_TYPE > const & array, + MemorySpace const space, + bool const touch ) +{ + array.move( space, touch ); + return Matrix< T >( array.dimsArray(), array.stridesArray(), array.data() ); +} + +/** + * + */ +template< typename T > +struct Vector +{ + template< int USD, typename INDEX_TYPE > + Vector( ArraySlice< T, 1, USD, INDEX_TYPE > const & slice ): + size{ integerConversion< std::ptrdiff_t >( slice.size() ) }, + stride{ integerConversion< std::ptrdiff_t >( slice.stride( 0 ) ) }, + data{ slice.data() } + {} + + Vector( T & value ): + size{ 1 }, + stride{ 1 }, + data{ &value } + {} + + std::ptrdiff_t const size; + std::ptrdiff_t const stride; + T * const data; +}; + +/** + * + */ +template< typename T > +struct Workspace +{ + virtual ~Workspace() + {}; + + virtual Vector< T > work() = 0; + + virtual Vector< T > work2() = 0; + + virtual Vector< T > work3() = 0; + + virtual Vector< RealVersion< T > > rwork() = 0; + + virtual Vector< int > iwork() = 0; + + virtual void resizeWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0; + + virtual void resizeWork2( MemorySpace const space, std::ptrdiff_t const newSize ) = 0; + + virtual void resizeWork3( MemorySpace const space, std::ptrdiff_t const newSize ) = 0; + + virtual void resizeRWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0; + + virtual void resizeIWork( MemorySpace const space, std::ptrdiff_t const newSize ) = 0; +}; + +/** + * + */ +template< typename T, template< typename > class BUFFER_TYPE > +struct ArrayWorkspace : public Workspace< T > +{ + ArrayWorkspace() + { + m_work.setName( "ArrayWorkspace::m_work" ); + m_work2.setName( "ArrayWorkspace::m_work2" ); + m_work3.setName( "ArrayWorkspace::m_work3" ); + m_rwork.setName( "ArrayWorkspace::m_rwork" ); + m_iwork.setName( "ArrayWorkspace::m_iwork" ); + } + + virtual Vector< T > work() override + { return m_work.toSlice(); } + + virtual Vector< T > work2() override + { return m_work2.toSlice(); } + + virtual Vector< T > work3() override + { return m_work3.toSlice(); } + + virtual Vector< RealVersion< T > > rwork() override + { return m_rwork.toSlice(); } + + virtual Vector< int > iwork() override + { return m_iwork.toSlice(); } + + virtual void resizeWork( MemorySpace const space, std::ptrdiff_t const newSize ) override + { + m_work.resizeWithoutInitializationOrDestruction( space, newSize ); + } + + virtual void resizeWork2( MemorySpace const space, std::ptrdiff_t const newSize ) override + { + m_work2.resizeWithoutInitializationOrDestruction( space, newSize ); + } + + virtual void resizeWork3( MemorySpace const space, std::ptrdiff_t const newSize ) override + { + m_work3.resizeWithoutInitializationOrDestruction( space, newSize ); + } + + virtual void resizeRWork( MemorySpace const space, std::ptrdiff_t const newSize ) override + { + m_rwork.resizeWithoutInitializationOrDestruction( space, newSize ); + } + + virtual void resizeIWork( MemorySpace const space, std::ptrdiff_t const newSize ) override + { + m_iwork.resizeWithoutInitializationOrDestruction( space, newSize ); + } + +private: + Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work; + + Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work2; + + Array< T, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_work3; + + Array< RealVersion< T >, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_rwork; + + Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > m_iwork; +}; + +/** + * + */ +template< typename T > +struct OptimalSizeCalculation : public Workspace< T > +{ + OptimalSizeCalculation() + {} + + virtual Vector< T > work() override + { return m_work; } + + virtual Vector< T > work2() override + { return m_work2; } + + virtual Vector< T > work3() override + { return m_work3; } + + virtual Vector< RealVersion< T > > rwork() override + { return m_rwork; } + + virtual Vector< int > iwork() override + { return m_iwork; } + + virtual void resizeWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override + { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } + + virtual void resizeWork2( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override + { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } + + virtual void resizeWork3( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override + { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } + + virtual void resizeRWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override + { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } + + virtual void resizeIWork( MemorySpace const LVARRAY_UNUSED_ARG( space ), std::ptrdiff_t const LVARRAY_UNUSED_ARG( newSize ) ) override + { LVARRAY_ERROR( "Not supported by OptimalSizeCalculation." ); } + + std::ptrdiff_t optimalWorkSize() const + { return static_cast< std::ptrdiff_t >( m_work.real() ); } + + std::ptrdiff_t optimalRWorkSize() const + { return static_cast< std::ptrdiff_t >( m_rwork ); } + + std::ptrdiff_t optimalIWorkSize() const + { return m_iwork; } + +private: + T m_work { -1 }; + + T m_work2 { -1 }; + + T m_work3 { -1 }; + + RealVersion< T > m_rwork { -1 }; + + int m_iwork { -1 }; +}; + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/dense.hpp b/src/dense/dense.hpp new file mode 100644 index 00000000..2fcf202d --- /dev/null +++ b/src/dense/dense.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include "common.hpp" + +namespace LvArray +{ +namespace dense +{ + +template< typename INTERFACE, typename MATRIX_A, typename MATRIX_B, typename MATRIX_C, typename SCALAR > +void gemm( + Operation opA, + Operation opB, + SCALAR const alpha, + MATRIX_A const & Ain, + MATRIX_B const & Bin, + SCALAR const beta, + MATRIX_C const & Cin ) +{ + Matrix< SCALAR const > A = toMatrix( Ain, INTERFACE::MEMORY_SPACE, false ); + Matrix< SCALAR const > B = toMatrix( Bin, INTERFACE::MEMORY_SPACE, false ); + Matrix< SCALAR > const C = toMatrix( Cin, INTERFACE::MEMORY_SPACE, true ); + + // Check the sizes + LVARRAY_ERROR_IF_NE( C.sizes[ 0 ], A.sizes[ 0 + (opA != Operation::NO_OP) ] ); + LVARRAY_ERROR_IF_NE( C.sizes[ 1 ], B.sizes[ 1 - (opB != Operation::NO_OP) ] ); + LVARRAY_ERROR_IF_NE( A.sizes[ 1 - (opA != Operation::NO_OP) ], + B.sizes[ 0 + (opB != Operation::NO_OP) ] ); + + // Check that everything is contiguous + LVARRAY_ERROR_IF( !A.isContiguous(), "Matrix A must have one stride on dimension." ); + LVARRAY_ERROR_IF( !B.isContiguous(), "Matrix B must have one stride one dimension." ); + LVARRAY_ERROR_IF( !C.isColumnMajor(), "Matrix C must be column major." ); + + // TODO(corbett5): Don't think this will work for Hermitian matrices. + if( !A.isColumnMajor() ) + { + A = A.transpose(); + opA = transposeOp( opA ); + } + if( !B.isColumnMajor() ) + { + B = B.transpose(); + opB = transposeOp( opB ); + } + + INTERFACE::gemm( opA, opB, alpha, A, B, beta, C ); +} + + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/eigenDecomposition.cpp b/src/dense/eigenDecomposition.cpp new file mode 100644 index 00000000..68236057 --- /dev/null +++ b/src/dense/eigenDecomposition.cpp @@ -0,0 +1,409 @@ +#include "eigenDecomposition.hpp" +#include "backendHelpers.hpp" + +extern "C" +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_CHEEVR LVARRAY_LAPACK_FORTRAN_MANGLE( cheevr ) +void LVARRAY_CHEEVR( + char const * JOBZ, + char const * RANGE, + char const * UPLO, + LvArray::dense::DenseInt const * N, + std::complex< float > * A, + LvArray::dense::DenseInt const * LDA, + float const * VL, + float const * VU, + LvArray::dense::DenseInt const * IL, + LvArray::dense::DenseInt const * IU, + float const * ABSTOL, + LvArray::dense::DenseInt * M, + float * W, + std::complex< float > * Z, + LvArray::dense::DenseInt const * LDZ, + LvArray::dense::DenseInt * ISUPPZ, + std::complex< float > * WORK, + LvArray::dense::DenseInt const * LWORK, + float * RWORK, + LvArray::dense::DenseInt const * LRWORK, + LvArray::dense::DenseInt * IWORK, + LvArray::dense::DenseInt const * LIWORK, + LvArray::dense::DenseInt * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_ZHEEVR LVARRAY_LAPACK_FORTRAN_MANGLE( zheevr ) +void LVARRAY_ZHEEVR( + char const * JOBZ, + char const * RANGE, + char const * UPLO, + LvArray::dense::DenseInt const * N, + std::complex< double > * A, + LvArray::dense::DenseInt const * LDA, + double const * VL, + double const * VU, + LvArray::dense::DenseInt const * IL, + LvArray::dense::DenseInt const * IU, + double const * ABSTOL, + LvArray::dense::DenseInt * M, + double * W, + std::complex< double > * Z, + LvArray::dense::DenseInt const * LDZ, + LvArray::dense::DenseInt * ISUPPZ, + std::complex< double > * WORK, + LvArray::dense::DenseInt const * LWORK, + double * RWORK, + LvArray::dense::DenseInt const * LRWORK, + LvArray::dense::DenseInt * IWORK, + LvArray::dense::DenseInt const * LIWORK, + LvArray::dense::DenseInt * INFO ); + + +} // extern "C" + +namespace LvArray +{ +namespace dense +{ +namespace internal +{ + +/** + * + */ +template< typename T > +DenseInt heevr( + BuiltInBackends const backend, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< T > > const & A, + Vector< T > const & eigenvalues, + Matrix< std::complex< T > > const & eigenvectors, + Vector< DenseInt > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType, + bool const compute ) +{ + LVARRAY_UNUSED_VARIABLE( backend ); + + LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." ); + + char const * const JOBZ = decompositionOptions.typeArg(); + char const * const RANGE = decompositionOptions.rangeArg(); + char const * const UPLO = getOption( storageType ); + DenseInt const N = A.sizes[ 1 ]; + DenseInt const LDA = A.strides[ 1 ]; + + T const VL = decompositionOptions.rangeMin; + T const VU = decompositionOptions.rangeMax; + + DenseInt maxEigenvaluesToFind = N; + DenseInt const IL = decompositionOptions.indexMin; + DenseInt const IU = decompositionOptions.indexMax; + if( decompositionOptions.range == EigenDecompositionOptions::BY_INDEX ) + { + LVARRAY_ERROR_IF_GT( IU, N ); + maxEigenvaluesToFind = IU - IL + 1; + } + + LVARRAY_ERROR_IF_LT( eigenvalues.size, maxEigenvaluesToFind ); + + DenseInt const ABSTOL = decompositionOptions.abstol; + DenseInt M = 0; + + if( decompositionOptions.type == EigenDecompositionOptions::EIGENVALUES_AND_VECTORS ) + { + LVARRAY_ERROR_IF_NE( eigenvectors.sizes[ 0 ], N ); + LVARRAY_ERROR_IF_LT( eigenvectors.sizes[ 1 ], maxEigenvaluesToFind ); + } + + DenseInt const LDZ = std::max( 1, eigenvectors.strides[ 1 ] ); + + if( decompositionOptions.range == EigenDecompositionOptions::ALL || + ( decompositionOptions.range == EigenDecompositionOptions::BY_INDEX && + maxEigenvaluesToFind == N ) ) + { + LVARRAY_ERROR_IF_LT( support.size, 2 * maxEigenvaluesToFind ); + } + + DenseInt const LWORK = compute ? workspace.work().size : -1; + DenseInt const LRWORK = compute ? workspace.rwork().size : -1; + DenseInt const LIWORK = compute ? workspace.iwork().size : -1; + + DenseInt INFO = 0; + + // With C++ 17 we can remove the reinterpret_cast with constexpr if. + if( backend == BuiltInBackends::LAPACK ) + { + if( std::is_same< T, float >::value ) + { + LVARRAY_CHEEVR( + JOBZ, + RANGE, + UPLO, + &N, + reinterpret_cast< std::complex< float > * >( A.data ), + &LDA, + reinterpret_cast< float const * >( &VL ), + reinterpret_cast< float const * >( &VU ), + &IL, + &IU, + reinterpret_cast< float const * >( &ABSTOL ), + &M, + reinterpret_cast< float * >( eigenvalues.data ), + reinterpret_cast< std::complex< float > * >( eigenvectors.data ), + &LDZ, + support.data, + reinterpret_cast< std::complex< float > * >( workspace.work().data ), + &LWORK, + reinterpret_cast< float * >( workspace.rwork().data ), + &LRWORK, + workspace.iwork().data, + &LIWORK, + &INFO ); + } + else + { + LVARRAY_ZHEEVR( + JOBZ, + RANGE, + UPLO, + &N, + reinterpret_cast< std::complex< double > * >( A.data ), + &LDA, + reinterpret_cast< double const * >( &VL ), + reinterpret_cast< double const * >( &VU ), + &IL, + &IU, + reinterpret_cast< double const * >( &ABSTOL ), + &M, + reinterpret_cast< double * >( eigenvalues.data ), + reinterpret_cast< std::complex< double > * >( eigenvectors.data ), + &LDZ, + support.data, + reinterpret_cast< std::complex< double > * >( workspace.work().data ), + &LWORK, + reinterpret_cast< double * >( workspace.rwork().data ), + &LRWORK, + workspace.iwork().data, + &LIWORK, + &INFO ); + } + } +#if defined( LVARRAY_USE_MAGMA ) + else if( backend == BuiltInBackends::MAGMA ) + { + if( std::is_same< T, float >::value ) + { + magma_cheevr( + magma_vec_const( *JOBZ ), + magma_range_const( *RANGE ), + magma_uplo_const( *UPLO ), + N, + reinterpret_cast< magmaFloatComplex * >( A.data ), + LDA, + VL, + VU, + IL, + IU, + ABSTOL, + &M, + reinterpret_cast< float * >( eigenvalues.data ), + reinterpret_cast< magmaFloatComplex * >( eigenvectors.data ), + LDZ, + support.data, + reinterpret_cast< magmaFloatComplex * >( workspace.work().data ), + LWORK, + reinterpret_cast< float * >( workspace.rwork().data ), + LRWORK, + workspace.iwork().data, + LIWORK, + &INFO ); + } + else + { + magma_zheevr( + magma_vec_const( *JOBZ ), + magma_range_const( *RANGE ), + magma_uplo_const( *UPLO ), + N, + reinterpret_cast< magmaDoubleComplex * >( A.data ), + LDA, + VL, + VU, + IL, + IU, + ABSTOL, + &M, + reinterpret_cast< double * >( eigenvalues.data ), + reinterpret_cast< magmaDoubleComplex * >( eigenvectors.data ), + LDZ, + support.data, + reinterpret_cast< magmaDoubleComplex * >( workspace.work().data ), + LWORK, + reinterpret_cast< double * >( workspace.rwork().data ), + LRWORK, + workspace.iwork().data, + LIWORK, + &INFO ); + } + } + else if( backend == BuiltInBackends::MAGMA_GPU ) + { + int LDWA = N; + int LDWZ = N; + + if( compute ) + { + workspace.resizeWork2( MemorySpace::host, LDWA * N ); + workspace.resizeWork3( MemorySpace::host, LDWZ * maxEigenvaluesToFind ); + } + + if( std::is_same< T, float >::value ) + { + magma_cheevr_gpu( + magma_vec_const( *JOBZ ), + magma_range_const( *RANGE ), + magma_uplo_const( *UPLO ), + N, + reinterpret_cast< magmaFloatComplex * >( A.data ), + LDA, + VL, + VU, + IL, + IU, + ABSTOL, + &M, + reinterpret_cast< float * >( eigenvalues.data ), + reinterpret_cast< magmaFloatComplex * >( eigenvectors.data ), + LDZ, + support.data, + reinterpret_cast< magmaFloatComplex * >( workspace.work2().data ), + LDWA, + reinterpret_cast< magmaFloatComplex * >( workspace.work3().data ), + LDWZ, + reinterpret_cast< magmaFloatComplex * >( workspace.work().data ), + LWORK, + reinterpret_cast< float * >( workspace.rwork().data ), + LRWORK, + workspace.iwork().data, + LIWORK, + &INFO ); + } + else + { + magma_zheevr_gpu( + magma_vec_const( *JOBZ ), + magma_range_const( *RANGE ), + magma_uplo_const( *UPLO ), + N, + reinterpret_cast< magmaDoubleComplex * >( A.data ), + LDA, + VL, + VU, + IL, + IU, + ABSTOL, + &M, + reinterpret_cast< double * >( eigenvalues.data ), + reinterpret_cast< magmaDoubleComplex * >( eigenvectors.data ), + LDZ, + support.data, + reinterpret_cast< magmaDoubleComplex * >( workspace.work2().data ), + LDWA, + reinterpret_cast< magmaDoubleComplex * >( workspace.work3().data ), + LDWZ, + reinterpret_cast< magmaDoubleComplex * >( workspace.work().data ), + LWORK, + reinterpret_cast< double * >( workspace.rwork().data ), + LRWORK, + workspace.iwork().data, + LIWORK, + &INFO ); + } + } +#endif + else + { + LVARRAY_ERROR( "Unknown built in backend: " << static_cast< int >( backend ) ); + } + + LVARRAY_ERROR_IF_NE( INFO, 0 ); + + return M; +} + +} // namespace internal + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template< typename T > +DenseInt heevr( + BuiltInBackends const backend, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< T > > const & A, + Vector< T > const & eigenvalues, + Matrix< std::complex< T > > const & eigenvectors, + Vector< DenseInt > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType ) +{ + // TODO(corbett5): I think we can support row major by simply complex-conjugating all entries. + // I'm not sure exactly how this would work for the eigenvectors though. + LVARRAY_ERROR_IF( !A.isColumnMajor(), "Row major is not yet supported." ); + LVARRAY_ERROR_IF( !eigenvectors.isColumnMajor(), "Row major is not yet supported." ); + + bool const reallocateWork = workspace.work().size < 2 * A.sizes[ 0 ]; + bool const reallocateRWork = workspace.rwork().size < 24 * A.sizes[ 0 ]; + bool const reallocateIWork = workspace.iwork().size < 10 * A.sizes[ 0 ]; + + if( reallocateWork || reallocateRWork || reallocateIWork ) + { + OptimalSizeCalculation< std::complex< T > > optimalSizes; + internal::heevr( backend, decompositionOptions, A, eigenvalues, eigenvectors, support, optimalSizes, storageType, false ); + + if( reallocateWork ) + { + workspace.resizeWork( MemorySpace::host, optimalSizes.optimalWorkSize() ); + } + + if( reallocateRWork ) + { + workspace.resizeRWork( MemorySpace::host, optimalSizes.optimalRWorkSize() ); + } + + if( reallocateIWork ) + { + workspace.resizeIWork( MemorySpace::host, optimalSizes.optimalIWorkSize() ); + } + } + + return internal::heevr( backend, decompositionOptions, A, eigenvalues, eigenvectors, support, workspace, storageType, true ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// explicit instantiations. +//////////////////////////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template DenseInt heevr< float >( + BuiltInBackends const backend, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< float > > const & A, + Vector< float > const & eigenvalues, + Matrix< std::complex< float > > const & eigenvectors, + Vector< DenseInt > const & support, + Workspace< std::complex< float > > & workspace, + SymmetricMatrixStorageType const storageType ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template DenseInt heevr< double >( + BuiltInBackends const backend, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< double > > const & A, + Vector< double > const & eigenvalues, + Matrix< std::complex< double > > const & eigenvectors, + Vector< DenseInt > const & support, + Workspace< std::complex< double > > & workspace, + SymmetricMatrixStorageType const storageType ); + +} // namespace dense +} // namespace LvArray diff --git a/src/dense/eigenDecomposition.hpp b/src/dense/eigenDecomposition.hpp new file mode 100644 index 00000000..5e7f3819 --- /dev/null +++ b/src/dense/eigenDecomposition.hpp @@ -0,0 +1,220 @@ +#pragma once + +#include "common.hpp" + +namespace LvArray +{ +namespace dense +{ + +/** + * + */ +struct EigenDecompositionOptions +{ + /** + * + */ + enum Type + { + EIGENVALUES, + EIGENVALUES_AND_VECTORS, + }; + + /** + * + */ + enum Range + { + ALL, + IN_INTERVAL, + BY_INDEX, + }; + + /** + * + */ + EigenDecompositionOptions( Type const typeP, double const abstolP=0 ): + type{ typeP }, + abstol{ abstolP } + { + LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type ); + } + + /** + * + */ + EigenDecompositionOptions( + Type const typeP, + double const rangeMinP, + double const rangeMaxP, + double const abstolP ): + type{ typeP }, + range{ IN_INTERVAL }, + rangeMin{ rangeMinP }, + rangeMax{ rangeMaxP }, + abstol{ abstolP } + { + LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type ); + LVARRAY_ERROR_IF_GE( rangeMin, rangeMax ); + } + + /** + * TODO: Not sure how I feel about the one based indexing for eigenvalues by index. + */ + EigenDecompositionOptions( + Type const typeP, + DenseInt const indexMinP, + DenseInt const indexMaxP, + double const abstolP ): + type{ typeP }, + range{ IN_INTERVAL }, + indexMin{ indexMinP }, + indexMax{ indexMaxP }, + abstol{ abstolP } + { + LVARRAY_ERROR_IF( type != EIGENVALUES && type != EIGENVALUES_AND_VECTORS, "Wrong type provided: type = " << type ); + LVARRAY_ERROR_IF_LT( indexMin, 1 ); + LVARRAY_ERROR_IF_GT( indexMin, indexMax ); + } + + /** + * + */ + char const * typeArg() const + { + static constexpr char const * const eigenvalueString = "N"; + static constexpr char const * const eigenvectorString = "V"; + + return type == EIGENVALUES ? eigenvalueString : eigenvectorString; + } + + /** + * + */ + char const * rangeArg() const + { + static constexpr char const * const allString = "A"; + static constexpr char const * const intervalString = "V"; + static constexpr char const * const indexString = "I"; + + if( range == ALL ) + { return allString; } + + return range == IN_INTERVAL ? intervalString : indexString; + } + + /// + Type const type; + + /// + Range const range = ALL; + + /// + double const rangeMin = std::numeric_limits< double >::max(); + + /// + double const rangeMax = std::numeric_limits< double >::lowest(); + + /// + DenseInt const indexMin = std::numeric_limits< DenseInt >::max(); + + /// + DenseInt const indexMax = std::numeric_limits< DenseInt >::lowest(); + + /// + double const abstol = 0; +}; + + +/** + * + */ +template< typename T > +DenseInt heevr( + BuiltInBackends const backend, + EigenDecompositionOptions const decompositionOptions, + Matrix< std::complex< T > > const & A, + Vector< T > const & eigenValues, + Matrix< std::complex< T > > const & eigenVectors, + Vector< DenseInt > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType ); + +/** + * + */ +template< typename BACK_END, typename T, int USD_A, int USD_V, typename INDEX_TYPE > +DenseInt heevr( + BACK_END && backend, + EigenDecompositionOptions const decompositionOptions, + ArraySlice< std::complex< T >, 2, USD_A, INDEX_TYPE > const & A, + ArraySlice< T, 1, 0, INDEX_TYPE > const & eigenValues, + ArraySlice< std::complex< T >, 2, USD_V, INDEX_TYPE > const & eigenVectors, + ArraySlice< DenseInt, 1, 0, INDEX_TYPE > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType ) +{ + Matrix< std::complex< T > > AMatrix( A ); + Vector< T > eigenValuesVector( eigenValues ); + Matrix< std::complex< T > > eigenVectorsMatrix( eigenVectors ); + Vector< DenseInt > supportVector( support ); + + return heevr( + std::forward< BACK_END >( backend ), + decompositionOptions, + AMatrix, + eigenValuesVector, + eigenVectorsMatrix, + supportVector, + workspace, + storageType ); +} + +/** + * + */ +template< typename BACK_END, typename T, int USD_A, int USD_V, typename INDEX_TYPE, template< typename > class BUFFER_TYPE > +DenseInt heevr( + BACK_END && backend, + EigenDecompositionOptions const decompositionOptions, + ArrayView< std::complex< T >, 2, USD_A, INDEX_TYPE, BUFFER_TYPE > const & A, + ArrayView< T, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & eigenValues, + ArrayView< std::complex< T >, 2, USD_V, INDEX_TYPE, BUFFER_TYPE > const & eigenVectors, + ArrayView< DenseInt, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & support, + Workspace< std::complex< T > > & workspace, + SymmetricMatrixStorageType const storageType ) +{ + MemorySpace const space = getSpaceForBackend( backend ); + + // The A matrix isn't touched because it is destroyed. + A.move( space, false ); + eigenVectors.move( space, true ); + +#if defined( LVARRAY_USE_MAGMA ) + // MAGMA wants the eigenvalues and support on the CPU. + if( backend == BuiltInBackends::MAGMA_GPU ) + { + eigenValues.move( MemorySpace::host, true ); + support.move( MemorySpace::host, true ); + } + else +#endif + { + eigenValues.move( space, true ); + support.move( space, true ); + } + + return heevr( + std::forward< BACK_END >( backend ), + decompositionOptions, + A.toSlice(), + eigenValues.toSlice(), + eigenVectors.toSlice(), + support.toSlice(), + workspace, + storageType ); +} + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/dense/linearSolve.cpp b/src/dense/linearSolve.cpp new file mode 100644 index 00000000..33d5f503 --- /dev/null +++ b/src/dense/linearSolve.cpp @@ -0,0 +1,278 @@ +#include "linearSolve.hpp" +#include "backendHelpers.hpp" + +extern "C" +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_SGESV LVARRAY_LAPACK_FORTRAN_MANGLE( sgesv ) +void LVARRAY_SGESV( + LvArray::dense::DenseInt const * N, + LvArray::dense::DenseInt const * NRHS, + float * A, + LvArray::dense::DenseInt const * LDA, + LvArray::dense::DenseInt * IPIV, + float * B, + LvArray::dense::DenseInt const * LDB, + LvArray::dense::DenseInt * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_DGESV LVARRAY_LAPACK_FORTRAN_MANGLE( dgesv ) +void LVARRAY_DGESV( + LvArray::dense::DenseInt const * N, + LvArray::dense::DenseInt const * NRHS, + double * A, + LvArray::dense::DenseInt const * LDA, + LvArray::dense::DenseInt * IPIV, + double * B, + LvArray::dense::DenseInt const * LDB, + LvArray::dense::DenseInt * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_CGESV LVARRAY_LAPACK_FORTRAN_MANGLE( cgesv ) +void LVARRAY_CGESV( + LvArray::dense::DenseInt const * N, + LvArray::dense::DenseInt const * NRHS, + std::complex< float > * A, + LvArray::dense::DenseInt const * LDA, + LvArray::dense::DenseInt * IPIV, + std::complex< float > * B, + LvArray::dense::DenseInt const * LDB, + LvArray::dense::DenseInt * INFO ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#define LVARRAY_ZGESV LVARRAY_LAPACK_FORTRAN_MANGLE( zgesv ) +void LVARRAY_ZGESV( + LvArray::dense::DenseInt const * N, + LvArray::dense::DenseInt const * NRHS, + std::complex< double > * A, + LvArray::dense::DenseInt const * LDA, + LvArray::dense::DenseInt * IPIV, + std::complex< double > * B, + LvArray::dense::DenseInt const * LDB, + LvArray::dense::DenseInt * INFO ); + +} // extern "C" + +namespace LvArray +{ +namespace dense +{ + +template< typename T > +void gesv( + BuiltInBackends const backend, + Matrix< T > const & A, + Matrix< T > const & B, + Vector< DenseInt > const & pivots ) +{ + LVARRAY_ERROR_IF( !A.isSquare(), "The matrix A must be square." ); + LVARRAY_ERROR_IF( !A.isColumnMajor(), "The matrix A must be column major." ); + + LVARRAY_ERROR_IF_NE( A.sizes[ 0 ], B.sizes[ 0 ] ); + LVARRAY_ERROR_IF( !B.isColumnMajor(), "The matrix B must be column major." ); + + LVARRAY_ERROR_IF_NE( pivots.size, A.sizes[ 0 ] ); + + DenseInt const N = A.sizes[ 1 ]; + DenseInt const NRHS = B.sizes[ 1 ]; + DenseInt const LDA = A.strides[ 1 ]; + DenseInt const LDB = B.strides[ 1 ]; + DenseInt INFO = 0; + + if( backend == BuiltInBackends::LAPACK ) + { + if( std::is_same< T, float >::value ) + { + LVARRAY_SGESV( + &N, + &NRHS, + reinterpret_cast< float * >( A.data ), + &LDA, + pivots.data, + reinterpret_cast< float * >( B.data ), + &LDB, + &INFO ); + } + if( std::is_same< T, double >::value ) + { + LVARRAY_DGESV( + &N, + &NRHS, + reinterpret_cast< double * >( A.data ), + &LDA, + pivots.data, + reinterpret_cast< double * >( B.data ), + &LDB, + &INFO ); + } + if( IsComplexT< T, float > ) + { + LVARRAY_CGESV( + &N, + &NRHS, + reinterpret_cast< std::complex< float > * >( A.data ), + &LDA, + pivots.data, + reinterpret_cast< std::complex< float > * >( B.data ), + &LDB, + &INFO ); + } + if( IsComplexT< T, double > ) + { + LVARRAY_ZGESV( + &N, + &NRHS, + reinterpret_cast< std::complex< double > * >( A.data ), + &LDA, + pivots.data, + reinterpret_cast< std::complex< double > * >( B.data ), + &LDB, + &INFO ); + } + } +#if defined( LVARRAY_USE_MAGMA ) + else if( backend == BuiltInBackends::MAGMA ) + { + if( std::is_same< T, float >::value ) + { + magma_sgesv( + N, + NRHS, + reinterpret_cast< float * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< float * >( B.data ), + LDB, + &INFO ); + } + if( std::is_same< T, double >::value ) + { + magma_dgesv( + N, + NRHS, + reinterpret_cast< double * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< double * >( B.data ), + LDB, + &INFO ); + } + if( IsComplexT< T, float > ) + { + magma_cgesv( + N, + NRHS, + reinterpret_cast< magmaFloatComplex * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< magmaFloatComplex * >( B.data ), + LDB, + &INFO ); + } + if( IsComplexT< T, double > ) + { + magma_zgesv( + N, + NRHS, + reinterpret_cast< magmaDoubleComplex * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< magmaDoubleComplex * >( B.data ), + LDB, + &INFO ); + } + } + else if( backend == BuiltInBackends::MAGMA_GPU ) + { + if( std::is_same< T, float >::value ) + { + magma_sgesv_gpu( + N, + NRHS, + reinterpret_cast< float * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< float * >( B.data ), + LDB, + &INFO ); + } + if( std::is_same< T, double >::value ) + { + magma_dgesv_gpu( + N, + NRHS, + reinterpret_cast< double * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< double * >( B.data ), + LDB, + &INFO ); + } + if( IsComplexT< T, float > ) + { + magma_cgesv_gpu( + N, + NRHS, + reinterpret_cast< magmaFloatComplex * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< magmaFloatComplex * >( B.data ), + LDB, + &INFO ); + } + if( IsComplexT< T, double > ) + { + magma_zgesv_gpu( + N, + NRHS, + reinterpret_cast< magmaDoubleComplex * >( A.data ), + LDA, + pivots.data, + reinterpret_cast< magmaDoubleComplex * >( B.data ), + LDB, + &INFO ); + } + } +#endif + else + { + LVARRAY_ERROR( "Unknown built in backend: " << static_cast< int >( backend ) ); + } + + LVARRAY_ERROR_IF( INFO < 0, "The " << -INFO << "-th argument had an illegal value." ); + LVARRAY_ERROR_IF( INFO > 0, "The factorization has been completed but U( " << INFO - 1 << ", " << INFO - 1 << + " ) is exactly zero so the solution could not be computed." ); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template void gesv< float >( + BuiltInBackends const backend, + Matrix< float > const & A, + Matrix< float > const & B, + Vector< DenseInt > const & pivots ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template void gesv< double >( + BuiltInBackends const backend, + Matrix< double > const & A, + Matrix< double > const & B, + Vector< DenseInt > const & pivots ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template void gesv< std::complex< float > >( + BuiltInBackends const backend, + Matrix< std::complex< float > > const & A, + Matrix< std::complex< float > > const & B, + Vector< DenseInt > const & pivots ); + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template void gesv< std::complex< double > >( + BuiltInBackends const backend, + Matrix< std::complex< double > > const & A, + Matrix< std::complex< double > > const & B, + Vector< DenseInt > const & pivots ); + + +} // namespace dense +} // namespace LvArray diff --git a/src/dense/linearSolve.hpp b/src/dense/linearSolve.hpp new file mode 100644 index 00000000..3efe7719 --- /dev/null +++ b/src/dense/linearSolve.hpp @@ -0,0 +1,85 @@ +#pragma once + +#include "common.hpp" + +namespace LvArray +{ +namespace dense +{ + +/** + * @brief Solves the matrix equation A X = B for X using (s, d, c, z)gesv. + * + * @tparam T The type of values in the matrices. Must be one of float, double, std::complex< float >, or std::complex< double >. + * @param backend The built in backend that implements (s, d, c, z)gesv. + * @param A The input matrix, which is overwritten with L and U from the LU decomposition. + * @param B The input right hand side, is overwritten with the solution X. + * @param pivots The permutation matrix used when factoring A. + * + * @note When using @c MAGMA_GPU as the backend both @param A and @param B should be on the GPU while @param pivots + * remains on the host. + */ +template< typename T > +void gesv( + BuiltInBackends const backend, + Matrix< T > const & A, + Matrix< T > const & B, + Vector< DenseInt > const & pivots ); + +/** + * + */ +template< typename BACK_END, typename T, int USD_A, int NDIM_B, int USD_B, typename INDEX_TYPE > +void gesv( + BACK_END && backend, + ArraySlice< T, 2, USD_A, INDEX_TYPE > const & A, + ArraySlice< T, NDIM_B, USD_B, INDEX_TYPE > const & B, + ArraySlice< DenseInt, 1, 0, INDEX_TYPE > const & pivots ) +{ + Matrix< T > AMatrix( A ); + Matrix< T > BMatrix( B ); + Vector< DenseInt > pivotsVector( pivots ); + + gesv( + std::forward< BACK_END >( backend ), + AMatrix, + BMatrix, + pivots ); +} + +/** + * + */ +template< typename BACK_END, typename T, int USD_A, int NDIM_B, int USD_B, typename INDEX_TYPE, template< typename > class BUFFER_TYPE > +void gesv( + BACK_END && backend, + ArrayView< T, 2, USD_A, INDEX_TYPE, BUFFER_TYPE > const & A, + ArrayView< T, NDIM_B, USD_B, INDEX_TYPE, BUFFER_TYPE > const & B, + ArrayView< DenseInt, 1, 0, INDEX_TYPE, BUFFER_TYPE > const & pivots ) +{ + // TODO(corbett5): Unclear about the touch here since A is destroyed but the LU decomposition may still be useful. + MemorySpace const space = getSpaceForBackend( backend ); + A.move( space, true ); + B.move( space, true ); + +#if defined( LVARRAY_USE_MAGMA ) + // MAGMA wants the pivots on the CPU. + if( backend == BuiltInBackends::MAGMA_GPU ) + { + pivots.move( MemorySpace::host, true ); + } + else +#endif + { + pivots.move( space, true ); + } + + return gesv( + std::forward< BACK_END >( backend ), + A.toSlice(), + B.toSlice(), + pivots.toSlice() ); +} + +} // namespace dense +} // namespace LvArray \ No newline at end of file diff --git a/src/math.hpp b/src/math.hpp index f832e0fa..d45f68f3 100644 --- a/src/math.hpp +++ b/src/math.hpp @@ -45,7 +45,7 @@ namespace internal * @return @p u converted to @tparam T. */ template< typename T, typename U > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr T convert( T const, U const u ) { return u; } @@ -55,7 +55,7 @@ T convert( T const, U const u ) * @return The number of values stored in @tparam T, by default this is 1. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr int numValues( T const ) { return 1; } @@ -76,7 +76,7 @@ struct SingleType * @param x The value to return. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr SingleType< T > getFirst( T const x ) { return x; } @@ -86,7 +86,7 @@ SingleType< T > getFirst( T const x ) * @param x The value to return. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr SingleType< T > getSecond( T const x ) { return x; } @@ -110,7 +110,7 @@ T lessThan( T const x, T const y ) * @return @p u converted to @c __half. */ template< typename U > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr __half convert( __half const, U const u ) { return __float2half_rn( u ); } @@ -122,7 +122,7 @@ __half convert( __half const, U const u ) * @return A @c __half2 with both halves having value @p u converted to @c __half. */ template< typename U > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr __half2 convert( __half2 const, U const u ) { return __float2half2_rn( u ); } @@ -131,10 +131,10 @@ __half2 convert( __half2 const, U const u ) * @param u The value to convert. * @return A @c __half2 with both halves having value @p u. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE __half2 convert( __half2 const, __half const u ) { -#if defined( __CUDA_ARCH__ ) +#if defined( LVARRAY_DEVICE_COMPILE ) return __half2half2( u ); #else return __float2half2_rn( u ); @@ -151,7 +151,7 @@ __half2 convert( __half2 const, __half const u ) * @return A @c __half2 containing @p u as the first value and @p v as the second. */ template< typename U, typename V > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr __half2 convert( __half2 const, U const u, V const v ) { return __floats2half2_rn( u, v ); } @@ -161,10 +161,10 @@ __half2 convert( __half2 const, U const u, V const v ) * @param v The second value to convert. * @return A @c __half2 containing @p u as the first value and @p v as the second. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE __half2 convert( __half2 const, __half const u, __half const v ) { -#if defined( __CUDA_ARCH__ ) +#if defined( LVARRAY_DEVICE_COMPILE ) return __halves2half2( u, v ); #else return __floats2half2_rn( u, v ); @@ -175,7 +175,7 @@ __half2 convert( __half2 const, __half const u, __half const v ) * @brief Return the number of values stored in a @c __half2, which is 2. * @return The number of values stored in a @c __half2, which is 2. */ -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr int numValues( __half2 const & ) { return 2; } @@ -193,7 +193,7 @@ struct SingleType< __half2 > * @return The fist @c __half in @p x. * @param x The value to query. */ -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half getFirst( __half2 const x ) { return __low2half( x ); } @@ -201,16 +201,19 @@ __half getFirst( __half2 const x ) * @return The second @c __half in @p x. * @param x The value to query. */ -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half getSecond( __half2 const x ) { return __high2half( x ); } +#endif + +#if defined( LVARRAY_USE_DEVICE ) /** * @return 1 if @p x is less than @p y, else 0. * @param x The first value. * @param y The second value. */ -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half lessThan( __half const x, __half const y ) { return __hlt( x, y ); } @@ -219,10 +222,9 @@ __half lessThan( __half const x, __half const y ) * @param x The first value. * @param y The second value. */ -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 lessThan( __half2 const x, __half2 const y ) { return __hlt2( x, y ); } - #endif } // namespace internal @@ -238,7 +240,7 @@ __half2 lessThan( __half2 const x, __half2 const y ) * @return The number of values stored in type @tparam T. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr int numValues() { return internal::numValues( T() ); } @@ -258,7 +260,7 @@ using SingleType = typename internal::SingleType< T >::type; * @return @p u converted to @tparam T. */ template< typename T, typename U > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr T convert( U const u ) { return internal::convert( T(), u ); } @@ -273,7 +275,7 @@ T convert( U const u ) * @return @p u, @p v converted to @tparam T. */ template< typename T, typename U, typename V > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr T convert( U const u, V const v ) { return internal::convert( T(), u, v ); } @@ -284,7 +286,7 @@ T convert( U const u, V const v ) * @note If @code numValues< T >() == 1 @endcode then @p x is returned. */ template< typename T > -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE SingleType< T > getFirst( T const x ) { return internal::getFirst( x ); } @@ -295,7 +297,7 @@ SingleType< T > getFirst( T const x ) * @note If @code numValues< T >() == 1 @endcode then @p x is returned. */ template< typename T > -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE SingleType< T > getSecond( T const x ) { return internal::getSecond( x ); } @@ -306,35 +308,37 @@ SingleType< T > getSecond( T const x ) * @param b The second number. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr std::enable_if_t< std::is_arithmetic< T >::value, T > max( T const a, T const b ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::max( a, b ); #else return std::max( a, b ); #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc max( T, T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half max( __half const a, __half const b ) { -#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) +#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) return __hmax( a, b ); +#elif defined(LVARRAY_USE_HIP) + return __hgt( a, b ) ? a : b; #else return a > b ? a : b; #endif } /// @copydoc max( T, T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 max( __half2 const a, __half2 const b ) { -#if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) +#if defined(LVARRAY_USE_CUDA) && CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) return __hmax2( a, b ); #else __half2 const aFactor = __hge2( a, b ); @@ -353,11 +357,11 @@ __half2 max( __half2 const a, __half2 const b ) * @param b The second number. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr std::enable_if_t< std::is_arithmetic< T >::value, T > min( T const a, T const b ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::min( a, b ); #else return std::min( a, b ); @@ -367,7 +371,8 @@ min( T const a, T const b ) #if defined( LVARRAY_USE_CUDA ) /// @copydoc min( T, T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE +LVARRAY_FORCE_INLINE __half min( __half const a, __half const b ) { #if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) @@ -378,7 +383,8 @@ __half min( __half const a, __half const b ) } /// @copydoc min( T, T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE +LVARRAY_FORCE_INLINE __half2 min( __half2 const a, __half2 const b ) { #if CUDART_VERSION > 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) @@ -398,20 +404,20 @@ __half2 min( __half2 const a, __half2 const b ) * @note This set of overloads is valid for any numeric type. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr T abs( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::abs( x ); #else return std::abs( x ); #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc abs( T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half abs( __half const x ) { #if CUDART_VERSION > 11000 @@ -422,7 +428,7 @@ __half abs( __half const x ) } /// @copydoc abs( T ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 abs( __half2 const x ) { #if CUDART_VERSION > 11000 @@ -440,7 +446,7 @@ __half2 abs( __half2 const x ) * @param x The value to square. */ template< typename T > -LVARRAY_HOST_DEVICE inline constexpr +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE constexpr T square( T const x ) { return x * x; } @@ -457,10 +463,10 @@ T square( T const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is @c double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float sqrt( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::sqrtf( x ); #else return std::sqrt( x ); @@ -469,25 +475,25 @@ float sqrt( float const x ) /// @copydoc sqrt( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double sqrt( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::sqrt( double( x ) ); #else return std::sqrt( x ); #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc sqrt( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half sqrt( __half const x ) { return ::hsqrt( x ); } /// @copydoc sqrt( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 sqrt( __half2 const x ) { return ::h2sqrt( x ); } @@ -499,10 +505,10 @@ __half2 sqrt( __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float invSqrt( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::rsqrtf( x ); #else return 1 / std::sqrt( x ); @@ -511,25 +517,25 @@ float invSqrt( float const x ) /// @copydoc invSqrt( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double invSqrt( T const x ) { -#if defined( __CUDA_ARCH__ ) +#if defined( LVARRAY_DEVICE_COMPILE ) return ::rsqrt( double( x ) ); #else return 1 / std::sqrt( x ); #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc invSqrt( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half invSqrt( __half const x ) { return ::hrsqrt( x ); } /// @copydoc invSqrt( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 invSqrt( __half2 const x ) { return ::h2rsqrt( x ); } @@ -548,10 +554,10 @@ __half2 invSqrt( __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float sin( float const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::sinf( theta ); #else return std::sin( theta ); @@ -560,25 +566,25 @@ float sin( float const theta ) /// @copydoc sin( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double sin( T const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::sin( double( theta ) ); #else return std::sin( theta ); #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc sin( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half sin( __half const theta ) { return ::hsin( theta ); } /// @copydoc sin( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 sin( __half2 const theta ) { return ::h2sin( theta ); } @@ -590,10 +596,10 @@ __half2 sin( __half2 const theta ) * @note This set of overloads is valid for any numeric type. If @p theta is not a float * it is converted to a double and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float cos( float const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::cosf( theta ); #else return std::cos( theta ); @@ -602,25 +608,25 @@ float cos( float const theta ) /// @copydoc cos( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double cos( T const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::cos( double( theta ) ); #else return std::cos( theta ); #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc cos( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half cos( __half const theta ) { return ::hcos( theta ); } /// @copydoc cos( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 cos( __half2 const theta ) { return ::h2cos( theta ); } @@ -632,11 +638,15 @@ __half2 cos( __half2 const theta ) * @param sinTheta The sine of @p theta. * @param cosTheta The cosine of @p theta. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE void sincos( float const theta, float & sinTheta, float & cosTheta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) + #if defined(LVARRAY_USE_CUDA) ::sincos( theta, &sinTheta, &cosTheta ); + #elif defined(LVARRAY_USE_HIP) + ::sincosf( theta, &sinTheta, &cosTheta ); + #endif #else sinTheta = std::sin( theta ); cosTheta = std::cos( theta ); @@ -645,11 +655,11 @@ void sincos( float const theta, float & sinTheta, float & cosTheta ) /// @copydoc sincos( float, float &, float & ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE void sincos( double const theta, double & sinTheta, double & cosTheta ) { -#if defined(__CUDA_ARCH__) - ::sincos( theta, &sinTheta, &cosTheta ); +#if defined(LVARRAY_DEVICE_COMPILE) + ::sincos( theta, &sinTheta, &cosTheta ); // hip and cuda versions both use double #else sinTheta = std::sin( theta ); cosTheta = std::cos( theta ); @@ -658,10 +668,10 @@ void sincos( double const theta, double & sinTheta, double & cosTheta ) /// @copydoc sincos( float, float &, float & ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE void sincos( T const theta, double & sinTheta, double & cosTheta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) double s, c; ::sincos( theta, &s, &c ); sinTheta = s; @@ -672,10 +682,10 @@ void sincos( T const theta, double & sinTheta, double & cosTheta ) #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc sincos( float, float &, float & ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE void sincos( __half const theta, __half & sinTheta, __half & cosTheta ) { sinTheta = ::hsin( theta ); @@ -683,7 +693,7 @@ void sincos( __half const theta, __half & sinTheta, __half & cosTheta ) } /// @copydoc sincos( float, float &, float & ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE void sincos( __half2 const theta, __half2 & sinTheta, __half2 & cosTheta ) { sinTheta = ::h2sin( theta ); @@ -698,10 +708,10 @@ void sincos( __half2 const theta, __half2 & sinTheta, __half2 & cosTheta ) * @note This set of overloads is valid for any numeric type. If @p theta is not a float * it is converted to a double and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float tan( float const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::tanf( theta ); #else return std::tan( theta ); @@ -710,20 +720,20 @@ float tan( float const theta ) /// @copydoc tan( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double tan( T const theta ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::tan( double( theta ) ); #else return std::tan( theta ); #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc tan( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half tan( __half const theta ) { __half s, c; @@ -732,7 +742,7 @@ __half tan( __half const theta ) } /// @copydoc tan( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 tan( __half2 const theta ) { __half2 s, c; @@ -764,7 +774,7 @@ namespace internal * @note Modified from https://developer.download.nvidia.com/cg/asin.html */ template< typename T > -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE T asinImpl( T const x ) { T const negate = lessThan( x, math::convert< T >( 0 ) ); @@ -786,7 +796,7 @@ T asinImpl( T const x ) * @note Modified from https://developer.download.nvidia.com/cg/acos.html */ template< typename T > -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE T acosImpl( T const x ) { T const negate = lessThan( x, math::convert< T >( 0 ) ); @@ -808,7 +818,8 @@ T acosImpl( T const x ) * @note Modified from https://developer.download.nvidia.com/cg/atan2.html */ template< typename T > -LVARRAY_DEVICE inline +LVARRAY_DEVICE +LVARRAY_FORCE_INLINE T atan2Impl( T const y, T const x ) { T const absX = abs( x ); @@ -842,10 +853,10 @@ T atan2Impl( T const y, T const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float asin( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::asinf( x ); #else return std::asin( x ); @@ -854,25 +865,25 @@ float asin( float const x ) /// @copydoc asin( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double asin( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::asin( double( x ) ); #else return std::asin( x ); #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc asin( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half asin( __half const x ) { return internal::asinImpl( x ); } /// @copydoc asin( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 asin( __half2 const x ) { return internal::asinImpl( x ); } @@ -884,10 +895,10 @@ __half2 asin( __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float acos( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::acosf( x ); #else return std::acos( x ); @@ -896,25 +907,25 @@ float acos( float const x ) /// @copydoc acos( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double acos( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::acos( double( x ) ); #else return std::acos( x ); #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc acos( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half acos( __half const x ) { return internal::acosImpl( x ); } /// @copydoc acos( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 acos( __half2 const x ) { return internal::acosImpl( x ); } @@ -927,10 +938,10 @@ __half2 acos( __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float atan2( float const y, float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::atan2f( y, x ); #else return std::atan2( y, x ); @@ -939,10 +950,10 @@ float atan2( float const y, float const x ) /// @copydoc atan2( float, float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double atan2( T const y, T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::atan2( double( y ), double( x ) ); #else return std::atan2( y, x ); @@ -952,12 +963,12 @@ double atan2( T const y, T const x ) #if defined( LVARRAY_USE_CUDA ) /// @copydoc atan2( float, float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half atan2( __half const y, __half const x ) { return internal::atan2Impl( y, x ); } /// @copydoc atan2( float, float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 atan2( __half2 const y, __half2 const x ) { return internal::atan2Impl( y, x ); } @@ -976,10 +987,10 @@ __half2 atan2( __half2 const y, __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float exp( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::expf( x ); #else return std::exp( x ); @@ -988,25 +999,25 @@ float exp( float const x ) /// @copydoc exp( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double exp( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::exp( double( x ) ); #else return std::exp( x ); #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc exp( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half exp( __half const x ) { return ::hexp( x ); } /// @copydoc exp( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 exp( __half2 const x ) { return ::h2exp( x ); } @@ -1018,10 +1029,10 @@ __half2 exp( __half2 const x ) * @note This set of overloads is valid for any numeric type. If @p x is integral it is converted to @c double * and the return type is double. */ -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE float log( float const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::logf( x ); #else return std::log( x ); @@ -1030,25 +1041,25 @@ float log( float const x ) /// @copydoc log( float ) template< typename T > -LVARRAY_HOST_DEVICE inline +LVARRAY_HOST_DEVICE LVARRAY_FORCE_INLINE double log( T const x ) { -#if defined(__CUDA_ARCH__) +#if defined(LVARRAY_DEVICE_COMPILE) return ::log( double( x ) ); #else return std::log( x ); #endif } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_DEVICE ) /// @copydoc log( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half log( __half const x ) { return ::hlog( x ); } /// @copydoc log( float ) -LVARRAY_DEVICE inline +LVARRAY_DEVICE LVARRAY_FORCE_INLINE __half2 log( __half2 const x ) { return ::h2log( x ); } diff --git a/src/sortedArrayManipulation.hpp b/src/sortedArrayManipulation.hpp index 7e9cae5d..d4bdbeed 100644 --- a/src/sortedArrayManipulation.hpp +++ b/src/sortedArrayManipulation.hpp @@ -211,7 +211,7 @@ LVARRAY_HOST_DEVICE inline void makeSorted( RandomAccessIterator const first, RandomAccessIterator const last, Compare && comp=Compare() ) { -#ifdef __CUDA_ARCH__ +#if defined(LVARRAY_DEVICE_COMPILE) if( last - first > internal::INTROSORT_THRESHOLD ) { internal::introsortLoop( first, last, comp ); diff --git a/src/system.cpp b/src/system.cpp index 25a2ec13..a6532ac5 100644 --- a/src/system.cpp +++ b/src/system.cpp @@ -417,11 +417,16 @@ std::string calculateSize( size_t const bytes ) suffix = "MB"; shift = 20; } - else + else if( bytes >> 10 != 0 ) { suffix = "KB"; shift = 10; } + else + { + suffix = "B"; + shift = 0; + } double const units = double( bytes ) / ( 1 << shift ); diff --git a/unitTests/CMakeLists.txt b/unitTests/CMakeLists.txt index 4d91681e..3ac33255 100644 --- a/unitTests/CMakeLists.txt +++ b/unitTests/CMakeLists.txt @@ -149,3 +149,8 @@ install(TARGETS testTensorOps if( ENABLE_PYLVARRAY ) add_subdirectory( python ) endif() + +if( ENABLE_LAPACK ) + add_subdirectory( dense ) +endif() + diff --git a/unitTests/dense/CMakeLists.txt b/unitTests/dense/CMakeLists.txt new file mode 100644 index 00000000..f87b2fda --- /dev/null +++ b/unitTests/dense/CMakeLists.txt @@ -0,0 +1,34 @@ +################################################################################################### +# Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors. +# All rights reserved. +# See the LICENSE file for details. +# SPDX-License-Identifier: (BSD-3-Clause) +################################################################################################### + +# +# Specify list of tests +# +set( testSources + testgemm.cpp +) + +# +# Add gtest C++ based tests +# +foreach(test ${testSources}) + get_filename_component( test_name ${test} NAME_WE ) + blt_add_executable( NAME ${test_name} + SOURCES ${test} + OUTPUT_DIR ${TEST_OUTPUT_DIRECTORY} + DEPENDS_ON gtest lvarray lvarraydense ${lvarray_dependencies} ) + + target_include_directories( ${test_name} PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../../src ) + + blt_add_test( NAME ${test_name} + COMMAND ${test_name} ) + + install(TARGETS ${test_name} + DESTINATION bin) +endforeach() + + diff --git a/unitTests/dense/testEigenDecomposition.cpp b/unitTests/dense/testEigenDecomposition.cpp new file mode 100644 index 00000000..bc29aa05 --- /dev/null +++ b/unitTests/dense/testEigenDecomposition.cpp @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors. + * All rights reserved. + * See the LICENSE file for details. + * SPDX-License-Identifier: (BSD-3-Clause) + */ + +// Source includes +#include "dense/eigenDecomposition.hpp" + +#include "../testUtils.hpp" + +#if defined( LVARRAY_USE_MAGMA ) + #include +#endif + +namespace LvArray +{ +namespace testing +{ + +using namespace dense; + +template< typename T > +using Array1d = Array< T, 1, RAJA::PERM_I, DenseInt, DEFAULT_BUFFER >; + +template< typename T, typename PERM > +using Array2d = Array< T, 2, PERM, DenseInt, DEFAULT_BUFFER >; + +// TODO(corbett5): significantly improve this test. + +template< typename T > +struct HEEVR_TEST +{ + HEEVR_TEST( BuiltInBackends const backend ): + m_backend( backend ) + { + m_matrix.setName( "matrix" ); + m_eigenvalues.setName( "m_eigenvalues" ); + m_eigenvectors.setName( "eigenvectors" ); + m_support.setName( "support" ); + } + + void threeByThreeEigenvalues() + { + resize( 20, 20, 0 ); + + m_matrix( 1, 1 ) = 2; + m_matrix( 0, 0 ) = 3; + m_matrix( 2, 2 ) = -4; + + SymmetricMatrixStorageType storageType = SymmetricMatrixStorageType::UPPER_TRIANGULAR; + + heevr( + m_backend, + EigenDecompositionOptions( EigenDecompositionOptions::EIGENVALUES ), + m_matrix.toView(), + m_eigenvalues.toView(), + m_eigenvectors.toView(), + m_support, + m_workspace, + storageType ); + + EXPECT_DOUBLE_EQ( m_eigenvalues[ 0 ], -4 ); + EXPECT_DOUBLE_EQ( m_eigenvalues[ 1 ], 2 ); + EXPECT_DOUBLE_EQ( m_eigenvalues[ 2 ], 3 ); + } + +private: + void resize( DenseInt const n, DenseInt const nvals, DenseInt const nvec ) + { + m_matrix.resize( n, n ); + m_eigenvalues.resize( nvals ); + m_eigenvectors.resize( n, nvec ); + m_support.resize( 2 * n ); + } + + BuiltInBackends const m_backend; + Array2d< std::complex< T >, RAJA::PERM_JI > m_matrix; + Array1d< T > m_eigenvalues; + Array2d< std::complex< T >, RAJA::PERM_JI > m_eigenvectors; + Array1d< int > m_support; + ArrayWorkspace< std::complex< T >, ChaiBuffer > m_workspace; +}; + +TEST( eigenvalues_float, lapack ) +{ + HEEVR_TEST< float > test( BuiltInBackends::LAPACK ); + + test.threeByThreeEigenvalues(); +} + +TEST( eigenvalues_double, lapack ) +{ + HEEVR_TEST< double > test( BuiltInBackends::LAPACK ); + + test.threeByThreeEigenvalues(); +} + +#if defined( LVARRAY_USE_MAGMA ) + +TEST( eigenvalues_float, magma ) +{ + HEEVR_TEST< float > test( BuiltInBackends::MAGMA ); + + test.threeByThreeEigenvalues(); +} + +TEST( eigenvalues_double, magma ) +{ + HEEVR_TEST< double > test( BuiltInBackends::MAGMA ); + + test.threeByThreeEigenvalues(); +} + +TEST( eigenvalues_float, magma_gpu ) +{ + HEEVR_TEST< float > test( BuiltInBackends::MAGMA_GPU ); + + test.threeByThreeEigenvalues(); +} + +TEST( eigenvalues_double, magma_gpu ) +{ + HEEVR_TEST< double > test( BuiltInBackends::MAGMA_GPU ); + + test.threeByThreeEigenvalues(); +} + +#endif + +} // namespace testing +} // namespace LvArray + +// This is the default gtest main method. It is included for ease of debugging. +int main( int argc, char * * argv ) +{ +#if defined( LVARRAY_USE_MAGMA ) + magma_init(); +#endif + + ::testing::InitGoogleTest( &argc, argv ); + int const result = RUN_ALL_TESTS(); + +#if defined( LVARRAY_USE_MAGMA ) + magma_finalize(); +#endif + + return result; +} diff --git a/unitTests/dense/testLinearSolve.cpp b/unitTests/dense/testLinearSolve.cpp new file mode 100644 index 00000000..7a1ab3c8 --- /dev/null +++ b/unitTests/dense/testLinearSolve.cpp @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors. + * All rights reserved. + * See the LICENSE file for details. + * SPDX-License-Identifier: (BSD-3-Clause) + */ + +// Source includes +#include "dense/linearSolve.hpp" + +#include "../testUtils.hpp" + +#include "output.hpp" + +#if defined( LVARRAY_USE_MAGMA ) + #include +#endif + +#define EXPECT_COMPLEX_NEAR( z1, z2, absError ) \ + EXPECT_NEAR( std::real( z1 ), std::real( z2 ), absError ); \ + EXPECT_NEAR( std::imag( z1 ), std::imag( z2 ), absError ) + +namespace LvArray +{ +namespace testing +{ + +using namespace dense; + +template< typename T > +using Array1d = Array< T, 1, RAJA::PERM_I, DenseInt, DEFAULT_BUFFER >; + +template< typename T, typename PERM > +using Array2d = Array< T, 2, PERM, DenseInt, DEFAULT_BUFFER >; + + +template< typename T > +struct GESV_Test : public ::testing::Test +{ + void test( BuiltInBackends const backend, DenseInt const N, DenseInt const nrhs ) + { + Array2d< T, RAJA::PERM_JI > A( N, N ); + Array2d< T, RAJA::PERM_JI > B( N, nrhs ) ; + Array1d< DenseInt > pivots( N ); + + for( DenseInt row = 0; row < N; ++row ) + { + for( DenseInt col = 0; col < N; ++col ) + { + A( row, col ) = randomNumber(); + } + + for( DenseInt col = 0; col < nrhs; ++col ) + { + B( row, col ) = randomNumber(); + } + } + + Array2d< T, RAJA::PERM_JI > ACopy( A ); + Array2d< T, RAJA::PERM_JI > X( B ); + gesv( backend, ACopy.toView(), X.toView(), pivots ); + + // TODO(corbett5): replace this with matrix matrix multiplication + X.move( MemorySpace::host, true ); + for( DenseInt i = 0; i < N; ++i ) + { + for( DenseInt j = 0; j < nrhs; ++j ) + { + T dot = 0; + for( DenseInt k = 0; k < N; ++k ) + { + dot += A( i, k ) * X( k, j ); + } + + EXPECT_COMPLEX_NEAR( dot, B( i, j ), 10 * N * std::numeric_limits< RealVersion< T > >::epsilon() ); + } + } + } + +private: + + template< typename _T=T > + std::enable_if_t< !IsComplex< _T >, T > + randomNumber() + { return m_dist( m_gen ); } + + template< typename _T=T > + std::enable_if_t< IsComplex< _T >, T > + randomNumber() + { return { m_dist( m_gen ), m_dist( m_gen ) }; } + + std::mt19937_64 m_gen; + std::uniform_real_distribution< RealVersion< T > > m_dist; +}; + +using GESV_Test_types = ::testing::Types< + float, + double, + std::complex< float >, + std::complex< double > + >; +TYPED_TEST_SUITE( GESV_Test, GESV_Test_types, ); + +TYPED_TEST( GESV_Test, LAPACK_2x2 ) +{ + this->test( BuiltInBackends::LAPACK, 2, 1 ); + this->test( BuiltInBackends::LAPACK, 2, 2 ); +} + +TYPED_TEST( GESV_Test, LAPACK_10x10 ) +{ + this->test( BuiltInBackends::LAPACK, 10, 1 ); + this->test( BuiltInBackends::LAPACK, 10, 3 ); +} + +TYPED_TEST( GESV_Test, LAPACK_100x100 ) +{ + this->test( BuiltInBackends::LAPACK, 100, 1 ); + this->test( BuiltInBackends::LAPACK, 100, 10 ); +} + +TYPED_TEST( GESV_Test, LAPACK_1000x1000 ) +{ + this->test( BuiltInBackends::LAPACK, 100, 1 ); + this->test( BuiltInBackends::LAPACK, 100, 10 ); +} + +#if defined( LVARRAY_USE_MAGMA ) + +TYPED_TEST( GESV_Test, MAGMA_2x2 ) +{ + this->test( BuiltInBackends::MAGMA, 2, 1 ); + this->test( BuiltInBackends::MAGMA, 2, 2 ); +} + +TYPED_TEST( GESV_Test, MAGMA_10x10 ) +{ + this->test( BuiltInBackends::MAGMA, 10, 1 ); + this->test( BuiltInBackends::MAGMA, 10, 3 ); +} + +TYPED_TEST( GESV_Test, MAGMA_100x100 ) +{ + this->test( BuiltInBackends::MAGMA, 100, 1 ); + this->test( BuiltInBackends::MAGMA, 100, 10 ); +} + +TYPED_TEST( GESV_Test, MAGMA_1000x1000 ) +{ + this->test( BuiltInBackends::MAGMA, 100, 1 ); + this->test( BuiltInBackends::MAGMA, 100, 10 ); +} + +TYPED_TEST( GESV_Test, MAGMA_GPU_2x2 ) +{ + this->test( BuiltInBackends::MAGMA_GPU, 2, 1 ); + this->test( BuiltInBackends::MAGMA_GPU, 2, 2 ); +} + +TYPED_TEST( GESV_Test, MAGMA_GPU_10x10 ) +{ + this->test( BuiltInBackends::MAGMA_GPU, 10, 1 ); + this->test( BuiltInBackends::MAGMA_GPU, 10, 3 ); +} + +TYPED_TEST( GESV_Test, MAGMA_GPU_100x100 ) +{ + this->test( BuiltInBackends::MAGMA_GPU, 100, 1 ); + this->test( BuiltInBackends::MAGMA_GPU, 100, 10 ); +} + +TYPED_TEST( GESV_Test, MAGMA_GPU_1000x1000 ) +{ + this->test( BuiltInBackends::MAGMA_GPU, 100, 1 ); + this->test( BuiltInBackends::MAGMA_GPU, 100, 10 ); +} + +#endif + +} // namespace testing +} // namespace LvArray + +// This is the default gtest main method. It is included for ease of debugging. +int main( int argc, char * * argv ) +{ +#if defined( LVARRAY_USE_MAGMA ) + magma_init(); +#endif + + ::testing::InitGoogleTest( &argc, argv ); + int const result = RUN_ALL_TESTS(); + +#if defined( LVARRAY_USE_MAGMA ) + magma_finalize(); +#endif + + return result; +} diff --git a/unitTests/dense/testgemm.cpp b/unitTests/dense/testgemm.cpp new file mode 100644 index 00000000..51f50773 --- /dev/null +++ b/unitTests/dense/testgemm.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC and LvArray contributors. + * All rights reserved. + * See the LICENSE file for details. + * SPDX-License-Identifier: (BSD-3-Clause) + */ + +// Source includes +#include "dense/dense.hpp" +#include "dense/BlasLapackInterface.hpp" + +#include "../testUtils.hpp" + +#include + +#if defined( LVARRAY_USE_MAGMA ) + #include +#endif + +#define EXPECT_COMPLEX_NEAR( z1, z2, absError ) \ + EXPECT_NEAR( std::real( z1 ), std::real( z2 ), absError ); \ + EXPECT_NEAR( std::imag( z1 ), std::imag( z2 ), absError ) + +namespace LvArray +{ +namespace testing +{ + +// This should probably go in a common place +template< typename T, typename PERM > +using Array2d = Array< T, 2, PERM, std::ptrdiff_t, DEFAULT_BUFFER >; + +template< typename T > +std::enable_if_t< std::is_floating_point< T >::value, T > +randomValue( std::mt19937 & gen ) +{ return std::uniform_real_distribution< T >{ -1, 1 }( gen ); } + +template< typename T > +std::enable_if_t< dense::IsComplex< T >, T > +randomValue( std::mt19937 & gen ) +{ + return { std::uniform_real_distribution< dense::RealVersion< T > >{ -1, 1 }( gen ), + std::uniform_real_distribution< dense::RealVersion< T > >{ -1, 1 }( gen ) }; +} + +template< typename T, typename PERM > +Array2d< T, PERM > randomMatrix( std::ptrdiff_t const N, std::ptrdiff_t const M ) +{ + std::mt19937 gen( std::random_device{}() ); + + Array2d< T, PERM > const ret( N, M ); + + for( std::ptrdiff_t r = 0; r < N; ++r ) + { + for( std::ptrdiff_t c = 0; c < M; ++c ) + { + ret( r, c ) = T{10} * randomValue< T >( gen ); + } + } + + return ret; +} + +template< typename T, typename PERM > +std::enable_if_t< std::is_floating_point< T >::value > +checkEqual( Array2d< T, PERM > const & lhs, Array2d< T, PERM > const & rhs, double rTol ) +{ + ASSERT_EQ( lhs.size( 0 ), rhs.size( 0 ) ); + ASSERT_EQ( lhs.size( 1 ), rhs.size( 1 ) ); + + for( std::ptrdiff_t i = 0; i < lhs.size(); ++i ) + { + EXPECT_NEAR( lhs.data()[ i ], rhs.data()[ i ], std::abs( lhs.data()[ i ] ) * rTol ); + } +} + +template< typename T, typename PERM > +std::enable_if_t< dense::IsComplex< T > > +checkEqual( Array2d< T, PERM > const & lhs, Array2d< T, PERM > const & rhs, double rTol ) +{ + ASSERT_EQ( lhs.size( 0 ), rhs.size( 0 ) ); + ASSERT_EQ( lhs.size( 1 ), rhs.size( 1 ) ); + + for( std::ptrdiff_t i = 0; i < lhs.size(); ++i ) + { + EXPECT_COMPLEX_NEAR( lhs.data()[ i ], rhs.data()[ i ], std::abs( lhs.data()[ i ] ) * rTol ); + } +} + +template< typename INTERFACE, typename T, typename PERM_A, typename PERM_B > +struct GemmTest +{ + std::mt19937 gen(); + + void Rij_eq_AikBkj() + { + std::mt19937 gen( std::random_device{}() ); + + int const N = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen ); + int const M = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen ); + int const K = std::uniform_int_distribution< std::ptrdiff_t >{ 0, 20 }( gen ); + + T const alpha = T{10} * randomValue< T >( gen ); + T const beta = T{10} * randomValue< T >( gen ); + + Array2d< T, PERM_A > const A = randomMatrix< T, PERM_A >( N, K ); + Array2d< T, PERM_B > const B = randomMatrix< T, PERM_B >( K, M ); + Array2d< T, RAJA::PERM_JI > const C = randomMatrix< T, RAJA::PERM_JI >( N, M ); + + Array2d< T, PERM_A > const Acopy = A; + Array2d< T, PERM_B > const Bcopy = B; + Array2d< T, RAJA::PERM_JI > const Ccopy = C; + + dense::gemm< INTERFACE >( dense::Operation::NO_OP, dense::Operation::NO_OP, alpha, A, B, beta, C ); + + A.move( MemorySpace::host, false ); + B.move( MemorySpace::host, false ); + C.move( MemorySpace::host, false ); + + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + T dot = 0; + for( std::ptrdiff_t k = 0; k < K; ++k ) + { + dot += Acopy( i, k ) * Bcopy( k, j ); + } + + Ccopy( i, j ) = alpha * dot + beta * Ccopy( i, j ); + } + } + + checkEqual( A, Acopy, 0 ); + checkEqual( B, Bcopy, 0 ); + checkEqual( C, Ccopy, 1e3 * std::numeric_limits< dense::RealVersion< T > >::epsilon() ); + } +}; + +TEST( LapackInterface_float, Rij_eq_AikBkj ) +{ + GemmTest< dense::BlasLapackInterface< float >, float, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_double, Rij_eq_AikBkj ) +{ + GemmTest< dense::BlasLapackInterface< double >, double, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_complex_float, Rij_eq_AikBkj ) +{ + GemmTest< dense::BlasLapackInterface< std::complex< float > >, std::complex< float >, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_complex_double, Rij_eq_AikBkj ) +{ + GemmTest< dense::BlasLapackInterface< std::complex< double > >, std::complex< double >, RAJA::PERM_JI, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_float, Rij_eq_AikBkj_foo ) +{ + GemmTest< dense::BlasLapackInterface< float >, float, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_double, Rij_eq_AikBkj_foo ) +{ + GemmTest< dense::BlasLapackInterface< double >, double, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_complex_float, Rij_eq_AikBkj_foo ) +{ + GemmTest< dense::BlasLapackInterface< std::complex< float > >, std::complex< float >, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + +TEST( LapackInterface_complex_double, Rij_eq_AikBkj_foo ) +{ + GemmTest< dense::BlasLapackInterface< std::complex< double > >, std::complex< double >, RAJA::PERM_IJ, RAJA::PERM_JI >().Rij_eq_AikBkj(); +} + + +} // namespace testing +} // namespace LvArray + +// This is the default gtest main method. It is included for ease of debugging. +int main( int argc, char * * argv ) +{ +#if defined( LVARRAY_USE_MAGMA ) + magma_init(); +#endif + + ::testing::InitGoogleTest( &argc, argv ); + int const result = RUN_ALL_TESTS(); + +#if defined( LVARRAY_USE_MAGMA ) + magma_finalize(); +#endif + + return result; +} diff --git a/unitTests/testArray1DOfArray1D.cpp b/unitTests/testArray1DOfArray1D.cpp index faa53b52..7ff271cb 100644 --- a/unitTests/testArray1DOfArray1D.cpp +++ b/unitTests/testArray1DOfArray1D.cpp @@ -233,7 +233,7 @@ using Array1DOfArray1DTestTypes = ::testing::Types< , std::pair< Array1D< Tensor, ChaiBuffer >, serialPolicy > , std::pair< Array1D< TestString, ChaiBuffer >, serialPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< Array1D< int, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< Array1D< Tensor, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testArray1DOfArray1DOfArray1D.cpp b/unitTests/testArray1DOfArray1DOfArray1D.cpp index 5dc93fe8..5038d778 100644 --- a/unitTests/testArray1DOfArray1DOfArray1D.cpp +++ b/unitTests/testArray1DOfArray1DOfArray1D.cpp @@ -272,7 +272,7 @@ using Array1DOfArray1DOfArray1DTestTypes = ::testing::Types< , std::pair< Array1D< Tensor, ChaiBuffer >, serialPolicy > , std::pair< Array1D< TestString, ChaiBuffer >, serialPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP ) ) && defined(LVARRAY_USE_CHAI) , std::pair< Array1D< int, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< Array1D< Tensor, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testArrayOfArrays.cpp b/unitTests/testArrayOfArrays.cpp index 784fd448..aa20086b 100644 --- a/unitTests/testArrayOfArrays.cpp +++ b/unitTests/testArrayOfArrays.cpp @@ -1284,7 +1284,7 @@ using ArrayOfArraysViewTestTypes = ::testing::Types< , std::pair< ArrayOfArrays< TestString, std::ptrdiff_t, ChaiBuffer >, serialPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< ArrayOfArrays< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< ArrayOfArrays< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif @@ -1467,7 +1467,7 @@ using ArrayOfArraysViewAtomicTestTypes = ::testing::Types< , std::pair< ArrayOfArrays< TestString, std::ptrdiff_t, ChaiBuffer >, parallelHostPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< ArrayOfArrays< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< ArrayOfArrays< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testArrayOfSets.cpp b/unitTests/testArrayOfSets.cpp index d3b9f540..ac71a76b 100644 --- a/unitTests/testArrayOfSets.cpp +++ b/unitTests/testArrayOfSets.cpp @@ -925,7 +925,7 @@ using ArrayOfSetsViewTestTypes = ::testing::Types< , std::pair< ArrayOfSets< TestString, std::ptrdiff_t, ChaiBuffer >, serialPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< ArrayOfSets< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< ArrayOfSets< Tensor, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testArray_ChaiBuffer.cpp b/unitTests/testArray_ChaiBuffer.cpp index 34825981..8bd5aaed 100644 --- a/unitTests/testArray_ChaiBuffer.cpp +++ b/unitTests/testArray_ChaiBuffer.cpp @@ -42,6 +42,10 @@ class ArrayTest : public ::testing::Test auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) ); std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::cuda }; std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool }; + #elif defined(LVARRAY_USE_HIP) + auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) ); + std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::hip }; + std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool }; #else std::initializer_list< MemorySpace > const spaces = { MemorySpace::host }; std::initializer_list< umpire::Allocator > const allocators = { hostPool }; @@ -61,13 +65,19 @@ class ArrayTest : public ::testing::Test array.move( MemorySpace::cuda, true ); EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "DEVICE_pool" ); + array.move( MemorySpace::host, true ); + EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "HOST_pool" ); + #elif defined(LVARRAY_USE_HIP) + array.move( MemorySpace::hip, true ); + EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "DEVICE_pool" ); + array.move( MemorySpace::host, true ); EXPECT_EQ( rm.getAllocator( array.data() ).getName(), "HOST_pool" ); #endif } #if defined( LVARRAY_USE_CUDA ) - void testDeviceAlloc() + void testCudaDeviceAlloc() { Array< int, 1, RAJA::PERM_I, int, ChaiBuffer > array; @@ -86,6 +96,26 @@ class ArrayTest : public ::testing::Test } } #endif +#if defined(LVARRAY_USE_HIP) + void testHIPDeviceAlloc() + { + Array< int, 1, RAJA::PERM_I, int, ChaiBuffer > array; + + array.resizeWithoutInitializationOrDestruction( MemorySpace::hip, 100 ); + + T * const devPtr = array.data(); + forall< parallelDevicePolicy< 32 > >( array.size(), [devPtr] LVARRAY_DEVICE ( int const i ) + { + new ( &devPtr[ i ] ) T( i ); + } ); + + array.move( MemorySpace::host, true ); + for( int i = 0; i < array.size(); ++i ) + { + EXPECT_EQ( array[ i ], T( i ) ); + } + } +#endif }; /// The list of types to instantiate ArrayTest with. @@ -104,7 +134,15 @@ TYPED_TEST( ArrayTest, AllocatorConstruction ) TYPED_TEST( ArrayTest, DeviceAlloc ) { - this->testDeviceAlloc(); + this->testCudaDeviceAlloc(); +} + +#endif +#if defined(LVARRAY_USE_HIP) + +TYPED_TEST( ArrayTest, DeviceAlloc ) +{ + this->testHIPDeviceAlloc(); } #endif diff --git a/unitTests/testCRSMatrix.cpp b/unitTests/testCRSMatrix.cpp index 987aa4e9..3c6c0556 100644 --- a/unitTests/testCRSMatrix.cpp +++ b/unitTests/testCRSMatrix.cpp @@ -1036,7 +1036,7 @@ using CRSMatrixViewTestTypes = ::testing::Types< , std::pair< CRSMatrix< TestString, int, std::ptrdiff_t, ChaiBuffer >, serialPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif @@ -1276,7 +1276,7 @@ using CRSMatrixViewAtomicTestTypes = ::testing::Types< , std::pair< CRSMatrix< double, int, std::ptrdiff_t, ChaiBuffer >, parallelHostPolicy > #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< CRSMatrix< double, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testChaiBuffer.cpp b/unitTests/testChaiBuffer.cpp index 8c6d9937..27b3401a 100644 --- a/unitTests/testChaiBuffer.cpp +++ b/unitTests/testChaiBuffer.cpp @@ -41,6 +41,10 @@ class ChaiBufferTest : public ::testing::Test auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) ); std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::cuda }; std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool }; + #elif defined( LVARRAY_USE_HIP ) + auto devicePool = rm.makeAllocator< umpire::strategy::QuickPool >( "DEVICE_pool", rm.getAllocator( "DEVICE" ) ); + std::initializer_list< MemorySpace > const spaces = { MemorySpace::host, MemorySpace::hip }; + std::initializer_list< umpire::Allocator > const allocators = { hostPool, devicePool }; #else std::initializer_list< MemorySpace > const spaces = { MemorySpace::host }; std::initializer_list< umpire::Allocator > const allocators = { hostPool }; @@ -62,6 +66,12 @@ class ChaiBufferTest : public ::testing::Test buffer.move( MemorySpace::cuda, true ); EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "DEVICE_pool" ); + buffer.move( MemorySpace::host, true ); + EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "HOST_pool" ); + #elif defined(LVARRAY_USE_HIP) + buffer.move( MemorySpace::hip, true ); + EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "DEVICE_pool" ); + buffer.move( MemorySpace::host, true ); EXPECT_EQ( rm.getAllocator( buffer.data() ).getName(), "HOST_pool" ); #endif @@ -188,6 +198,126 @@ class ChaiBufferTest : public ::testing::Test EXPECT_EQ( buffer[ i ], T( i ) ); } + bufferManipulation::free( buffer, size ); + } +#elif defined( LVARRAY_USE_HIP ) + void testMove() + { + ChaiBuffer< T > buffer( true ); + + int const size = 100; + buffer.reallocate( 0, MemorySpace::host, size ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + + for( int i = 0; i < size; ++i ) + { + new ( &buffer[ i ] ) T( i ); + } + + buffer.move( MemorySpace::hip, true ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip ); + T * const devPtr = buffer.data(); + + forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i ) + { + devPtr[ i ] += devPtr[ i ]; + } ); + + // Check that the device changes are seen on the host. Then modify the values without touching. + buffer.move( MemorySpace::host, false ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + for( int i = 0; i < size; ++i ) + { + EXPECT_EQ( buffer[ i ], T( i ) + T( i ) ); + buffer[ i ] = T( 0 ); + } + + buffer.move( MemorySpace::hip, true ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip ); + forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i ) + { + devPtr[ i ] += devPtr[ i ]; + } ); + + buffer.move( MemorySpace::host, false ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + for( int i = 0; i < size; ++i ) + { + EXPECT_EQ( buffer[ i ], T( i ) + T( i ) + T( i ) + T( i ) ); + } + + bufferManipulation::free( buffer, size ); + } + + void testCapture() + { + ChaiBuffer< T > buffer( true ); + + int const size = 100; + buffer.reallocate( 0, MemorySpace::host, size ); + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + + for( int i = 0; i < size; ++i ) + { + new ( &buffer[ i ] ) T( i ); + } + + forall< parallelDevicePolicy< 32 > >( size, [buffer] LVARRAY_DEVICE ( int const i ) + { + buffer[ i ] += buffer[ i ]; + } ); + + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip ); + + + // Check that the device changes are seen on the host. Then modify the values without touching. + ChaiBuffer< T const > constBuffer( buffer ); + forall< serialPolicy >( size, [constBuffer] ( int const i ) + { + EXPECT_EQ( constBuffer[ i ], T( i ) + T( i ) ); + const_cast< T & >( constBuffer[ i ] ) = T( 0 ); + } ); + + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + EXPECT_EQ( constBuffer.getPreviousSpace(), MemorySpace::host ); + + forall< parallelDevicePolicy< 32 > >( size, [buffer] LVARRAY_DEVICE ( int const i ) + { + buffer[ i ] += buffer[ i ]; + } ); + + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::hip ); + + forall< serialPolicy >( size, [constBuffer] ( int const i ) + { + EXPECT_EQ( constBuffer[ i ], T( i ) + T( i ) + T( i ) + T( i ) ); + } ); + + EXPECT_EQ( buffer.getPreviousSpace(), MemorySpace::host ); + EXPECT_EQ( constBuffer.getPreviousSpace(), MemorySpace::host ); + + bufferManipulation::free( buffer, size ); + } + + void testDeviceRealloc() + { + ChaiBuffer< T > buffer( true ); + + int const size = 100; + buffer.reallocate( 0, MemorySpace::hip, size ); + + T * const devPtr = buffer.data(); + forall< parallelDevicePolicy< 32 > >( size, [devPtr] LVARRAY_DEVICE ( int const i ) + { + new ( &devPtr[ i ] ) T( i ); + } ); + + buffer.move( MemorySpace::host, true ); + for( int i = 0; i < size; ++i ) + { + EXPECT_EQ( buffer[ i ], T( i ) ); + } + bufferManipulation::free( buffer, size ); } #endif @@ -205,7 +335,7 @@ TYPED_TEST( ChaiBufferTest, AllocatorConstruction ) this->testAllocatorConstruction(); } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP ) TYPED_TEST( ChaiBufferTest, Move ) { diff --git a/unitTests/testMath.cpp b/unitTests/testMath.cpp index 08502c4f..f6d193cb 100644 --- a/unitTests/testMath.cpp +++ b/unitTests/testMath.cpp @@ -145,12 +145,14 @@ using TestMathTypes = ::testing::Types< , std::pair< long long int, serialPolicy > , std::pair< float, serialPolicy > , std::pair< double, serialPolicy > -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP ) , std::pair< int, parallelDevicePolicy< 32 > > , std::pair< long int, parallelDevicePolicy< 32 > > , std::pair< long long int, parallelDevicePolicy< 32 > > , std::pair< float, parallelDevicePolicy< 32 > > , std::pair< double, parallelDevicePolicy< 32 > > +#endif +#if defined( LVARRAY_USE_CUDA ) , std::pair< __half, parallelDevicePolicy< 32 > > #endif >; @@ -331,7 +333,7 @@ struct TestMath2 : public ::testing::Test } }; -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP ) using TestMath2Types = ::testing::Types< std::pair< __half2, parallelDevicePolicy< 32 > > @@ -403,7 +405,8 @@ void forAllHalvesinMinus1to1( bool const include1, LAMBDA && lambda ) } } ); } - +#endif +#if defined(LVARRAY_USE_CUDA) void asinHalfAccuracy() { RAJA::ReduceMax< RAJA::cuda_reduce, double > maxDiff( 0 ); diff --git a/unitTests/testMemcpy.cpp b/unitTests/testMemcpy.cpp index f3adcece..0e44243d 100644 --- a/unitTests/testMemcpy.cpp +++ b/unitTests/testMemcpy.cpp @@ -242,7 +242,106 @@ void testAsyncMemcpyDevice() EXPECT_EQ( x[ i ], -i ); } } +#elif defined(LVARRAY_USE_HIP) +template< template< typename > class BUFFER_TYPE > +void testMemcpyDevice() +{ + Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > x( 100 ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + x[ i ] = i; + } + + Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > y( x.size() ); + y.move( MemorySpace::hip ); + int * yPtr = y.data(); + + memcpy< 0, 0 >( y, {}, x.toViewConst(), {} ); + + forall< RAJA::hip_exec< 32 > >( y.size(), [yPtr] LVARRAY_DEVICE ( std::ptrdiff_t const i ) + { + PORTABLE_EXPECT_EQ( yPtr[ i ], i ); + yPtr[ i ] *= 2; + } ); + + memcpy< 0, 0 >( x, {}, y.toViewConst(), {} ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + EXPECT_EQ( x[ i ], 2 * i ); + } + + // Move y to the CPU but then capture and modify a view on device. This way y's data pointer is still pointing + // to host memory but the subsequent memcpy should pick up that it's previous space is on device. + y.move( MemorySpace::host ); + + ArrayView< int, 1, 0, std::ptrdiff_t, BUFFER_TYPE > const yView = y.toView(); + forall< RAJA::hip_exec< 32 > >( y.size(), [yView] LVARRAY_DEVICE ( std::ptrdiff_t const i ) + { + yView[ i ] = -i; + } ); + + memcpy< 0, 0 >( x, {}, y.toViewConst(), {} ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + EXPECT_EQ( x[ i ], -i ); + } +} + +template< template< typename > class BUFFER_TYPE > +void testAsyncMemcpyDevice() +{ + camp::resources::Resource stream{ camp::resources::Hip{} }; + + Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > x( 100 ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + x[ i ] = i; + } + + Array< int, 1, RAJA::PERM_I, std::ptrdiff_t, BUFFER_TYPE > y( x.size() ); + y.move( MemorySpace::hip ); + int * yPtr = y.data(); + + camp::resources::Event e = memcpy< 0, 0 >( stream, y.toView(), {}, x.toViewConst(), {} ); + stream.wait_for( &e ); + + forall< RAJA::hip_exec< 32 > >( y.size(), [yPtr] LVARRAY_DEVICE ( std::ptrdiff_t const i ) + { + PORTABLE_EXPECT_EQ( yPtr[ i ], i ); + yPtr[ i ] *= 2; + } ); + + e = memcpy< 0, 0 >( stream, x, {}, y.toViewConst(), {} ); + stream.wait_for( &e ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + EXPECT_EQ( x[ i ], 2 * i ); + } + + // Move y to the CPU but then capture and modify a view on device. This way y's data pointer is still pointing + // to host memory but the subsequent memcpy should pick up that it's previous space is on device. + y.move( MemorySpace::host ); + + ArrayView< int, 1, 0, std::ptrdiff_t, BUFFER_TYPE > const yView = y.toView(); + forall< RAJA::hip_exec< 32 > >( y.size(), [yView] LVARRAY_DEVICE ( std::ptrdiff_t const i ) + { + yView[ i ] = -i; + } ); + + e = memcpy< 0, 0 >( stream, x, {}, y.toViewConst(), {} ); + stream.wait_for( &e ); + + for( std::ptrdiff_t i = 0; i < x.size(); ++i ) + { + EXPECT_EQ( x[ i ], -i ); + } +} #endif TEST( TestMemcpy, MallocBuffer1D ) @@ -282,7 +381,7 @@ TEST( TestMemcpy, ChaiBuffer2D ) testMemcpy2D< ChaiBuffer >(); } -#if defined( LVARRAY_USE_CUDA ) +#if defined( LVARRAY_USE_CUDA ) || defined( LVARRAY_USE_HIP ) TEST( TestMemcpy, ChaiBufferDevice ) { diff --git a/unitTests/testSortedArray.cpp b/unitTests/testSortedArray.cpp index 5198bd24..fe52ddfc 100644 --- a/unitTests/testSortedArray.cpp +++ b/unitTests/testSortedArray.cpp @@ -451,7 +451,7 @@ using SortedArrayViewTestTypes = ::testing::Types< std::pair< SortedArray< int, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy > , std::pair< SortedArray< Tensor, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy > , std::pair< SortedArray< TestString, INDEX_TYPE, DEFAULT_BUFFER >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined( LVARRAY_USE_HIP ) ) && defined(LVARRAY_USE_CHAI) , std::pair< SortedArray< int, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< SortedArray< Tensor, INDEX_TYPE, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testSortedArrayManipulation.cpp b/unitTests/testSortedArrayManipulation.cpp index 2d784cb2..ae376cb4 100644 --- a/unitTests/testSortedArrayManipulation.cpp +++ b/unitTests/testSortedArrayManipulation.cpp @@ -190,7 +190,7 @@ using SingleArrayTestTypes = ::testing::Types< , std::tuple< TestString, sortedArrayManipulation::less< TestString >, serialPolicy > , std::tuple< TestString, sortedArrayManipulation::greater< TestString >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< int, sortedArrayManipulation::less< int >, parallelDevicePolicy< 256 > > , std::tuple< int, sortedArrayManipulation::greater< int >, parallelDevicePolicy< 256 > > , std::tuple< Tensor, sortedArrayManipulation::less< Tensor >, parallelDevicePolicy< 256 > > @@ -290,7 +290,7 @@ using DualArrayTestTypes = ::testing::Types< , std::tuple< TestString, TestString, sortedArrayManipulation::less< TestString >, serialPolicy > , std::tuple< TestString, TestString, sortedArrayManipulation::greater< TestString >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) )&& defined(LVARRAY_USE_CHAI) , std::tuple< int, int, sortedArrayManipulation::less< int >, parallelDevicePolicy< 256 > > , std::tuple< int, int, sortedArrayManipulation::greater< int >, parallelDevicePolicy< 256 > > , std::tuple< Tensor, Tensor, sortedArrayManipulation::less< Tensor >, parallelDevicePolicy< 256 > > diff --git a/unitTests/testSparsityPattern.cpp b/unitTests/testSparsityPattern.cpp index 50ec30f9..fee7a995 100644 --- a/unitTests/testSparsityPattern.cpp +++ b/unitTests/testSparsityPattern.cpp @@ -1016,7 +1016,7 @@ using SparsityPatternViewTestTypes = ::testing::Types< #endif #endif -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< SparsityPattern< int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #if !defined( __ibmxl__ ) , std::pair< SparsityPattern< uint, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > @@ -1171,7 +1171,7 @@ using CRSMatrixTestTypes = ::testing::Types< std::pair< CRSMatrix< int, int, std::ptrdiff_t, MallocBuffer >, serialPolicy > , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, MallocBuffer >, serialPolicy > , std::pair< CRSMatrix< TestString, int, std::ptrdiff_t, MallocBuffer >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::pair< CRSMatrix< int, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > , std::pair< CRSMatrix< Tensor, int, std::ptrdiff_t, ChaiBuffer >, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testStackArray.cpp b/unitTests/testStackArray.cpp index 249ccebb..e29206ab 100644 --- a/unitTests/testStackArray.cpp +++ b/unitTests/testStackArray.cpp @@ -281,7 +281,7 @@ using StackArrayCaptureTestTypes = ::testing::Types< , std::pair< RAJA::PERM_KIJ, serialPolicy > , std::pair< RAJA::PERM_KJI, serialPolicy > -#if defined(LVARRAY_USE_CUDA) +#if defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) , std::pair< RAJA::PERM_I, parallelDevicePolicy< 32 > > , std::pair< RAJA::PERM_IJ, parallelDevicePolicy< 32 > > , std::pair< RAJA::PERM_JI, parallelDevicePolicy< 32 > > diff --git a/unitTests/testTensorOpsEigen.cpp b/unitTests/testTensorOpsEigen.cpp index 46ff354d..2c556ec7 100644 --- a/unitTests/testTensorOpsEigen.cpp +++ b/unitTests/testTensorOpsEigen.cpp @@ -243,7 +243,7 @@ using TestEigendecompositionTypes = ::testing::Types< , std::tuple< double, double, std::integral_constant< int, 3 >, serialPolicy > , std::tuple< std::int64_t, double, std::integral_constant< int, 3 >, serialPolicy > , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< std::int64_t, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > diff --git a/unitTests/testTensorOpsFixedSize.cpp b/unitTests/testTensorOpsFixedSize.cpp index e66fd5a3..21392a17 100644 --- a/unitTests/testTensorOpsFixedSize.cpp +++ b/unitTests/testTensorOpsFixedSize.cpp @@ -569,7 +569,7 @@ using FixedSizeSquareMatrixTestTypes = ::testing::Types< std::tuple< double, std::integral_constant< int, 2 >, serialPolicy > , std::tuple< double, std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< double, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > #endif @@ -616,6 +616,5 @@ TYPED_TEST( FixedSizeSquareMatrixTest, denseToSymmetric ) { this->denseToSymmetric(); } - } // namespace testing } // namespace LvArray diff --git a/unitTests/testTensorOpsInverse.hpp b/unitTests/testTensorOpsInverse.hpp index 4909a686..9edfa950 100644 --- a/unitTests/testTensorOpsInverse.hpp +++ b/unitTests/testTensorOpsInverse.hpp @@ -375,7 +375,7 @@ using InverseTestTypes = ::testing::Types< , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy > , std::tuple< int, double, std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< int, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > @@ -400,7 +400,7 @@ using InverseFloatOnlyTestTypes = ::testing::Types< , std::tuple< double, double, std::integral_constant< int, 3 >, serialPolicy > , std::tuple< float, float, std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< float, float, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< double, double, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > diff --git a/unitTests/testTensorOpsNoSize.cpp b/unitTests/testTensorOpsNoSize.cpp index b08e5ae1..8c1112d4 100644 --- a/unitTests/testTensorOpsNoSize.cpp +++ b/unitTests/testTensorOpsNoSize.cpp @@ -349,7 +349,7 @@ using NoSizeTestTypes = ::testing::Types< std::tuple< double, serialPolicy > , std::tuple< int, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, parallelDevicePolicy< 32 > > , std::tuple< int, parallelDevicePolicy< 32 > > #endif diff --git a/unitTests/testTensorOpsOneSize.cpp b/unitTests/testTensorOpsOneSize.cpp index fc351c75..78946638 100644 --- a/unitTests/testTensorOpsOneSize.cpp +++ b/unitTests/testTensorOpsOneSize.cpp @@ -693,7 +693,7 @@ using OneSizeTestTypes = ::testing::Types< , std::tuple< int, std::integral_constant< int, 3 >, serialPolicy > , std::tuple< double, std::integral_constant< int, 6 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, std::integral_constant< int, 2 >, parallelDevicePolicy< 32 > > , std::tuple< int, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > , std::tuple< double, std::integral_constant< int, 6 >, parallelDevicePolicy< 32 > > diff --git a/unitTests/testTensorOpsThreeSizes.hpp b/unitTests/testTensorOpsThreeSizes.hpp index 5a27092a..b4546a9b 100644 --- a/unitTests/testTensorOpsThreeSizes.hpp +++ b/unitTests/testTensorOpsThreeSizes.hpp @@ -530,7 +530,7 @@ using ThreeSizesTestTypes = ::testing::Types< std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, diff --git a/unitTests/testTensorOpsTwoSizes.hpp b/unitTests/testTensorOpsTwoSizes.hpp index 07978011..5492b2b5 100644 --- a/unitTests/testTensorOpsTwoSizes.hpp +++ b/unitTests/testTensorOpsTwoSizes.hpp @@ -930,7 +930,7 @@ using TwoSizesTestTypes = ::testing::Types< , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, serialPolicy > , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, serialPolicy > -#if defined(LVARRAY_USE_CUDA) && defined(LVARRAY_USE_CHAI) +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) , std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, parallelDevicePolicy< 32 > > , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > diff --git a/unitTests/testTensorOpsTwoSizes1.cpp b/unitTests/testTensorOpsTwoSizes1.cpp index 7f5a97d5..96ac793c 100644 --- a/unitTests/testTensorOpsTwoSizes1.cpp +++ b/unitTests/testTensorOpsTwoSizes1.cpp @@ -6,13 +6,941 @@ */ // Source includes -#include "testTensorOpsTwoSizes.hpp" +//#include "testTensorOpsTwoSizes.hpp" + +// Source includes +#include "tensorOps.hpp" +#include "Array.hpp" +#include "testUtils.hpp" +#include "output.hpp" +#include "testTensorOpsCommon.hpp" + +// TPL includes +#include namespace LvArray { namespace testing { +template< typename T_N_M_POLICY_TUPLE > +class TwoSizesTest : public ::testing::Test +{ +public: + using T = std::tuple_element_t< 0, T_N_M_POLICY_TUPLE >; + static constexpr std::ptrdiff_t N = std::tuple_element_t< 1, T_N_M_POLICY_TUPLE > {}; + static constexpr std::ptrdiff_t M = std::tuple_element_t< 2, T_N_M_POLICY_TUPLE > {}; + using POLICY = std::tuple_element_t< 3, T_N_M_POLICY_TUPLE >; + + void SetUp() override + { + fill( m_matrixA_IJK.toSlice(), m_matrixASeed ); + fill( m_matrixA_IKJ.toSlice(), m_matrixASeed ); + fill( m_matrixA_KJI.toSlice(), m_matrixASeed ); + fill( m_matrixA_local, m_matrixASeed ); + + fill( m_matrixB_IJK.toSlice(), m_matrixBSeed ); + fill( m_matrixB_IKJ.toSlice(), m_matrixBSeed ); + fill( m_matrixB_KJI.toSlice(), m_matrixBSeed ); + fill( m_matrixB_local, m_matrixBSeed ); + + fill( m_matrixNN_IJK.toSlice(), m_matrixNNSeed ); + fill( m_matrixNN_IKJ.toSlice(), m_matrixNNSeed ); + fill( m_matrixNN_KJI.toSlice(), m_matrixNNSeed ); + fill( m_matrixNN_local, m_matrixNNSeed ); + + fill( m_matrixMN_IJK.toSlice(), m_matrixMNSeed ); + fill( m_matrixMN_IKJ.toSlice(), m_matrixMNSeed ); + fill( m_matrixMN_KJI.toSlice(), m_matrixMNSeed ); + fill( m_matrixMN_local, m_matrixMNSeed ); + + fill( m_vectorN_IJ.toSlice(), m_vectorNSeed ); + fill( m_vectorN_JI.toSlice(), m_vectorNSeed ); + fill( m_vectorN_local, m_vectorNSeed ); + + fill( m_vectorM_IJ.toSlice(), m_vectorMSeed ); + fill( m_vectorM_JI.toSlice(), m_vectorMSeed ); + fill( m_vectorM_local, m_vectorMSeed ); + } + + void testScale() + { + T scale = T( 3.14 ); + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = m_matrixA_local[ i ][ j ] * scale; + } + } + + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + std::ptrdiff_t const aSeed = m_matrixASeed; + forall< POLICY >( 1, [scale, result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, aSeed] LVARRAY_HOST_DEVICE ( int ) + { + tensorOps::scale< N, M >( matrixA_IJK[ 0 ], scale ); + CHECK_EQUALITY_2D( N, M, matrixA_IJK[ 0 ], result ); + + tensorOps::scale< N, M >( matrixA_IKJ[ 0 ], scale ); + CHECK_EQUALITY_2D( N, M, matrixA_IKJ[ 0 ], result ); + + tensorOps::scale< N, M >( matrixA_KJI[ 0 ], scale ); + CHECK_EQUALITY_2D( N, M, matrixA_KJI[ 0 ], result ); + + T matrix_local[ N ][ M ]; + fill( matrix_local, aSeed ); + tensorOps::scale< N, M >( matrix_local, scale ); + CHECK_EQUALITY_2D( N, M, matrix_local, result ); + } ); + } + + void testFill() + { + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI] LVARRAY_HOST_DEVICE ( int ) + { + for( int i = 0; i < 3; ++i ) + { + T const value = 3.14 * i; + tensorOps::fill< N, M >( matrixA_IJK[ 0 ], value ); + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + PORTABLE_EXPECT_EQ( matrixA_IJK( 0, j, k ), value ); + } + } + + tensorOps::fill< N, M >( matrixA_IKJ[ 0 ], value ); + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + PORTABLE_EXPECT_EQ( matrixA_IKJ( 0, j, k ), value ); + } + } + + tensorOps::fill< N, M >( matrixA_KJI[ 0 ], value ); + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + PORTABLE_EXPECT_EQ( matrixA_KJI( 0, j, k ), value ); + } + } + + T matrix_local[ N ][ M ]; + tensorOps::fill< N, M >( matrix_local, value ); + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + PORTABLE_EXPECT_EQ( matrix_local[ j ][ k ], value ); + } + } + } + } ); + } + + void testAiBj() + { + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = m_vectorN_local[ i ] * m_vectorM_local[ j ]; + } + } + + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst(); + T const ( &vectorN_local )[ N ] = m_vectorN_local; + + ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst(); + T const ( &vectorM_local )[ M ] = m_vectorM_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, vectorN_IJ, vectorN_JI, vectorN_local, + vectorM_IJ, vectorM_JI, vectorM_local, matrixSeed] LVARRAY_HOST_DEVICE ( int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( matrix, matrixSeed ); \ + tensorOps::Rij_eq_AiBj< N, M >( matrix, vectorN, vectorM ); \ + CHECK_EQUALITY_2D( N, M, matrix, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testPlusAiBj() + { + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = m_matrixA_local[ i ][ j ] + m_vectorN_local[ i ] * m_vectorM_local[ j ]; + } + } + + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst(); + T const ( &vectorN_local )[ N ] = m_vectorN_local; + + ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst(); + T const ( &vectorM_local )[ M ] = m_vectorM_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorM_local, matrixSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( matrix, matrixSeed ); \ + tensorOps::Rij_add_AiBj< N, M >( matrix, vectorN, vectorM ); \ + CHECK_EQUALITY_2D( N, M, matrix, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testAijBj() + { + T result[ N ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + T dot = 0; + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + dot += m_matrixA_local[ i ][ j ] * m_vectorM_local[ j ]; + } + result[ i ] = dot; + } + + ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst(); + T const ( &matrix_local )[ N ][ M ] = m_matrixA_local; + + ArrayViewT< T, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toView(); + ArrayViewT< T, 2, 0 > const vectorN_JI = m_vectorN_JI.toView(); + + ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst(); + T const ( &vectorM_local )[ M ] = m_vectorM_local; + + std::ptrdiff_t const vectorNSeed = m_vectorNSeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorM_IJ, vectorM_JI, vectorM_local, vectorNSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( vectorN, vectorNSeed ); \ + tensorOps::Ri_eq_AijBj< N, M >( vectorN, matrix, vectorM ); \ + CHECK_EQUALITY_1D( N, vectorN, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T vectorN_local[ N ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testPlusAijBj() + { + T result[ N ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + T dot = 0; + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + dot += m_matrixA_local[ i ][ j ] * m_vectorM_local[ j ]; + } + result[ i ] = m_vectorN_local[ i ] + dot; + } + + ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst(); + T const ( &matrix_local )[ N ][ M ] = m_matrixA_local; + + ArrayViewT< T, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toView(); + ArrayViewT< T, 2, 0 > const vectorN_JI = m_vectorN_JI.toView(); + + ArrayViewT< T const, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorM_JI = m_vectorM_JI.toViewConst(); + T const ( &vectorM_local )[ M ] = m_vectorM_local; + + std::ptrdiff_t const vectorNSeed = m_vectorNSeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorM_IJ, vectorM_JI, vectorM_local, vectorNSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( vectorN, vectorNSeed ); \ + tensorOps::Ri_add_AijBj< N, M >( vectorN, matrix, vectorM ); \ + CHECK_EQUALITY_1D( N, vectorN, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T vectorN_local[ N ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testAjiBj() + { + T result[ M ]; + for( std::ptrdiff_t i = 0; i < M; ++i ) + { + T dot = 0; + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + dot += m_matrixA_local[ j ][ i ] * m_vectorN_local[ j ]; + } + result[ i ] = dot; + } + + ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst(); + T const ( &matrix_local )[ N ][ M ] = m_matrixA_local; + + ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst(); + T const ( &vectorN_local )[ N ] = m_vectorN_local; + + ArrayViewT< T, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toView(); + ArrayViewT< T, 2, 0 > const vectorM_JI = m_vectorM_JI.toView(); + + std::ptrdiff_t const vectorMSeed = m_vectorMSeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorMSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( vectorM, vectorMSeed ); \ + tensorOps::Ri_eq_AjiBj< M, N >( vectorM, matrix, vectorN ); \ + CHECK_EQUALITY_1D( M, vectorM, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T vectorM_local[ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testPlusAjiBj() + { + T result[ M ]; + for( std::ptrdiff_t i = 0; i < M; ++i ) + { + T dot = 0; + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + dot += m_matrixA_local[ j ][ i ] * m_vectorN_local[ j ]; + } + result[ i ] = m_vectorM_local[ i ] + dot; + } + + ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst(); + T const ( &matrix_local )[ N ][ M ] = m_matrixA_local; + + ArrayViewT< T const, 2, 1 > const vectorN_IJ = m_vectorN_IJ.toViewConst(); + ArrayViewT< T const, 2, 0 > const vectorN_JI = m_vectorN_JI.toViewConst(); + T const ( &vectorN_local )[ N ] = m_vectorN_local; + + ArrayViewT< T, 2, 1 > const vectorM_IJ = m_vectorM_IJ.toView(); + ArrayViewT< T, 2, 0 > const vectorM_JI = m_vectorM_JI.toView(); + + std::ptrdiff_t const vectorMSeed = m_vectorMSeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrix_local, vectorN_IJ, vectorN_JI, vectorN_local, vectorM_IJ, vectorM_JI, vectorMSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( matrix, vectorN, vectorM ) \ + fill( vectorM, vectorMSeed ); \ + tensorOps::Ri_add_AjiBj< M, N >( vectorM, matrix, vectorN ); \ + CHECK_EQUALITY_1D( M, vectorM, result ) + + #define _TEST_PERMS( matrix, vectorN, vectorM0, vectorM1, vectorM2 ) \ + _TEST( matrix, vectorN, vectorM0 ); \ + _TEST( matrix, vectorN, vectorM1 ); \ + _TEST( matrix, vectorN, vectorM2 ) + + T vectorM_local[ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_IJ[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_JI[ 0 ], vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + _TEST_PERMS( matrix_local, vectorN_local, vectorM_IJ[ 0 ], vectorM_JI[ 0 ], vectorM_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testCopy() + { + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst(); + T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( dstMatrix, srcMatrix ) \ + fill( dstMatrix, matrixSeed ); \ + tensorOps::copy< N, M >( dstMatrix, srcMatrix ); \ + CHECK_EQUALITY_2D( N, M, dstMatrix, srcMatrix ) + + #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \ + _TEST( dstMatrix, srcMatrix0 ); \ + _TEST( dstMatrix, srcMatrix1 ); \ + _TEST( dstMatrix, srcMatrix2 ); \ + _TEST( dstMatrix, srcMatrix3 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testScaledCopy() + { + T scale = T( 3.14 ); + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = scale * m_matrixB_local[ i ][ j ]; + } + } + + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst(); + T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [scale, result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( dstMatrix, srcMatrix ) \ + fill( dstMatrix, matrixSeed ); \ + tensorOps::scaledCopy< N, M >( dstMatrix, srcMatrix, scale ); \ + CHECK_EQUALITY_2D( N, M, dstMatrix, result ) + + #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \ + _TEST( dstMatrix, srcMatrix0 ); \ + _TEST( dstMatrix, srcMatrix1 ); \ + _TEST( dstMatrix, srcMatrix2 ); \ + _TEST( dstMatrix, srcMatrix3 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testAdd() + { + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = m_matrixA_local[ i ][ j ] + m_matrixB_local[ i ][ j ]; + } + } + + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toViewConst(); + T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed] LVARRAY_HOST_DEVICE ( + int ) + { + #define _TEST( dstMatrix, srcMatrix ) \ + fill( dstMatrix, matrixSeed ); \ + tensorOps::add< N, M >( dstMatrix, srcMatrix ); \ + CHECK_EQUALITY_2D( N, M, dstMatrix, result ) + + #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \ + _TEST( dstMatrix, srcMatrix0 ); \ + _TEST( dstMatrix, srcMatrix1 ); \ + _TEST( dstMatrix, srcMatrix2 ); \ + _TEST( dstMatrix, srcMatrix3 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrix_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testScaledAdd() + { + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 3, 2 > const matrixB_IJK_view = m_matrixB_IJK.toView(); + ArrayViewT< T const, 3, 1 > const matrixB_IKJ_view = m_matrixB_IKJ.toView(); + ArrayViewT< T const, 3, 0 > const matrixB_KJI_view = m_matrixB_KJI.toView(); + + T const ( &matrixB_local )[ N ][ M ] = m_matrixB_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixB_IJK_view, matrixB_IKJ_view, matrixB_KJI_view, matrixB_local, matrixSeed ] + LVARRAY_HOST_DEVICE ( int ) + { + #define _TEST( dstMatrix, srcMatrix ) \ + fill( dstMatrix, matrixSeed ); \ + tensorOps::scaledAdd< N, M >( dstMatrix, srcMatrix, scale ); \ + CHECK_EQUALITY_2D( N, M, dstMatrix, result ); \ + + #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \ + _TEST( dstMatrix, srcMatrix0 ); \ + _TEST( dstMatrix, srcMatrix1 ); \ + _TEST( dstMatrix, srcMatrix2 ); \ + _TEST( dstMatrix, srcMatrix3 ) + + T matrixA_local[ N ][ M ]; + fill( matrixA_local, matrixSeed ); + + T const scale = T( 3.14 ); + T result[ N ][ M ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < M; ++j ) + { + result[ i ][ j ] = matrixA_local[ i ][ j ] + scale * matrixB_local[ i ][ j ]; + } + } + + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + _TEST_PERMS( matrixA_local, matrixB_IJK_view[ 0 ], matrixB_IKJ_view[ 0 ], matrixB_KJI_view[ 0 ], matrixB_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testAkiAkj() + { + T result[ N ][ N ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + T dot = 0; + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + dot += m_matrixMN_local[ k ][ i ] * m_matrixMN_local[ k ][ j ]; + } + result[ i ][ j ] = dot; + } + } + + ArrayViewT< T const, 3, 2 > const matrixMN_IJK = m_matrixMN_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixMN_IKJ = m_matrixMN_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixMN_KJI = m_matrixMN_KJI.toViewConst(); + T const ( &matrixMN_local )[ M ][ N ] = m_matrixMN_local; + + ArrayViewT< T, 3, 2 > const matrixNN_IJK = m_matrixNN_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixNN_IKJ = m_matrixNN_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixNN_KJI = m_matrixNN_KJI.toView(); + + std::ptrdiff_t const matrixNNSeed = m_matrixNNSeed; + + forall< POLICY >( 1, + [result, matrixMN_IJK, matrixMN_IKJ, matrixMN_KJI, matrixMN_local, matrixNN_IJK, + matrixNN_IKJ, matrixNN_KJI, matrixNNSeed ] LVARRAY_HOST_DEVICE ( int ) + { + #define _TEST( matrixNN, matrixMN ) \ + fill( matrixNN, matrixNNSeed ); \ + tensorOps::Rij_eq_AkiAkj< N, M >( matrixNN, matrixMN ); \ + CHECK_EQUALITY_2D( N, N, matrixNN, result ) + + #define _TEST_PERMS( matrixNN, matrixMN0, matrixMN1, matrixMN2, matrixMN3 ) \ + _TEST( matrixNN, matrixMN0 ); \ + _TEST( matrixNN, matrixMN1 ); \ + _TEST( matrixNN, matrixMN2 ); \ + _TEST( matrixNN, matrixMN3 ) + + T matrixNN_local[ N ][ N ]; + + _TEST_PERMS( matrixNN_IJK[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixNN_IKJ[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixNN_KJI[ 0 ], matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixNN_local, matrixMN_IJK[ 0 ], matrixMN_IKJ[ 0 ], matrixMN_KJI[ 0 ], matrixMN_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testPlusAikAjk() + { + T result[ N ][ N ]; + for( std::ptrdiff_t i = 0; i < N; ++i ) + { + for( std::ptrdiff_t j = 0; j < N; ++j ) + { + T dot = 0; + for( std::ptrdiff_t k = 0; k < M; ++k ) + { + dot += m_matrixA_local[ i ][ k ] * m_matrixA_local[ j ][ k ]; + } + result[ i ][ j ] = m_matrixNN_local[ i ][ j ] + dot; + } + } + + ArrayViewT< T const, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toViewConst(); + T const ( &matrixA_local )[ N ][ M ] = m_matrixA_local; + + ArrayViewT< T, 3, 2 > const matrixNN_IJK = m_matrixNN_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixNN_IKJ = m_matrixNN_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixNN_KJI = m_matrixNN_KJI.toView(); + + std::ptrdiff_t const matrixNNSeed = m_matrixNNSeed; + + forall< POLICY >( 1, + [result, matrixA_IJK, matrixA_IKJ, matrixA_KJI, matrixA_local, matrixNN_IJK, + matrixNN_IKJ, matrixNN_KJI, matrixNNSeed ] LVARRAY_HOST_DEVICE ( int ) + { + #define _TEST( matrixNN, matrixA ) \ + fill( matrixNN, matrixNNSeed ); \ + tensorOps::Rij_add_AikAjk< N, M >( matrixNN, matrixA ); \ + CHECK_EQUALITY_2D( N, N, matrixNN, result ) + + #define _TEST_PERMS( matrixNN, matrixA0, matrixA1, matrixA2, matrixA3 ) \ + _TEST( matrixNN, matrixA0 ); \ + _TEST( matrixNN, matrixA1 ); \ + _TEST( matrixNN, matrixA2 ); \ + _TEST( matrixNN, matrixA3 ) + + T matrixNN_local[ N ][ N ]; + + _TEST_PERMS( matrixNN_IJK[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local ); + _TEST_PERMS( matrixNN_IKJ[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local ); + _TEST_PERMS( matrixNN_KJI[ 0 ], matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local ); + _TEST_PERMS( matrixNN_local, matrixA_IJK[ 0 ], matrixA_IKJ[ 0 ], matrixA_KJI[ 0 ], matrixA_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + + void testTranspose() + { + ArrayViewT< T, 3, 2 > const matrixA_IJK = m_matrixA_IJK.toView(); + ArrayViewT< T, 3, 1 > const matrixA_IKJ = m_matrixA_IKJ.toView(); + ArrayViewT< T, 3, 0 > const matrixA_KJI = m_matrixA_KJI.toView(); + + ArrayViewT< T const, 3, 2 > const matrixMN_IJK_view = m_matrixMN_IJK.toViewConst(); + ArrayViewT< T const, 3, 1 > const matrixMN_IKJ_view = m_matrixMN_IKJ.toViewConst(); + ArrayViewT< T const, 3, 0 > const matrixMN_KJI_view = m_matrixMN_KJI.toViewConst(); + T const ( &matrixMN_local )[ M ][ N ] = m_matrixMN_local; + + std::ptrdiff_t const matrixSeed = m_matrixASeed; + + forall< POLICY >( 1, [=] LVARRAY_HOST_DEVICE ( int ) + { + #define _TEST( dstMatrix, srcMatrix ) \ + fill( dstMatrix, matrixSeed ); \ + tensorOps::transpose< N, M >( dstMatrix, srcMatrix ); \ + for( int i = 0; i < N; ++i ) \ + { \ + for( int j = 0; j < M; ++j ) \ + { \ + PORTABLE_EXPECT_EQ( dstMatrix[ i ][ j ], srcMatrix[ j ][ i ] ); \ + } \ + } + + #define _TEST_PERMS( dstMatrix, srcMatrix0, srcMatrix1, srcMatrix2, srcMatrix3 ) \ + _TEST( dstMatrix, srcMatrix0 ); \ + _TEST( dstMatrix, srcMatrix1 ); \ + _TEST( dstMatrix, srcMatrix2 ); \ + _TEST( dstMatrix, srcMatrix3 ) + + T matrix_local[ N ][ M ]; + + _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_IJK[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_IKJ[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrixA_KJI[ 0 ], matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + _TEST_PERMS( matrix_local, matrixMN_IJK_view[ 0 ], matrixMN_IKJ_view[ 0 ], matrixMN_KJI_view[ 0 ], matrixMN_local ); + + #undef _TEST_PERMS + #undef _TEST + } ); + } + +private: + std::ptrdiff_t const m_matrixASeed = 0; + ArrayT< T, RAJA::PERM_IJK > m_matrixA_IJK { 1, N, M }; + ArrayT< T, RAJA::PERM_IKJ > m_matrixA_IKJ { 1, N, M }; + ArrayT< T, RAJA::PERM_KJI > m_matrixA_KJI { 1, N, M }; + T m_matrixA_local[ N ][ M ]; + + std::ptrdiff_t const m_matrixBSeed = m_matrixASeed + N * M; + ArrayT< T, RAJA::PERM_IJK > m_matrixB_IJK { 1, N, M }; + ArrayT< T, RAJA::PERM_IKJ > m_matrixB_IKJ { 1, N, M }; + ArrayT< T, RAJA::PERM_KJI > m_matrixB_KJI { 1, N, M }; + T m_matrixB_local[ N ][ M ]; + + std::ptrdiff_t const m_matrixNNSeed = m_matrixBSeed + N * M; + ArrayT< T, RAJA::PERM_IJK > m_matrixNN_IJK { 1, N, N }; + ArrayT< T, RAJA::PERM_IKJ > m_matrixNN_IKJ { 1, N, N }; + ArrayT< T, RAJA::PERM_KJI > m_matrixNN_KJI { 1, N, N }; + T m_matrixNN_local[ N ][ N ]; + + std::ptrdiff_t const m_matrixMNSeed = m_matrixNNSeed + N * N; + ArrayT< T, RAJA::PERM_IJK > m_matrixMN_IJK { 1, M, N }; + ArrayT< T, RAJA::PERM_IKJ > m_matrixMN_IKJ { 1, M, N }; + ArrayT< T, RAJA::PERM_KJI > m_matrixMN_KJI { 1, M, N }; + T m_matrixMN_local[ M ][ N ]; + + std::ptrdiff_t const m_vectorNSeed = m_matrixMNSeed + N * M; + ArrayT< T, RAJA::PERM_IJ > m_vectorN_IJ { 1, N }; + ArrayT< T, RAJA::PERM_JI > m_vectorN_JI { 1, N }; + T m_vectorN_local[ N ]; + + std::ptrdiff_t const m_vectorMSeed = m_vectorNSeed + N; + ArrayT< T, RAJA::PERM_IJ > m_vectorM_IJ { 1, M }; + ArrayT< T, RAJA::PERM_JI > m_vectorM_JI { 1, M }; + T m_vectorM_local[ M ]; +}; + + +using TwoSizesTestTypes = ::testing::Types< + std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, serialPolicy > + , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, serialPolicy > + , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, serialPolicy > + +#if ( defined(LVARRAY_USE_CUDA) || defined(LVARRAY_USE_HIP) ) && defined(LVARRAY_USE_CHAI) + , std::tuple< double, std::integral_constant< int, 2 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > + , std::tuple< int, std::integral_constant< int, 5 >, std::integral_constant< int, 4 >, parallelDevicePolicy< 32 > > + , std::tuple< double, std::integral_constant< int, 3 >, std::integral_constant< int, 3 >, parallelDevicePolicy< 32 > > +#endif + >; + +TYPED_TEST_SUITE( TwoSizesTest, TwoSizesTestTypes, ); + + TYPED_TEST( TwoSizesTest, scale ) { this->testScale(); diff --git a/unitTests/testTypeManipulation.cpp b/unitTests/testTypeManipulation.cpp index 5bef9a4c..494fb038 100644 --- a/unitTests/testTypeManipulation.cpp +++ b/unitTests/testTypeManipulation.cpp @@ -78,6 +78,23 @@ CUDA_TEST( typeManipulation, forEachArg ) }, intReducer, floatReducer, doubleReducer ); } ); + EXPECT_EQ( intReducer.get(), 2 ); + EXPECT_EQ( floatReducer.get(), 4 ); + EXPECT_EQ( doubleReducer.get(), 7 ); +#elif defined(LVARRAY_USE_HIP) + // Test on device. + RAJA::ReduceSum< RAJA::hip_reduce, int > intReducer( 1 ); + RAJA::ReduceSum< RAJA::hip_reduce, float > floatReducer( 3 ); + RAJA::ReduceSum< RAJA::hip_reduce, double > doubleReducer( 6 ); + forall< parallelDevicePolicy< 32 > >( 1, [intReducer, floatReducer, doubleReducer] LVARRAY_DEVICE ( int ) + { + // This has to be a host-device lambda to avoid errors. + typeManipulation::forEachArg( [] LVARRAY_HOST_DEVICE ( auto & reducer ) + { + reducer += 1; + }, intReducer, floatReducer, doubleReducer ); + } ); + EXPECT_EQ( intReducer.get(), 2 ); EXPECT_EQ( floatReducer.get(), 4 ); EXPECT_EQ( doubleReducer.get(), 7 ); diff --git a/unitTests/testUtils.hpp b/unitTests/testUtils.hpp index 5a2db2bf..a4a3efa1 100644 --- a/unitTests/testUtils.hpp +++ b/unitTests/testUtils.hpp @@ -20,6 +20,7 @@ // TPL includes #include +#include #include // System includes @@ -72,6 +73,19 @@ struct RAJAHelper< RAJA::cuda_exec< N > > static constexpr MemorySpace space = MemorySpace::cuda; }; +#elif defined(LVARRAY_USE_HIP) + +template< unsigned long THREADS_PER_BLOCK > +using parallelDevicePolicy = RAJA::hip_exec< THREADS_PER_BLOCK >; + +template< unsigned long N > +struct RAJAHelper< RAJA::hip_exec< N > > +{ + using ReducePolicy = RAJA::hip_reduce; + using AtomicPolicy = RAJA::hip_atomic; + static constexpr MemorySpace space = MemorySpace::hip; +}; + #endif template< typename POLICY, typename INDEX_TYPE, typename LAMBDA > @@ -103,14 +117,14 @@ LAYOUT const & getRAJAViewLayout( RAJA::View< T, LAYOUT > const & view ) } -#ifndef __CUDA_ARCH__ -#define PORTABLE_EXPECT_EQ( L, R ) EXPECT_EQ( L, R ) -#define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) EXPECT_LE( math::abs( ( L ) -( R ) ), EPSILON ) << \ - STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R ); -#else +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) #define PORTABLE_EXPECT_EQ( L, R ) LVARRAY_ERROR_IF_NE( L, R ) #define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) LVARRAY_ERROR_IF_GE_MSG( math::abs( ( L ) -( R ) ), EPSILON, \ STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R ) ); +#else +#define PORTABLE_EXPECT_EQ( L, R ) EXPECT_EQ( L, R ) +#define PORTABLE_EXPECT_NEAR( L, R, EPSILON ) EXPECT_LE( math::abs( ( L ) -( R ) ), EPSILON ) << \ + STRINGIZE( L ) " = " << ( L ) << "\n" << STRINGIZE( R ) " = " << ( R ); #endif // Comparator that compares a std::pair by it's first object.