From b7a322deb6d5b4be1021153991a10108f8c9e84c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 7 Jul 2020 19:22:41 -0700 Subject: [PATCH 01/41] [CI] Add RMM as an optional dependency --- CMakeLists.txt | 7 ++++++ Jenkinsfile | 34 +++++++++++++++++++++++++++ cmake/ExternalLibs.cmake | 27 +++++++++++++++++++++ src/common/device_helpers.cuh | 33 ++++++++++++++++++++++---- tests/ci_build/Dockerfile.rmm | 39 +++++++++++++++++++++++++++++++ tests/ci_build/build_via_cmake.sh | 13 ++++++++++- 6 files changed, 147 insertions(+), 6 deletions(-) create mode 100644 cmake/ExternalLibs.cmake create mode 100644 tests/ci_build/Dockerfile.rmm diff --git a/CMakeLists.txt b/CMakeLists.txt index d8190c5fb809..830f2669ec21 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,7 @@ option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF) option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF) set(GPU_COMPUTE_VER "" CACHE STRING "Semicolon separated list of compute versions to be built against, e.g. '35;61'") +option(USE_RMM "Build with RAPIDS Memory Manager (RMM)" OFF) ## Copied From dmlc option(USE_HDFS "Build with HDFS support" OFF) option(USE_AZURE "Build with AZURE support" OFF) @@ -79,6 +80,9 @@ endif (R_LIB AND GOOGLE_TEST) if (USE_AVX) message(SEND_ERROR "The option 'USE_AVX' is deprecated as experimental AVX features have been removed from XGBoost.") endif (USE_AVX) +if (USE_RMM AND NOT (USE_CUDA)) + message(SEND_ERROR "`USE_RMM` must be enabled with `USE_CUDA` flag.") +endif (USE_RMM AND NOT (USE_CUDA)) #-- Sanitizer if (USE_SANITIZER) @@ -170,6 +174,9 @@ endif (R_LIB) # Plugin add_subdirectory(${xgboost_SOURCE_DIR}/plugin) +# 3rd-party libs +include(cmake/ExternalLibs.cmake) + #-- library if (BUILD_STATIC_LIB) add_library(xgboost STATIC) diff --git a/Jenkinsfile b/Jenkinsfile index 40db6f9c0fe9..2c9c40b4480e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -66,6 +66,7 @@ pipeline { 'build-cpu-non-omp': { BuildCPUNonOmp() }, 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') }, 'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') }, + 'build-gpu-rmm-cuda10.2': { BuildCUDAWithRMM(cuda_version: '10.2') }, 'build-jvm-packages': { BuildJVMPackages(spark_version: '2.4.3') }, 'build-jvm-doc': { BuildJVMDoc() } ]) @@ -84,6 +85,7 @@ pipeline { 'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) }, 'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') }, 'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) }, + 'test-rmm-cpp-gpu': { TestCppGPUWithRMM(cuda_version: '10.2') }, 'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') }, 'test-jvm-jdk11': { CrossTestJVMwithJDK(jdk_version: '11') }, 'test-jvm-jdk12': { CrossTestJVMwithJDK(jdk_version: '12') }, @@ -262,6 +264,22 @@ def BuildCUDA(args) { } } +def BuildCUDAWithRMM(args) { + node('linux && cpu_build') { + unstash name: 'srcs' + echo "Build with CUDA ${args.cuda_version} and RMM" + def container_type = "rmm" + def docker_binary = "docker" + def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}" + sh """ + ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=rmm_test -DUSE_CUDA=ON -DUSE_RMM=ON + """ + echo 'Stashing C++ test executable (testxgboost)...' + stash name: 'xgboost_rmm_cpp_tests', includes: 'build/testxgboost' + deleteDir() + } +} + def BuildJVMPackages(args) { node('linux && cpu') { unstash name: 'srcs' @@ -368,6 +386,22 @@ def TestCppGPU(args) { } } +def TestCppGPUWithRMM(args) { + node('linux && gpu') { + unstash name: 'xgboost_rmm_cpp_tests' + unstash name: 'srcs' + echo "Test C++, CUDA ${args.cuda_version} with RMM" + def container_type = "rmm" + def docker_binary = "nvidia-docker" + def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}" + echo "Using a single GPU" + sh """ + ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate rmm_test && build/testxgboost --gtest_filter=-*.MGPU_*" + """ + deleteDir() + } +} + def CrossTestJVMwithJDK(args) { node('linux && cpu') { unstash name: 'xgboost4j_jar' diff --git a/cmake/ExternalLibs.cmake b/cmake/ExternalLibs.cmake new file mode 100644 index 000000000000..7181699d508f --- /dev/null +++ b/cmake/ExternalLibs.cmake @@ -0,0 +1,27 @@ +# RMM +if (USE_RMM) + # Use Conda env if available + if(DEFINED ENV{CONDA_PREFIX}) + set(CMAKE_PREFIX_PATH "$ENV{CONDA_PREFIX};${CMAKE_PREFIX_PATH}") + message(STATUS "Detected Conda environment, CMAKE_PREFIX_PATH set to: ${CMAKE_PREFIX_PATH}") + else() + message(STATUS "No Conda environment detected") + endif() + + find_path(RMM_INCLUDE "rmm" + HINTS "$ENV{RMM_ROOT}/include") + + find_library(RMM_LIBRARY "rmm" + HINTS "$ENV{RMM_ROOT}/lib" "$ENV{RMM_ROOT}/build") + + if ((NOT RMM_LIBRARY) OR (NOT RMM_INCLUDE)) + message(FATAL_ERROR "Could not locate RMM library") + endif () + + message(STATUS "RMM: RMM_LIBRARY set to ${RMM_LIBRARY}") + message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}") + + target_include_directories(objxgboost PUBLIC ${RMM_INCLUDE}) + target_link_libraries(objxgboost PUBLIC ${RMM_LIBRARY} cuda) + target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1) +endif () diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index d339dc72d712..c948ce130450 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -36,7 +36,15 @@ #ifdef XGBOOST_USE_NCCL #include "nccl.h" -#endif +#endif // XGBOOST_USE_NCCL + +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +#include "rmm/mr/device/cuda_memory_resource.hpp" +#include "rmm/mr/device/default_memory_resource.hpp" +#include "rmm/mr/device/device_memory_resource.hpp" +#include "rmm/mr/device/pool_memory_resource.hpp" +#include "rmm/mr/device/thrust_allocator_adaptor.hpp" +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__) @@ -370,12 +378,21 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) { } namespace detail { + +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +template +using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator; +#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +template +using XGBBaseDeviceAllocator = thrust::device_malloc_allocator; +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + /** * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose. */ template -struct XGBDefaultDeviceAllocatorImpl : thrust::device_malloc_allocator { - using SuperT = thrust::device_malloc_allocator; +struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator { + using SuperT = XGBBaseDeviceAllocator; using pointer = thrust::device_ptr; // NOLINT template struct rebind // NOLINT @@ -391,13 +408,19 @@ struct XGBDefaultDeviceAllocatorImpl : thrust::device_malloc_allocator { GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); return SuperT::deallocate(ptr, n); } +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + using cuda_mr = rmm::mr::cuda_memory_resource; + using pool_mr = rmm::mr::pool_memory_resource; + XGBDefaultDeviceAllocatorImpl() : SuperT(new pool_mr(new cuda_mr), cudaStream_t{0}) {} +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 }; /** - * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end and logs allocations if verbose. Does not initialise memory on construction. + * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end and logs + * allocations if verbose. Does not initialise memory on construction. */ template -struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator { +struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator { using pointer = thrust::device_ptr; // NOLINT template struct rebind // NOLINT diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm new file mode 100644 index 000000000000..18730a9c0889 --- /dev/null +++ b/tests/ci_build/Dockerfile.rmm @@ -0,0 +1,39 @@ +ARG CUDA_VERSION +FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu18.04 + +# Environment +ENV DEBIAN_FRONTEND noninteractive +SHELL ["/bin/bash", "-c"] # Use Bash as shell + +# Install all basic requirements +RUN \ + apt-get update && \ + apt-get install -y wget unzip bzip2 libgomp1 build-essential ninja-build git && \ + # Python + wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash Miniconda3.sh -b -p /opt/python && \ + # CMake + wget -nv -nc https://cmake.org/files/v3.13/cmake-3.13.0-Linux-x86_64.sh --no-check-certificate && \ + bash cmake-3.13.0-Linux-x86_64.sh --skip-license --prefix=/usr + +ENV PATH=/opt/python/bin:$PATH + +# Create new Conda environment with RMM +RUN \ + conda create -n rmm_test -c nvidia -c rapidsai -c conda-forge -c defaults \ + python=3.7 rmm=0.14 cudatoolkit=$CUDA_VERSION + +ENV GOSU_VERSION 1.10 + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/build_via_cmake.sh b/tests/ci_build/build_via_cmake.sh index 98808141b0e8..8fc0c0acc66f 100755 --- a/tests/ci_build/build_via_cmake.sh +++ b/tests/ci_build/build_via_cmake.sh @@ -1,10 +1,21 @@ #!/usr/bin/env bash set -e +if [[ "$1" == --conda-env=* ]] +then + conda_env=$(echo "$1" | sed 's/^--conda-env=//g' -) + echo "Activating Conda environment ${conda_env}" + shift 1 + cmake_args="$@" + source activate ${conda_env} +else + cmake_args="$@" +fi + rm -rf build mkdir build cd build -cmake .. "$@" -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON +cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON make clean make -j$(nproc) cd .. From e15845d4e72e890c2babe31a988b26503a7d9038 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 8 Jul 2020 16:08:59 -0700 Subject: [PATCH 02/41] Replace caching allocator with pool allocator from RMM --- src/common/device_helpers.cuh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index c948ce130450..3870f6082d2d 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -420,7 +420,7 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator { * allocations if verbose. Does not initialise memory on construction. */ template -struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator { +struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator { using pointer = thrust::device_ptr; // NOLINT template struct rebind // NOLINT @@ -462,8 +462,13 @@ using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl; /*! Be careful that the initialization constructor is a no-op, which means calling * `vec.resize(n)` won't initialize the memory region to 0. Instead use * `vec.resize(n, 0)`*/ +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +template +using XGBCachingDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl; +#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 template using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl; +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 /** \brief Specialisation of thrust device vector using custom allocator. */ template using device_vector = thrust::device_vector>; // NOLINT From 812c2092240e2e9011d319602656f8e26236b1f1 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 8 Jul 2020 21:43:52 -0700 Subject: [PATCH 03/41] Revert "Replace caching allocator with pool allocator from RMM" This reverts commit e15845d4e72e890c2babe31a988b26503a7d9038. --- src/common/device_helpers.cuh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 3870f6082d2d..e8054d9948b0 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -462,13 +462,8 @@ using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl; /*! Be careful that the initialization constructor is a no-op, which means calling * `vec.resize(n)` won't initialize the memory region to 0. Instead use * `vec.resize(n, 0)`*/ -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -template -using XGBCachingDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl; -#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 template using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl; -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 /** \brief Specialisation of thrust device vector using custom allocator. */ template using device_vector = thrust::device_vector>; // NOLINT From a8911128d0dcf598e5ed50adf0db4d777a873d35 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 8 Jul 2020 21:48:41 -0700 Subject: [PATCH 04/41] Use rmm::mr::get_default_resource() --- src/common/device_helpers.cuh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index e8054d9948b0..ab40a46ba2d0 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -39,10 +39,7 @@ #endif // XGBOOST_USE_NCCL #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -#include "rmm/mr/device/cuda_memory_resource.hpp" #include "rmm/mr/device/default_memory_resource.hpp" -#include "rmm/mr/device/device_memory_resource.hpp" -#include "rmm/mr/device/pool_memory_resource.hpp" #include "rmm/mr/device/thrust_allocator_adaptor.hpp" #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 @@ -409,9 +406,7 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator { return SuperT::deallocate(ptr, n); } #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - using cuda_mr = rmm::mr::cuda_memory_resource; - using pool_mr = rmm::mr::pool_memory_resource; - XGBDefaultDeviceAllocatorImpl() : SuperT(new pool_mr(new cuda_mr), cudaStream_t{0}) {} + XGBDefaultDeviceAllocatorImpl() : SuperT(rmm::mr::get_default_resource(), cudaStream_t{0}) {} #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 }; From b5eb54d8c71ca62bdf2d0012f4fcd17ecef133e9 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 9 Jul 2020 10:05:59 -0700 Subject: [PATCH 05/41] Try setting default resource (doesn't work yet) --- tests/cpp/helpers.cc | 4 ++++ tests/cpp/helpers.cu | 12 ++++++++++++ tests/cpp/helpers.h | 2 ++ tests/cpp/test_main.cc | 3 +++ 4 files changed, 21 insertions(+) diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 2274e57e7307..e3b3ccb88194 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -478,4 +478,8 @@ std::unique_ptr CreateTrainedGBM( return gbm; } +#ifndef XGBOOST_USE_CUDA +void SetUpRMMResource() {} +#endif // XGBOOST_USE_CUDA + } // namespace xgboost diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu index 9b70ea543231..7d240cb60bc7 100644 --- a/tests/cpp/helpers.cu +++ b/tests/cpp/helpers.cu @@ -4,6 +4,11 @@ #include "../../src/data/device_adapter.cuh" #include "../../src/data/iterative_device_dmatrix.h" +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +#include "rmm/mr/device/default_memory_resource.hpp" +#include "rmm/mr/device/cnmem_memory_resource.hpp" +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + namespace xgboost { CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows, @@ -40,4 +45,11 @@ std::shared_ptr RandomDataGenerator::GenerateDeviceDMatrix(bool with_la 0, bins_); return m; } + +void SetUpRMMResource() { +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + rmm::mr::cnmem_memory_resource pool_mr{}; + rmm::mr::set_default_resource(&pool_mr); +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +} } // namespace xgboost diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index 0783b9a89dba..441e8a6b0c5d 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -352,5 +352,7 @@ inline int Next(DataIterHandle self) { return static_cast(self)->Next(); } +void SetUpRMMResource(); + } // namespace xgboost #endif diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc index d9f3e8f338a8..eaf2274250d3 100644 --- a/tests/cpp/test_main.cc +++ b/tests/cpp/test_main.cc @@ -5,7 +5,10 @@ #include #include +#include "helpers.h" + int main(int argc, char ** argv) { + xgboost::SetUpRMMResource(); xgboost::Args args {{"verbosity", "2"}}; xgboost::ConsoleLogger::Configure(args); From 6abd4c0e29904bf9b67000bb9acf1bcd8cccea03 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 9 Jul 2020 10:31:02 -0700 Subject: [PATCH 06/41] Allocate pool_mr in the heap --- tests/cpp/helpers.cu | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu index 7d240cb60bc7..c5ec9cc72dc9 100644 --- a/tests/cpp/helpers.cu +++ b/tests/cpp/helpers.cu @@ -5,8 +5,10 @@ #include "../../src/data/iterative_device_dmatrix.h" #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +#include #include "rmm/mr/device/default_memory_resource.hpp" -#include "rmm/mr/device/cnmem_memory_resource.hpp" +#include "rmm/mr/device/cuda_memory_resource.hpp" +#include "rmm/mr/device/pool_memory_resource.hpp" #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 namespace xgboost { @@ -48,8 +50,11 @@ std::shared_ptr RandomDataGenerator::GenerateDeviceDMatrix(bool with_la void SetUpRMMResource() { #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - rmm::mr::cnmem_memory_resource pool_mr{}; - rmm::mr::set_default_resource(&pool_mr); + using cuda_mr_t = rmm::mr::cuda_memory_resource; + using pool_mr_t = rmm::mr::pool_memory_resource; + auto cuda_mr = std::make_unique(); + auto pool_mr = std::make_unique(cuda_mr.release()); + rmm::mr::set_default_resource(pool_mr.release()); #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 } } // namespace xgboost From 2bdbc238c80c88ab0ba2793850b4ab2c9ddb9da3 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 9 Jul 2020 12:04:45 -0700 Subject: [PATCH 07/41] Prevent leaking pool_mr handle --- tests/cpp/helpers.cc | 14 +++++++++++--- tests/cpp/helpers.cu | 20 +++++++++++++++----- tests/cpp/helpers.h | 5 ++++- tests/cpp/test_main.cc | 3 ++- 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index e3b3ccb88194..04c7f934c621 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -478,8 +478,16 @@ std::unique_ptr CreateTrainedGBM( return gbm; } -#ifndef XGBOOST_USE_CUDA -void SetUpRMMResource() {} -#endif // XGBOOST_USE_CUDA +#if !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1 +class RMMAllocator {}; + +void DeleteRMMResource(RMMAllocator* r) { + delete r; +} + +RMMAllocatorPtr SetUpRMMResource() { + return RMMAllocatorPtr(nullptr, DeleteRMMResource); +} +#endif // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1 } // namespace xgboost diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu index c5ec9cc72dc9..ebaa0a49f20c 100644 --- a/tests/cpp/helpers.cu +++ b/tests/cpp/helpers.cu @@ -48,13 +48,23 @@ std::shared_ptr RandomDataGenerator::GenerateDeviceDMatrix(bool with_la return m; } -void SetUpRMMResource() { #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - using cuda_mr_t = rmm::mr::cuda_memory_resource; - using pool_mr_t = rmm::mr::pool_memory_resource; +using cuda_mr_t = rmm::mr::cuda_memory_resource; +using pool_mr_t = rmm::mr::pool_memory_resource; +class RMMAllocator { + public: + std::unique_ptr handle; +}; + +void DeleteRMMResource(RMMAllocator* r) { + delete r; +} + +RMMAllocatorPtr SetUpRMMResource() { auto cuda_mr = std::make_unique(); auto pool_mr = std::make_unique(cuda_mr.release()); - rmm::mr::set_default_resource(pool_mr.release()); -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + rmm::mr::set_default_resource(pool_mr.get()); + return RMMAllocatorPtr(new RMMAllocator{std::move(pool_mr)}, DeleteRMMResource); } +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 } // namespace xgboost diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index 441e8a6b0c5d..967223fed9bb 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -352,7 +353,9 @@ inline int Next(DataIterHandle self) { return static_cast(self)->Next(); } -void SetUpRMMResource(); +class RMMAllocator; +using RMMAllocatorPtr = std::unique_ptr; +RMMAllocatorPtr SetUpRMMResource(); } // namespace xgboost #endif diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc index eaf2274250d3..ed35ff775e82 100644 --- a/tests/cpp/test_main.cc +++ b/tests/cpp/test_main.cc @@ -3,12 +3,13 @@ #include #include #include +#include #include #include "helpers.h" int main(int argc, char ** argv) { - xgboost::SetUpRMMResource(); + auto rmm_alloc = xgboost::SetUpRMMResource(); xgboost::Args args {{"verbosity", "2"}}; xgboost::ConsoleLogger::Configure(args); From c7236326ae2a32d42ab0dab2627dcb94af87a31d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 9 Jul 2020 12:57:49 -0700 Subject: [PATCH 08/41] Separate EXPECT_DEATH() in separate test suite suffixed DeathTest --- tests/cpp/common/test_span.cc | 82 +++++++++++++++---- tests/cpp/common/test_span.cu | 2 +- tests/cpp/common/test_transform_range.cc | 2 +- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 2 +- 4 files changed, 68 insertions(+), 20 deletions(-) diff --git a/tests/cpp/common/test_span.cc b/tests/cpp/common/test_span.cc index c49763f3f61b..53fd32e1a1f0 100644 --- a/tests/cpp/common/test_span.cc +++ b/tests/cpp/common/test_span.cc @@ -97,11 +97,6 @@ TEST(Span, FromPtrLen) { } } - { - auto lazy = [=]() {Span tmp (arr, 5);}; - EXPECT_DEATH(lazy(), "\\[xgboost\\] Condition .* failed.\n"); - } - // dynamic extent { Span s (arr, 16); @@ -122,6 +117,15 @@ TEST(Span, FromPtrLen) { } } +TEST(SpanDeathTest, FromPtrLen) { + float arr[16]; + InitializeRange(arr, arr+16); + { + auto lazy = [=]() {Span tmp (arr, 5);}; + EXPECT_DEATH(lazy(), "\\[xgboost\\] Condition .* failed.\n"); + } +} + TEST(Span, FromFirstLast) { float arr[16]; InitializeRange(arr, arr+16); @@ -285,7 +289,13 @@ TEST(Span, ElementAccess) { ASSERT_EQ(i, arr[j]); ++j; } +} + +TEST(SpanDeathTest, ElementAccess) { + float arr[16]; + InitializeRange(arr, arr + 16); + Span s (arr); EXPECT_DEATH(s[16], "\\[xgboost\\] Condition .* failed.\n"); EXPECT_DEATH(s[-1], "\\[xgboost\\] Condition .* failed.\n"); @@ -312,7 +322,9 @@ TEST(Span, FrontBack) { ASSERT_EQ(s.front(), 0); ASSERT_EQ(s.back(), 3); } +} +TEST(SpanDeathTest, FrontBack) { { Span s; EXPECT_DEATH(s.front(), "\\[xgboost\\] Condition .* failed.\n"); @@ -340,10 +352,6 @@ TEST(Span, FirstLast) { for (size_t i = 0; i < first.size(); ++i) { ASSERT_EQ(first[i], arr[i]); } - auto constexpr kOne = static_cast::index_type>(-1); - EXPECT_DEATH(s.first(), "\\[xgboost\\] Condition .* failed.\n"); - EXPECT_DEATH(s.first<17>(), "\\[xgboost\\] Condition .* failed.\n"); - EXPECT_DEATH(s.first<32>(), "\\[xgboost\\] Condition .* failed.\n"); } { @@ -359,10 +367,6 @@ TEST(Span, FirstLast) { for (size_t i = 0; i < last.size(); ++i) { ASSERT_EQ(last[i], arr[i+12]); } - auto constexpr kOne = static_cast::index_type>(-1); - EXPECT_DEATH(s.last(), "\\[xgboost\\] Condition .* failed.\n"); - EXPECT_DEATH(s.last<17>(), "\\[xgboost\\] Condition .* failed.\n"); - EXPECT_DEATH(s.last<32>(), "\\[xgboost\\] Condition .* failed.\n"); } // dynamic extent @@ -379,10 +383,6 @@ TEST(Span, FirstLast) { ASSERT_EQ(first[i], s[i]); } - EXPECT_DEATH(s.first(-1), "\\[xgboost\\] Condition .* failed.\n"); - EXPECT_DEATH(s.first(17), "\\[xgboost\\] Condition .* failed.\n"); - EXPECT_DEATH(s.first(32), "\\[xgboost\\] Condition .* failed.\n"); - delete [] arr; } @@ -399,6 +399,50 @@ TEST(Span, FirstLast) { ASSERT_EQ(s[12 + i], last[i]); } + delete [] arr; + } +} + +TEST(SpanDeathTest, FirstLast) { + // static extent + { + float arr[16]; + InitializeRange(arr, arr + 16); + + Span s (arr); + auto constexpr kOne = static_cast::index_type>(-1); + EXPECT_DEATH(s.first(), "\\[xgboost\\] Condition .* failed.\n"); + EXPECT_DEATH(s.first<17>(), "\\[xgboost\\] Condition .* failed.\n"); + EXPECT_DEATH(s.first<32>(), "\\[xgboost\\] Condition .* failed.\n"); + } + + { + float arr[16]; + InitializeRange(arr, arr + 16); + + Span s (arr); + auto constexpr kOne = static_cast::index_type>(-1); + EXPECT_DEATH(s.last(), "\\[xgboost\\] Condition .* failed.\n"); + EXPECT_DEATH(s.last<17>(), "\\[xgboost\\] Condition .* failed.\n"); + EXPECT_DEATH(s.last<32>(), "\\[xgboost\\] Condition .* failed.\n"); + } + + // dynamic extent + { + float *arr = new float[16]; + InitializeRange(arr, arr + 16); + Span s (arr, 16); + EXPECT_DEATH(s.first(-1), "\\[xgboost\\] Condition .* failed.\n"); + EXPECT_DEATH(s.first(17), "\\[xgboost\\] Condition .* failed.\n"); + EXPECT_DEATH(s.first(32), "\\[xgboost\\] Condition .* failed.\n"); + + delete [] arr; + } + + { + float *arr = new float[16]; + InitializeRange(arr, arr + 16); + Span s (arr, 16); EXPECT_DEATH(s.last(-1), "\\[xgboost\\] Condition .* failed.\n"); EXPECT_DEATH(s.last(17), "\\[xgboost\\] Condition .* failed.\n"); EXPECT_DEATH(s.last(32), "\\[xgboost\\] Condition .* failed.\n"); @@ -420,7 +464,11 @@ TEST(Span, Subspan) { auto s4 = s1.subspan(2, dynamic_extent); ASSERT_EQ(s1.data() + 2, s4.data()); ASSERT_EQ(s4.size(), s1.size() - 2); +} +TEST(SpanDeathTest, Subspan) { + int arr[16] {0}; + Span s1 (arr); EXPECT_DEATH(s1.subspan(-1, 0), "\\[xgboost\\] Condition .* failed.\n"); EXPECT_DEATH(s1.subspan(17, 0), "\\[xgboost\\] Condition .* failed.\n"); diff --git a/tests/cpp/common/test_span.cu b/tests/cpp/common/test_span.cu index 00e00d4f4c3c..7e9336902b36 100644 --- a/tests/cpp/common/test_span.cu +++ b/tests/cpp/common/test_span.cu @@ -221,7 +221,7 @@ struct TestElementAccess { } }; -TEST(GPUSpan, ElementAccess) { +TEST(GPUSpanDeathTest, ElementAccess) { dh::safe_cuda(cudaSetDevice(0)); auto test_element_access = []() { thrust::host_vector h_vec (16); diff --git a/tests/cpp/common/test_transform_range.cc b/tests/cpp/common/test_transform_range.cc index 68319dfd3ff0..84163ea669b7 100644 --- a/tests/cpp/common/test_transform_range.cc +++ b/tests/cpp/common/test_transform_range.cc @@ -59,7 +59,7 @@ TEST(Transform, DeclareUnifiedTest(Basic)) { } #if !defined(__CUDACC__) -TEST(Transform, Exception) { +TEST(TransformDeathTest, Exception) { size_t const kSize {16}; std::vector h_in(kSize); const HostDeviceVector in_vec{h_in, -1}; diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 3210a25a1d5f..4879ca080937 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -119,7 +119,7 @@ void TestIncorrectRow() { }); } -TEST(RowPartitioner, IncorrectRow) { +TEST(RowPartitionerDeathTest, IncorrectRow) { ASSERT_DEATH({ TestIncorrectRow(); },".*"); } } // namespace tree From 78c22544d89306d5f4b24ea19278b40b5e181b87 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 9 Jul 2020 13:01:05 -0700 Subject: [PATCH 09/41] Turn off death tests for RMM --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 2c9c40b4480e..e2f3980dc9bd 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -396,7 +396,7 @@ def TestCppGPUWithRMM(args) { def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}" echo "Using a single GPU" sh """ - ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate rmm_test && build/testxgboost --gtest_filter=-*.MGPU_*" + ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate rmm_test && build/testxgboost --gtest_filter=-*.MGPU_*:*DeathTest.*" """ deleteDir() } From a520fa157a2d1c67b6c92a72c60a390ae365faf7 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 9 Jul 2020 16:09:52 -0700 Subject: [PATCH 10/41] Address reviewer's feedback --- tests/cpp/CMakeLists.txt | 2 ++ tests/cpp/helpers.cc | 29 ++++++++++++++++++++++++++--- tests/cpp/helpers.cu | 27 --------------------------- tests/cpp/helpers.h | 2 +- tests/cpp/test_main.cc | 2 +- 5 files changed, 30 insertions(+), 32 deletions(-) diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index 75948e5251f7..f6cfc24b26f1 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -29,6 +29,8 @@ if (USE_CUDA) $<$:${GEN_CODE}>) target_compile_definitions(testxgboost PRIVATE -DXGBOOST_USE_CUDA=1) + find_package(CUDA) + target_include_directories(testxgboost PRIVATE ${CUDA_INCLUDE_DIRS}) set_target_properties(testxgboost PROPERTIES CUDA_SEPARABLE_COMPILATION OFF) diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 04c7f934c621..9e7683c6757b 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -20,6 +20,13 @@ #include "../../src/gbm/gbtree_model.h" #include "xgboost/predictor.h" +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +#include +#include "rmm/mr/device/default_memory_resource.hpp" +#include "rmm/mr/device/cuda_memory_resource.hpp" +#include "rmm/mr/device/pool_memory_resource.hpp" +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + bool FileExists(const std::string& filename) { struct stat st; return stat(filename.c_str(), &st) == 0; @@ -478,14 +485,30 @@ std::unique_ptr CreateTrainedGBM( return gbm; } -#if !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1 -class RMMAllocator {}; +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +using cuda_mr_t = rmm::mr::cuda_memory_resource; +using pool_mr_t = rmm::mr::pool_memory_resource; +class RMMAllocator { + public: + std::unique_ptr handle; +}; void DeleteRMMResource(RMMAllocator* r) { delete r; } -RMMAllocatorPtr SetUpRMMResource() { +RMMAllocatorPtr SetUpRMMResourceForCppTests() { + auto cuda_mr = std::make_unique(); + auto pool_mr = std::make_unique(cuda_mr.release()); + rmm::mr::set_default_resource(pool_mr.get()); + return RMMAllocatorPtr(new RMMAllocator{std::move(pool_mr)}, DeleteRMMResource); +} +#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +class RMMAllocator {}; + +void DeleteRMMResource(RMMAllocator* r) {} + +RMMAllocatorPtr SetUpRMMResourceForCppTests() { return RMMAllocatorPtr(nullptr, DeleteRMMResource); } #endif // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1 diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu index ebaa0a49f20c..9b70ea543231 100644 --- a/tests/cpp/helpers.cu +++ b/tests/cpp/helpers.cu @@ -4,13 +4,6 @@ #include "../../src/data/device_adapter.cuh" #include "../../src/data/iterative_device_dmatrix.h" -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -#include -#include "rmm/mr/device/default_memory_resource.hpp" -#include "rmm/mr/device/cuda_memory_resource.hpp" -#include "rmm/mr/device/pool_memory_resource.hpp" -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - namespace xgboost { CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows, @@ -47,24 +40,4 @@ std::shared_ptr RandomDataGenerator::GenerateDeviceDMatrix(bool with_la 0, bins_); return m; } - -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -using cuda_mr_t = rmm::mr::cuda_memory_resource; -using pool_mr_t = rmm::mr::pool_memory_resource; -class RMMAllocator { - public: - std::unique_ptr handle; -}; - -void DeleteRMMResource(RMMAllocator* r) { - delete r; -} - -RMMAllocatorPtr SetUpRMMResource() { - auto cuda_mr = std::make_unique(); - auto pool_mr = std::make_unique(cuda_mr.release()); - rmm::mr::set_default_resource(pool_mr.get()); - return RMMAllocatorPtr(new RMMAllocator{std::move(pool_mr)}, DeleteRMMResource); -} -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 } // namespace xgboost diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index 967223fed9bb..eb1b5d7733a2 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -355,7 +355,7 @@ inline int Next(DataIterHandle self) { class RMMAllocator; using RMMAllocatorPtr = std::unique_ptr; -RMMAllocatorPtr SetUpRMMResource(); +RMMAllocatorPtr SetUpRMMResourceForCppTests(); } // namespace xgboost #endif diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc index ed35ff775e82..cb2b78679e11 100644 --- a/tests/cpp/test_main.cc +++ b/tests/cpp/test_main.cc @@ -9,7 +9,7 @@ #include "helpers.h" int main(int argc, char ** argv) { - auto rmm_alloc = xgboost::SetUpRMMResource(); + auto rmm_alloc = xgboost::SetUpRMMResourceForCppTests(); xgboost::Args args {{"verbosity", "2"}}; xgboost::ConsoleLogger::Configure(args); From a73391cb6e5603cccb3a4c7420c5976bb5a864e7 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 9 Jul 2020 19:35:27 -0700 Subject: [PATCH 11/41] Prevent leaking of cuda_mr --- tests/cpp/helpers.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 9e7683c6757b..3b5cfeced15c 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -490,7 +490,10 @@ using cuda_mr_t = rmm::mr::cuda_memory_resource; using pool_mr_t = rmm::mr::pool_memory_resource; class RMMAllocator { public: - std::unique_ptr handle; + cuda_mr_t cuda_mr; + pool_mr_t pool_mr; + RMMAllocator() : cuda_mr(), pool_mr(&cuda_mr) {} + ~RMMAllocator() = default; }; void DeleteRMMResource(RMMAllocator* r) { @@ -498,10 +501,9 @@ void DeleteRMMResource(RMMAllocator* r) { } RMMAllocatorPtr SetUpRMMResourceForCppTests() { - auto cuda_mr = std::make_unique(); - auto pool_mr = std::make_unique(cuda_mr.release()); - rmm::mr::set_default_resource(pool_mr.get()); - return RMMAllocatorPtr(new RMMAllocator{std::move(pool_mr)}, DeleteRMMResource); + auto ptr = RMMAllocatorPtr(new RMMAllocator(), DeleteRMMResource); + rmm::mr::set_default_resource(&ptr->pool_mr); + return ptr; } #else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 class RMMAllocator {}; From fa4ec118255ef2455cf6ec5149c88248676537bb Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 21 Jul 2020 20:26:04 -0700 Subject: [PATCH 12/41] Fix Jenkinsfile syntax --- Jenkinsfile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6c27372d0f74..882b6753b04a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -280,12 +280,12 @@ def BuildCUDA(args) { stash name: "xgboost_cpp_tests_cuda${args.cuda_version}", includes: 'build/testxgboost' if (args.build_rmm) { echo "Build with CUDA ${args.cuda_version} and RMM" - def container_type = "rmm" - def docker_binary = "docker" - def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}" + container_type = "rmm" + docker_binary = "docker" + docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}" sh """ rm -rf build/ - ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=rmm_test -DUSE_CUDA=ON -DUSE_RMM=ON ${arch_flag} + ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=rmm_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_RMM=ON ${arch_flag} """ echo 'Stashing C++ test executable (testxgboost)...' stash name: "xgboost_cpp_tests_rmm_cuda${args.cuda_version}", includes: 'build/testxgboost' @@ -398,9 +398,9 @@ def TestCppGPU(args) { sh "rm -rfv build/" unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" echo "Test C++, CUDA ${args.host_cuda_version} with RMM" - def container_type = "rmm" - def docker_binary = "nvidia-docker" - def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" + container_type = "rmm" + docker_binary = "nvidia-docker" + docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" echo "Using a single GPU" sh """ ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate rmm_test && build/testxgboost --gtest_filter=-*DeathTest.*" From 871fc296f0ddbe74ba650ebf586e5b75f570d17f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 21 Jul 2020 20:28:09 -0700 Subject: [PATCH 13/41] Remove unnecessary function in Jenkinsfile --- Jenkinsfile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 882b6753b04a..52952729cb9d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -410,12 +410,6 @@ def TestCppGPU(args) { } } -def TestCppGPUWithRMM(args) { - node('linux && gpu') { - deleteDir() - } -} - def CrossTestJVMwithJDK(args) { node('linux && cpu') { unstash name: 'xgboost4j_jar' From 48051dfa97eeaa998d0655bc10f6f22dd945a419 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 21 Jul 2020 20:59:12 -0700 Subject: [PATCH 14/41] [CI] Install NCCL into RMM container --- tests/ci_build/Dockerfile.rmm | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm index 18730a9c0889..8f52f91f3736 100644 --- a/tests/ci_build/Dockerfile.rmm +++ b/tests/ci_build/Dockerfile.rmm @@ -1,5 +1,6 @@ ARG CUDA_VERSION -FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu18.04 +FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu16.04 +ARG CUDA_VERSION # Environment ENV DEBIAN_FRONTEND noninteractive @@ -16,6 +17,13 @@ RUN \ wget -nv -nc https://cmake.org/files/v3.13/cmake-3.13.0-Linux-x86_64.sh --no-check-certificate && \ bash cmake-3.13.0-Linux-x86_64.sh --skip-license --prefix=/usr +# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) +RUN \ + export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \ + export NCCL_VERSION=2.7.5-1 && \ + apt-get update && \ + apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT} + ENV PATH=/opt/python/bin:$PATH # Create new Conda environment with RMM From c0a05ce091bd6e3ed0d504738a9e69ab581cca4f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 21 Jul 2020 21:07:11 -0700 Subject: [PATCH 15/41] Run Python tests --- Jenkinsfile | 25 ++++++++++++------------- tests/ci_build/test_python.sh | 8 +++++++- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 52952729cb9d..b188f83e9f56 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -89,9 +89,9 @@ pipeline { parallel ([ 'test-python-cpu': { TestPythonCPU() }, 'test-python-gpu-cuda10.0': { TestPythonGPU(host_cuda_version: '10.0') }, - 'test-python-gpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2') }, + 'test-python-gpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', test_rmm: true) }, 'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') }, - 'test-python-mgpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', multi_gpu: true) }, + 'test-python-mgpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) }, 'test-cpp-gpu-cuda10.2': { TestCppGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', test_rmm: true) }, 'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') }, 'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '3.0.0') }, @@ -286,7 +286,11 @@ def BuildCUDA(args) { sh """ rm -rf build/ ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=rmm_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_RMM=ON ${arch_flag} + ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal" + ${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} manylinux2010_x86_64 """ + echo 'Stashing Python wheel...' + stash name: "xgboost_whl_rmm_cuda${args.cuda_version}", includes: 'python-package/dist/*.whl' echo 'Stashing C++ test executable (testxgboost)...' stash name: "xgboost_cpp_tests_rmm_cuda${args.cuda_version}", includes: 'build/testxgboost' } @@ -354,17 +358,12 @@ def TestPythonGPU(args) { def container_type = "gpu" def docker_binary = "nvidia-docker" def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" - if (args.multi_gpu) { - echo "Using multiple GPUs" - sh """ - ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh mgpu - """ - } else { - echo "Using a single GPU" - sh """ - ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh gpu - """ - } + def mgpu_indicator = (args.multi_gpu) ? 'mgpu' : 'gpu' + sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" + sh "rm -rfv build/ python-package/dist/" + unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" + sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" deleteDir() } } diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh index fbd3644667d8..8aa73fd58452 100755 --- a/tests/ci_build/test_python.sh +++ b/tests/ci_build/test_python.sh @@ -26,12 +26,17 @@ function install_xgboost { fi } +function uninstall_xgboost { + pip uninstall -y xgboost +} + # Run specified test suite case "$suite" in gpu) source activate gpu_test install_xgboost pytest -v -s -rxXs --fulltrace -m "not mgpu" tests/python-gpu + uninstall_xgboost ;; mgpu) @@ -41,7 +46,7 @@ case "$suite" in cd tests/distributed ./runtests-gpu.sh - cd - + uninstall_xgboost ;; cpu) @@ -49,6 +54,7 @@ case "$suite" in pytest -v -s --fulltrace tests/python cd tests/distributed ./runtests.sh + uninstall_xgboost ;; *) From c12e0a6cd58d143c641a5dbc11554503f2b40dab Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 21 Jul 2020 22:20:57 -0700 Subject: [PATCH 16/41] Try building with RMM, CUDA 10.0 --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index b188f83e9f56..4ce2a555251e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -70,7 +70,7 @@ pipeline { 'build-cpu-non-omp': { BuildCPUNonOmp() }, // Build reference, distribution-ready Python wheel with CUDA 10.0 // using CentOS 6 image - 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') }, + 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0', build_rmm: true) }, // The build-gpu-* builds below use Ubuntu image 'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') }, 'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) }, From a3e0e2f2e5c8dde7a03f4ef7efd6513ffece47cc Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 21 Jul 2020 22:45:48 -0700 Subject: [PATCH 17/41] Do not use RMM for CUDA 10.0 target --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4ce2a555251e..85d92de17203 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -70,7 +70,7 @@ pipeline { 'build-cpu-non-omp': { BuildCPUNonOmp() }, // Build reference, distribution-ready Python wheel with CUDA 10.0 // using CentOS 6 image - 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0', build_rmm: true) }, + 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') }, // The build-gpu-* builds below use Ubuntu image 'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') }, 'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) }, @@ -395,7 +395,7 @@ def TestCppGPU(args) { sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost" if (args.test_rmm) { sh "rm -rfv build/" - unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" echo "Test C++, CUDA ${args.host_cuda_version} with RMM" container_type = "rmm" docker_binary = "nvidia-docker" From 3aeab6987af2edda9636ea03f902246043537fc7 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 21 Jul 2020 23:15:47 -0700 Subject: [PATCH 18/41] Actually test for test_rmm flag --- Jenkinsfile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 85d92de17203..ab564cb39ec6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -360,10 +360,12 @@ def TestPythonGPU(args) { def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" def mgpu_indicator = (args.multi_gpu) ? 'mgpu' : 'gpu' sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" - sh "rm -rfv build/ python-package/dist/" - unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}" - unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" - sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" + if (args.test_rmm) { + sh "rm -rfv build/ python-package/dist/" + unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" + sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" + } deleteDir() } } From 862d58092bd9f812dc9aed2a36cc30a1d7c8eb91 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 21 Jul 2020 23:18:54 -0700 Subject: [PATCH 19/41] Fix TestPythonGPU --- Jenkinsfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index ab564cb39ec6..f75987400fd6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -362,8 +362,8 @@ def TestPythonGPU(args) { sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" if (args.test_rmm) { sh "rm -rfv build/ python-package/dist/" - unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}" - unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" + unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" } deleteDir() @@ -393,7 +393,7 @@ def TestCppGPU(args) { echo "Test C++, CUDA ${args.host_cuda_version}" def container_type = "gpu" def docker_binary = "nvidia-docker" - def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" + def docker_args = "--build-arg CUDA_VERSION=${artifact_cuda_version}" sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost" if (args.test_rmm) { sh "rm -rfv build/" From 2a064bffd19d7e8931ef0d1aaf8dacbe74f896d3 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 29 Jul 2020 09:19:59 +0000 Subject: [PATCH 20/41] Use CNMeM allocator, since pool allocator doesn't yet support multiGPU --- tests/cpp/helpers.cc | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 3b5cfeced15c..a8733a316891 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -22,9 +22,10 @@ #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 #include +#include +#include #include "rmm/mr/device/default_memory_resource.hpp" -#include "rmm/mr/device/cuda_memory_resource.hpp" -#include "rmm/mr/device/pool_memory_resource.hpp" +#include "rmm/mr/device/cnmem_memory_resource.hpp" #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 bool FileExists(const std::string& filename) { @@ -486,13 +487,18 @@ std::unique_ptr CreateTrainedGBM( } #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -using cuda_mr_t = rmm::mr::cuda_memory_resource; -using pool_mr_t = rmm::mr::pool_memory_resource; + +std::vector GetVisibleGPUs() { + std::vector gpus(common::AllVisibleGPUs()); + std::iota(gpus.begin(), gpus.end(), 0); + return gpus; +} + +using cnmem_mr_t = rmm::mr::cnmem_memory_resource; class RMMAllocator { public: - cuda_mr_t cuda_mr; - pool_mr_t pool_mr; - RMMAllocator() : cuda_mr(), pool_mr(&cuda_mr) {} + cnmem_mr_t cnmem_mr; + RMMAllocator() : cnmem_mr(0, GetVisibleGPUs()) {} ~RMMAllocator() = default; }; @@ -502,7 +508,7 @@ void DeleteRMMResource(RMMAllocator* r) { RMMAllocatorPtr SetUpRMMResourceForCppTests() { auto ptr = RMMAllocatorPtr(new RMMAllocator(), DeleteRMMResource); - rmm::mr::set_default_resource(&ptr->pool_mr); + rmm::mr::set_default_resource(&ptr->cnmem_mr); return ptr; } #else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 From 789021fa31112e25b683aef39fff375403060141 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 30 Jul 2020 21:39:23 +0000 Subject: [PATCH 21/41] Use 10.0 container to build RMM-enabled XGBoost --- Jenkinsfile | 12 ++++++------ tests/ci_build/Dockerfile.rmm | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1c30760ef9e5..62e5a70ee891 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -70,10 +70,10 @@ pipeline { 'build-cpu-non-omp': { BuildCPUNonOmp() }, // Build reference, distribution-ready Python wheel with CUDA 10.0 // using CentOS 6 image - 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') }, + 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0', build_rmm: true) }, // The build-gpu-* builds below use Ubuntu image 'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') }, - 'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) }, + 'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2') }, 'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0') }, 'build-jvm-packages-gpu-cuda10.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.0', cuda_version: '10.0') }, 'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.0') }, @@ -386,8 +386,8 @@ def TestPythonGPU(args) { sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" if (args.test_rmm) { sh "rm -rfv build/ python-package/dist/" - unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}" - unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" + unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" } deleteDir() @@ -417,11 +417,11 @@ def TestCppGPU(args) { echo "Test C++, CUDA ${args.host_cuda_version}" def container_type = "gpu" def docker_binary = "nvidia-docker" - def docker_args = "--build-arg CUDA_VERSION=${artifact_cuda_version}" + def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost" if (args.test_rmm) { sh "rm -rfv build/" - unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" echo "Test C++, CUDA ${args.host_cuda_version} with RMM" container_type = "rmm" docker_binary = "nvidia-docker" diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm index 8f52f91f3736..feac054274c1 100644 --- a/tests/ci_build/Dockerfile.rmm +++ b/tests/ci_build/Dockerfile.rmm @@ -20,7 +20,7 @@ RUN \ # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=2.7.5-1 && \ + export NCCL_VERSION=2.4.8-1 && \ apt-get update && \ apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT} From f27d836aa63d202c11444f0fa5c25514c063ab4e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 31 Jul 2020 09:52:37 +0000 Subject: [PATCH 22/41] Revert "Use 10.0 container to build RMM-enabled XGBoost" This reverts commit 789021fa31112e25b683aef39fff375403060141. --- Jenkinsfile | 12 ++++++------ tests/ci_build/Dockerfile.rmm | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 62e5a70ee891..1c30760ef9e5 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -70,10 +70,10 @@ pipeline { 'build-cpu-non-omp': { BuildCPUNonOmp() }, // Build reference, distribution-ready Python wheel with CUDA 10.0 // using CentOS 6 image - 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0', build_rmm: true) }, + 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') }, // The build-gpu-* builds below use Ubuntu image 'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') }, - 'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2') }, + 'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) }, 'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0') }, 'build-jvm-packages-gpu-cuda10.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.0', cuda_version: '10.0') }, 'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.0') }, @@ -386,8 +386,8 @@ def TestPythonGPU(args) { sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" if (args.test_rmm) { sh "rm -rfv build/ python-package/dist/" - unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}" - unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" + unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" } deleteDir() @@ -417,11 +417,11 @@ def TestCppGPU(args) { echo "Test C++, CUDA ${args.host_cuda_version}" def container_type = "gpu" def docker_binary = "nvidia-docker" - def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" + def docker_args = "--build-arg CUDA_VERSION=${artifact_cuda_version}" sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost" if (args.test_rmm) { sh "rm -rfv build/" - unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" echo "Test C++, CUDA ${args.host_cuda_version} with RMM" container_type = "rmm" docker_binary = "nvidia-docker" diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm index feac054274c1..8f52f91f3736 100644 --- a/tests/ci_build/Dockerfile.rmm +++ b/tests/ci_build/Dockerfile.rmm @@ -20,7 +20,7 @@ RUN \ # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=2.4.8-1 && \ + export NCCL_VERSION=2.7.5-1 && \ apt-get update && \ apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT} From a4b86a9ea8848e02c527e2072cf297590ced477c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 31 Jul 2020 09:53:51 +0000 Subject: [PATCH 23/41] Fix Jenkinsfile --- Jenkinsfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1c30760ef9e5..8b51fcb817b2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -386,8 +386,8 @@ def TestPythonGPU(args) { sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" if (args.test_rmm) { sh "rm -rfv build/ python-package/dist/" - unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}" - unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" + unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" } deleteDir() @@ -417,11 +417,11 @@ def TestCppGPU(args) { echo "Test C++, CUDA ${args.host_cuda_version}" def container_type = "gpu" def docker_binary = "nvidia-docker" - def docker_args = "--build-arg CUDA_VERSION=${artifact_cuda_version}" + def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost" if (args.test_rmm) { sh "rm -rfv build/" - unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" echo "Test C++, CUDA ${args.host_cuda_version} with RMM" container_type = "rmm" docker_binary = "nvidia-docker" From e5eb262c8feb1d16b7e2dc3559d960117f71764b Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 31 Jul 2020 09:48:10 +0000 Subject: [PATCH 24/41] [CI] Assign larger /dev/shm to NCCL --- Jenkinsfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8b51fcb817b2..4902251fedf8 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -383,12 +383,14 @@ def TestPythonGPU(args) { def docker_binary = "nvidia-docker" def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" def mgpu_indicator = (args.multi_gpu) ? 'mgpu' : 'gpu' - sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" + // Allocate extra space in /dev/shm to enable NCCL + def docker_extra_params = (args.multi_gpu) ? "CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'" : '' + sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" if (args.test_rmm) { sh "rm -rfv build/ python-package/dist/" unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}" unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" - sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" + sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" } deleteDir() } From 4cf7f0007b1b995d1e0b48ce9b69a5949a4a1285 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 31 Jul 2020 09:48:30 +0000 Subject: [PATCH 25/41] Use 10.2 artifact to run multi-GPU Python tests --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4902251fedf8..1af5192c4c85 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -92,7 +92,7 @@ pipeline { 'test-python-gpu-cuda10.0': { TestPythonGPU(host_cuda_version: '10.0') }, 'test-python-gpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', test_rmm: true) }, 'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') }, - 'test-python-mgpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) }, + 'test-python-mgpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) }, 'test-cpp-gpu-cuda10.2': { TestCppGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', test_rmm: true) }, 'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') }, 'test-jvm-jdk8-cuda10.0': { CrossTestJVMwithJDKGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.0') }, From d023a503f88751b192a56cc2b9bdb2250bfb559c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 31 Jul 2020 09:45:28 +0000 Subject: [PATCH 26/41] Add CUDA 10.0 -> 11.0 cross-version test; remove CUDA 10.0 target --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1af5192c4c85..60a90e66548c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -89,8 +89,8 @@ pipeline { script { parallel ([ 'test-python-cpu': { TestPythonCPU() }, - 'test-python-gpu-cuda10.0': { TestPythonGPU(host_cuda_version: '10.0') }, 'test-python-gpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', test_rmm: true) }, + 'test-python-gpu-cuda11.0-cross': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '11.0') }, 'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') }, 'test-python-mgpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) }, 'test-cpp-gpu-cuda10.2': { TestCppGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', test_rmm: true) }, From abc64a311e9d0894cfc760fbaead3760b07f2b37 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 31 Jul 2020 10:40:23 +0000 Subject: [PATCH 27/41] Rename Conda env rmm_test -> gpu_test --- Jenkinsfile | 4 ++-- tests/ci_build/Dockerfile.rmm | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 60a90e66548c..8fbac6bebc21 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -287,7 +287,7 @@ def BuildCUDA(args) { docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}" sh """ rm -rf build/ - ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=rmm_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_RMM=ON ${arch_flag} + ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_RMM=ON ${arch_flag} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal" ${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} manylinux2010_x86_64 """ @@ -430,7 +430,7 @@ def TestCppGPU(args) { docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" echo "Using a single GPU" sh """ - ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate rmm_test && build/testxgboost --gtest_filter=-*DeathTest.*" + ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --gtest_filter=-*DeathTest.*" """ } deleteDir() diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm index 8f52f91f3736..d5ebca97aec8 100644 --- a/tests/ci_build/Dockerfile.rmm +++ b/tests/ci_build/Dockerfile.rmm @@ -28,7 +28,7 @@ ENV PATH=/opt/python/bin:$PATH # Create new Conda environment with RMM RUN \ - conda create -n rmm_test -c nvidia -c rapidsai -c conda-forge -c defaults \ + conda create -n gpu_test -c nvidia -c rapidsai -c conda-forge -c defaults \ python=3.7 rmm=0.14 cudatoolkit=$CUDA_VERSION ENV GOSU_VERSION 1.10 From 1e7e42e8003b6d9b880770c80f234e22743d531f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 31 Jul 2020 11:09:25 +0000 Subject: [PATCH 28/41] Use env var to opt into CNMeM pool for C++ tests --- Jenkinsfile | 4 ++-- tests/cpp/helpers.cc | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8fbac6bebc21..e6afcc229a74 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -428,9 +428,9 @@ def TestCppGPU(args) { container_type = "rmm" docker_binary = "nvidia-docker" docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" - echo "Using a single GPU" + def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e XGBOOST_CPP_TEST_USE_RMM_POOL=1'" sh """ - ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --gtest_filter=-*DeathTest.*" + ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --gtest_filter=-*DeathTest.*" """ } deleteDir() diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index a8733a316891..030c7c2918ae 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -507,6 +507,9 @@ void DeleteRMMResource(RMMAllocator* r) { } RMMAllocatorPtr SetUpRMMResourceForCppTests() { + if (dmlc::GetEnv("XGBOOST_CPP_TEST_USE_RMM_POOL", 0) != 1) { + return RMMAllocatorPtr(nullptr, DeleteRMMResource); + } auto ptr = RMMAllocatorPtr(new RMMAllocator(), DeleteRMMResource); rmm::mr::set_default_resource(&ptr->cnmem_mr); return ptr; From 1069ae0af62b48566ed32688a21ca1b1c32e341e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 31 Jul 2020 11:39:22 -0700 Subject: [PATCH 29/41] Use identical CUDA version for RMM builds and tests --- Jenkinsfile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index e6afcc229a74..551c236db094 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -89,10 +89,11 @@ pipeline { script { parallel ([ 'test-python-cpu': { TestPythonCPU() }, - 'test-python-gpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', test_rmm: true) }, + // artifact_cuda_version doesn't apply to RMM tests; RMM tests will always match CUDA version between artifact and host env + 'test-python-gpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.2', test_rmm: true) }, 'test-python-gpu-cuda11.0-cross': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '11.0') }, 'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') }, - 'test-python-mgpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) }, + 'test-python-mgpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) }, 'test-cpp-gpu-cuda10.2': { TestCppGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', test_rmm: true) }, 'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') }, 'test-jvm-jdk8-cuda10.0': { CrossTestJVMwithJDKGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.0') }, @@ -388,8 +389,8 @@ def TestPythonGPU(args) { sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" if (args.test_rmm) { sh "rm -rfv build/ python-package/dist/" - unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}" - unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" + unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" } deleteDir() @@ -423,7 +424,7 @@ def TestCppGPU(args) { sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost" if (args.test_rmm) { sh "rm -rfv build/" - unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}" + unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" echo "Test C++, CUDA ${args.host_cuda_version} with RMM" container_type = "rmm" docker_binary = "nvidia-docker" From 99a75209bb77150df61ebfebf636f0b394a6b1b0 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 6 Aug 2020 06:46:55 +0000 Subject: [PATCH 30/41] Use Pytest fixtures to enable RMM pool in Python tests --- Jenkinsfile | 2 +- tests/ci_build/test_python.sh | 18 +++++++---- tests/python-gpu/conftest.py | 32 ++++++++++++++++++++ tests/python-gpu/test_gpu_with_dask.py | 41 +++++++++++++------------- 4 files changed, 67 insertions(+), 26 deletions(-) create mode 100644 tests/python-gpu/conftest.py diff --git a/Jenkinsfile b/Jenkinsfile index 551c236db094..a1a93a628953 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -391,7 +391,7 @@ def TestPythonGPU(args) { sh "rm -rfv build/ python-package/dist/" unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}" unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}" - sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}" + sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator} --use-rmm-pool" } deleteDir() } diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh index 79eff9618971..28cd5e526b3b 100755 --- a/tests/ci_build/test_python.sh +++ b/tests/ci_build/test_python.sh @@ -2,7 +2,15 @@ set -e set -x -suite=$1 +if [ "$#" -lt 1 ] +then + suite='' + args='' +else + suite=$1 + shift 1 + args="$@" +fi # Install XGBoost Python package function install_xgboost { @@ -35,14 +43,14 @@ case "$suite" in gpu) source activate gpu_test install_xgboost - pytest -v -s -rxXs --fulltrace -m "not mgpu" tests/python-gpu + pytest -v -s -rxXs --fulltrace -m "not mgpu" ${args} tests/python-gpu uninstall_xgboost ;; mgpu) source activate gpu_test install_xgboost - pytest -v -s -rxXs --fulltrace -m "mgpu" tests/python-gpu + pytest -v -s -rxXs --fulltrace -m "mgpu" ${args} tests/python-gpu cd tests/distributed ./runtests-gpu.sh @@ -52,14 +60,14 @@ case "$suite" in cpu) source activate cpu_test install_xgboost - pytest -v -s --fulltrace tests/python + pytest -v -s -rxXs --fulltrace ${args} tests/python cd tests/distributed ./runtests.sh uninstall_xgboost ;; *) - echo "Usage: $0 {gpu|mgpu|cpu}" + echo "Usage: $0 {gpu|mgpu|cpu} [extra args to pass to pytest]" exit 1 ;; esac diff --git a/tests/python-gpu/conftest.py b/tests/python-gpu/conftest.py new file mode 100644 index 000000000000..af2d6aa10b6d --- /dev/null +++ b/tests/python-gpu/conftest.py @@ -0,0 +1,32 @@ +import pytest +import logging + +def has_rmm(): + try: + import rmm + return True + except ImportError: + return False + +@pytest.fixture(scope='module', autouse=True) +def setup_rmm_pool(request, pytestconfig): + if pytestconfig.getoption('--use-rmm-pool') and request.module.__name__ != 'test_gpu_with_dask': + if not has_rmm(): + raise ImportError('The --use-rmm-pool option requires the RMM package') + import rmm + from dask_cuda.utils import get_n_gpus + rmm.reinitialize(pool_allocator=True, devices=list(range(get_n_gpus()))) + +@pytest.fixture(scope='module', autouse=True) +def local_cuda_cluster_rmm_kwargs(request, pytestconfig): + if pytestconfig.getoption('--use-rmm-pool') and request.module.__name__ == 'test_gpu_with_dask': + if not has_rmm(): + raise ImportError('The --use-rmm-pool option requires the RMM package') + import rmm + from dask_cuda.utils import get_n_gpus + rmm.reinitialize() + return {'rmm_pool_size': '8GB'} + return {} + +def pytest_addoption(parser): + parser.addoption('--use-rmm-pool', action='store_true', default=False, help='Use RMM pool') diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py index 45209b25871e..b3e5d8a14ed2 100644 --- a/tests/python-gpu/test_gpu_with_dask.py +++ b/tests/python-gpu/test_gpu_with_dask.py @@ -3,7 +3,6 @@ import pytest import numpy as np import asyncio -import unittest import xgboost import subprocess from hypothesis import given, strategies, settings, note @@ -151,24 +150,24 @@ def run_gpu_hist(params, num_rounds, dataset, DMatrixT, client): assert tm.non_increasing(history['train'][dataset.metric]) -class TestDistributedGPU(unittest.TestCase): +class TestDistributedGPU: @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_cudf()) @pytest.mark.skipif(**tm.no_dask_cudf()) @pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.mgpu - def test_dask_dataframe(self): - with LocalCUDACluster() as cluster: + def test_dask_dataframe(self, local_cuda_cluster_rmm_kwargs): + with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster: with Client(cluster) as client: run_with_dask_dataframe(dxgb.DaskDMatrix, client) run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client) - @given(parameter_strategy, strategies.integers(1, 20), - tm.dataset_strategy) + @given(params=parameter_strategy, num_rounds=strategies.integers(1, 20), + dataset=tm.dataset_strategy) @settings(deadline=duration(seconds=120)) @pytest.mark.mgpu - def test_gpu_hist(self, params, num_rounds, dataset): - with LocalCUDACluster(n_workers=2) as cluster: + def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster_rmm_kwargs): + with LocalCUDACluster(n_workers=2, **local_cuda_cluster_rmm_kwargs) as cluster: with Client(cluster) as client: run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, client) @@ -177,8 +176,8 @@ def test_gpu_hist(self, params, num_rounds, dataset): @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.mgpu - def test_dask_array(self): - with LocalCUDACluster() as cluster: + def test_dask_array(self, local_cuda_cluster_rmm_kwargs): + with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster: with Client(cluster) as client: run_with_dask_array(dxgb.DaskDMatrix, client) run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, client) @@ -186,15 +185,15 @@ def test_dask_array(self): @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.mgpu - def test_empty_dmatrix(self): - with LocalCUDACluster() as cluster: + def test_empty_dmatrix(self, local_cuda_cluster_rmm_kwargs): + with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster: with Client(cluster) as client: parameters = {'tree_method': 'gpu_hist', 'debug_synchronize': True} run_empty_dmatrix_reg(client, parameters) run_empty_dmatrix_cls(client, parameters) - def run_quantile(self, name): + def run_quantile(self, name, local_cuda_cluster_rmm_kwargs): if sys.platform.startswith("win"): pytest.skip("Skipping dask tests on Windows") @@ -217,7 +216,7 @@ def runit(worker_addr, rabit_args): env[port[0]] = port[1] return subprocess.run([exe, test], env=env, stdout=subprocess.PIPE) - with LocalCUDACluster() as cluster: + with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster: with Client(cluster) as client: workers = list(dxgb._get_client_workers(client).keys()) rabit_args = client.sync(dxgb._get_rabit_args, workers, client) @@ -235,14 +234,14 @@ def runit(worker_addr, rabit_args): @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.mgpu @pytest.mark.gtest - def test_quantile_basic(self): - self.run_quantile('AllReduceBasic') + def test_quantile_basic(self, local_cuda_cluster_rmm_kwargs): + self.run_quantile('AllReduceBasic', local_cuda_cluster_rmm_kwargs) @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.mgpu @pytest.mark.gtest - def test_quantile_same_on_all_workers(self): - self.run_quantile('SameOnAllWorkers') + def test_quantile_same_on_all_workers(self, local_cuda_cluster_rmm_kwargs): + self.run_quantile('SameOnAllWorkers', local_cuda_cluster_rmm_kwargs) async def run_from_dask_array_asyncio(scheduler_address): @@ -272,8 +271,10 @@ async def run_from_dask_array_asyncio(scheduler_address): return output -def test_with_asyncio(): - with LocalCUDACluster() as cluster: +@pytest.mark.skipif(**tm.no_dask()) +@pytest.mark.mgpu +def test_with_asyncio(local_cuda_cluster_rmm_kwargs): + with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster: with Client(cluster) as client: address = client.scheduler.address output = asyncio.run(run_from_dask_array_asyncio(address)) From 92d14814d8d967172b765932cccd1e56d0afea48 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 7 Aug 2020 23:18:01 +0000 Subject: [PATCH 31/41] Move RMM to plugin/CMakeLists.txt; use PLUGIN_RMM --- CMakeLists.txt | 11 ++++------- Jenkinsfile | 2 +- cmake/ExternalLibs.cmake | 27 --------------------------- plugin/CMakeLists.txt | 19 +++++++++++++++++++ 4 files changed, 24 insertions(+), 35 deletions(-) delete mode 100644 cmake/ExternalLibs.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 88867ce3780d..134e13498ba5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,7 +47,6 @@ option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF) option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF) set(GPU_COMPUTE_VER "" CACHE STRING "Semicolon separated list of compute versions to be built against, e.g. '35;61'") -option(USE_RMM "Build with RAPIDS Memory Manager (RMM)" OFF) ## Copied From dmlc option(USE_HDFS "Build with HDFS support" OFF) option(USE_AZURE "Build with AZURE support" OFF) @@ -61,6 +60,7 @@ address, leak, undefined and thread.") ## Plugins option(PLUGIN_LZ4 "Build lz4 plugin" OFF) option(PLUGIN_DENSE_PARSER "Build dense parser plugin" OFF) +option(PLUGIN_RMM "Build with RAPIDS Memory Manager (RMM)" OFF) option(ADD_PKGCONFIG "Add xgboost.pc into system." ON) #-- Checks for building XGBoost @@ -83,9 +83,9 @@ endif (R_LIB AND GOOGLE_TEST) if (USE_AVX) message(SEND_ERROR "The option 'USE_AVX' is deprecated as experimental AVX features have been removed from XGBoost.") endif (USE_AVX) -if (USE_RMM AND NOT (USE_CUDA)) - message(SEND_ERROR "`USE_RMM` must be enabled with `USE_CUDA` flag.") -endif (USE_RMM AND NOT (USE_CUDA)) +if (PLUGIN_RMM AND NOT (USE_CUDA)) + message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.") +endif (PLUGIN_RMM AND NOT (USE_CUDA)) if (ENABLE_ALL_WARNINGS) if ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")) message(SEND_ERROR "ENABLE_ALL_WARNINGS is only available for Clang and GCC.") @@ -194,9 +194,6 @@ endif (R_LIB) # Plugin add_subdirectory(${xgboost_SOURCE_DIR}/plugin) -# 3rd-party libs -include(cmake/ExternalLibs.cmake) - #-- library if (BUILD_STATIC_LIB) add_library(xgboost STATIC) diff --git a/Jenkinsfile b/Jenkinsfile index a1a93a628953..54d449a79b33 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -288,7 +288,7 @@ def BuildCUDA(args) { docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}" sh """ rm -rf build/ - ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_RMM=ON ${arch_flag} + ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON ${arch_flag} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal" ${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} manylinux2010_x86_64 """ diff --git a/cmake/ExternalLibs.cmake b/cmake/ExternalLibs.cmake deleted file mode 100644 index 7181699d508f..000000000000 --- a/cmake/ExternalLibs.cmake +++ /dev/null @@ -1,27 +0,0 @@ -# RMM -if (USE_RMM) - # Use Conda env if available - if(DEFINED ENV{CONDA_PREFIX}) - set(CMAKE_PREFIX_PATH "$ENV{CONDA_PREFIX};${CMAKE_PREFIX_PATH}") - message(STATUS "Detected Conda environment, CMAKE_PREFIX_PATH set to: ${CMAKE_PREFIX_PATH}") - else() - message(STATUS "No Conda environment detected") - endif() - - find_path(RMM_INCLUDE "rmm" - HINTS "$ENV{RMM_ROOT}/include") - - find_library(RMM_LIBRARY "rmm" - HINTS "$ENV{RMM_ROOT}/lib" "$ENV{RMM_ROOT}/build") - - if ((NOT RMM_LIBRARY) OR (NOT RMM_INCLUDE)) - message(FATAL_ERROR "Could not locate RMM library") - endif () - - message(STATUS "RMM: RMM_LIBRARY set to ${RMM_LIBRARY}") - message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}") - - target_include_directories(objxgboost PUBLIC ${RMM_INCLUDE}) - target_link_libraries(objxgboost PUBLIC ${RMM_LIBRARY} cuda) - target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1) -endif () diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt index 86f784ca0155..1eb1274a397c 100644 --- a/plugin/CMakeLists.txt +++ b/plugin/CMakeLists.txt @@ -6,3 +6,22 @@ endif (PLUGIN_LZ4) if (PLUGIN_DENSE_PARSER) target_sources(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/dense_parser/dense_libsvm.cc) endif (PLUGIN_DENSE_PARSER) + +if (PLUGIN_RMM) + find_path(RMM_INCLUDE "rmm" + HINTS "$ENV{RMM_ROOT}/include") + + find_library(RMM_LIBRARY "rmm" + HINTS "$ENV{RMM_ROOT}/lib" "$ENV{RMM_ROOT}/build") + + if ((NOT RMM_LIBRARY) OR (NOT RMM_INCLUDE)) + message(FATAL_ERROR "Could not locate RMM library") + endif () + + message(STATUS "RMM: RMM_LIBRARY set to ${RMM_LIBRARY}") + message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}") + + target_include_directories(objxgboost PUBLIC ${RMM_INCLUDE}) + target_link_libraries(objxgboost PUBLIC ${RMM_LIBRARY} cuda) + target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1) +endif (PLUGIN_RMM) From e74fd0dcb0ca45fedaf056011795ec519e25d2b4 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 8 Aug 2020 01:19:00 +0000 Subject: [PATCH 32/41] Use per-device MR; use command arg in gtest --- Jenkinsfile | 3 +-- src/common/device_helpers.cuh | 15 +++++++++-- src/predictor/gpu_predictor.cu | 24 +++++++++++------- tests/cpp/helpers.cc | 46 +++++++++++++++++++++++----------- tests/cpp/helpers.h | 2 +- tests/cpp/test_main.cc | 2 +- 6 files changed, 62 insertions(+), 30 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 54d449a79b33..274caeb3e352 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -429,9 +429,8 @@ def TestCppGPU(args) { container_type = "rmm" docker_binary = "nvidia-docker" docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}" - def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e XGBOOST_CPP_TEST_USE_RMM_POOL=1'" sh """ - ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --gtest_filter=-*DeathTest.*" + ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --use-rmm-pool --gtest_filter=-*DeathTest.*" """ } deleteDir() diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index ab40a46ba2d0..748aaf9c3524 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -39,7 +39,8 @@ #endif // XGBOOST_USE_NCCL #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -#include "rmm/mr/device/default_memory_resource.hpp" +#include +#include "rmm/mr/device/per_device_resource.hpp" #include "rmm/mr/device/thrust_allocator_adaptor.hpp" #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 @@ -406,7 +407,8 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator { return SuperT::deallocate(ptr, n); } #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - XGBDefaultDeviceAllocatorImpl() : SuperT(rmm::mr::get_default_resource(), cudaStream_t{0}) {} + XGBDefaultDeviceAllocatorImpl() + : SuperT(rmm::mr::get_current_device_resource(), cudaStream_t{0}) {} #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 }; @@ -770,6 +772,15 @@ xgboost::common::Span ToSpan(thrust::device_vector& vec, return ToSpan(vec, offset, size); } +template ::index_type> +xgboost::common::Span ToSpan( + std::unique_ptr& vec, + IndexT offset = 0, + IndexT size = std::numeric_limits::max()) { + return ToSpan(*vec.get(), offset, size); +} + // thrust begin, similiar to std::begin template thrust::device_ptr tbegin(xgboost::HostDeviceVector& vector) { // NOLINT diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 9a498136d41a..fa31af2e4c8e 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -213,9 +213,10 @@ __global__ void PredictKernel(Data data, class DeviceModel { public: - dh::device_vector nodes; - dh::device_vector tree_segments; - dh::device_vector tree_group; + // Need to lazily construct the vectors because GPU id is only known at runtime + std::unique_ptr> nodes; + std::unique_ptr> tree_segments; + std::unique_ptr> tree_group; size_t tree_beg_; // NOLINT size_t tree_end_; // NOLINT int num_group; @@ -224,16 +225,16 @@ class DeviceModel { const thrust::host_vector& h_tree_segments, const thrust::host_vector& h_nodes, size_t tree_begin, size_t tree_end) { - nodes.resize(h_nodes.size()); - dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(), + nodes->resize(h_nodes.size()); + dh::safe_cuda(cudaMemcpyAsync(nodes->data().get(), h_nodes.data(), sizeof(RegTree::Node) * h_nodes.size(), cudaMemcpyHostToDevice)); - tree_segments.resize(h_tree_segments.size()); - dh::safe_cuda(cudaMemcpyAsync(tree_segments.data().get(), h_tree_segments.data(), + tree_segments->resize(h_tree_segments.size()); + dh::safe_cuda(cudaMemcpyAsync(tree_segments->data().get(), h_tree_segments.data(), sizeof(size_t) * h_tree_segments.size(), cudaMemcpyHostToDevice)); - tree_group.resize(model.tree_info.size()); - dh::safe_cuda(cudaMemcpyAsync(tree_group.data().get(), model.tree_info.data(), + tree_group->resize(model.tree_info.size()); + dh::safe_cuda(cudaMemcpyAsync(tree_group->data().get(), model.tree_info.data(), sizeof(int) * model.tree_info.size(), cudaMemcpyHostToDevice)); this->tree_beg_ = tree_begin; @@ -243,6 +244,11 @@ class DeviceModel { void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) { dh::safe_cuda(cudaSetDevice(gpu_id)); + // Allocate device vectors using correct GPU ID context + nodes.reset(new dh::device_vector()); + tree_segments.reset(new dh::device_vector()); + tree_group.reset(new dh::device_vector()); + CHECK_EQ(model.param.size_leaf_vector, 0); // Copy decision trees to device thrust::host_vector h_tree_segments{}; diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 030c7c2918ae..0d9d3770498b 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -24,8 +24,9 @@ #include #include #include -#include "rmm/mr/device/default_memory_resource.hpp" -#include "rmm/mr/device/cnmem_memory_resource.hpp" +#include "rmm/mr/device/per_device_resource.hpp" +#include "rmm/mr/device/cuda_memory_resource.hpp" +#include "rmm/mr/device/pool_memory_resource.hpp" #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 bool FileExists(const std::string& filename) { @@ -488,17 +489,23 @@ std::unique_ptr CreateTrainedGBM( #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -std::vector GetVisibleGPUs() { - std::vector gpus(common::AllVisibleGPUs()); - std::iota(gpus.begin(), gpus.end(), 0); - return gpus; -} - -using cnmem_mr_t = rmm::mr::cnmem_memory_resource; +using cuda_mr_t = rmm::mr::cuda_memory_resource; +using pool_mr_t = rmm::mr::pool_memory_resource; class RMMAllocator { public: - cnmem_mr_t cnmem_mr; - RMMAllocator() : cnmem_mr(0, GetVisibleGPUs()) {} + std::vector> cuda_mr; + std::vector> pool_mr; + int n_gpu; + RMMAllocator() : n_gpu(common::AllVisibleGPUs()) { + int current_device; + CHECK_EQ(cudaGetDevice(¤t_device), cudaSuccess); + for (int i = 0; i < n_gpu; ++i) { + CHECK_EQ(cudaSetDevice(i), cudaSuccess); + cuda_mr.push_back(std::unique_ptr(new cuda_mr_t)); + pool_mr.push_back(std::unique_ptr(new pool_mr_t(cuda_mr[i].get()))); + } + CHECK_EQ(cudaSetDevice(current_device), cudaSuccess); + } ~RMMAllocator() = default; }; @@ -506,12 +513,21 @@ void DeleteRMMResource(RMMAllocator* r) { delete r; } -RMMAllocatorPtr SetUpRMMResourceForCppTests() { - if (dmlc::GetEnv("XGBOOST_CPP_TEST_USE_RMM_POOL", 0) != 1) { +RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv) { + bool use_rmm_pool = false; + for (int i = 1; i < argc; ++i) { + if (argv[i] == std::string("--use-rmm-pool")) { + use_rmm_pool = true; + } + } + if (!use_rmm_pool) { return RMMAllocatorPtr(nullptr, DeleteRMMResource); } + LOG(INFO) << "Using RMM memory pool"; auto ptr = RMMAllocatorPtr(new RMMAllocator(), DeleteRMMResource); - rmm::mr::set_default_resource(&ptr->cnmem_mr); + for (int i = 0; i < ptr->n_gpu; ++i) { + rmm::mr::set_per_device_resource(rmm::cuda_device_id(i), ptr->pool_mr[i].get()); + } return ptr; } #else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 @@ -519,7 +535,7 @@ class RMMAllocator {}; void DeleteRMMResource(RMMAllocator* r) {} -RMMAllocatorPtr SetUpRMMResourceForCppTests() { +RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv) { return RMMAllocatorPtr(nullptr, DeleteRMMResource); } #endif // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1 diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index eb1b5d7733a2..5d4ce6cefa68 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -355,7 +355,7 @@ inline int Next(DataIterHandle self) { class RMMAllocator; using RMMAllocatorPtr = std::unique_ptr; -RMMAllocatorPtr SetUpRMMResourceForCppTests(); +RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv); } // namespace xgboost #endif diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc index cb2b78679e11..b93329c2e788 100644 --- a/tests/cpp/test_main.cc +++ b/tests/cpp/test_main.cc @@ -9,11 +9,11 @@ #include "helpers.h" int main(int argc, char ** argv) { - auto rmm_alloc = xgboost::SetUpRMMResourceForCppTests(); xgboost::Args args {{"verbosity", "2"}}; xgboost::ConsoleLogger::Configure(args); testing::InitGoogleTest(&argc, argv); testing::FLAGS_gtest_death_test_style = "threadsafe"; + auto rmm_alloc = xgboost::SetUpRMMResourceForCppTests(argc, argv); return RUN_ALL_TESTS(); } From 2ee04b3eac05cf26af1942bd0e78d9fd009d8268 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Fri, 7 Aug 2020 19:35:29 -0700 Subject: [PATCH 33/41] Set CMake prefix path to use Conda env --- tests/ci_build/build_via_cmake.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ci_build/build_via_cmake.sh b/tests/ci_build/build_via_cmake.sh index 3af8bbe9b20d..44c9b5d4a29b 100755 --- a/tests/ci_build/build_via_cmake.sh +++ b/tests/ci_build/build_via_cmake.sh @@ -8,14 +8,16 @@ then shift 1 cmake_args="$@" source activate ${conda_env} + cmake_prefix_flag="-DCMAKE_PREFIX_PATH=$CONDA_PREFIX" else cmake_args="$@" + cmake_prefix_flag='' fi rm -rf build mkdir build cd build -cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -GNinja +cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -GNinja ${cmake_prefix_flag} ninja clean time ninja -v cd .. From 87422a2f120fac78c8f372cf317b9709cddeb30d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 7 Aug 2020 19:57:46 -0700 Subject: [PATCH 34/41] Use 0.15 nightly version of RMM --- tests/ci_build/Dockerfile.gpu | 4 ++-- tests/ci_build/Dockerfile.rmm | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu index 055caf3c1028..efc3d9186067 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/tests/ci_build/Dockerfile.gpu @@ -17,8 +17,8 @@ ENV PATH=/opt/python/bin:$PATH # Create new Conda environment with cuDF, Dask, and cuPy RUN \ - conda create -n gpu_test -c rapidsai -c nvidia -c conda-forge -c defaults \ - python=3.7 cudf=0.14 cudatoolkit=$CUDA_VERSION dask dask-cuda dask-cudf cupy \ + conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \ + python=3.7 cudf=0.15* rmm=0.15* cudatoolkit=$CUDA_VERSION dask dask-cuda dask-cudf cupy \ numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis ENV GOSU_VERSION 1.10 diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm index d5ebca97aec8..a92f09c47f28 100644 --- a/tests/ci_build/Dockerfile.rmm +++ b/tests/ci_build/Dockerfile.rmm @@ -28,8 +28,8 @@ ENV PATH=/opt/python/bin:$PATH # Create new Conda environment with RMM RUN \ - conda create -n gpu_test -c nvidia -c rapidsai -c conda-forge -c defaults \ - python=3.7 rmm=0.14 cudatoolkit=$CUDA_VERSION + conda create -n gpu_test -c nvidia -c rapidsai-nightly -c rapidsai -c conda-forge -c defaults \ + python=3.7 rmm=0.15* cudatoolkit=$CUDA_VERSION ENV GOSU_VERSION 1.10 From 9021a75ea7263bef9ce94fb02d77d184eeac208a Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 7 Aug 2020 20:00:59 -0700 Subject: [PATCH 35/41] Remove unnecessary header --- src/common/device_helpers.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 748aaf9c3524..8971f705dea8 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -39,7 +39,6 @@ #endif // XGBOOST_USE_NCCL #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -#include #include "rmm/mr/device/per_device_resource.hpp" #include "rmm/mr/device/thrust_allocator_adaptor.hpp" #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 From 377580a94966767880870cf4bb9283f45bdc1e82 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 8 Aug 2020 08:42:42 +0000 Subject: [PATCH 36/41] Fix a unit test when cudf is missing --- python-package/xgboost/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index b29eac7959f3..9491efd1c38c 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -317,7 +317,7 @@ def _is_cudf_df(data): import cudf except ImportError: return False - return isinstance(data, cudf.DataFrame) + return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame) def _cudf_array_interfaces(data): From 3df7cc3b668283b9b9233ded0021a21102877889 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 10 Aug 2020 19:57:58 +0000 Subject: [PATCH 37/41] Add RMM demos --- demo/rmm_plugin/README.md | 31 ++++++++++++++++++++++++++ demo/rmm_plugin/rmm_mgpu_with_dask.py | 27 ++++++++++++++++++++++ demo/rmm_plugin/rmm_singlegpu.py | 14 ++++++++++++ tests/pytest.ini | 3 ++- tests/python-gpu/conftest.py | 11 ++++++++- tests/python-gpu/test_gpu_demos.py | 1 + tests/python-gpu/test_gpu_with_dask.py | 2 ++ 7 files changed, 87 insertions(+), 2 deletions(-) create mode 100644 demo/rmm_plugin/README.md create mode 100644 demo/rmm_plugin/rmm_mgpu_with_dask.py create mode 100644 demo/rmm_plugin/rmm_singlegpu.py diff --git a/demo/rmm_plugin/README.md b/demo/rmm_plugin/README.md new file mode 100644 index 000000000000..ad73c61f3097 --- /dev/null +++ b/demo/rmm_plugin/README.md @@ -0,0 +1,31 @@ +Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL) +==================================================================== +[RAPIDS Memory Manager (RMM)](https://github.com/rapidsai/rmm) library provides a collection of +efficient memory allocators for NVIDIA GPUs. It is now possible to use XGBoost with memory +allocators provided by RMM, by enabling the RMM integration plugin. + +The demos in this directory highlights one RMM allocator in particular: **the pool sub-allocator**. +This allocator addresses the slow speed of `cudaMalloc()` by allocating a large chunk of memory +upfront. Subsequent allocations will draw from the pool of already allocated memory and thus avoid +the overhead of calling `cudaMalloc()` directly. See +[this GTC talk slides](https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf) +for more details. + +Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this, +run CMake with option `-DPLUGIN_RMM=ON` (`-DUSE_CUDA=ON` also required): +``` +cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON +make -j4 +``` +CMake will attempt to locate the RMM library in your build environment. You may choose to build +RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you +should specify the location of RMM with the CMake prefix: +``` +# If using Conda: +cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX +# If using RMM installed with a custom location +cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm +``` + +* [Using RMM with a single GPU](./rmm_singlegpu.py) +* [Using RMM with a local Dask cluster consisting of multiple GPUs](./rmm_mgpu_with_dask.py) diff --git a/demo/rmm_plugin/rmm_mgpu_with_dask.py b/demo/rmm_plugin/rmm_mgpu_with_dask.py new file mode 100644 index 000000000000..eac0c5da4822 --- /dev/null +++ b/demo/rmm_plugin/rmm_mgpu_with_dask.py @@ -0,0 +1,27 @@ +import xgboost as xgb +from sklearn.datasets import make_classification +import dask +from dask.distributed import Client +from dask_cuda import LocalCUDACluster + +def main(client): + X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3) + X = dask.array.from_array(X) + y = dask.array.from_array(y) + dtrain = xgb.dask.DaskDMatrix(client, X, label=y) + + params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3, + 'tree_method': 'gpu_hist'} + output = xgb.dask.train(client, params, dtrain, num_boost_round=100, + evals=[(dtrain, 'train')]) + bst = output['booster'] + history = output['history'] + for i, e in enumerate(history['train']['merror']): + print(f'[{i}] train-merror: {e}') + +if __name__ == '__main__': + # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option to + # LocalCUDACluster constructor. + with LocalCUDACluster(rmm_pool_size='2GB') as cluster: + with Client(cluster) as client: + main(client) diff --git a/demo/rmm_plugin/rmm_singlegpu.py b/demo/rmm_plugin/rmm_singlegpu.py new file mode 100644 index 000000000000..c56e0a0cef43 --- /dev/null +++ b/demo/rmm_plugin/rmm_singlegpu.py @@ -0,0 +1,14 @@ +import xgboost as xgb +import rmm +from sklearn.datasets import make_classification + +# Initialize RMM pool allocator +rmm.reinitialize(pool_allocator=True) + +X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3) +dtrain = xgb.DMatrix(X, label=y) + +params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3, + 'tree_method': 'gpu_hist'} +# XGBoost will automatically use the RMM pool allocator +bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, 'train')]) diff --git a/tests/pytest.ini b/tests/pytest.ini index 136782056f95..f34505d3532c 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -2,4 +2,5 @@ markers = mgpu: Mark a test that requires multiple GPUs to run. ci: Mark a test that runs only on CI. - gtest: Mark a test that requires C++ Google Test executable. \ No newline at end of file + gtest: Mark a test that requires C++ Google Test executable. + no_rmm_pool_setup: Mark a test to skip the setup_rmm_pool() fixture. diff --git a/tests/python-gpu/conftest.py b/tests/python-gpu/conftest.py index af2d6aa10b6d..1ed7cae7d308 100644 --- a/tests/python-gpu/conftest.py +++ b/tests/python-gpu/conftest.py @@ -8,9 +8,18 @@ def has_rmm(): except ImportError: return False +def get_module_attributes(module): + if not hasattr(module, 'pytestmark'): + return [] + if isinstance(module.pytestmark, list): + return [x.name for x in module.pytestmark] + return [module.pytestmark.name] + @pytest.fixture(scope='module', autouse=True) def setup_rmm_pool(request, pytestconfig): - if pytestconfig.getoption('--use-rmm-pool') and request.module.__name__ != 'test_gpu_with_dask': + if (pytestconfig.getoption('--use-rmm-pool') and + 'no_rmm_pool_setup' not in get_module_attributes(request.module)): + print('!!! Setting up RMM pool') if not has_rmm(): raise ImportError('The --use-rmm-pool option requires the RMM package') import rmm diff --git a/tests/python-gpu/test_gpu_demos.py b/tests/python-gpu/test_gpu_demos.py index a3a9aaff5b08..58dc262e6b4e 100644 --- a/tests/python-gpu/test_gpu_demos.py +++ b/tests/python-gpu/test_gpu_demos.py @@ -6,6 +6,7 @@ import testing as tm import test_demos as td # noqa +pytestmark = pytest.mark.no_rmm_pool_setup @pytest.mark.skipif(**tm.no_cupy()) def test_data_iterator(): diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py index b3e5d8a14ed2..dc9ba0a49d4d 100644 --- a/tests/python-gpu/test_gpu_with_dask.py +++ b/tests/python-gpu/test_gpu_with_dask.py @@ -12,6 +12,8 @@ if sys.platform.startswith("win"): pytest.skip("Skipping dask tests on Windows", allow_module_level=True) +pytestmark = pytest.mark.no_rmm_pool_setup + sys.path.append("tests/python") from test_with_dask import run_empty_dmatrix_reg # noqa from test_with_dask import run_empty_dmatrix_cls # noqa From 567fb330a6beeb32dc0d7e873e64ebd41fe71c4f Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Mon, 10 Aug 2020 13:34:09 -0700 Subject: [PATCH 38/41] Remove print() --- tests/python-gpu/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python-gpu/conftest.py b/tests/python-gpu/conftest.py index 1ed7cae7d308..05ddf10c5036 100644 --- a/tests/python-gpu/conftest.py +++ b/tests/python-gpu/conftest.py @@ -19,7 +19,6 @@ def get_module_attributes(module): def setup_rmm_pool(request, pytestconfig): if (pytestconfig.getoption('--use-rmm-pool') and 'no_rmm_pool_setup' not in get_module_attributes(request.module)): - print('!!! Setting up RMM pool') if not has_rmm(): raise ImportError('The --use-rmm-pool option requires the RMM package') import rmm From 1e63c465713e492de3531b811ae6d09a2cb80905 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 11 Aug 2020 22:21:43 +0000 Subject: [PATCH 39/41] Use HostDeviceVector in GPU predictor --- src/common/device_helpers.cuh | 9 ----- src/common/host_device_vector.cu | 2 ++ src/predictor/gpu_predictor.cu | 58 ++++++++++++-------------------- 3 files changed, 23 insertions(+), 46 deletions(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 8971f705dea8..beb94680f493 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -771,15 +771,6 @@ xgboost::common::Span ToSpan(thrust::device_vector& vec, return ToSpan(vec, offset, size); } -template ::index_type> -xgboost::common::Span ToSpan( - std::unique_ptr& vec, - IndexT offset = 0, - IndexT size = std::numeric_limits::max()) { - return ToSpan(*vec.get(), offset, size); -} - // thrust begin, similiar to std::begin template thrust::device_ptr tbegin(xgboost::HostDeviceVector& vector) { // NOLINT diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 7470f1c07731..39a0fbe9efb0 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -11,6 +11,7 @@ #include "xgboost/data.h" #include "xgboost/host_device_vector.h" +#include "xgboost/tree_model.h" #include "device_helpers.cuh" namespace xgboost { @@ -402,6 +403,7 @@ template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; // bst_row_t template class HostDeviceVector; // bst_feature_t +template class HostDeviceVector; #if defined(__APPLE__) /* diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index fa31af2e4c8e..c05688eaf4d8 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -214,44 +214,20 @@ __global__ void PredictKernel(Data data, class DeviceModel { public: // Need to lazily construct the vectors because GPU id is only known at runtime - std::unique_ptr> nodes; - std::unique_ptr> tree_segments; - std::unique_ptr> tree_group; + HostDeviceVector nodes; + HostDeviceVector tree_segments; + HostDeviceVector tree_group; size_t tree_beg_; // NOLINT size_t tree_end_; // NOLINT int num_group; - void CopyModel(const gbm::GBTreeModel& model, - const thrust::host_vector& h_tree_segments, - const thrust::host_vector& h_nodes, - size_t tree_begin, size_t tree_end) { - nodes->resize(h_nodes.size()); - dh::safe_cuda(cudaMemcpyAsync(nodes->data().get(), h_nodes.data(), - sizeof(RegTree::Node) * h_nodes.size(), - cudaMemcpyHostToDevice)); - tree_segments->resize(h_tree_segments.size()); - dh::safe_cuda(cudaMemcpyAsync(tree_segments->data().get(), h_tree_segments.data(), - sizeof(size_t) * h_tree_segments.size(), - cudaMemcpyHostToDevice)); - tree_group->resize(model.tree_info.size()); - dh::safe_cuda(cudaMemcpyAsync(tree_group->data().get(), model.tree_info.data(), - sizeof(int) * model.tree_info.size(), - cudaMemcpyHostToDevice)); - this->tree_beg_ = tree_begin; - this->tree_end_ = tree_end; - this->num_group = model.learner_model_param->num_output_group; - } - void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) { dh::safe_cuda(cudaSetDevice(gpu_id)); - // Allocate device vectors using correct GPU ID context - nodes.reset(new dh::device_vector()); - tree_segments.reset(new dh::device_vector()); - tree_group.reset(new dh::device_vector()); CHECK_EQ(model.param.size_leaf_vector, 0); // Copy decision trees to device - thrust::host_vector h_tree_segments{}; + tree_segments = std::move(HostDeviceVector({}, gpu_id)); + auto& h_tree_segments = tree_segments.HostVector(); h_tree_segments.reserve((tree_end - tree_begin) + 1); size_t sum = 0; h_tree_segments.push_back(sum); @@ -260,13 +236,21 @@ class DeviceModel { h_tree_segments.push_back(sum); } - thrust::host_vector h_nodes(h_tree_segments.back()); + nodes = std::move(HostDeviceVector(h_tree_segments.back(), RegTree::Node(), + gpu_id)); + auto& h_nodes = nodes.HostVector(); for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { auto& src_nodes = model.trees.at(tree_idx)->GetNodes(); std::copy(src_nodes.begin(), src_nodes.end(), h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]); } - CopyModel(model, h_tree_segments, h_nodes, tree_begin, tree_end); + + tree_group = std::move(HostDeviceVector(model.tree_info.size(), 0, gpu_id)); + auto& h_tree_group = tree_group.HostVector(); + std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size()); + this->tree_beg_ = tree_begin; + this->tree_end_ = tree_end; + this->num_group = model.learner_model_param->num_output_group; } }; @@ -293,8 +277,8 @@ class GPUPredictor : public xgboost::Predictor { dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} ( PredictKernel, data, - dh::ToSpan(model_.nodes), predictions->DeviceSpan().subspan(batch_offset), - dh::ToSpan(model_.tree_segments), dh::ToSpan(model_.tree_group), + model_.nodes.DeviceSpan(), predictions->DeviceSpan().subspan(batch_offset), + model_.tree_segments.DeviceSpan(), model_.tree_group.DeviceSpan(), model_.tree_beg_, model_.tree_end_, num_features, num_rows, entry_start, use_shared, model_.num_group); } @@ -309,8 +293,8 @@ class GPUPredictor : public xgboost::Predictor { dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} ( PredictKernel, batch, - dh::ToSpan(model_.nodes), out_preds->DeviceSpan().subspan(batch_offset), - dh::ToSpan(model_.tree_segments), dh::ToSpan(model_.tree_group), + model_.nodes.DeviceSpan(), out_preds->DeviceSpan().subspan(batch_offset), + model_.tree_segments.DeviceSpan(), model_.tree_group.DeviceSpan(), model_.tree_beg_, model_.tree_end_, batch.NumFeatures(), num_rows, entry_start, use_shared, model_.num_group); } @@ -441,8 +425,8 @@ class GPUPredictor : public xgboost::Predictor { dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} ( PredictKernel, m->Value(), - dh::ToSpan(d_model.nodes), out_preds->predictions.DeviceSpan(), - dh::ToSpan(d_model.tree_segments), dh::ToSpan(d_model.tree_group), + d_model.nodes.DeviceSpan(), out_preds->predictions.DeviceSpan(), + d_model.tree_segments.DeviceSpan(), d_model.tree_group.DeviceSpan(), tree_begin, tree_end, m->NumColumns(), info.num_row_, entry_start, use_shared, output_groups); } From ad216c51bba4b04d3bf039507c7b8398e9a47fae Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 11 Aug 2020 22:52:30 +0000 Subject: [PATCH 40/41] Simplify pytest setup; use LocalCUDACluster fixture --- tests/pytest.ini | 1 - tests/python-gpu/conftest.py | 37 +++++---- tests/python-gpu/test_gpu_demos.py | 2 - tests/python-gpu/test_gpu_with_dask.py | 107 ++++++++++++------------- 4 files changed, 74 insertions(+), 73 deletions(-) diff --git a/tests/pytest.ini b/tests/pytest.ini index f34505d3532c..5a0d27a6cec6 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -3,4 +3,3 @@ markers = mgpu: Mark a test that requires multiple GPUs to run. ci: Mark a test that runs only on CI. gtest: Mark a test that requires C++ Google Test executable. - no_rmm_pool_setup: Mark a test to skip the setup_rmm_pool() fixture. diff --git a/tests/python-gpu/conftest.py b/tests/python-gpu/conftest.py index 05ddf10c5036..1865ce529a98 100644 --- a/tests/python-gpu/conftest.py +++ b/tests/python-gpu/conftest.py @@ -1,6 +1,10 @@ +import sys import pytest import logging +sys.path.append("tests/python") +import testing as tm # noqa + def has_rmm(): try: import rmm @@ -8,33 +12,34 @@ def has_rmm(): except ImportError: return False -def get_module_attributes(module): - if not hasattr(module, 'pytestmark'): - return [] - if isinstance(module.pytestmark, list): - return [x.name for x in module.pytestmark] - return [module.pytestmark.name] - -@pytest.fixture(scope='module', autouse=True) +@pytest.fixture(scope='session', autouse=True) def setup_rmm_pool(request, pytestconfig): - if (pytestconfig.getoption('--use-rmm-pool') and - 'no_rmm_pool_setup' not in get_module_attributes(request.module)): + if pytestconfig.getoption('--use-rmm-pool'): if not has_rmm(): raise ImportError('The --use-rmm-pool option requires the RMM package') import rmm from dask_cuda.utils import get_n_gpus - rmm.reinitialize(pool_allocator=True, devices=list(range(get_n_gpus()))) + rmm.reinitialize(pool_allocator=True, initial_pool_size=1024*1024*1024, + devices=list(range(get_n_gpus()))) -@pytest.fixture(scope='module', autouse=True) -def local_cuda_cluster_rmm_kwargs(request, pytestconfig): - if pytestconfig.getoption('--use-rmm-pool') and request.module.__name__ == 'test_gpu_with_dask': +@pytest.fixture(scope='function') +def local_cuda_cluster(request, pytestconfig): + kwargs = {} + if hasattr(request, 'param'): + kwargs.update(request.param) + if pytestconfig.getoption('--use-rmm-pool'): if not has_rmm(): raise ImportError('The --use-rmm-pool option requires the RMM package') import rmm from dask_cuda.utils import get_n_gpus rmm.reinitialize() - return {'rmm_pool_size': '8GB'} - return {} + kwargs['rmm_pool_size'] = '2GB' + if tm.no_dask_cuda()['condition']: + raise ImportError('The local_cuda_cluster fixture requires dask_cuda package') + from dask_cuda import LocalCUDACluster + cluster = LocalCUDACluster(**kwargs) + yield cluster + cluster.close() def pytest_addoption(parser): parser.addoption('--use-rmm-pool', action='store_true', default=False, help='Use RMM pool') diff --git a/tests/python-gpu/test_gpu_demos.py b/tests/python-gpu/test_gpu_demos.py index 58dc262e6b4e..f74d2adc2823 100644 --- a/tests/python-gpu/test_gpu_demos.py +++ b/tests/python-gpu/test_gpu_demos.py @@ -6,8 +6,6 @@ import testing as tm import test_demos as td # noqa -pytestmark = pytest.mark.no_rmm_pool_setup - @pytest.mark.skipif(**tm.no_cupy()) def test_data_iterator(): script = os.path.join(td.PYTHON_DEMO_DIR, 'data_iterator.py') diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py index dc9ba0a49d4d..a06bfc28361f 100644 --- a/tests/python-gpu/test_gpu_with_dask.py +++ b/tests/python-gpu/test_gpu_with_dask.py @@ -12,8 +12,6 @@ if sys.platform.startswith("win"): pytest.skip("Skipping dask tests on Windows", allow_module_level=True) -pytestmark = pytest.mark.no_rmm_pool_setup - sys.path.append("tests/python") from test_with_dask import run_empty_dmatrix_reg # noqa from test_with_dask import run_empty_dmatrix_cls # noqa @@ -24,7 +22,6 @@ try: import dask.dataframe as dd from xgboost import dask as dxgb - from dask_cuda import LocalCUDACluster from dask.distributed import Client from dask import array as da import cudf @@ -158,44 +155,45 @@ class TestDistributedGPU: @pytest.mark.skipif(**tm.no_dask_cudf()) @pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.mgpu - def test_dask_dataframe(self, local_cuda_cluster_rmm_kwargs): - with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster: - with Client(cluster) as client: - run_with_dask_dataframe(dxgb.DaskDMatrix, client) - run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client) + def test_dask_dataframe(self, local_cuda_cluster): + with Client(local_cuda_cluster) as client: + run_with_dask_dataframe(dxgb.DaskDMatrix, client) + run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client) @given(params=parameter_strategy, num_rounds=strategies.integers(1, 20), dataset=tm.dataset_strategy) @settings(deadline=duration(seconds=120)) + @pytest.mark.skipif(**tm.no_dask()) + @pytest.mark.skipif(**tm.no_dask_cuda()) + @pytest.mark.parametrize('local_cuda_cluster', [{'n_workers': 2}], indirect=['local_cuda_cluster']) @pytest.mark.mgpu - def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster_rmm_kwargs): - with LocalCUDACluster(n_workers=2, **local_cuda_cluster_rmm_kwargs) as cluster: - with Client(cluster) as client: - run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, - client) - run_gpu_hist(params, num_rounds, dataset, - dxgb.DaskDeviceQuantileDMatrix, client) + def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster): + with Client(local_cuda_cluster) as client: + run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, + client) + run_gpu_hist(params, num_rounds, dataset, + dxgb.DaskDeviceQuantileDMatrix, client) @pytest.mark.skipif(**tm.no_cupy()) + @pytest.mark.skipif(**tm.no_dask()) + @pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.mgpu - def test_dask_array(self, local_cuda_cluster_rmm_kwargs): - with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster: - with Client(cluster) as client: - run_with_dask_array(dxgb.DaskDMatrix, client) - run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, client) + def test_dask_array(self, local_cuda_cluster): + with Client(local_cuda_cluster) as client: + run_with_dask_array(dxgb.DaskDMatrix, client) + run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, client) @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.mgpu - def test_empty_dmatrix(self, local_cuda_cluster_rmm_kwargs): - with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster: - with Client(cluster) as client: - parameters = {'tree_method': 'gpu_hist', - 'debug_synchronize': True} - run_empty_dmatrix_reg(client, parameters) - run_empty_dmatrix_cls(client, parameters) - - def run_quantile(self, name, local_cuda_cluster_rmm_kwargs): + def test_empty_dmatrix(self, local_cuda_cluster): + with Client(local_cuda_cluster) as client: + parameters = {'tree_method': 'gpu_hist', + 'debug_synchronize': True} + run_empty_dmatrix_reg(client, parameters) + run_empty_dmatrix_cls(client, parameters) + + def run_quantile(self, name, local_cuda_cluster): if sys.platform.startswith("win"): pytest.skip("Skipping dask tests on Windows") @@ -218,32 +216,33 @@ def runit(worker_addr, rabit_args): env[port[0]] = port[1] return subprocess.run([exe, test], env=env, stdout=subprocess.PIPE) - with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster: - with Client(cluster) as client: - workers = list(dxgb._get_client_workers(client).keys()) - rabit_args = client.sync(dxgb._get_rabit_args, workers, client) - futures = client.map(runit, - workers, - pure=False, - workers=workers, - rabit_args=rabit_args) - results = client.gather(futures) - for ret in results: - msg = ret.stdout.decode('utf-8') - assert msg.find('1 test from GPUQuantile') != -1, msg - assert ret.returncode == 0, msg + with Client(local_cuda_cluster) as client: + workers = list(dxgb._get_client_workers(client).keys()) + rabit_args = client.sync(dxgb._get_rabit_args, workers, client) + futures = client.map(runit, + workers, + pure=False, + workers=workers, + rabit_args=rabit_args) + results = client.gather(futures) + for ret in results: + msg = ret.stdout.decode('utf-8') + assert msg.find('1 test from GPUQuantile') != -1, msg + assert ret.returncode == 0, msg @pytest.mark.skipif(**tm.no_dask()) + @pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.mgpu @pytest.mark.gtest - def test_quantile_basic(self, local_cuda_cluster_rmm_kwargs): - self.run_quantile('AllReduceBasic', local_cuda_cluster_rmm_kwargs) + def test_quantile_basic(self, local_cuda_cluster): + self.run_quantile('AllReduceBasic', local_cuda_cluster) @pytest.mark.skipif(**tm.no_dask()) + @pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.mgpu @pytest.mark.gtest - def test_quantile_same_on_all_workers(self, local_cuda_cluster_rmm_kwargs): - self.run_quantile('SameOnAllWorkers', local_cuda_cluster_rmm_kwargs) + def test_quantile_same_on_all_workers(self, local_cuda_cluster): + self.run_quantile('SameOnAllWorkers', local_cuda_cluster) async def run_from_dask_array_asyncio(scheduler_address): @@ -274,11 +273,11 @@ async def run_from_dask_array_asyncio(scheduler_address): @pytest.mark.skipif(**tm.no_dask()) +@pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.mgpu -def test_with_asyncio(local_cuda_cluster_rmm_kwargs): - with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster: - with Client(cluster) as client: - address = client.scheduler.address - output = asyncio.run(run_from_dask_array_asyncio(address)) - assert isinstance(output['booster'], xgboost.Booster) - assert isinstance(output['history'], dict) +def test_with_asyncio(local_cuda_cluster): + with Client(local_cuda_cluster) as client: + address = client.scheduler.address + output = asyncio.run(run_from_dask_array_asyncio(address)) + assert isinstance(output['booster'], xgboost.Booster) + assert isinstance(output['history'], dict) From b4195cd5ea4f91b858527117a34c805e55a1f716 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 11 Aug 2020 22:55:49 +0000 Subject: [PATCH 41/41] Address reviewers' commments --- tests/cpp/helpers.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 0d9d3770498b..858b651981fb 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -489,20 +489,20 @@ std::unique_ptr CreateTrainedGBM( #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -using cuda_mr_t = rmm::mr::cuda_memory_resource; -using pool_mr_t = rmm::mr::pool_memory_resource; +using CUDAMemoryResource = rmm::mr::cuda_memory_resource; +using PoolMemoryResource = rmm::mr::pool_memory_resource; class RMMAllocator { public: - std::vector> cuda_mr; - std::vector> pool_mr; + std::vector> cuda_mr; + std::vector> pool_mr; int n_gpu; RMMAllocator() : n_gpu(common::AllVisibleGPUs()) { int current_device; CHECK_EQ(cudaGetDevice(¤t_device), cudaSuccess); for (int i = 0; i < n_gpu; ++i) { CHECK_EQ(cudaSetDevice(i), cudaSuccess); - cuda_mr.push_back(std::unique_ptr(new cuda_mr_t)); - pool_mr.push_back(std::unique_ptr(new pool_mr_t(cuda_mr[i].get()))); + cuda_mr.push_back(std::make_unique()); + pool_mr.push_back(std::make_unique(cuda_mr[i].get())); } CHECK_EQ(cudaSetDevice(current_device), cudaSuccess); }